From dff24a914c0a43752c3e450d283b5276f4e5f368 Mon Sep 17 00:00:00 2001 From: har0ke Date: Sun, 5 Jul 2020 15:10:52 +0200 Subject: [PATCH] Generate implementations for specific block-sizes --- CMakeLists.txt | 4 + scripts/generate.py | 109 + scripts/test.py | 47 +- .../detail/RegisterBlocking.h | 4 + src/register_blocking/detail/manual.h | 7568 +++++++++++++++++ 5 files changed, 7708 insertions(+), 24 deletions(-) create mode 100644 scripts/generate.py create mode 100644 src/register_blocking/detail/manual.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 05eabcf..f96f2fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,10 +6,14 @@ set(CMAKE_CXX_STANDARD 17) option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON) option(USE_CLANG "Build with clang instead of gcc" ON) option(WITH_AVX512 "Enable AVX512" OFF) +option(WITH_MANUAL "Enable MANUAL" ON) if(WITH_AVX512) add_compile_definitions(WITH_AVX512) endif() +if(WITH_MANUAL) + add_compile_definitions(WITH_MANUAL) +endif() if(USE_CLANG) set(CMAKE_CXX_COMPILER "clang++") diff --git a/scripts/generate.py b/scripts/generate.py new file mode 100644 index 0000000..d0f093d --- /dev/null +++ b/scripts/generate.py @@ -0,0 +1,109 @@ +import os +code_prefix = """ +namespace detail { + + template + struct RegisterBlocking; +""" + +code_template = """ + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + +%s + + for (int p = 0; p < k; p++) { +%s + } + +%s + + } + }; +""" + +code_postfix = "}" + + +def generate_rows(num_rows, num_columns): + + variables = ["c"] + zero_variables = [] + loop_lines = [] + after_loop = [] + for row_index in range(num_rows): + variables.append("r%d" % row_index) + for column_index in range(num_columns): + zero_variables.append("r%dc%d" % (row_index, column_index)) + + for row_index in range(num_rows): + loop_lines.append("r%d = bwc::BroadcastToVector(A(aRowOffset + %d, p));" % (row_index, row_index)) + + for column_index in range(num_columns): + loop_lines.append("c = bwc::LoadVector(&B(p, bColOffset + %d * bwc::VectorWidth));" % column_index) + for row_index in range(num_rows): + loop_lines.append("r%dc%d += r%d * c;" % (row_index, column_index, row_index)) + + after_loop.append("size_t row = aRowOffset;") + after_loop.append("size_t column;") + for row_index in range(num_rows): + after_loop.append("column = bColOffset;") + for column_index in range(num_columns): + after_loop.append("AddAndStore(&C(row, column), r%dc%d);" % (row_index, column_index)) + if column_index != num_columns - 1: + after_loop.append("column += bwc::VectorWidth;") + if row_index != num_rows - 1: + after_loop.append("++row;") + print(len(variables) + len(zero_variables)) + variables_lines = ["typename bwc::VectorType %s;" % variable for variable in variables] + variables_lines += ["typename bwc::VectorType %s;" % variable for variable in zero_variables] + variables_lines += [variable + " = bwc::XOR(" + variable + ", " + variable + ");" for variable in zero_variables] + + variables_code = "\n".join(" " + line for line in variables_lines) + loop_code = "\n".join(" " + line for line in loop_lines) + after_loop_code = "\n".join(" " + line for line in after_loop) + + return code_template % (num_rows, num_columns, variables_code, loop_code, after_loop_code) + + +max_register_count = 32 + +extra_registers = 1 + + +def get_initial_column_vectors(rows, registers): + if rows == 0: + return 0 + result = int (float(registers - extra_registers - rows) / rows) + if extra_registers + rows + rows * (result + 1) <= registers: + return result + 1 + return result + + +def get_initial_rows(registers): + for rows in reversed(range(registers)): + if extra_registers + rows + rows * rows <= registers: + return rows + return 0 + + +fn = os.path.join(os.path.dirname(__file__), "..", "src/register_blocking/detail/manual.h") +with open(fn, "w") as f: + f.write(code_prefix) + for rows in range(1, get_initial_rows(max_register_count) + 1): + for columns in range(1, get_initial_column_vectors(rows, max_register_count) + 1): + f.write(generate_rows(rows, columns)) + f.write(code_postfix) \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py index 2e5912a..a97102d 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -21,27 +21,20 @@ def check_call_quiet(*args, **kwargs): exit(0) -def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): +def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args): flags = [ "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), "-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"), "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON", "-DWITH_AVX512=" + ("ON" if avx512 else "OFF"), - ] + "-DWITH_MANUAL=" + ("OFF" if no_manual else "ON"), + ] build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_") - return build_path + return flags, build_path - -def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): - flags = [ - "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), - "-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"), - "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), - "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON", - "-DWITH_AVX512=" + ("ON" if avx512 else "OFF"), - ] - build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_") +def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args): + flags, build_path = get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args) if not os.path.exists(build_path): os.makedirs(build_path) check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags) @@ -58,7 +51,8 @@ very_slow_functions = ["naive_reordered"] slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"] normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"] fast_functions = ["divide_and_conquer_block_avx2", "blas"] -avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"] +avx512_fast_functions = ["divide_and_conquer_block_avx512"] +avx512_normal_functions = ["block_wise_avx512"] if __name__ == '__main__': os.chdir(os.path.join(os.path.dirname(__file__), "..")) @@ -78,11 +72,15 @@ if __name__ == '__main__': parser.add_argument("--gcc", action="store_true") parser.add_argument("--function", type=str, nargs="*") parser.add_argument("--release", action="store_true") + parser.add_argument("--no_manual", action="store_true") options = parser.parse_args() functions = fast_functions + if options.avx512: + functions += avx512_fast_functions + if options.very_slow: functions += very_slow_functions @@ -92,12 +90,13 @@ if __name__ == '__main__': if options.very_slow or options.slow or options.normal: functions += normal_functions - if options.avx512: - functions += avx512_functions + if options.avx512 and (options.very_slow or options.slow or options.normal): + functions += avx512_normal_functions + if options.function: functions = options.function - functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"] + # functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"] extra_args = [] if options.validate: extra_args.append("--validate") @@ -119,22 +118,22 @@ if __name__ == '__main__': already_dumped = False for sizes in matrix_combinations: args = list(sizes) - compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, args) + compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, options.no_manual, args) folder = "x".join(sizes) for fidx, function in enumerate(functions): arguments = [folder, "--algorithm", function] if with_double: arguments.append("--double") - output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args) + output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, options.no_manual, arguments + extra_args) ms = output.decode()[output.decode().find("multiply:") + 10:] if not already_dumped: - build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args), "simd_multiply") - os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path) - os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path) + build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, options.no_manual, arguments + extra_args)[1], "simd_multiply") + # os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path) + # os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path) upp = os.path.expanduser("~/dup.sh") - os.system('bash %s upload main_with_source.s .' % upp) - os.system('bash %s upload main_wo_source.s .' % upp) + # os.system('bash %s upload main_with_source.s .' % upp) + # os.system('bash %s upload main_wo_source.s .' % upp) already_dumped = True if "ms\n" in ms: ms = float(ms.split("ms\n")[0]) diff --git a/src/register_blocking/detail/RegisterBlocking.h b/src/register_blocking/detail/RegisterBlocking.h index 09cac35..8a684f9 100644 --- a/src/register_blocking/detail/RegisterBlocking.h +++ b/src/register_blocking/detail/RegisterBlocking.h @@ -8,6 +8,10 @@ #include "../../Matrix.h" #include "ExtendedBlockWiseConfig.h" +#ifdef WITH_MANUAL +#include "manual.h" +#endif + namespace detail { template< diff --git a/src/register_blocking/detail/manual.h b/src/register_blocking/detail/manual.h new file mode 100644 index 0000000..82afdcf --- /dev/null +++ b/src/register_blocking/detail/manual.h @@ -0,0 +1,7568 @@ + +namespace detail { + + template + struct RegisterBlocking; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + r0c0 = bwc::XOR(r0c0, r0c0); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + typename bwc::VectorType r0c24; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + r0c24 = bwc::XOR(r0c24, r0c24); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 24 * bwc::VectorWidth)); + r0c24 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c24); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + typename bwc::VectorType r0c24; + typename bwc::VectorType r0c25; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + r0c24 = bwc::XOR(r0c24, r0c24); + r0c25 = bwc::XOR(r0c25, r0c25); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 24 * bwc::VectorWidth)); + r0c24 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 25 * bwc::VectorWidth)); + r0c25 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c24); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c25); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + typename bwc::VectorType r0c24; + typename bwc::VectorType r0c25; + typename bwc::VectorType r0c26; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + r0c24 = bwc::XOR(r0c24, r0c24); + r0c25 = bwc::XOR(r0c25, r0c25); + r0c26 = bwc::XOR(r0c26, r0c26); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 24 * bwc::VectorWidth)); + r0c24 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 25 * bwc::VectorWidth)); + r0c25 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 26 * bwc::VectorWidth)); + r0c26 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c24); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c25); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c26); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + typename bwc::VectorType r0c24; + typename bwc::VectorType r0c25; + typename bwc::VectorType r0c26; + typename bwc::VectorType r0c27; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + r0c24 = bwc::XOR(r0c24, r0c24); + r0c25 = bwc::XOR(r0c25, r0c25); + r0c26 = bwc::XOR(r0c26, r0c26); + r0c27 = bwc::XOR(r0c27, r0c27); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 24 * bwc::VectorWidth)); + r0c24 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 25 * bwc::VectorWidth)); + r0c25 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 26 * bwc::VectorWidth)); + r0c26 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 27 * bwc::VectorWidth)); + r0c27 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c24); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c25); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c26); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c27); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + typename bwc::VectorType r0c24; + typename bwc::VectorType r0c25; + typename bwc::VectorType r0c26; + typename bwc::VectorType r0c27; + typename bwc::VectorType r0c28; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + r0c24 = bwc::XOR(r0c24, r0c24); + r0c25 = bwc::XOR(r0c25, r0c25); + r0c26 = bwc::XOR(r0c26, r0c26); + r0c27 = bwc::XOR(r0c27, r0c27); + r0c28 = bwc::XOR(r0c28, r0c28); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 24 * bwc::VectorWidth)); + r0c24 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 25 * bwc::VectorWidth)); + r0c25 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 26 * bwc::VectorWidth)); + r0c26 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 27 * bwc::VectorWidth)); + r0c27 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 28 * bwc::VectorWidth)); + r0c28 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c24); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c25); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c26); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c27); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c28); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r0c14; + typename bwc::VectorType r0c15; + typename bwc::VectorType r0c16; + typename bwc::VectorType r0c17; + typename bwc::VectorType r0c18; + typename bwc::VectorType r0c19; + typename bwc::VectorType r0c20; + typename bwc::VectorType r0c21; + typename bwc::VectorType r0c22; + typename bwc::VectorType r0c23; + typename bwc::VectorType r0c24; + typename bwc::VectorType r0c25; + typename bwc::VectorType r0c26; + typename bwc::VectorType r0c27; + typename bwc::VectorType r0c28; + typename bwc::VectorType r0c29; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r0c14 = bwc::XOR(r0c14, r0c14); + r0c15 = bwc::XOR(r0c15, r0c15); + r0c16 = bwc::XOR(r0c16, r0c16); + r0c17 = bwc::XOR(r0c17, r0c17); + r0c18 = bwc::XOR(r0c18, r0c18); + r0c19 = bwc::XOR(r0c19, r0c19); + r0c20 = bwc::XOR(r0c20, r0c20); + r0c21 = bwc::XOR(r0c21, r0c21); + r0c22 = bwc::XOR(r0c22, r0c22); + r0c23 = bwc::XOR(r0c23, r0c23); + r0c24 = bwc::XOR(r0c24, r0c24); + r0c25 = bwc::XOR(r0c25, r0c25); + r0c26 = bwc::XOR(r0c26, r0c26); + r0c27 = bwc::XOR(r0c27, r0c27); + r0c28 = bwc::XOR(r0c28, r0c28); + r0c29 = bwc::XOR(r0c29, r0c29); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 14 * bwc::VectorWidth)); + r0c14 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 15 * bwc::VectorWidth)); + r0c15 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 16 * bwc::VectorWidth)); + r0c16 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 17 * bwc::VectorWidth)); + r0c17 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 18 * bwc::VectorWidth)); + r0c18 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 19 * bwc::VectorWidth)); + r0c19 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 20 * bwc::VectorWidth)); + r0c20 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 21 * bwc::VectorWidth)); + r0c21 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 22 * bwc::VectorWidth)); + r0c22 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 23 * bwc::VectorWidth)); + r0c23 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 24 * bwc::VectorWidth)); + r0c24 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 25 * bwc::VectorWidth)); + r0c25 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 26 * bwc::VectorWidth)); + r0c26 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 27 * bwc::VectorWidth)); + r0c27 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 28 * bwc::VectorWidth)); + r0c28 += r0 * c; + c = bwc::LoadVector(&B(p, bColOffset + 29 * bwc::VectorWidth)); + r0c29 += r0 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c14); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c15); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c16); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c17); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c18); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c19); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c20); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c21); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c22); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c23); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c24); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c25); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c26); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c27); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c28); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c29); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r1c0; + r0c0 = bwc::XOR(r0c0, r0c0); + r1c0 = bwc::XOR(r1c0, r1c0); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + typename bwc::VectorType r1c9; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + r1c9 = bwc::XOR(r1c9, r1c9); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + r1c9 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c9); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + typename bwc::VectorType r1c9; + typename bwc::VectorType r1c10; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + r1c9 = bwc::XOR(r1c9, r1c9); + r1c10 = bwc::XOR(r1c10, r1c10); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + r1c9 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + r1c10 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c10); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + typename bwc::VectorType r1c9; + typename bwc::VectorType r1c10; + typename bwc::VectorType r1c11; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + r1c9 = bwc::XOR(r1c9, r1c9); + r1c10 = bwc::XOR(r1c10, r1c10); + r1c11 = bwc::XOR(r1c11, r1c11); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + r1c9 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + r1c10 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + r1c11 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c11); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + typename bwc::VectorType r1c9; + typename bwc::VectorType r1c10; + typename bwc::VectorType r1c11; + typename bwc::VectorType r1c12; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + r1c9 = bwc::XOR(r1c9, r1c9); + r1c10 = bwc::XOR(r1c10, r1c10); + r1c11 = bwc::XOR(r1c11, r1c11); + r1c12 = bwc::XOR(r1c12, r1c12); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + r1c9 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + r1c10 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + r1c11 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + r1c12 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c12); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r0c9; + typename bwc::VectorType r0c10; + typename bwc::VectorType r0c11; + typename bwc::VectorType r0c12; + typename bwc::VectorType r0c13; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + typename bwc::VectorType r1c9; + typename bwc::VectorType r1c10; + typename bwc::VectorType r1c11; + typename bwc::VectorType r1c12; + typename bwc::VectorType r1c13; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r0c9 = bwc::XOR(r0c9, r0c9); + r0c10 = bwc::XOR(r0c10, r0c10); + r0c11 = bwc::XOR(r0c11, r0c11); + r0c12 = bwc::XOR(r0c12, r0c12); + r0c13 = bwc::XOR(r0c13, r0c13); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + r1c9 = bwc::XOR(r1c9, r1c9); + r1c10 = bwc::XOR(r1c10, r1c10); + r1c11 = bwc::XOR(r1c11, r1c11); + r1c12 = bwc::XOR(r1c12, r1c12); + r1c13 = bwc::XOR(r1c13, r1c13); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 9 * bwc::VectorWidth)); + r0c9 += r0 * c; + r1c9 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 10 * bwc::VectorWidth)); + r0c10 += r0 * c; + r1c10 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 11 * bwc::VectorWidth)); + r0c11 += r0 * c; + r1c11 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 12 * bwc::VectorWidth)); + r0c12 += r0 * c; + r1c12 += r1 * c; + c = bwc::LoadVector(&B(p, bColOffset + 13 * bwc::VectorWidth)); + r0c13 += r0 * c; + r1c13 += r1 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c13); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c9); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c10); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c11); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c12); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c13); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r1c0; + typename bwc::VectorType r2c0; + r0c0 = bwc::XOR(r0c0, r0c0); + r1c0 = bwc::XOR(r1c0, r1c0); + r2c0 = bwc::XOR(r2c0, r2c0); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r2c5; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r2c5 = bwc::XOR(r2c5, r2c5); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + r2c5 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c5); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r2c5; + typename bwc::VectorType r2c6; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r2c5 = bwc::XOR(r2c5, r2c5); + r2c6 = bwc::XOR(r2c6, r2c6); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + r2c5 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + r2c6 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c6); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r2c5; + typename bwc::VectorType r2c6; + typename bwc::VectorType r2c7; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r2c5 = bwc::XOR(r2c5, r2c5); + r2c6 = bwc::XOR(r2c6, r2c6); + r2c7 = bwc::XOR(r2c7, r2c7); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + r2c5 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + r2c6 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + r2c7 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c7); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r0c6; + typename bwc::VectorType r0c7; + typename bwc::VectorType r0c8; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r1c6; + typename bwc::VectorType r1c7; + typename bwc::VectorType r1c8; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r2c5; + typename bwc::VectorType r2c6; + typename bwc::VectorType r2c7; + typename bwc::VectorType r2c8; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r0c6 = bwc::XOR(r0c6, r0c6); + r0c7 = bwc::XOR(r0c7, r0c7); + r0c8 = bwc::XOR(r0c8, r0c8); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r1c6 = bwc::XOR(r1c6, r1c6); + r1c7 = bwc::XOR(r1c7, r1c7); + r1c8 = bwc::XOR(r1c8, r1c8); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r2c5 = bwc::XOR(r2c5, r2c5); + r2c6 = bwc::XOR(r2c6, r2c6); + r2c7 = bwc::XOR(r2c7, r2c7); + r2c8 = bwc::XOR(r2c8, r2c8); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + r2c5 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 6 * bwc::VectorWidth)); + r0c6 += r0 * c; + r1c6 += r1 * c; + r2c6 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 7 * bwc::VectorWidth)); + r0c7 += r0 * c; + r1c7 += r1 * c; + r2c7 += r2 * c; + c = bwc::LoadVector(&B(p, bColOffset + 8 * bwc::VectorWidth)); + r0c8 += r0 * c; + r1c8 += r1 * c; + r2c8 += r2 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c8); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c8); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c5); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c6); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c7); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c8); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r0c0; + typename bwc::VectorType r1c0; + typename bwc::VectorType r2c0; + typename bwc::VectorType r3c0; + r0c0 = bwc::XOR(r0c0, r0c0); + r1c0 = bwc::XOR(r1c0, r1c0); + r2c0 = bwc::XOR(r2c0, r2c0); + r3c0 = bwc::XOR(r3c0, r3c0); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c3); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + typename bwc::VectorType r3c4; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + r3c4 = bwc::XOR(r3c4, r3c4); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + r3c4 += r3 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c4); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r0c5; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r1c5; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r2c5; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + typename bwc::VectorType r3c4; + typename bwc::VectorType r3c5; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r0c5 = bwc::XOR(r0c5, r0c5); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r1c5 = bwc::XOR(r1c5, r1c5); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r2c5 = bwc::XOR(r2c5, r2c5); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + r3c4 = bwc::XOR(r3c4, r3c4); + r3c5 = bwc::XOR(r3c5, r3c5); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + r3c4 += r3 * c; + c = bwc::LoadVector(&B(p, bColOffset + 5 * bwc::VectorWidth)); + r0c5 += r0 * c; + r1c5 += r1 * c; + r2c5 += r2 * c; + r3c5 += r3 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c5); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c5); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c5); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c4); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c5); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r4; + typename bwc::VectorType r0c0; + typename bwc::VectorType r1c0; + typename bwc::VectorType r2c0; + typename bwc::VectorType r3c0; + typename bwc::VectorType r4c0; + r0c0 = bwc::XOR(r0c0, r0c0); + r1c0 = bwc::XOR(r1c0, r1c0); + r2c0 = bwc::XOR(r2c0, r2c0); + r3c0 = bwc::XOR(r3c0, r3c0); + r4c0 = bwc::XOR(r4c0, r4c0); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + r4 = bwc::BroadcastToVector(A(aRowOffset + 4, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + r4c0 += r4 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r4c0); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r4; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r4c0; + typename bwc::VectorType r4c1; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r4c0 = bwc::XOR(r4c0, r4c0); + r4c1 = bwc::XOR(r4c1, r4c1); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + r4 = bwc::BroadcastToVector(A(aRowOffset + 4, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + r4c0 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + r4c1 += r4 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r4c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c1); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r4; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r4c0; + typename bwc::VectorType r4c1; + typename bwc::VectorType r4c2; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r4c0 = bwc::XOR(r4c0, r4c0); + r4c1 = bwc::XOR(r4c1, r4c1); + r4c2 = bwc::XOR(r4c2, r4c2); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + r4 = bwc::BroadcastToVector(A(aRowOffset + 4, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + r4c0 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + r4c1 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + r4c2 += r4 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r4c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c2); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r4; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + typename bwc::VectorType r4c0; + typename bwc::VectorType r4c1; + typename bwc::VectorType r4c2; + typename bwc::VectorType r4c3; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + r4c0 = bwc::XOR(r4c0, r4c0); + r4c1 = bwc::XOR(r4c1, r4c1); + r4c2 = bwc::XOR(r4c2, r4c2); + r4c3 = bwc::XOR(r4c3, r4c3); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + r4 = bwc::BroadcastToVector(A(aRowOffset + 4, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + r4c0 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + r4c1 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + r4c2 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + r4c3 += r4 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c3); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r4c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c3); + + } + }; + + template + struct RegisterBlocking { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType c; + typename bwc::VectorType r0; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType r4; + typename bwc::VectorType r0c0; + typename bwc::VectorType r0c1; + typename bwc::VectorType r0c2; + typename bwc::VectorType r0c3; + typename bwc::VectorType r0c4; + typename bwc::VectorType r1c0; + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r2c0; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r3c0; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + typename bwc::VectorType r3c4; + typename bwc::VectorType r4c0; + typename bwc::VectorType r4c1; + typename bwc::VectorType r4c2; + typename bwc::VectorType r4c3; + typename bwc::VectorType r4c4; + r0c0 = bwc::XOR(r0c0, r0c0); + r0c1 = bwc::XOR(r0c1, r0c1); + r0c2 = bwc::XOR(r0c2, r0c2); + r0c3 = bwc::XOR(r0c3, r0c3); + r0c4 = bwc::XOR(r0c4, r0c4); + r1c0 = bwc::XOR(r1c0, r1c0); + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r2c0 = bwc::XOR(r2c0, r2c0); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r3c0 = bwc::XOR(r3c0, r3c0); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + r3c4 = bwc::XOR(r3c4, r3c4); + r4c0 = bwc::XOR(r4c0, r4c0); + r4c1 = bwc::XOR(r4c1, r4c1); + r4c2 = bwc::XOR(r4c2, r4c2); + r4c3 = bwc::XOR(r4c3, r4c3); + r4c4 = bwc::XOR(r4c4, r4c4); + + for (int p = 0; p < k; p++) { + r0 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r1 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 3, p)); + r4 = bwc::BroadcastToVector(A(aRowOffset + 4, p)); + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r0c0 += r0 * c; + r1c0 += r1 * c; + r2c0 += r2 * c; + r3c0 += r3 * c; + r4c0 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r0c1 += r0 * c; + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + r4c1 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r0c2 += r0 * c; + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + r4c2 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r0c3 += r0 * c; + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + r4c3 += r4 * c; + c = bwc::LoadVector(&B(p, bColOffset + 4 * bwc::VectorWidth)); + r0c4 += r0 * c; + r1c4 += r1 * c; + r2c4 += r2 * c; + r3c4 += r3 * c; + r4c4 += r4 * c; + } + + size_t row = aRowOffset; + size_t column; + column = bColOffset; + AddAndStore(&C(row, column), r0c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r0c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r1c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r1c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r2c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r2c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r3c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r3c4); + ++row; + column = bColOffset; + AddAndStore(&C(row, column), r4c0); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c1); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c2); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c3); + column += bwc::VectorWidth; + AddAndStore(&C(row, column), r4c4); + + } + }; +} \ No newline at end of file