diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a71546..d1a2671 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,8 @@ else() set(CMAKE_CXX_COMPILER "g++") endif() -set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") if(OPTIMIZE_FOR_NATIVE) CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) diff --git a/plot.py b/plot.py index 3a4dd38..0035254 100644 --- a/plot.py +++ b/plot.py @@ -1,6 +1,57 @@ import matplotlib.pyplot as plt import numpy as np +import os +import json +from pprint import pprint + +folder = os.path.join(os.path.dirname(__file__), "data") + +files = list(reversed(sorted([os.path.join(folder, fn) for fn in os.listdir(folder) if fn.endswith(".json")]))) + + +each_size = { + +} +functions = set() +for fn in files: + with open(fn) as f: + data =json.load(f) + if data["total_size"] not in each_size: + each_size[data["total_size"]] = {v: k for k, v in zip(data["means"], data["functions"])} + functions = functions.union(set(data["functions"])) +pprint(each_size) +pprint(functions) + +for function in functions: + #if function.startswith("divide") and "naive" in function and not (function.endswith("r2") or function.endswith("r3")): + # continue + #if not function.startswith("divide_and_conquer_naive"): + # continue + if function.startswith("block_wise"): + continue + x = [] + y = [] + for size in sorted(each_size.keys()): + values = each_size[size] + if function in values: + x.append(pow(size, 1.)) + y.append(float(values[function]) / 1000.) + print(x, y) + plt.plot(x, y, label=function) + +from matplotlib import rc + +plt.legend() +plt.xscale("log") +plt.grid() +plt.xlim((1e10, 5.3e11)) +plt.ylim((0, 160)) +plt.savefig("test.pdf") +plt.xlabel("n * p * m") +plt.ylabel("seconds") +plt.show() +exit(0) a = [ [1e6, '0.000', '0.000', '0.000', '0.300', '0.200', '0.250', '0.000', '0.000'], diff --git a/scripts/dumpall.py b/scripts/dumpall.py new file mode 100755 index 0000000..d7c64bf --- /dev/null +++ b/scripts/dumpall.py @@ -0,0 +1,10 @@ +#!/usr/bin/python +import os +import sys +import subprocess +for e in range(8, 13): + for p in [1, 5]: + number = str(float(str(p) + "e" + str(e))) + args = [os.path.join(os.path.dirname(__file__), "test.py"), number] + sys.argv[1:] + print(" ".join(args)) + subprocess.check_call(args) \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py index ef480f2..1cd4254 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -21,10 +21,11 @@ def check_call_quiet(*args, **kwargs): exit(0) -def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, args): +def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): + flags = [ "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), - "-DCMAKE_BUILD_TYPE=RelWithDebInfo", + "-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"), "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON", "-DWITH_AVX512=" + ("ON" if avx512 else "OFF"), @@ -42,8 +43,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a output = subprocess.check_output([os.path.join(build_path, target)] + args) return output -very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"] -slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"] +very_slow_functions = ["naive_reordered"] +slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"] normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"] fast_functions = ["divide_and_conquer_block_avx2", "blas"] avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"] @@ -65,6 +66,7 @@ if __name__ == '__main__': parser.add_argument("--double", action="store_true") parser.add_argument("--gcc", action="store_true") parser.add_argument("--function", type=str, nargs="*") + parser.add_argument("--release", action="store_true") options = parser.parse_args() @@ -84,14 +86,14 @@ if __name__ == '__main__': if options.function: functions = options.function - + # functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"] extra_args = [] if options.validate: extra_args.append("--validate") if options.double: extra_args.append("--double") - total = options.total_size + total = int(options.total_size) matrix_combinations = [] i = pow(total, 1. / 3.) for _ in range(20): @@ -99,19 +101,19 @@ if __name__ == '__main__': b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))) c = int(round(total / a / b)) matrix_combinations.append([str(a), str(b), str(c)]) - + times = [[] for f in functions] output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json") clang = not options.gcc for sizes in matrix_combinations: args = list(sizes) - compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args) + compile_and_run("..", "builds", "generate_random", True, True, options.avx512, options.release, args) folder = "x".join(sizes) for fidx, function in enumerate(functions): arguments = [folder, "--algorithm", function] if with_double: arguments.append("--double") - output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args) + output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args) ms = output.decode()[output.decode().find("multiply:") + 10:] if "ms\n" in ms: ms = float(ms.split("ms\n")[0]) @@ -127,6 +129,7 @@ if __name__ == '__main__': "times": times, "sizes": matrix_combinations, "functions": functions, - "means": ["%.3f" % numpy.mean(ts) for ts in times] + "means": ["%.3f" % numpy.mean(ts) for ts in times], + "total_size": total }, f) os.rename(output_file + ".cache", output_file) \ No newline at end of file diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h index 425db9d..5557818 100644 --- a/src/DevideAndConquer.h +++ b/src/DevideAndConquer.h @@ -4,6 +4,8 @@ #ifndef SMID_MATRIX_DEVIDEANDCONQUER_H #define SMID_MATRIX_DEVIDEANDCONQUER_H + +#include "register_blocking/BlockWise.h" #include "Boost.h" #include "Naive.h" @@ -107,14 +109,14 @@ struct multiplier_naive_functor { }; -template +template struct multiplier_block_wise : block_wise { typedef block_wise Base; - static constexpr unsigned SplitMultipleM = 5 * Base::NumRows; - static constexpr unsigned SplitMultipleP = 20; - static constexpr unsigned SplitMultipleN = Base::NumColumns; + static constexpr unsigned SplitMultipleM = RowMultiplier * Base::NumRows; + static constexpr unsigned SplitMultipleP = PMultiplier; + static constexpr unsigned SplitMultipleN = ColumnMultiplier * Base::NumColumns; static SplitAction action(unsigned m, unsigned p, unsigned n) { size_t m_p = m / SplitMultipleM; @@ -143,27 +145,27 @@ struct multiplier_block_wise : block_wise { template void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template @@ -174,10 +176,9 @@ void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix &C, const template void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } - #ifdef WITH_AVX512 template void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix &C, const Matrix &A, const Matrix &B) { diff --git a/src/main.cpp b/src/main.cpp index acec4cc..e3868f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,7 +3,7 @@ #include #include #include -#include "register_blocking/RegisterBlocking.h" +#include "register_blocking/BlockWise.h" #include "Boost.h" #include "Naive.h" #include "DevideAndConquer.h" diff --git a/src/register_blocking/BlockWise.h b/src/register_blocking/BlockWise.h new file mode 100644 index 0000000..4ab8827 --- /dev/null +++ b/src/register_blocking/BlockWise.h @@ -0,0 +1,111 @@ +// +// Created by oke on 04.07.20. +// + +#ifndef SMID_MATRIX_BLOCKWISE_H +#define SMID_MATRIX_BLOCKWISE_H + +#include "detail/BlockWiseImpl.h" + +#include + +struct __m128_block_wise_config { + using FloatType = float; + using VectorType = __m128; + static constexpr auto LoadVector = _mm_loadu_ps; + static constexpr auto StoreVector = _mm_storeu_ps; + static constexpr auto BroadcastToVector = _mm_set1_ps; + static constexpr auto XOR = _mm_xor_ps; + static constexpr unsigned Registers = 16; +}; + +struct __m128d_block_wise_config { + using FloatType = double; + using VectorType = __m128d; + static constexpr auto LoadVector = _mm_loadu_pd; + static constexpr auto StoreVector = _mm_storeu_pd; + static constexpr auto BroadcastToVector = _mm_set1_pd; + static constexpr auto XOR = _mm_xor_pd; + static constexpr unsigned Registers = 16; +}; + +struct __m256_block_wise_config { + using FloatType = float; + using VectorType = __m256; + static constexpr auto LoadVector = _mm256_loadu_ps; + static constexpr auto StoreVector = _mm256_storeu_ps; + static constexpr auto BroadcastToVector = _mm256_set1_ps; + static constexpr auto XOR = _mm256_xor_ps; + static constexpr unsigned Registers = 16; +}; + +struct __m256d_block_wise_config { + using FloatType = double; + using VectorType = __m256d; + static constexpr auto LoadVector = _mm256_loadu_pd; + static constexpr auto StoreVector = _mm256_storeu_pd; + static constexpr auto BroadcastToVector = _mm256_set1_pd; + static constexpr auto XOR = _mm256_xor_pd; + static constexpr unsigned Registers = 16; +}; + +#ifdef WITH_AVX512 +struct __m512_block_wise_config { + using FloatType = float; + using VectorType = __m512; + static constexpr auto LoadVector = _mm512_loadu_ps; + static constexpr auto StoreVector = _mm512_storeu_ps; + static constexpr auto BroadcastToVector = _mm512_set1_ps; + static constexpr unsigned Registers = 32; +}; + +struct __m512d_block_wise_config { + using FloatType = double; + using VectorType = __m512d; + static constexpr auto LoadVector = _mm512_loadu_pd; + static constexpr auto StoreVector = _mm512_storeu_pd; + static constexpr auto BroadcastToVector = _mm512_set1_pd; + static constexpr unsigned Registers = 32; +}; +#endif + +enum AvxVersion { + SSE, + AVX2, +#ifdef WITH_AVX512 + AVX512 +#endif +}; + +template +struct block_wise; + +template<> struct block_wise : public detail::block_wise_base<__m128_block_wise_config> {}; +template<> struct block_wise : public detail::block_wise_base<__m128d_block_wise_config> {}; + +template<> struct block_wise : public detail::block_wise_base<__m256_block_wise_config> {}; +template<> struct block_wise : public detail::block_wise_base<__m256d_block_wise_config> {}; + +#ifdef WITH_AVX512 +template<> struct block_wise : public detail::block_wise_base<__m512_block_wise_config> {}; + template<> struct block_wise : public detail::block_wise_base<__m512d_block_wise_config> {}; +#endif + +template +void __attribute__ ((noinline)) block_wise_sse(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply2(C, A, B); +} + +template +void __attribute__ ((noinline)) block_wise_avx2(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply2(C, A, B); +} + +#ifdef WITH_AVX512 +template +void __attribute__ ((noinline)) block_wise_avx512(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply2(C, A, B); +} +#endif + +#endif //SMID_MATRIX_BLOCKWISE_H diff --git a/src/register_blocking/RegisterBlocking.h b/src/register_blocking/detail/BlockWiseImpl.h similarity index 50% rename from src/register_blocking/RegisterBlocking.h rename to src/register_blocking/detail/BlockWiseImpl.h index 4d50148..544c3d3 100644 --- a/src/register_blocking/RegisterBlocking.h +++ b/src/register_blocking/detail/BlockWiseImpl.h @@ -1,95 +1,14 @@ // -// Created by oke on 01.07.20. +// Created by oke on 04.07.20. // -#ifndef SMID_MATRIX_REGISTERBLOCKING_H -#define SMID_MATRIX_REGISTERBLOCKING_H +#ifndef SMID_MATRIX_BLOCKWISE_IMPL_H +#define SMID_MATRIX_BLOCKWISE_IMPL_H -#include "../Matrix.h" -#include +#include "RegisterBlocking.h" namespace detail { - // maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 - // C = floor ((16 - R) / R) - - constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) { - if (R == 0) return 0; - return (unsigned) ((double) (Registers - R) / (double) R); - } - - constexpr unsigned GetInitialRows(unsigned Registers) { - for(unsigned R = Registers; R > 0; --R) { - if(R + R * (R + 1) < Registers) { - return R; - } - } - return 0; - } - - - template - struct ExtendedBlockWiseConfig { - typedef typename BlockWiseConfig::FloatType FloatType; - using VectorType = typename BlockWiseConfig::VectorType; - - static constexpr auto Registers = BlockWiseConfig::Registers; - static constexpr auto LoadVector = BlockWiseConfig::LoadVector; - static constexpr auto StoreVector = BlockWiseConfig::StoreVector; - static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; - static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); - static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, - LoadVector(memory) + - vector); - }; - }; - - template< - // template parameter as struct: otherwise some warning about losing alignment information warning - typename BitWiseConfig, - unsigned _NumRows, unsigned _NumColumnVectors - > - struct RegisterBlocking { - typedef ExtendedBlockWiseConfig bwc; - - static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { - bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); - }; - - static constexpr auto NumRows = _NumRows; - static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; - - template - static void __attribute__ ((noinline)) - handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { - - // AVX2 has 16 registers - // should be compiled as registers (total: regA * regB) - typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; - // iterate over dot-product terms - for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) - // Perform the DOT product - for (int bi = 0; - bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) - typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); - for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) - typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); - CReg[ai][bi] += aa * bb; - } - } - } - // total regA * regB + regB registers - - // Accumulate the results into C. - for (int ai = 0; ai < _NumRows; ai++) { - for (int bi = 0; bi < _NumColumnVectors; bi++) { - AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); - } - } - } - - }; - template< // template parameter as struct: otherwise some warning about losing alignment information warning typename BlockWiseConfig @@ -107,6 +26,7 @@ namespace detail { bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); }; + /* template static void multiply(M1 &C, const M2 &A, const M3 &B) { assert(C.size1() == A.size1() && C.size2() == B.size2()); @@ -142,6 +62,7 @@ namespace detail { } } } + */ template class iterate_columns { @@ -250,100 +171,4 @@ namespace detail { }; } - -struct __m128_block_wise_config { - using FloatType = float; - using VectorType = __m128; - static constexpr auto LoadVector = _mm_loadu_ps; - static constexpr auto StoreVector = _mm_storeu_ps; - static constexpr auto BroadcastToVector = _mm_set1_ps; - static constexpr unsigned Registers = 16; -}; - -struct __m128d_block_wise_config { - using FloatType = double; - using VectorType = __m128d; - static constexpr auto LoadVector = _mm_loadu_pd; - static constexpr auto StoreVector = _mm_storeu_pd; - static constexpr auto BroadcastToVector = _mm_set1_pd; - static constexpr unsigned Registers = 16; -}; - -struct __m256_block_wise_config { - using FloatType = float; - using VectorType = __m256; - static constexpr auto LoadVector = _mm256_loadu_ps; - static constexpr auto StoreVector = _mm256_storeu_ps; - static constexpr auto BroadcastToVector = _mm256_set1_ps; - static constexpr unsigned Registers = 16; -}; - -struct __m256d_block_wise_config { - using FloatType = double; - using VectorType = __m256d; - static constexpr auto LoadVector = _mm256_loadu_pd; - static constexpr auto StoreVector = _mm256_storeu_pd; - static constexpr auto BroadcastToVector = _mm256_set1_pd; - static constexpr unsigned Registers = 16; -}; - -#ifdef WITH_AVX512 -struct __m512_block_wise_config { - using FloatType = float; - using VectorType = __m512; - static constexpr auto LoadVector = _mm512_loadu_ps; - static constexpr auto StoreVector = _mm512_storeu_ps; - static constexpr auto BroadcastToVector = _mm512_set1_ps; - static constexpr unsigned Registers = 32; -}; - -struct __m512d_block_wise_config { - using FloatType = double; - using VectorType = __m512d; - static constexpr auto LoadVector = _mm512_loadu_pd; - static constexpr auto StoreVector = _mm512_storeu_pd; - static constexpr auto BroadcastToVector = _mm512_set1_pd; - static constexpr unsigned Registers = 32; -}; -#endif - -enum AvxVersion { - SSE, - AVX2, -#ifdef WITH_AVX512 - AVX512 -#endif -}; - -template -struct block_wise; - -template<> struct block_wise : public detail::block_wise_base<__m128_block_wise_config> {}; -template<> struct block_wise : public detail::block_wise_base<__m128d_block_wise_config> {}; - -template<> struct block_wise : public detail::block_wise_base<__m256_block_wise_config> {}; -template<> struct block_wise : public detail::block_wise_base<__m256d_block_wise_config> {}; - -#ifdef WITH_AVX512 - template<> struct block_wise : public detail::block_wise_base<__m512_block_wise_config> {}; - template<> struct block_wise : public detail::block_wise_base<__m512d_block_wise_config> {}; -#endif - -template -void __attribute__ ((noinline)) block_wise_sse(Matrix &C, const Matrix &A, const Matrix &B) { - block_wise::multiply(C, A, B); -} - -template -void __attribute__ ((noinline)) block_wise_avx2(Matrix &C, const Matrix &A, const Matrix &B) { - block_wise::multiply(C, A, B); -} - -#ifdef WITH_AVX512 -template -void __attribute__ ((noinline)) block_wise_avx512(Matrix &C, const Matrix &A, const Matrix &B) { - block_wise::multiply(C, A, B); -} -#endif - -#endif //SMID_MATRIX_REGISTERBLOCKING_H +#endif // SMID_MATRIX_BLOCKWISE_IMPL_H diff --git a/src/register_blocking/detail/ExtendedBlockWiseConfig.h b/src/register_blocking/detail/ExtendedBlockWiseConfig.h new file mode 100644 index 0000000..04620df --- /dev/null +++ b/src/register_blocking/detail/ExtendedBlockWiseConfig.h @@ -0,0 +1,51 @@ +// +// Created by oke on 04.07.20. +// + +#ifndef SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H +#define SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H + +template +struct ExtendedBlockWiseConfig { + + typedef typename BlockWiseConfig::FloatType FloatType; + using VectorType = typename BlockWiseConfig::VectorType; + + static constexpr auto Registers = BlockWiseConfig::Registers; + static constexpr auto LoadVector = BlockWiseConfig::LoadVector; + static constexpr auto StoreVector = BlockWiseConfig::StoreVector; + static constexpr auto XOR = BlockWiseConfig::XOR; + static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; + static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); + static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { + StoreVector(memory, LoadVector(memory) + vector); + }; +}; + +// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 +// C = floor ((16 - R) / R) + +// maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16 +// C = floor ((16 - R) / R) + +unsigned constexpr extra_registers = 2; + + +constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) { + if (R == 0) return 0; + auto result = (unsigned) ((double) (Registers - extra_registers - R) / (double) R); + if (extra_registers + R + R * (result + 1) <= Registers) + return result + 1; + return result; +} + +constexpr unsigned GetInitialRows(unsigned Registers) { + for(unsigned R = Registers; R > 0; --R) { + if(extra_registers + R + R * R <= Registers) { + return R; + } + } + return 0; +} + +#endif //SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H diff --git a/src/register_blocking/detail/RegisterBlocking.h b/src/register_blocking/detail/RegisterBlocking.h new file mode 100644 index 0000000..09cac35 --- /dev/null +++ b/src/register_blocking/detail/RegisterBlocking.h @@ -0,0 +1,155 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_REGISTERBLOCKING_H +#define SMID_MATRIX_REGISTERBLOCKING_H + +#include "../../Matrix.h" +#include "ExtendedBlockWiseConfig.h" + +namespace detail { + + template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BitWiseConfig, + unsigned _NumRows, unsigned _NumColumnVectors + > + struct RegisterBlocking { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = _NumRows; + static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; + bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + } + } + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < _NumRows; ai++) { + for (int bi = 0; bi < _NumColumnVectors; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); + } + } + } + }; + +/* + template + struct RegisterBlocking { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + typename bwc::VectorType r3c4; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType c; + + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + r3c4 = bwc::XOR(r3c4, r3c4); + + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + r1 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r1c4 += r1 * c; + r2c4 += r2 * c; + r3c4 += r3 * c; + } + + // total regA * regB + regB registers + + // Accumulate the results into C. + AddAndStore(&C(aRowOffset + 0, bColOffset + 0 * bwc::VectorWidth), r1c1); + AddAndStore(&C(aRowOffset + 1, bColOffset + 0 * bwc::VectorWidth), r2c1); + AddAndStore(&C(aRowOffset + 2, bColOffset + 0 * bwc::VectorWidth), r3c1); + AddAndStore(&C(aRowOffset + 0, bColOffset + 1 * bwc::VectorWidth), r1c2); + AddAndStore(&C(aRowOffset + 1, bColOffset + 1 * bwc::VectorWidth), r2c2); + AddAndStore(&C(aRowOffset + 2, bColOffset + 1 * bwc::VectorWidth), r3c2); + AddAndStore(&C(aRowOffset + 0, bColOffset + 2 * bwc::VectorWidth), r1c3); + AddAndStore(&C(aRowOffset + 1, bColOffset + 2 * bwc::VectorWidth), r2c3); + AddAndStore(&C(aRowOffset + 2, bColOffset + 2 * bwc::VectorWidth), r3c3); + AddAndStore(&C(aRowOffset + 0, bColOffset + 3 * bwc::VectorWidth), r1c4); + AddAndStore(&C(aRowOffset + 1, bColOffset + 3 * bwc::VectorWidth), r2c4); + AddAndStore(&C(aRowOffset + 2, bColOffset + 3 * bwc::VectorWidth), r3c4); + } + }; +*/ + +} + +#endif //SMID_MATRIX_REGISTERBLOCKING_H diff --git a/src/register_blocking/detail/RegisterBlockingManual.h b/src/register_blocking/detail/RegisterBlockingManual.h new file mode 100644 index 0000000..fd624e3 --- /dev/null +++ b/src/register_blocking/detail/RegisterBlockingManual.h @@ -0,0 +1,80 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_REGISTERBLOCKINGMANUAL_H +#define SMID_MATRIX_REGISTERBLOCKINGMANUAL_H + +#include "../../Matrix.h" +#include "ExtendedBlockWiseConfig.h" + +template +struct a { + typedef ExtendedBlockWiseConfig bwc; + + + template + void do_stuff(int p, int bi, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + + for (int ai = 0; ai < Rows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + + } + +}; + + +namespace detail { + + template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BitWiseConfig, + unsigned _NumRows, unsigned _NumColumnVectors + > + struct RegisterBlockingManual { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = _NumRows; + static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + } + } + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < _NumRows; ai++) { + for (int bi = 0; bi < _NumColumnVectors; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); + } + } + } + + }; + +} + +#endif //SMID_MATRIX_REGISTERBLOCKINGMANUAL_H diff --git a/test.pdf b/test.pdf new file mode 100644 index 0000000..f5f0089 Binary files /dev/null and b/test.pdf differ diff --git a/test.svg b/test.svg new file mode 100644 index 0000000..3c87af6 --- /dev/null +++ b/test.svg @@ -0,0 +1,1195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +