From 45ff460f4aa3ec7942eed1ac4a14fd0b3b3f7894 Mon Sep 17 00:00:00 2001 From: har0ke Date: Fri, 3 Jul 2020 17:51:59 +0200 Subject: [PATCH] Added avx512 --- CMakeLists.txt | 1 + plot.py | 5 +- scripts/test.py | 74 +++++-- src/BLASMul.h | 4 +- src/Boost.h | 4 +- src/DevideAndConquer.h | 82 ++----- src/Naive.h | 8 +- src/RegisterBlocking.h | 483 ++++++++++++++++++++++------------------- src/main.cpp | 36 +-- 9 files changed, 354 insertions(+), 343 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68c2689..63fdcfc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ find_package(Boost REQUIRED COMPONENTS filesystem program_options) include_directories("${Boost_INCLUDE_DIRS}") +message("${Boost_LIBRARIES} ${BLAS_LIBRARIES}") add_library(simple_matrix src/Matrix.cpp) target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES}) diff --git a/plot.py b/plot.py index 9c1cfbf..3a4dd38 100644 --- a/plot.py +++ b/plot.py @@ -21,9 +21,8 @@ a = [ def get_i(x): return np.array([(i[0], float(i[x + 1])) for i in a if len(i) > x + 1]) -functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", - "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", - "blas", "naive_reordered"] +[1e9, '172.500', '166.250', '163.900', '127.100', '117.100', '686.450', '35.100', '808.800'], +functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", "blas", "naive_reordered"] for i, f in list(enumerate(functions)): xy = get_i(i) diff --git a/scripts/test.py b/scripts/test.py index b1079b2..f5b4d34 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -1,4 +1,5 @@ #!/usr/bin/python +import argparse import json import os import shutil @@ -28,7 +29,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON" ] build_path = os.path.join(build_path_prefix, " ".join(flags)) - os.makedirs(build_path, exist_ok=True) + if not os.path.exists(build_path): + os.makedirs(build_path) check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags) check_call_quiet(["make", target], cwd=build_path) if False: @@ -39,37 +41,68 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a output = subprocess.check_output([os.path.join(build_path, target)] + args) return output - +very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"] +slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"] +normal_functions = ["block_wise_avx2"] +fast_functions = ["divide_and_conquer_block_avx2", "blas"] +avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"] if __name__ == '__main__': os.chdir(os.path.join(os.path.dirname(__file__), "..")) - os.makedirs("data", exist_ok=True) + if not os.path.exists("data"): + os.makedirs("data") os.chdir("data") with_double = False - extra_args = []#["--validate"] - total = float(sys.argv[1] if len(sys.argv) > 1 else 1e8) - sss= [] + parser = argparse.ArgumentParser() + parser.add_argument("total_size", type=float) + parser.add_argument("--very_slow", action="store_true") + parser.add_argument("--slow", action="store_true") + parser.add_argument("--normal", action="store_true") + parser.add_argument("--avx512", action="store_true") + parser.add_argument("--validate", action="store_true") + parser.add_argument("--double", action="store_true") + parser.add_argument("--function", type=str, nargs="*") + + options = parser.parse_args() + + functions = fast_functions + + if options.very_slow: + functions += very_slow_functions + + if options.very_slow or options.slow: + functions += slow_functions + + if options.very_slow or options.slow or options.normal: + functions += normal_functions + + if options.avx512: + functions += avx512_functions + + if options.function: + functions = options.function + + extra_args = [] + if options.validate: + extra_args.append("--validate") + if options.double: + extra_args.append("--double") + + total = options.total_size + matrix_combinations = [] i = pow(total, 1. / 3.) for _ in range(20): - a = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))) - b = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))) + a = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))) + b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))) + c = int(round(total / a / b)) + matrix_combinations.append([str(a), str(b), str(c)]) - c = round(total / a / b) - sss.append([str(a), str(b), str(c)]) - - functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", - "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", - "blas"] - if total < 1e10: - functions.append("naive_reordered") times = [[] for f in functions] output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json") for clang in [True]: - for sizes in sss: + for sizes in matrix_combinations: args = list(sizes) - if with_double: - args.append("--double") compile_and_run("..", "builds", "generate_random", True, True, args) folder = "x".join(sizes) for fidx, function in enumerate(functions): @@ -88,8 +121,9 @@ if __name__ == '__main__': print(["%.3f" % numpy.mean(ts) for ts in times]) with open(output_file + ".cache", "w") as f: json.dump({ + "extr_args": extra_args, "times": times, - "sizes": sss, + "sizes": matrix_combinations, "functions": functions, "means": ["%.3f" % numpy.mean(ts) for ts in times] }, f) diff --git a/src/BLASMul.h b/src/BLASMul.h index 4be61bc..78f8669 100644 --- a/src/BLASMul.h +++ b/src/BLASMul.h @@ -32,10 +32,8 @@ struct blas_functor { }; template -Matrix blas(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void blas(Matrix &C, const Matrix &A, const Matrix &B) { blas_functor::multiply(C, A, B); - return C; } #endif //SMID_MATRIX_BLASMUL_H diff --git a/src/Boost.h b/src/Boost.h index c25784e..0298195 100644 --- a/src/Boost.h +++ b/src/Boost.h @@ -9,10 +9,8 @@ #include template -Matrix __attribute__ ((noinline)) boost_axpy_mul(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void __attribute__ ((noinline)) boost_axpy_mul(Matrix &C, const Matrix &A, const Matrix &B) { boost::numeric::ublas::axpy_prod(A, B, C); - return C; } template diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h index 146864c..7fc7a25 100644 --- a/src/DevideAndConquer.h +++ b/src/DevideAndConquer.h @@ -102,17 +102,19 @@ struct multiplier_naive_functor { } return; } - _naive_reordered(C, A, B); + naive_reordered(C, A, B); } }; -template -struct multiplier_block_wise_256 : block_wise_256 { +template +struct multiplier_block_wise : block_wise { - static constexpr unsigned SplitMultipleM = 5 * block_wise_256::NumRows; + typedef block_wise Base; + + static constexpr unsigned SplitMultipleM = 5 * Base::NumRows; static constexpr unsigned SplitMultipleP = 20; - static constexpr unsigned SplitMultipleN = block_wise_256::NumColumns; + static constexpr unsigned SplitMultipleN = Base::NumColumns; static SplitAction action(unsigned m, unsigned p, unsigned n) { size_t m_p = m / SplitMultipleM; @@ -138,89 +140,43 @@ struct multiplier_block_wise_256 : block_wise_256 { }; -template -struct multiplier_block_wise_256_alternative : block_wise_256_alternative { - - static constexpr unsigned SplitMultipleM = 5 * block_wise_256::NumRows; - static constexpr unsigned SplitMultipleP = 20; - static constexpr unsigned SplitMultipleN = block_wise_256::NumColumns; - - static SplitAction action(unsigned m, unsigned p, unsigned n) { - size_t m_p = m / SplitMultipleM; - size_t p_p = p / SplitMultipleP; - size_t n_p = n / SplitMultipleN; - unsigned _SplitMultipleM = SplitMultipleM; - unsigned _SplitMultipleP = 1; - unsigned _SplitMultipleN = SplitMultipleN; - - auto max_dim = std::max(m_p, std::max(p_p, n_p)); - if (m <= SplitMultipleM && n <= SplitMultipleN) { - return DoNotSplit; - } else if (max_dim == m_p) { - return SplitA; - } else if (max_dim == n_p) { - return SplitB; - } else if (p > SplitMultipleP){ - return SplitBoth; - } else { - return DoNotSplit; - } - } -}; - - -template<> -struct block_wise_256; template -Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r1(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix &C, const Matrix &A, const Matrix &B) { _divide_and_conquer>(C, A, B); - return C; } template -Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r2(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix &C, const Matrix &A, const Matrix &B) { _divide_and_conquer>(C, A, B); - return C; } template -Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r3(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix &C, const Matrix &A, const Matrix &B) { _divide_and_conquer>(C, A, B); - return C; } template -Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r4(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix &C, const Matrix &A, const Matrix &B) { _divide_and_conquer>(C, A, B); - return C; } template -Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r5(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); +void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix &C, const Matrix &A, const Matrix &B) { _divide_and_conquer>(C, A, B); - return C; } template -Matrix __attribute__ ((noinline)) divide_and_conquer_block1(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); - auto nr = block_wise_256::NumRows; - auto nc = block_wise_256::NumColumns; - _divide_and_conquer>(C, A, B); - return C; +void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); } + +#ifdef WITH_AVX512 template -Matrix __attribute__ ((noinline)) divide_and_conquer_block2(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); - _divide_and_conquer>(C, A, B); - return C; +void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); } +#endif #endif //SMID_MATRIX_DEVIDEANDCONQUER_H diff --git a/src/Naive.h b/src/Naive.h index 39e4238..d1d5a32 100644 --- a/src/Naive.h +++ b/src/Naive.h @@ -21,7 +21,7 @@ Matrix __attribute__ ((noinline)) naive(const Matrix &A, const Matrix & } template -void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B) { +void __attribute__ ((noinline)) naive_reordered(M1 &C, const M2 &A, const M3 &B) { assert(A.size2() == B.size1()); for (int i = 0; i < A.size1(); i++) { for (int p = 0; p < B.size1(); p++) { @@ -31,11 +31,5 @@ void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B } } } -template -Matrix naive_reordered(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); - _naive_reordered(C, A, B); - return C; -} #endif //SMID_MATRIX_NAIVE_H diff --git a/src/RegisterBlocking.h b/src/RegisterBlocking.h index 4c49ed6..7b5c50d 100644 --- a/src/RegisterBlocking.h +++ b/src/RegisterBlocking.h @@ -8,161 +8,160 @@ #include "Matrix.h" #include -struct __m256_block_wise_config { - using FloatType = float; - using VectorType = __m256; - static constexpr auto LoadVector = _mm256_loadu_ps; - static constexpr auto StoreVector = _mm256_storeu_ps; - static constexpr auto BroadcastToVector = _mm256_broadcast_ss; -}; +enum AvxVersion { + AVX2, -struct __m256d_block_wise_config { - using FloatType = double; - using VectorType = __m256d; - static constexpr auto LoadVector = _mm256_loadu_pd; - static constexpr auto StoreVector = _mm256_storeu_pd; - static constexpr auto BroadcastToVector = _mm256_broadcast_sd; -}; - -template -struct ExtendedBlockWiseConfig { - typedef typename BlockWiseConfig::FloatType FloatType; - using VectorType = typename BlockWiseConfig::VectorType; - static constexpr auto LoadVector = BlockWiseConfig::LoadVector; - static constexpr auto StoreVector = BlockWiseConfig::StoreVector; - static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; - static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); - static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); }; -}; -template< - // template parameter as struct: otherwise some warning about losing alignment information warning - typename BitWiseConfig, - unsigned _NumRows, unsigned _NumColumnVectors -> -struct RegisterBlocking { - typedef ExtendedBlockWiseConfig bwc; - - static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { - bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); }; - - static constexpr auto NumRows = _NumRows; - static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; - - template - static void __attribute__ ((noinline)) handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { - - // AVX2 has 16 registers - // should be compiled as registers (total: regA * regB) - typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; - // iterate over dot-product terms - for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) - // Perform the DOT product - for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) - typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); - for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) - typename bwc::VectorType aa = bwc::BroadcastToVector(&A(aRowOffset + ai, p)); - CReg[ai][bi] += aa * bb; - } - } - } - // total regA * regB + regB registers - - // Accumulate the results into C. - for (int ai = 0; ai < _NumRows; ai++) { - for (int bi = 0; bi < _NumColumnVectors; bi++) { - AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); - } - } - } +#ifdef WITH_AVX512 + AVX512 +#endif }; -template -struct BestRowRegisterBlocking; +namespace detail { -template -struct BestRowRegisterBlocking : public RegisterBlocking {}; -template -struct BestRowRegisterBlocking : public RegisterBlocking {}; -template -struct BestRowRegisterBlocking : public RegisterBlocking {}; -// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 -// C = floor ((16 - R) / R) + struct __m256_block_wise_config { + using FloatType = float; + using VectorType = __m256; + static constexpr auto LoadVector = _mm256_loadu_ps; + static constexpr auto StoreVector = _mm256_storeu_ps; + static constexpr auto BroadcastToVector = _mm256_set1_ps; + }; -template< - // template parameter as struct: otherwise some warning about losing alignment information warning - typename BlockWiseConfig = __m256_block_wise_config, - unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4 -> -struct block_wise { + struct __m256d_block_wise_config { + using FloatType = double; + using VectorType = __m256d; + static constexpr auto LoadVector = _mm256_loadu_pd; + static constexpr auto StoreVector = _mm256_storeu_pd; + static constexpr auto BroadcastToVector = _mm256_set1_pd; + }; - typedef ExtendedBlockWiseConfig bwc; +#ifdef WITH_AVX512 + struct __m512_block_wise_config { + using FloatType = float; + using VectorType = __m512; + static constexpr auto LoadVector = _mm512_loadu_ps; + static constexpr auto StoreVector = _mm512_storeu_ps; + static constexpr auto BroadcastToVector = _mm512_set1_ps; + }; - static constexpr unsigned NumRows = InitialNumRows; - static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth; + struct __m512d_block_wise_config { + using FloatType = double; + using VectorType = __m512d; + static constexpr auto LoadVector = _mm512_loadu_pd; + static constexpr auto StoreVector = _mm512_storeu_pd; + static constexpr auto BroadcastToVector = _mm512_set1_pd; + }; +#endif - static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { - bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); }; + template + struct ExtendedBlockWiseConfig { + typedef typename BlockWiseConfig::FloatType FloatType; + using VectorType = typename BlockWiseConfig::VectorType; + static constexpr auto LoadVector = BlockWiseConfig::LoadVector; + static constexpr auto StoreVector = BlockWiseConfig::StoreVector; + static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; + static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); + static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, + LoadVector(memory) + + vector); + }; + }; - template - static void multiply(M1 &C, const M2 &A, const M3 &B) { - assert(C.size1() == A.size1() && C.size2() == B.size2()); - int m = A.size1(); - int k = A.size2(); - int n = B.size2(); - int m_i = 0; - for(; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row - int n_i = 0; - for(; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors * bwc::VectorWidth) { - RegisterBlocking::handle_block(k, C, A, m_i, B, n_i); - } - // calculate remaining columns (like naive_reordered) - if(n_i < n) { - for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) { - for (int p = 0; p < k; ++p) { - for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { - C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); - } + template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BitWiseConfig, + unsigned _NumRows, unsigned _NumColumnVectors + > + struct RegisterBlocking { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = _NumRows; + static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; + bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; } } } - } - // calculate remaining rows (like naive_reordered) - for (; m_i < m; ++m_i) { - for (int p = 0; p < k; p++) { - for (int n_i = 0; n_i < n; ++n_i) { - C(m_i, n_i) += A(m_i, p) * B(p, n_i); + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < _NumRows; ai++) { + for (int bi = 0; bi < _NumColumnVectors; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); } } } - } - template - class iterate_columns { - /** - * consume all n_i's possible - */ - template - static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { - for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n; - n_i += CurrentNumColumnVectors * bwc::VectorWidth) { - RegisterBlocking - ::handle_block(k, C, A, m_i, B, n_i); - } - } + }; - public: + template + struct BestRowRegisterBlocking; + + template + struct BestRowRegisterBlocking : public RegisterBlocking { + }; + template + struct BestRowRegisterBlocking : public RegisterBlocking { + }; + template + struct BestRowRegisterBlocking : public RegisterBlocking { + }; + // maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 + // C = floor ((16 - R) / R) + + template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BlockWiseConfig = __m256_block_wise_config, + unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4 + > + struct block_wise { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr unsigned NumRows = InitialNumRows; + static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; template - static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { - consume(k, m_i, n_i, n, C, A, B); - if(CurrentNumColumnVectors > 1) { - // try with less column vectors - iterate_columns::iterate(k, m_i, n_i, n, C, A, B); - } else { - // do rest of columns manually - if(n_i < n) { - for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) { + static void multiply(M1 &C, const M2 &A, const M3 &B) { + assert(C.size1() == A.size1() && C.size2() == B.size2()); + int m = A.size1(); + int k = A.size2(); + int n = B.size2(); + int m_i = 0; + for (; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row + int n_i = 0; + for (; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors * + bwc::VectorWidth) { + RegisterBlocking::handle_block(k, C, A, + m_i, B, + n_i); + } + // calculate remaining columns (like naive_reordered) + if (n_i < n) { + for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) { for (int p = 0; p < k; ++p) { for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); @@ -170,121 +169,147 @@ struct block_wise { } } } - n_i = n; } - assert(n_i == n); - } - }; - - template - class iterate_columns { - public: - template - static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { - throw std::runtime_error("should not be called, this is a bug."); + // calculate remaining rows (like naive_reordered) + for (; m_i < m; ++m_i) { + for (int p = 0; p < k; p++) { + for (int n_i = 0; n_i < n; ++n_i) { + C(m_i, n_i) += A(m_i, p) * B(p, n_i); + } + } + } } - }; + template + class iterate_columns { + /** + * consume all n_i's possible + */ + template + static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { + for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n; + n_i += CurrentNumColumnVectors * bwc::VectorWidth) { + RegisterBlocking + ::handle_block(k, C, A, m_i, B, n_i); + } + } - template - class iterate_rows { + public: - template - static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { - for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) { - size_t n_i = 0; - iterate_columns::iterate(k, m_i, n_i, n, C, A, B); + template + static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { + consume(k, m_i, n_i, n, C, A, B); + if (CurrentNumColumnVectors > 1) { + // try with less column vectors + iterate_columns::iterate(k, m_i, n_i, n, C, A, B); + } else { + // do rest of columns manually + if (n_i < n) { + for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) { + for (int p = 0; p < k; ++p) { + for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { + C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); + } + } + } + } + n_i = n; + } assert(n_i == n); } - } + }; - public: + template + class iterate_columns { + public: + template + static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { + throw std::runtime_error("should not be called, this is a bug."); + } + + }; + + template + class iterate_rows { + + template + static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { + for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) { + size_t n_i = 0; + iterate_columns::iterate(k, m_i, n_i, n, C, A, B); + assert(n_i == n); + } + } + + public: + + template + static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { + consume(k, m_i, m, n, C, A, B); + if (CurrentNumRows > 1) { + // try with less num rows vectors + iterate_rows::iterate(k, m_i, m, n, C, A, B); + } + assert(m_i == m); + } + }; + + template + class iterate_rows<0, CurrentNumColumnVectors> { + + public: + template + static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { + throw std::runtime_error("should not be called, this is a bug."); + } + }; template - static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { - consume(k, m_i, m, n, C, A, B); - if(CurrentNumRows > 1) { - // try with less num rows vectors - iterate_rows::iterate(k, m_i, m, n, C, A, B); - } + static void multiply2(M1 &C, const M2 &A, const M3 &B) { + assert(C.size1() == A.size1() && C.size2() == B.size2()); + int m = A.size1(); + int k = A.size2(); + int n = B.size2(); + size_t m_i = 0; + iterate_rows::iterate(k, m_i, m, n, C, A, B); assert(m_i == m); } + }; - template - class iterate_rows<0, CurrentNumColumnVectors> { + template + struct block_wise_base : block_wise { - public: template - static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { - throw std::runtime_error("should not be called, this is a bug."); + void operator()(M1 &C, const M2 &A, const M3 &B) const { + return block_wise::multiply2(C, A, B); } + }; - - template - static void multiply2(M1 &C, const M2 &A, const M3 &B) { - assert(C.size1() == A.size1() && C.size2() == B.size2()); - int m = A.size1(); - int k = A.size2(); - int n = B.size2(); - size_t m_i = 0; - iterate_rows::iterate(k, m_i, m, n, C, A, B); - assert(m_i == m); - } - -}; - -template -struct block_wise_256; - -template<> -struct block_wise_256 : public block_wise<__m256_block_wise_config> { - template - void operator()(M1 &C, const M2 &A, const M3 &B) const { - return multiply(C, A, B); - } - -}; - -template<> -struct block_wise_256 : public block_wise<__m256d_block_wise_config> { - template - void operator()(M1 &C, const M2 &A, const M3 &B) const { - return multiply(C, A, B); - } -}; - -template -struct block_wise_256_alternative; - -template<> -struct block_wise_256_alternative : public block_wise<__m256_block_wise_config> { - template - void operator()(M1 &C, const M2 &A, const M3 &B) const { - return multiply2(C, A, B); - } -}; - -template<> -struct block_wise_256_alternative : public block_wise<__m256d_block_wise_config> { - template - void operator()(M1 &C, const M2 &A, const M3 &B) const { - return multiply(C, A, B); - } -}; - -template -Matrix __attribute__ ((noinline)) block_wise_256_f(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); - block_wise_256::multiply(C, A, B); - return C; } +template +struct block_wise; + +template<> struct block_wise : public detail::block_wise_base {}; +template<> struct block_wise : public detail::block_wise_base {}; + +#ifdef WITH_AVX512 + template<> struct block_wise : public detail::block_wise_base {}; + template<> struct block_wise : public detail::block_wise_base {}; +#endif + template -Matrix __attribute__ ((noinline)) block_wise_256_f2(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2(), 0); - block_wise_256::multiply2(C, A, B); - return C; +void __attribute__ ((noinline)) block_wise_avx2(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply(C, A, B); } + +#ifdef WITH_AVX512 +template +void __attribute__ ((noinline)) block_wise_avx512(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply(C, A, B); +} +#endif + #endif //SMID_MATRIX_REGISTERBLOCKING_H diff --git a/src/main.cpp b/src/main.cpp index e6c1d28..037fb07 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,12 +12,13 @@ using namespace std::chrono; namespace po = boost::program_options; template -using BinaryMatrixOp = Matrix (*)(const Matrix &A, const Matrix &B); +using BinaryMatrixOp = void (*)(Matrix &C, const Matrix &A, const Matrix &B); template Matrix run_function(BinaryMatrixOp f, const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); auto a = steady_clock::now(); - auto C = f(A, B); + f(C, A, B); auto b = steady_clock::now(); auto ms = std::chrono::duration_cast(b - a).count(); if (ms > 1000) { @@ -49,27 +50,32 @@ int main_work(const std::string &test_function_name, const std::string &input_fo // use the result to prevent compiler to optimize... double use_result = 0; - /*TEST_IF(test_function_name, naive, A, B) - TEST_IF(test_function_name, boost_blocked_mul_64, A, B) - TEST_IF(test_function_name, boost_blocked_mul_256, A, B) - TEST_IF(test_function_name, boost_mul, A, B) - TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B) - TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B) - TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B) - TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)*/ TEST_IF(test_function_name, naive_reordered, A, B) - TEST_IF(test_function_name, block_wise_256_f, A, B) - TEST_IF(test_function_name, block_wise_256_f2, A, B) + TEST_IF(test_function_name, block_wise_avx2, A, B) + +#ifdef WITH_AVX512 + TEST_IF(test_function_name, block_wise_avx512, A, B) +#endif + TEST_IF(test_function_name, boost_axpy_mul, A, B) - TEST_IF(test_function_name, divide_and_conquer_block1, A, B) - TEST_IF(test_function_name, divide_and_conquer_block2, A, B) + TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B) + +#ifdef WITH_AVX512 + TEST_IF(test_function_name, divide_and_conquer_block_avx512, A, B) +#endif + + TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B) TEST_IF(test_function_name, divide_and_conquer_naive_r3, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B) TEST_IF(test_function_name, blas, A, B) if(validate) { std::cout << "Validating matrix" << std::endl; - auto C2 = boost_axpy_mul(A, B); + Matrix C2(A.size1(), B.size2(), 0); + boost_axpy_mul(C2, A, B); if(C.size1() != C2.size1() || C.size2() != C2.size2()) throw std::runtime_error("Result matrix has invalid size."); for(auto i = 0; i < C2.size1(); ++i) {