Added avx512

2020-07-03 17:51:59 +02:00 · 2020-07-03 17:51:59 +02:00 · 45ff460f4a
commit 45ff460f4a
parent 520407e45d
9 changed files with 354 additions and 343 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,6 +45,7 @@ find_package(Boost REQUIRED COMPONENTS filesystem program_options)
 include_directories("${Boost_INCLUDE_DIRS}")
 message("${Boost_LIBRARIES} ${BLAS_LIBRARIES}")
 add_library(simple_matrix src/Matrix.cpp)
 target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES})
--- a/plot.py
+++ b/plot.py
@ -21,9 +21,8 @@ a = [
 def get_i(x):
    return np.array([(i[0], float(i[x + 1])) for i in a if len(i) > x  + 1])
-functions =  ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul",
+[1e9,          '172.500',         '166.250',           '163.900',        '127.100',                   '117.100',                   '686.450',                     '35.100', '808.800'],
-              "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3",
+functions =  ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", "blas",   "naive_reordered"]
              "blas", "naive_reordered"]
 for i, f in list(enumerate(functions)):
    xy = get_i(i)
--- a/scripts/test.py
+++ b/scripts/test.py
@ -1,4 +1,5 @@
 #!/usr/bin/python
 import argparse
 import json
 import os
 import shutil
@ -28,7 +29,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
        "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON"
    ]
    build_path = os.path.join(build_path_prefix, " ".join(flags))
-    os.makedirs(build_path, exist_ok=True)
+    if not os.path.exists(build_path):
        os.makedirs(build_path)
    check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
    check_call_quiet(["make", target], cwd=build_path)
    if False:
@ -39,37 +41,68 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
        output = subprocess.check_output([os.path.join(build_path, target)] + args)
        return output
-
+very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
 slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
 normal_functions = ["block_wise_avx2"]
 fast_functions = ["divide_and_conquer_block_avx2", "blas"]
 avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
 if __name__ == '__main__':
    os.chdir(os.path.join(os.path.dirname(__file__), ".."))
-    os.makedirs("data", exist_ok=True)
+    if not os.path.exists("data"):
        os.makedirs("data")
    os.chdir("data")
    with_double = False
    extra_args = []#["--validate"]
-    total = float(sys.argv[1] if len(sys.argv) > 1 else 1e8)
+    parser = argparse.ArgumentParser()
-    sss= []
+    parser.add_argument("total_size", type=float)
    parser.add_argument("--very_slow", action="store_true")
    parser.add_argument("--slow", action="store_true")
    parser.add_argument("--normal", action="store_true")
    parser.add_argument("--avx512", action="store_true")
    parser.add_argument("--validate", action="store_true")
    parser.add_argument("--double", action="store_true")
    parser.add_argument("--function", type=str, nargs="*")
    options = parser.parse_args()
    functions = fast_functions
    if options.very_slow:
        functions += very_slow_functions
    if options.very_slow or options.slow:
        functions += slow_functions
    if options.very_slow or options.slow or options.normal:
        functions += normal_functions
    if options.avx512:
        functions += avx512_functions
    if options.function:
        functions = options.function
    extra_args = []
    if options.validate:
        extra_args.append("--validate")
    if options.double:
        extra_args.append("--double")
    total = options.total_size
    matrix_combinations = []
    i = pow(total, 1. / 3.)
    for _ in range(20):
-        a = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))
+        a = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
-        b = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))
+        b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
        c = int(round(total / a / b))
        matrix_combinations.append([str(a), str(b), str(c)])
        c = round(total / a / b)
        sss.append([str(a), str(b), str(c)])
    functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul",
                  "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3",
                  "blas"]
    if total < 1e10:
        functions.append("naive_reordered")
    times = [[] for f in functions]
    output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
    for clang in [True]:
-        for sizes in sss:
+        for sizes in matrix_combinations:
            args = list(sizes)
            if with_double:
                args.append("--double")
            compile_and_run("..", "builds", "generate_random", True, True, args)
            folder = "x".join(sizes)
            for fidx, function in enumerate(functions):
@ -88,8 +121,9 @@ if __name__ == '__main__':
            print(["%.3f" % numpy.mean(ts) for ts in times])
            with open(output_file + ".cache", "w") as f:
                json.dump({
                    "extr_args": extra_args,
                    "times": times,
-                    "sizes": sss,
+                    "sizes": matrix_combinations,
                    "functions": functions,
                    "means": ["%.3f" % numpy.mean(ts) for ts in times]
                }, f)
--- a/src/BLASMul.h
+++ b/src/BLASMul.h
@ -32,10 +32,8 @@ struct blas_functor<double> {
 };
 template<typename T>
-Matrix<T> blas(const Matrix<T> &A, const Matrix<T> &B) {
+void blas(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    blas_functor<T>::multiply(C, A, B);
    return C;
 }
 #endif //SMID_MATRIX_BLASMUL_H
--- a/src/Boost.h
+++ b/src/Boost.h
@ -9,10 +9,8 @@
 #include <boost/numeric/ublas/operation.hpp>
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) boost_axpy_mul(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) boost_axpy_mul(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    boost::numeric::ublas::axpy_prod(A, B, C);
    return C;
 }
 template<typename T>
--- a/src/DevideAndConquer.h
+++ b/src/DevideAndConquer.h
@ -102,17 +102,19 @@ struct multiplier_naive_functor {
            }
            return;
        }
-        _naive_reordered(C, A, B);
+        naive_reordered(C, A, B);
    }
 };
-template<typename FloatType>
+template<typename FloatType, AvxVersion version>
-struct multiplier_block_wise_256 : block_wise_256<FloatType> {
+struct multiplier_block_wise : block_wise<FloatType, version> {
-    static constexpr unsigned SplitMultipleM = 5 * block_wise_256<FloatType>::NumRows;
+    typedef block_wise<FloatType, version> Base;
    static constexpr unsigned SplitMultipleM = 5 * Base::NumRows;
    static constexpr unsigned SplitMultipleP = 20;
-    static constexpr unsigned SplitMultipleN = block_wise_256<FloatType>::NumColumns;
+    static constexpr unsigned SplitMultipleN = Base::NumColumns;
    static SplitAction action(unsigned m, unsigned p, unsigned n) {
        size_t m_p = m / SplitMultipleM;
@ -138,89 +140,43 @@ struct multiplier_block_wise_256 : block_wise_256<FloatType> {
 };
 template<typename FloatType>
 struct multiplier_block_wise_256_alternative : block_wise_256_alternative<FloatType> {
    static constexpr unsigned SplitMultipleM = 5 * block_wise_256<FloatType>::NumRows;
    static constexpr unsigned SplitMultipleP = 20;
    static constexpr unsigned SplitMultipleN = block_wise_256<FloatType>::NumColumns;
    static SplitAction action(unsigned m, unsigned p, unsigned n) {
        size_t m_p = m / SplitMultipleM;
        size_t p_p = p / SplitMultipleP;
        size_t n_p = n / SplitMultipleN;
        unsigned _SplitMultipleM = SplitMultipleM;
        unsigned _SplitMultipleP = 1;
        unsigned _SplitMultipleN = SplitMultipleN;
        auto max_dim = std::max(m_p, std::max(p_p, n_p));
        if (m <= SplitMultipleM && n <= SplitMultipleN) {
            return DoNotSplit;
        } else if (max_dim == m_p) {
            return SplitA;
        } else if (max_dim == n_p) {
            return SplitB;
        } else if (p > SplitMultipleP){
            return SplitBoth;
        } else {
            return DoNotSplit;
        }
    }
 };
 template<>
 struct block_wise_256<float>;
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r1(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    _divide_and_conquer<multiplier_naive_functor<5, 5>>(C, A, B);
    return C;
 }
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r2(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    _divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
    return C;
 }
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r3(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    _divide_and_conquer<multiplier_naive_functor<200, 200>>(C, A, B);
    return C;
 }
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r4(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    _divide_and_conquer<multiplier_naive_functor<500, 500>>(C, A, B);
    return C;
 }
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r5(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    _divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
    return C;
 }
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_block1(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
    auto nr = block_wise_256<T>::NumRows;
    auto nc = block_wise_256<T>::NumColumns;
    _divide_and_conquer<multiplier_block_wise_256<T>>(C, A, B);
    return C;
 }
 #ifdef WITH_AVX512
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) divide_and_conquer_block2(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_block_wise<T, AVX512>>(C, A, B);
    _divide_and_conquer<multiplier_block_wise_256_alternative<T>>(C, A, B);
    return C;
 }
 #endif
 #endif //SMID_MATRIX_DEVIDEANDCONQUER_H
--- a/src/Naive.h
+++ b/src/Naive.h
@ -21,7 +21,7 @@ Matrix<T> __attribute__ ((noinline)) naive(const Matrix<T> &A, const Matrix<T> &
 }
 template<typename M1, typename M2, typename M3>
-void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B) {
+void __attribute__ ((noinline)) naive_reordered(M1 &C, const M2 &A, const M3 &B) {
    assert(A.size2() == B.size1());
    for (int i = 0; i < A.size1(); i++) {
        for (int p = 0; p < B.size1(); p++) {
@ -31,11 +31,5 @@ void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B
        }
    }
 }
 template<typename T>
 Matrix<T> naive_reordered(const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    _naive_reordered(C, A, B);
    return C;
 }
 #endif //SMID_MATRIX_NAIVE_H
--- a/src/RegisterBlocking.h
+++ b/src/RegisterBlocking.h
@ -8,161 +8,160 @@
 #include "Matrix.h"
 #include <immintrin.h>
-struct __m256_block_wise_config {
+enum AvxVersion {
-    using FloatType = float;
+    AVX2,
    using VectorType = __m256;
    static constexpr auto LoadVector = _mm256_loadu_ps;
    static constexpr auto StoreVector = _mm256_storeu_ps;
    static constexpr auto BroadcastToVector = _mm256_broadcast_ss;
 };
-struct __m256d_block_wise_config {
+#ifdef WITH_AVX512
-    using FloatType = double;
+    AVX512
-    using VectorType = __m256d;
+#endif
    static constexpr auto LoadVector = _mm256_loadu_pd;
    static constexpr auto StoreVector = _mm256_storeu_pd;
    static constexpr auto BroadcastToVector = _mm256_broadcast_sd;
 };
 template<typename BlockWiseConfig>
 struct ExtendedBlockWiseConfig {
    typedef typename BlockWiseConfig::FloatType FloatType;
    using VectorType = typename BlockWiseConfig::VectorType;
    static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
    static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
    static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
    static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
    static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); };
 };
 template<
        // template parameter as struct: otherwise some warning about losing alignment information warning
        typename BitWiseConfig,
        unsigned _NumRows, unsigned _NumColumnVectors
 >
 struct RegisterBlocking {
    typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
    static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
        bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); };
    static constexpr auto NumRows = _NumRows;
    static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
    template<typename M1, typename M2, typename M3>
    static void __attribute__ ((noinline))  handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
        // AVX2 has 16 registers
        // should be compiled as registers (total: regA * regB)
        typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
        // iterate over dot-product terms
        for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
            // Perform the DOT product
            for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
                typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
                for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
                    typename bwc::VectorType aa = bwc::BroadcastToVector(&A(aRowOffset + ai, p));
                    CReg[ai][bi] += aa * bb;
                }
            }
        }
        // total regA * regB + regB registers
        // Accumulate the results into C.
        for (int ai = 0; ai < _NumRows; ai++) {
            for (int bi = 0; bi < _NumColumnVectors; bi++) {
                AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
            }
        }
    }
 };
-template<typename BitWiseConfig, unsigned Rows>
+namespace detail {
 struct BestRowRegisterBlocking;
-template<typename BitWiseConfig>
+    struct __m256_block_wise_config {
-struct BestRowRegisterBlocking<BitWiseConfig, 1> : public RegisterBlocking<BitWiseConfig, 1, 15> {};
+        using FloatType = float;
-template<typename BitWiseConfig>
+        using VectorType = __m256;
-struct BestRowRegisterBlocking<BitWiseConfig, 2> : public RegisterBlocking<BitWiseConfig, 2, 7> {};
+        static constexpr auto LoadVector = _mm256_loadu_ps;
-template<typename BitWiseConfig>
+        static constexpr auto StoreVector = _mm256_storeu_ps;
-struct BestRowRegisterBlocking<BitWiseConfig, 3> : public RegisterBlocking<BitWiseConfig, 3, 4> {};
+        static constexpr auto BroadcastToVector = _mm256_set1_ps;
-// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
+    };
 // C = floor ((16 - R) / R)
-template<
+    struct __m256d_block_wise_config {
-        // template parameter as struct: otherwise some warning about losing alignment information warning
+        using FloatType = double;
-        typename BlockWiseConfig = __m256_block_wise_config,
+        using VectorType = __m256d;
-        unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4
+        static constexpr auto LoadVector = _mm256_loadu_pd;
->
+        static constexpr auto StoreVector = _mm256_storeu_pd;
-struct block_wise {
+        static constexpr auto BroadcastToVector = _mm256_set1_pd;
    };
-    typedef ExtendedBlockWiseConfig<BlockWiseConfig> bwc;
+#ifdef WITH_AVX512
    struct __m512_block_wise_config {
        using FloatType = float;
        using VectorType = __m512;
        static constexpr auto LoadVector = _mm512_loadu_ps;
        static constexpr auto StoreVector = _mm512_storeu_ps;
        static constexpr auto BroadcastToVector = _mm512_set1_ps;
    };
-    static constexpr unsigned NumRows = InitialNumRows;
+    struct __m512d_block_wise_config {
-    static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth;
+        using FloatType = double;
        using VectorType = __m512d;
        static constexpr auto LoadVector = _mm512_loadu_pd;
        static constexpr auto StoreVector = _mm512_storeu_pd;
        static constexpr auto BroadcastToVector = _mm512_set1_pd;
    };
 #endif
-    static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
+    template<typename BlockWiseConfig>
-        bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); };
+    struct ExtendedBlockWiseConfig {
        typedef typename BlockWiseConfig::FloatType FloatType;
        using VectorType = typename BlockWiseConfig::VectorType;
        static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
        static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
        static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
        static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
        static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory,
                                                                                                   LoadVector(memory) +
                                                                                                   vector);
        };
    };
-    template<typename M1, typename M2, typename M3>
+    template<
-    static void multiply(M1 &C, const M2 &A, const M3 &B) {
+            // template parameter as struct: otherwise some warning about losing alignment information warning
-        assert(C.size1() == A.size1() && C.size2() == B.size2());
+            typename BitWiseConfig,
-        int m = A.size1();
+            unsigned _NumRows, unsigned _NumColumnVectors
-        int k = A.size2();
+    >
-        int n = B.size2();
+    struct RegisterBlocking {
-        int m_i = 0;
+        typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
-        for(; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row
+
-            int n_i = 0;
+        static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
-            for(; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors * bwc::VectorWidth) {
+            bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
-                RegisterBlocking<BlockWiseConfig, InitialNumRows, InitialNumColumnVectors>::handle_block(k, C, A, m_i, B, n_i);
+        };
-            }
+
-            // calculate remaining columns (like naive_reordered)
+        static constexpr auto NumRows = _NumRows;
-            if(n_i < n) {
+        static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
-                for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) {
+
-                    for (int p = 0; p < k; ++p) {
+        template<typename M1, typename M2, typename M3>
-                        for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
+        static void __attribute__ ((noinline))
-                            C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
+        handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
-                        }
+
            // AVX2 has 16 registers
            // should be compiled as registers (total: regA * regB)
            typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
            // iterate over dot-product terms
            for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
                // Perform the DOT product
                for (int bi = 0;
                     bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
                    typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
                    for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
                        typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
                        CReg[ai][bi] += aa * bb;
                    }
                }
            }
-        }
+            // total regA * regB + regB registers
-        // calculate remaining rows (like naive_reordered)
+
-        for (; m_i < m; ++m_i) {
+            // Accumulate the results into C.
-            for (int p = 0; p < k; p++) {
+            for (int ai = 0; ai < _NumRows; ai++) {
-                for (int n_i = 0; n_i < n; ++n_i) {
+                for (int bi = 0; bi < _NumColumnVectors; bi++) {
-                    C(m_i, n_i) += A(m_i, p) * B(p, n_i);
+                    AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
                }
            }
        }
    }
-    template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
+    };
    class iterate_columns {
        /**
         * consume all n_i's possible
         */
        template<typename M1, typename M2, typename M3>
        static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
            for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n;
                    n_i += CurrentNumColumnVectors * bwc::VectorWidth) {
                RegisterBlocking<BlockWiseConfig, CurrentNumRows, CurrentNumColumnVectors>
                ::handle_block(k, C, A, m_i, B, n_i);
            }
        }
-    public:
+    template<typename BitWiseConfig, unsigned Rows>
    struct BestRowRegisterBlocking;
    template<typename BitWiseConfig>
    struct BestRowRegisterBlocking<BitWiseConfig, 1> : public RegisterBlocking<BitWiseConfig, 1, 15> {
    };
    template<typename BitWiseConfig>
    struct BestRowRegisterBlocking<BitWiseConfig, 2> : public RegisterBlocking<BitWiseConfig, 2, 7> {
    };
    template<typename BitWiseConfig>
    struct BestRowRegisterBlocking<BitWiseConfig, 3> : public RegisterBlocking<BitWiseConfig, 3, 4> {
    };
    // maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
    // C = floor ((16 - R) / R)
    template<
            // template parameter as struct: otherwise some warning about losing alignment information warning
            typename BlockWiseConfig = __m256_block_wise_config,
            unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4
    >
    struct block_wise {
        typedef ExtendedBlockWiseConfig<BlockWiseConfig> bwc;
        static constexpr unsigned NumRows = InitialNumRows;
        static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth;
        static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
            bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
        };
        template<typename M1, typename M2, typename M3>
-        static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
+        static void multiply(M1 &C, const M2 &A, const M3 &B) {
-            consume(k, m_i, n_i, n, C, A, B);
+            assert(C.size1() == A.size1() && C.size2() == B.size2());
-            if(CurrentNumColumnVectors > 1) {
+            int m = A.size1();
-                // try with less column vectors
+            int k = A.size2();
-                iterate_columns<CurrentNumRows, CurrentNumColumnVectors - 1>::iterate(k, m_i, n_i, n, C, A, B);
+            int n = B.size2();
-            } else {
+            int m_i = 0;
-                // do rest of columns manually
+            for (; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row
-                if(n_i < n) {
+                int n_i = 0;
-                    for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) {
+                for (; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors *
                                                                                     bwc::VectorWidth) {
                    RegisterBlocking<BlockWiseConfig, InitialNumRows, InitialNumColumnVectors>::handle_block(k, C, A,
                                                                                                             m_i, B,
                                                                                                             n_i);
                }
                // calculate remaining columns (like naive_reordered)
                if (n_i < n) {
                    for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) {
                        for (int p = 0; p < k; ++p) {
                            for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
                                C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
@ -170,121 +169,147 @@ struct block_wise {
                        }
                    }
                }
                n_i = n;
            }
-            assert(n_i == n);
+            // calculate remaining rows (like naive_reordered)
-        }
+            for (; m_i < m; ++m_i) {
-    };
+                for (int p = 0; p < k; p++) {
-
+                    for (int n_i = 0; n_i < n; ++n_i) {
-    template<unsigned CurrentNumRows>
+                        C(m_i, n_i) += A(m_i, p) * B(p, n_i);
-    class iterate_columns<CurrentNumRows, 0> {
+                    }
-    public:
+                }
-        template<typename M1, typename M2, typename M3>
+            }
        static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
            throw std::runtime_error("should not be called, this is a bug.");
        }
-    };
+        template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
        class iterate_columns {
            /**
             * consume all n_i's possible
             */
            template<typename M1, typename M2, typename M3>
            static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
                for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n;
                       n_i += CurrentNumColumnVectors * bwc::VectorWidth) {
                    RegisterBlocking<BlockWiseConfig, CurrentNumRows, CurrentNumColumnVectors>
                    ::handle_block(k, C, A, m_i, B, n_i);
                }
            }
-    template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
+        public:
    class iterate_rows {
-        template<typename M1, typename M2, typename M3>
+            template<typename M1, typename M2, typename M3>
-        static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
-            for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) {
+                consume(k, m_i, n_i, n, C, A, B);
-                size_t n_i = 0;
+                if (CurrentNumColumnVectors > 1) {
-                iterate_columns<CurrentNumRows, CurrentNumColumnVectors>::iterate(k, m_i, n_i, n, C, A, B);
+                    // try with less column vectors
                    iterate_columns<CurrentNumRows, CurrentNumColumnVectors - 1>::iterate(k, m_i, n_i, n, C, A, B);
                } else {
                    // do rest of columns manually
                    if (n_i < n) {
                        for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) {
                            for (int p = 0; p < k; ++p) {
                                for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
                                    C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
                                }
                            }
                        }
                    }
                    n_i = n;
                }
                assert(n_i == n);
            }
-        }
+        };
-    public:
+        template<unsigned CurrentNumRows>
        class iterate_columns<CurrentNumRows, 0> {
        public:
            template<typename M1, typename M2, typename M3>
            static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
                throw std::runtime_error("should not be called, this is a bug.");
            }
        };
        template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
        class iterate_rows {
            template<typename M1, typename M2, typename M3>
            static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
                for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) {
                    size_t n_i = 0;
                    iterate_columns<CurrentNumRows, CurrentNumColumnVectors>::iterate(k, m_i, n_i, n, C, A, B);
                    assert(n_i == n);
                }
            }
        public:
            template<typename M1, typename M2, typename M3>
            static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
                consume(k, m_i, m, n, C, A, B);
                if (CurrentNumRows > 1) {
                    // try with less num rows vectors
                    iterate_rows<CurrentNumRows - 1, CurrentNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
                }
                assert(m_i == m);
            }
        };
        template<unsigned CurrentNumColumnVectors>
        class iterate_rows<0, CurrentNumColumnVectors> {
        public:
            template<typename M1, typename M2, typename M3>
            static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
                throw std::runtime_error("should not be called, this is a bug.");
            }
        };
        template<typename M1, typename M2, typename M3>
-        static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
+        static void multiply2(M1 &C, const M2 &A, const M3 &B) {
-            consume(k, m_i, m, n, C, A, B);
+            assert(C.size1() == A.size1() && C.size2() == B.size2());
-            if(CurrentNumRows > 1) {
+            int m = A.size1();
-                // try with less num rows vectors
+            int k = A.size2();
-                iterate_rows<CurrentNumRows - 1, CurrentNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
+            int n = B.size2();
-            }
+            size_t m_i = 0;
            iterate_rows<InitialNumRows, InitialNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
            assert(m_i == m);
        }
    };
-    template<unsigned CurrentNumColumnVectors>
+    template<typename BlockWiseConfig>
-    class iterate_rows<0, CurrentNumColumnVectors> {
+    struct block_wise_base : block_wise<BlockWiseConfig> {
    public:
        template<typename M1, typename M2, typename M3>
-        static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
+        void operator()(M1 &C, const M2 &A, const M3 &B) const {
-            throw std::runtime_error("should not be called, this is a bug.");
+            return block_wise<BlockWiseConfig>::multiply2(C, A, B);
        }
    };
    template<typename M1, typename M2, typename M3>
    static void multiply2(M1 &C, const M2 &A, const M3 &B) {
        assert(C.size1() == A.size1() && C.size2() == B.size2());
        int m = A.size1();
        int k = A.size2();
        int n = B.size2();
        size_t m_i = 0;
        iterate_rows<InitialNumRows, InitialNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
        assert(m_i == m);
    }
 };
 template<typename FloatType>
 struct block_wise_256;
 template<>
 struct block_wise_256<float> : public block_wise<__m256_block_wise_config> {
    template<typename M1, typename M2, typename M3>
    void operator()(M1 &C, const M2 &A, const M3 &B) const {
        return multiply(C, A, B);
    }
 };
 template<>
 struct block_wise_256<double> : public block_wise<__m256d_block_wise_config> {
    template<typename M1, typename M2, typename M3>
    void operator()(M1 &C, const M2 &A, const M3 &B) const {
        return multiply(C, A, B);
    }
 };
 template<typename FloatType>
 struct block_wise_256_alternative;
 template<>
 struct block_wise_256_alternative<float> : public block_wise<__m256_block_wise_config> {
    template<typename M1, typename M2, typename M3>
    void operator()(M1 &C, const M2 &A, const M3 &B) const {
        return multiply2(C, A, B);
    }
 };
 template<>
 struct block_wise_256_alternative<double> : public block_wise<__m256d_block_wise_config> {
    template<typename M1, typename M2, typename M3>
    void operator()(M1 &C, const M2 &A, const M3 &B) const {
        return multiply(C, A, B);
    }
 };
 template<typename T>
 Matrix<T> __attribute__ ((noinline)) block_wise_256_f(const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    block_wise_256<T>::multiply(C, A, B);
    return C;
 }
 template<typename FloatType, AvxVersion avxVersion>
 struct block_wise;
 template<> struct block_wise<float, AVX2> : public detail::block_wise_base<detail::__m256_block_wise_config> {};
 template<> struct block_wise<double, AVX2> : public detail::block_wise_base<detail::__m256d_block_wise_config> {};
 #ifdef WITH_AVX512
    template<> struct block_wise<float, AVX512> : public detail::block_wise_base<detail::__m512_block_wise_config> {};
    template<> struct block_wise<double, AVX512> : public detail::block_wise_base<detail::__m512d_block_wise_config> {};
 #endif
 template<typename T>
-Matrix<T> __attribute__ ((noinline)) block_wise_256_f2(const Matrix<T> &A, const Matrix<T> &B) {
+void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2(), 0);
+    block_wise<T, AVX2>::multiply(C, A, B);
    block_wise_256<T>::multiply2(C, A, B);
    return C;
 }
 #ifdef WITH_AVX512
 template<typename T>
 void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
    block_wise<T, AVX512>::multiply(C, A, B);
 }
 #endif
 #endif //SMID_MATRIX_REGISTERBLOCKING_H
--- a/src/main.cpp
+++ b/src/main.cpp
@ -12,12 +12,13 @@ using namespace std::chrono;
 namespace po = boost::program_options;
 template<typename T>
-using BinaryMatrixOp = Matrix<T> (*)(const Matrix<T> &A, const Matrix<T> &B);
+using BinaryMatrixOp = void (*)(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B);
 template<typename T>
 Matrix<T> run_function(BinaryMatrixOp<T> f, const Matrix<T> &A, const Matrix<T> &B) {
    Matrix<T> C(A.size1(), B.size2(), 0);
    auto a = steady_clock::now();
-    auto C = f(A, B);
+    f(C, A, B);
    auto b = steady_clock::now();
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count();
    if (ms > 1000) {
@ -49,27 +50,32 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
    // use the result to prevent compiler to optimize...
    double use_result = 0;
    /*TEST_IF(test_function_name, naive, A, B)
    TEST_IF(test_function_name, boost_blocked_mul_64, A, B)
    TEST_IF(test_function_name, boost_blocked_mul_256, A, B)
    TEST_IF(test_function_name, boost_mul, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)*/
    TEST_IF(test_function_name, naive_reordered, A, B)
-    TEST_IF(test_function_name, block_wise_256_f, A, B)
+    TEST_IF(test_function_name, block_wise_avx2, A, B)
-    TEST_IF(test_function_name, block_wise_256_f2, A, B)
+
 #ifdef WITH_AVX512
    TEST_IF(test_function_name, block_wise_avx512, A, B)
 #endif
    TEST_IF(test_function_name, boost_axpy_mul, A, B)
-    TEST_IF(test_function_name, divide_and_conquer_block1, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
-    TEST_IF(test_function_name, divide_and_conquer_block2, A, B)
+
 #ifdef WITH_AVX512
    TEST_IF(test_function_name, divide_and_conquer_block_avx512, A, B)
 #endif
    TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r3, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B)
    TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)
    TEST_IF(test_function_name, blas, A, B)
    if(validate) {
        std::cout << "Validating matrix" << std::endl;
-        auto C2 = boost_axpy_mul(A, B);
+        Matrix<FloatType> C2(A.size1(), B.size2(), 0);
        boost_axpy_mul(C2, A, B);
        if(C.size1() != C2.size1() || C.size2() != C2.size2())
            throw std::runtime_error("Result matrix has invalid size.");
        for(auto i = 0; i < C2.size1(); ++i) {