From 520407e45d711dd949e6f5e2dd97067fb790ef5b Mon Sep 17 00:00:00 2001
From: har0ke <dev@ohargens.de>
Date: Fri, 3 Jul 2020 15:13:40 +0200
Subject: [PATCH] Added Devide and Conqour approach

---
 CMakeLists.txt         |  14 +-
 plot.py                |  46 +++++++
 scripts/test.py        |  58 ++++++++-
 src/BLASMul.h          |  41 ++++++
 src/Boost.h            |  31 +++++
 src/DevideAndConquer.h | 226 ++++++++++++++++++++++++++++++++
 src/Naive.h            |  41 ++++++
 src/RegisterBlocking.h | 290 +++++++++++++++++++++++++++++++++++++++++
 src/main.cpp           | 213 +++++-------------------------
 9 files changed, 768 insertions(+), 192 deletions(-)
 create mode 100644 plot.py
 create mode 100644 src/BLASMul.h
 create mode 100644 src/Boost.h
 create mode 100644 src/DevideAndConquer.h
 create mode 100644 src/Naive.h
 create mode 100644 src/RegisterBlocking.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f4fa8d..68c2689 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(smid_matrix CXX)
 include(CheckCXXCompilerFlag)
 set(CMAKE_CXX_STANDARD 17)
 option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON)
-option(USE_CLANG "Build with clang instead of gcc" OFF)
+option(USE_CLANG "Build with clang instead of gcc" ON)
 set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run")
 
 add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}")
@@ -15,6 +15,8 @@ else()
     set(CMAKE_CXX_COMPILER "g++")
 endif()
 
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+
 if(OPTIMIZE_FOR_NATIVE)
     CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
     if(COMPILER_SUPPORTS_MARCH_NATIVE)
@@ -31,12 +33,20 @@ else()
     endif()
 endif()
 
+set(BLA_VENDER OpenBLAS)
+find_package(BLAS REQUIRED)
+if(BLAS_FOUND)
+    include_directories(${BLAS_INCLUDE_DIRS})
+ELSE()
+    message("OpenBLAS NOT found.")
+endif(BLAS_FOUND)
+
 find_package(Boost REQUIRED COMPONENTS filesystem program_options)
 
 include_directories("${Boost_INCLUDE_DIRS}")
 
 add_library(simple_matrix src/Matrix.cpp)
-target_link_libraries(simple_matrix ${Boost_LIBRARIES})
+target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES})
 
 add_executable(simd_multiply src/main.cpp)
 target_link_libraries(simd_multiply simple_matrix)
diff --git a/plot.py b/plot.py
new file mode 100644
index 0000000..9c1cfbf
--- /dev/null
+++ b/plot.py
@@ -0,0 +1,46 @@
+
+import matplotlib.pyplot as plt
+import numpy as np
+a = [
+
+[1e6, '0.000', '0.000', '0.000', '0.300', '0.200', '0.250', '0.000', '0.000'],
+[5e6, '0.200', '0.100', '0.200', '2.000', '1.800', '4.300', '0.200', '3.400'],
+[1e7, '1.400', '1.350', '1.500', '4.000', '3.500', '4.750', '0.600', '7.750'],
+[5e7, '7.450', '6.700', '8.100', '13.900', '11.950', '37.550', '2.350', '38.200'],
+[1e8, '13.900', '13.150', '13.600', '15.300', '13.600', '100.200', '2.900', '75.300'],
+[5e8, '81.100', '76.600', '74.350', '82.200', '75.150', '385.000', '14.600', '377.600'],
+[1e9, '172.500', '166.250', '163.900', '127.100', '117.100', '686.450', '35.100', '808.800'],
+[5e9, '1228.550', '1208.650', '1048.350', '532.850', '508.050', '2317.350', '127.750', '3998.350'],
+[1e10,'2577.050', '2533.500', '2415.700', '922.000', '892.750', '3702.750', '298.550', '8060.550'],
+[5e10, '15281.650', '15105.350', '12339.050', '4593.650', '4418.700', '16350.150', '1266.350', '40414.250'],
+[1e11, '32127.000', '31768.100', '24630.650', '8223.350', '8152.700', '32005.850', '2425.700', '81039.750'],
+[5e11, '249427.000', '248249.333', '124935.833', '39844.500', '39466.167', '138008.000', '11864.500', '402842.250']
+
+]
+
+def get_i(x):
+    return np.array([(i[0], float(i[x + 1])) for i in a if len(i) > x  + 1])
+
+functions =  ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul",
+              "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3",
+              "blas", "naive_reordered"]
+
+for i, f in list(enumerate(functions)):
+    xy = get_i(i)
+    plt.plot(xy[:,0], xy[:,1] / 1000., label=f)
+    print(xy)
+plt.legend()
+plt.xscale("log")
+plt.xlim((5e9, 1e12))
+plt.show()
+
+
+for i, f in list(enumerate(functions)):
+    xy = get_i(i)
+    xy_l = get_i(len(functions) - 1)
+    plt.plot(xy[:,0], xy[:,1] / xy[:, 1], label=f)
+    print(xy)
+plt.legend()
+plt.xscale("log")
+plt.xlim((5e9, 1e12))
+plt.show()
\ No newline at end of file
diff --git a/scripts/test.py b/scripts/test.py
index 3a4a6ca..b1079b2 100755
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -1,6 +1,13 @@
 #!/usr/bin/python
+import json
 import os
+import shutil
 import subprocess
+import sys
+from datetime import datetime
+from pprint import pprint
+
+import numpy
 
 
 def check_call_quiet(*args, **kwargs):
@@ -8,7 +15,7 @@ def check_call_quiet(*args, **kwargs):
     output, err = p.communicate()
     rc = p.returncode
     if rc != 0:
-        print(output)
+        print(output.decode())
         print(err)
         exit(0)
 
@@ -16,7 +23,7 @@ def check_call_quiet(*args, **kwargs):
 def compile_and_run(source_path, build_path_prefix, target, native, use_clang, args):
     flags = [
         "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
-        "-DCMAKE_BUILD_TYPE=Release",
+        "-DCMAKE_BUILD_TYPE=RelWithDebInfo",
         "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
         "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON"
     ]
@@ -29,7 +36,9 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
                               + [os.path.join(build_path, target)] + args)
     else:
         print(" ".join([os.path.join(build_path, target)] + args))
-        subprocess.check_call([os.path.join(build_path, target)] + args)
+        output = subprocess.check_output([os.path.join(build_path, target)] + args)
+        return output
+
 
 
 if __name__ == '__main__':
@@ -37,16 +46,51 @@ if __name__ == '__main__':
     os.makedirs("data", exist_ok=True)
     os.chdir("data")
     with_double = False
-    extra_args = ["--validate"]
+    extra_args = []#["--validate"]
+
+    total = float(sys.argv[1] if len(sys.argv) > 1 else 1e8)
+    sss= []
+    i = pow(total, 1. / 3.)
+    for _ in range(20):
+        a = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))
+        b = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))
+
+        c = round(total / a / b)
+        sss.append([str(a), str(b), str(c)])
+
+    functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul",
+                  "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3",
+                  "blas"]
+    if total < 1e10:
+        functions.append("naive_reordered")
+    times = [[] for f in functions]
+    output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
     for clang in [True]:
-        for sizes in [["2048", "2048", "999"]]:  # [["873", "1456", "999"]]:
+        for sizes in sss:
             args = list(sizes)
             if with_double:
                 args.append("--double")
             compile_and_run("..", "builds", "generate_random", True, True, args)
             folder = "x".join(sizes)
-            for function in ["block_wise_256_f", "boost_axpy_mul"]:  # ["naive_reordered", "block_wise_256_f", "boost_blocked_mul_64", "boost_blocked_mul_256", "boost_mul", "boost_axpy_mul"]:
+            for fidx, function in enumerate(functions):
                 arguments = [folder, "--algorithm", function]
                 if with_double:
                     arguments.append("--double")
-                compile_and_run("..", "builds", "simd_multiply", True, clang, arguments + extra_args)
+                output = compile_and_run("..", "builds", "simd_multiply", True, clang, arguments + extra_args)
+                ms = output.decode()[output.decode().find("multiply:") + 10:]
+                if "ms\n" in ms:
+                    ms = float(ms.split("ms\n")[0])
+                else:
+                    ms = float(ms.split("s\n")[0]) * 1000
+                times[fidx].append(ms)
+
+            shutil.rmtree(folder)
+            print(["%.3f" % numpy.mean(ts) for ts in times])
+            with open(output_file + ".cache", "w") as f:
+                json.dump({
+                    "times": times,
+                    "sizes": sss,
+                    "functions": functions,
+                    "means": ["%.3f" % numpy.mean(ts) for ts in times]
+                }, f)
+            os.rename(output_file + ".cache", output_file)
\ No newline at end of file
diff --git a/src/BLASMul.h b/src/BLASMul.h
new file mode 100644
index 0000000..4be61bc
--- /dev/null
+++ b/src/BLASMul.h
@@ -0,0 +1,41 @@
+//
+// Created by oke on 02.07.20.
+//
+
+#ifndef SMID_MATRIX_BLASMUL_H
+#define SMID_MATRIX_BLASMUL_H
+
+#include <cblas/cblas.h>
+#include "Matrix.h"
+
+template<typename T>
+struct blas_functor;
+
+template<>
+struct blas_functor<float> {
+
+    static void multiply(Matrix<float> &C, const Matrix<float> &A, const Matrix<float> &B) {
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, C.size1(), C.size2(), A.size2(), 1.,
+                    &A(0, 0), A.size2(), &B(0, 0), B.size2(), 1, &C(0, 0), C.size2());
+    }
+
+};
+
+template<>
+struct blas_functor<double> {
+
+    static void multiply(Matrix<double> &C, const Matrix<double> &A, const Matrix<double> &B) {
+        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, C.size1(), C.size2(), A.size2(), 1,
+                    &A(0, 0), A.size2(), &B(0, 0), B.size2(), 1, &C(0, 0), C.size2());
+    }
+
+};
+
+template<typename T>
+Matrix<T> blas(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    blas_functor<T>::multiply(C, A, B);
+    return C;
+}
+
+#endif //SMID_MATRIX_BLASMUL_H
diff --git a/src/Boost.h b/src/Boost.h
new file mode 100644
index 0000000..c25784e
--- /dev/null
+++ b/src/Boost.h
@@ -0,0 +1,31 @@
+//
+// Created by oke on 01.07.20.
+//
+
+#ifndef SMID_MATRIX_BOOST_H
+#define SMID_MATRIX_BOOST_H
+
+#include <boost/numeric/ublas/operation_blocked.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) boost_axpy_mul(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    boost::numeric::ublas::axpy_prod(A, B, C);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) boost_blocked_mul_64(const Matrix<T> &A, const Matrix<T> &B) {
+    return boost::numeric::ublas::block_prod<Matrix<T>, 64>(A, B);
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) boost_blocked_mul_256(const Matrix<T> &A, const Matrix<T> &B) {
+    return boost::numeric::ublas::block_prod<Matrix<T>, 256>(A, B);
+}
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) boost_mul(const Matrix<T> &A, const Matrix<T> &B) {
+    return boost::numeric::ublas::prod(A, B);
+}
+#endif //SMID_MATRIX_BOOST_H
diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h
new file mode 100644
index 0000000..146864c
--- /dev/null
+++ b/src/DevideAndConquer.h
@@ -0,0 +1,226 @@
+//
+// Created by oke on 01.07.20.
+//
+
+#ifndef SMID_MATRIX_DEVIDEANDCONQUER_H
+#define SMID_MATRIX_DEVIDEANDCONQUER_H
+#include "Boost.h"
+#include "Naive.h"
+
+enum SplitAction {
+    SplitA, SplitB, SplitBoth, DoNotSplit
+};
+
+size_t splitByMultiple(size_t value, size_t multiple) {
+    auto result = (value / (multiple * 2)) * multiple;
+    return result == 0 ? 1 : result;
+}
+
+template<typename Multiplier, typename M1, typename M2, typename M3>
+void _divide_and_conquer(M1 &C, const M2 &A, const M3 &B) {
+    assert(C.size1() == A.size1() && C.size2() == B.size2());
+    assert(A.size2() == B.size1());
+    size_t n = A.size1();
+    size_t p = A.size2();
+    size_t m = B.size2();
+
+    switch (Multiplier::action(n, p, m)) {
+        case DoNotSplit:
+            constexpr Multiplier multiplier;
+            multiplier(C, A, B);
+            break;
+        case SplitA:
+            {
+                size_t split_index = splitByMultiple(n, Multiplier::SplitMultipleM);
+                if (split_index == 0) split_index = 1;
+                const auto A1 = boost::numeric::ublas::project(A, {0, split_index}, {0, A.size2()});
+                const auto A2 = boost::numeric::ublas::project(A, {split_index, A.size1()}, {0, A.size2()});
+                auto C1 = boost::numeric::ublas::project(C, {0, split_index}, {0, C.size2()});
+                auto C2 = boost::numeric::ublas::project(C, {split_index, C.size1()}, {0, C.size2()});
+                _divide_and_conquer<Multiplier>(C1, A1, B);
+                _divide_and_conquer<Multiplier>(C2, A2, B);
+            }
+            break;
+        case SplitB:
+            {
+                size_t split_index = splitByMultiple(m, Multiplier::SplitMultipleN);
+                if (split_index == 0) split_index = 1;
+                auto B1 = boost::numeric::ublas::project(B, {0, B.size1()}, {0, split_index});
+                auto B2 = boost::numeric::ublas::project(B, {0, B.size1()}, {split_index, B.size2()});
+                auto C1 = boost::numeric::ublas::project(C, {0, C.size1()}, {0, split_index});
+                auto C2 = boost::numeric::ublas::project(C, {0, C.size1()}, {split_index, C.size2()});
+                _divide_and_conquer<Multiplier>(C1, A, B1);
+                _divide_and_conquer<Multiplier>(C2, A, B2);
+            }
+            break;
+        case SplitBoth:
+            {
+                size_t split_index = splitByMultiple(p, Multiplier::SplitMultipleP);
+                auto B1 = boost::numeric::ublas::project(B, {0, split_index}, {0, B.size2()});
+                auto B2 = boost::numeric::ublas::project(B, {split_index, B.size1()}, {0, B.size2()});
+                auto A1 = boost::numeric::ublas::project(A, {0, A.size1()}, {0, split_index});
+                auto A2 = boost::numeric::ublas::project(A, {0, A.size1()}, {split_index, A.size2()});
+                _divide_and_conquer<Multiplier>(C, A1, B1);
+                _divide_and_conquer<Multiplier>(C, A2, B2);
+            }
+            break;
+        default:
+            std::cerr << "Invalid split action" << std::endl;
+    }
+}
+
+template<unsigned NumRows, unsigned NumColumns>
+struct multiplier_naive_functor {
+
+    static constexpr unsigned SplitMultipleN = NumRows;
+    static constexpr unsigned SplitMultipleP = 1;
+    static constexpr unsigned SplitMultipleM = NumColumns;
+
+    static SplitAction action(unsigned m, unsigned p, unsigned n) {
+        size_t max_dim = std::max(std::max(m, n), p);
+        if (m <= NumRows && n <= NumColumns) {
+            return DoNotSplit;
+        } else if (max_dim == m) {
+            return SplitA;
+        } else if (max_dim == n) {
+            return SplitB;
+        } else {
+            return SplitBoth;
+        }
+    }
+
+
+    template<typename M1, typename M2, typename M3>
+    void operator()(M1 &C, const M2 &A, const M3 &B) const {
+        if(C.size1() == NumRows && C.size2() == NumColumns) {
+            for (int i = 0; i < NumRows; i++) {
+                for (int p = 0; p < B.size1(); p++) {
+                    for (int j = 0; j < NumColumns; j++) {
+                        C(i, j) += A(i, p) * B(p, j);
+                    }
+                }
+            }
+            return;
+        }
+        _naive_reordered(C, A, B);
+    }
+};
+
+
+template<typename FloatType>
+struct multiplier_block_wise_256 : block_wise_256<FloatType> {
+
+    static constexpr unsigned SplitMultipleM = 5 * block_wise_256<FloatType>::NumRows;
+    static constexpr unsigned SplitMultipleP = 20;
+    static constexpr unsigned SplitMultipleN = block_wise_256<FloatType>::NumColumns;
+
+    static SplitAction action(unsigned m, unsigned p, unsigned n) {
+        size_t m_p = m / SplitMultipleM;
+        size_t p_p = p / SplitMultipleP;
+        size_t n_p = n / SplitMultipleN;
+        unsigned _SplitMultipleM = SplitMultipleM;
+        unsigned _SplitMultipleP = 1;
+        unsigned _SplitMultipleN = SplitMultipleN;
+
+        auto max_dim = std::max(m_p, std::max(p_p, n_p));
+        if (m <= SplitMultipleM && n <= SplitMultipleN) {
+            return DoNotSplit;
+        } else if (max_dim == m_p) {
+            return SplitA;
+        } else if (max_dim == n_p) {
+            return SplitB;
+        } else if (p > SplitMultipleP){
+            return SplitBoth;
+        } else {
+            return DoNotSplit;
+        }
+    }
+
+};
+
+template<typename FloatType>
+struct multiplier_block_wise_256_alternative : block_wise_256_alternative<FloatType> {
+
+    static constexpr unsigned SplitMultipleM = 5 * block_wise_256<FloatType>::NumRows;
+    static constexpr unsigned SplitMultipleP = 20;
+    static constexpr unsigned SplitMultipleN = block_wise_256<FloatType>::NumColumns;
+
+    static SplitAction action(unsigned m, unsigned p, unsigned n) {
+        size_t m_p = m / SplitMultipleM;
+        size_t p_p = p / SplitMultipleP;
+        size_t n_p = n / SplitMultipleN;
+        unsigned _SplitMultipleM = SplitMultipleM;
+        unsigned _SplitMultipleP = 1;
+        unsigned _SplitMultipleN = SplitMultipleN;
+
+        auto max_dim = std::max(m_p, std::max(p_p, n_p));
+        if (m <= SplitMultipleM && n <= SplitMultipleN) {
+            return DoNotSplit;
+        } else if (max_dim == m_p) {
+            return SplitA;
+        } else if (max_dim == n_p) {
+            return SplitB;
+        } else if (p > SplitMultipleP){
+            return SplitBoth;
+        } else {
+            return DoNotSplit;
+        }
+    }
+};
+
+
+template<>
+struct block_wise_256<float>;
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r1(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_naive_functor<5, 5>>(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r2(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r3(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_naive_functor<200, 200>>(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r4(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_naive_functor<500, 500>>(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r5(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_block1(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    auto nr = block_wise_256<T>::NumRows;
+    auto nc = block_wise_256<T>::NumColumns;
+    _divide_and_conquer<multiplier_block_wise_256<T>>(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) divide_and_conquer_block2(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _divide_and_conquer<multiplier_block_wise_256_alternative<T>>(C, A, B);
+    return C;
+}
+
+#endif //SMID_MATRIX_DEVIDEANDCONQUER_H
diff --git a/src/Naive.h b/src/Naive.h
new file mode 100644
index 0000000..39e4238
--- /dev/null
+++ b/src/Naive.h
@@ -0,0 +1,41 @@
+//
+// Created by oke on 01.07.20.
+//
+
+#ifndef SMID_MATRIX_NAIVE_H
+#define SMID_MATRIX_NAIVE_H
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) naive(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    assert(A.size2() == B.size1());
+
+    for (int i = 0; i < A.size1(); i++) {
+        for (int j = 0; j < B.size2(); j++) {
+            for (int p = 0; p < B.size1(); p++) {
+                C(i, j) += A(i, p) * B(p, j);
+            }
+        }
+    }
+    return C;
+}
+
+template<typename M1, typename M2, typename M3>
+void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B) {
+    assert(A.size2() == B.size1());
+    for (int i = 0; i < A.size1(); i++) {
+        for (int p = 0; p < B.size1(); p++) {
+            for (int j = 0; j < B.size2(); j++) {
+                C(i, j) += A(i, p) * B(p, j);
+            }
+        }
+    }
+}
+template<typename T>
+Matrix<T> naive_reordered(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    _naive_reordered(C, A, B);
+    return C;
+}
+
+#endif //SMID_MATRIX_NAIVE_H
diff --git a/src/RegisterBlocking.h b/src/RegisterBlocking.h
new file mode 100644
index 0000000..4c49ed6
--- /dev/null
+++ b/src/RegisterBlocking.h
@@ -0,0 +1,290 @@
+//
+// Created by oke on 01.07.20.
+//
+
+#ifndef SMID_MATRIX_REGISTERBLOCKING_H
+#define SMID_MATRIX_REGISTERBLOCKING_H
+
+#include "Matrix.h"
+#include <immintrin.h>
+
+struct __m256_block_wise_config {
+    using FloatType = float;
+    using VectorType = __m256;
+    static constexpr auto LoadVector = _mm256_loadu_ps;
+    static constexpr auto StoreVector = _mm256_storeu_ps;
+    static constexpr auto BroadcastToVector = _mm256_broadcast_ss;
+};
+
+struct __m256d_block_wise_config {
+    using FloatType = double;
+    using VectorType = __m256d;
+    static constexpr auto LoadVector = _mm256_loadu_pd;
+    static constexpr auto StoreVector = _mm256_storeu_pd;
+    static constexpr auto BroadcastToVector = _mm256_broadcast_sd;
+};
+
+template<typename BlockWiseConfig>
+struct ExtendedBlockWiseConfig {
+    typedef typename BlockWiseConfig::FloatType FloatType;
+    using VectorType = typename BlockWiseConfig::VectorType;
+    static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
+    static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
+    static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
+    static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
+    static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); };
+};
+template<
+        // template parameter as struct: otherwise some warning about losing alignment information warning
+        typename BitWiseConfig,
+        unsigned _NumRows, unsigned _NumColumnVectors
+>
+struct RegisterBlocking {
+    typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
+
+    static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
+        bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); };
+
+    static constexpr auto NumRows = _NumRows;
+    static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
+
+    template<typename M1, typename M2, typename M3>
+    static void __attribute__ ((noinline))  handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
+
+        // AVX2 has 16 registers
+        // should be compiled as registers (total: regA * regB)
+        typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
+        // iterate over dot-product terms
+        for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
+            // Perform the DOT product
+            for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
+                typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
+                for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
+                    typename bwc::VectorType aa = bwc::BroadcastToVector(&A(aRowOffset + ai, p));
+                    CReg[ai][bi] += aa * bb;
+                }
+            }
+        }
+        // total regA * regB + regB registers
+
+        // Accumulate the results into C.
+        for (int ai = 0; ai < _NumRows; ai++) {
+            for (int bi = 0; bi < _NumColumnVectors; bi++) {
+                AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
+            }
+        }
+    }
+
+};
+
+template<typename BitWiseConfig, unsigned Rows>
+struct BestRowRegisterBlocking;
+
+template<typename BitWiseConfig>
+struct BestRowRegisterBlocking<BitWiseConfig, 1> : public RegisterBlocking<BitWiseConfig, 1, 15> {};
+template<typename BitWiseConfig>
+struct BestRowRegisterBlocking<BitWiseConfig, 2> : public RegisterBlocking<BitWiseConfig, 2, 7> {};
+template<typename BitWiseConfig>
+struct BestRowRegisterBlocking<BitWiseConfig, 3> : public RegisterBlocking<BitWiseConfig, 3, 4> {};
+// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
+// C = floor ((16 - R) / R)
+
+template<
+        // template parameter as struct: otherwise some warning about losing alignment information warning
+        typename BlockWiseConfig = __m256_block_wise_config,
+        unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4
+>
+struct block_wise {
+
+    typedef ExtendedBlockWiseConfig<BlockWiseConfig> bwc;
+
+    static constexpr unsigned NumRows = InitialNumRows;
+    static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth;
+
+    static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
+        bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); };
+
+    template<typename M1, typename M2, typename M3>
+    static void multiply(M1 &C, const M2 &A, const M3 &B) {
+        assert(C.size1() == A.size1() && C.size2() == B.size2());
+        int m = A.size1();
+        int k = A.size2();
+        int n = B.size2();
+        int m_i = 0;
+        for(; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row
+            int n_i = 0;
+            for(; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors * bwc::VectorWidth) {
+                RegisterBlocking<BlockWiseConfig, InitialNumRows, InitialNumColumnVectors>::handle_block(k, C, A, m_i, B, n_i);
+            }
+            // calculate remaining columns (like naive_reordered)
+            if(n_i < n) {
+                for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) {
+                    for (int p = 0; p < k; ++p) {
+                        for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
+                            C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
+                        }
+                    }
+                }
+            }
+        }
+        // calculate remaining rows (like naive_reordered)
+        for (; m_i < m; ++m_i) {
+            for (int p = 0; p < k; p++) {
+                for (int n_i = 0; n_i < n; ++n_i) {
+                    C(m_i, n_i) += A(m_i, p) * B(p, n_i);
+                }
+            }
+        }
+    }
+
+    template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
+    class iterate_columns {
+        /**
+         * consume all n_i's possible
+         */
+        template<typename M1, typename M2, typename M3>
+        static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n;
+                    n_i += CurrentNumColumnVectors * bwc::VectorWidth) {
+                RegisterBlocking<BlockWiseConfig, CurrentNumRows, CurrentNumColumnVectors>
+                ::handle_block(k, C, A, m_i, B, n_i);
+            }
+        }
+
+    public:
+
+        template<typename M1, typename M2, typename M3>
+        static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            consume(k, m_i, n_i, n, C, A, B);
+            if(CurrentNumColumnVectors > 1) {
+                // try with less column vectors
+                iterate_columns<CurrentNumRows, CurrentNumColumnVectors - 1>::iterate(k, m_i, n_i, n, C, A, B);
+            } else {
+                // do rest of columns manually
+                if(n_i < n) {
+                    for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) {
+                        for (int p = 0; p < k; ++p) {
+                            for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
+                                C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
+                            }
+                        }
+                    }
+                }
+                n_i = n;
+            }
+            assert(n_i == n);
+        }
+    };
+
+    template<unsigned CurrentNumRows>
+    class iterate_columns<CurrentNumRows, 0> {
+    public:
+        template<typename M1, typename M2, typename M3>
+        static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            throw std::runtime_error("should not be called, this is a bug.");
+        }
+
+    };
+
+    template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
+    class iterate_rows {
+
+        template<typename M1, typename M2, typename M3>
+        static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) {
+                size_t n_i = 0;
+                iterate_columns<CurrentNumRows, CurrentNumColumnVectors>::iterate(k, m_i, n_i, n, C, A, B);
+                assert(n_i == n);
+            }
+        }
+
+    public:
+
+        template<typename M1, typename M2, typename M3>
+        static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            consume(k, m_i, m, n, C, A, B);
+            if(CurrentNumRows > 1) {
+                // try with less num rows vectors
+                iterate_rows<CurrentNumRows - 1, CurrentNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
+            }
+            assert(m_i == m);
+        }
+    };
+
+    template<unsigned CurrentNumColumnVectors>
+    class iterate_rows<0, CurrentNumColumnVectors> {
+
+    public:
+        template<typename M1, typename M2, typename M3>
+        static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
+            throw std::runtime_error("should not be called, this is a bug.");
+        }
+    };
+
+    template<typename M1, typename M2, typename M3>
+    static void multiply2(M1 &C, const M2 &A, const M3 &B) {
+        assert(C.size1() == A.size1() && C.size2() == B.size2());
+        int m = A.size1();
+        int k = A.size2();
+        int n = B.size2();
+        size_t m_i = 0;
+        iterate_rows<InitialNumRows, InitialNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
+        assert(m_i == m);
+    }
+
+};
+
+template<typename FloatType>
+struct block_wise_256;
+
+template<>
+struct block_wise_256<float> : public block_wise<__m256_block_wise_config> {
+    template<typename M1, typename M2, typename M3>
+    void operator()(M1 &C, const M2 &A, const M3 &B) const {
+        return multiply(C, A, B);
+    }
+
+};
+
+template<>
+struct block_wise_256<double> : public block_wise<__m256d_block_wise_config> {
+    template<typename M1, typename M2, typename M3>
+    void operator()(M1 &C, const M2 &A, const M3 &B) const {
+        return multiply(C, A, B);
+    }
+};
+
+template<typename FloatType>
+struct block_wise_256_alternative;
+
+template<>
+struct block_wise_256_alternative<float> : public block_wise<__m256_block_wise_config> {
+    template<typename M1, typename M2, typename M3>
+    void operator()(M1 &C, const M2 &A, const M3 &B) const {
+        return multiply2(C, A, B);
+    }
+};
+
+template<>
+struct block_wise_256_alternative<double> : public block_wise<__m256d_block_wise_config> {
+    template<typename M1, typename M2, typename M3>
+    void operator()(M1 &C, const M2 &A, const M3 &B) const {
+        return multiply(C, A, B);
+    }
+};
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) block_wise_256_f(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    block_wise_256<T>::multiply(C, A, B);
+    return C;
+}
+
+template<typename T>
+Matrix<T> __attribute__ ((noinline)) block_wise_256_f2(const Matrix<T> &A, const Matrix<T> &B) {
+    Matrix<T> C(A.size1(), B.size2(), 0);
+    block_wise_256<T>::multiply2(C, A, B);
+    return C;
+}
+
+#endif //SMID_MATRIX_REGISTERBLOCKING_H
diff --git a/src/main.cpp b/src/main.cpp
index 04a2af7..e6c1d28 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,60 +2,15 @@
 #include <chrono>
 #include <iostream>
 #include <cassert>
-#include <boost/numeric/ublas/operation_blocked.hpp>
-#include <boost/numeric/ublas/operation.hpp>
 #include <boost/program_options.hpp>
-#include <immintrin.h>
-
+#include "RegisterBlocking.h"
+#include "Boost.h"
+#include "Naive.h"
+#include "DevideAndConquer.h"
+#include "BLASMul.h"
 using namespace std::chrono;
 namespace po = boost::program_options;
 
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) naive(const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2());
-    assert(A.size2() == B.size1());
-
-    for (int i = 0; i < A.size1(); i++) {
-        for (int j = 0; j < B.size2(); j++) {
-            for (int p = 0; p < B.size1(); p++) {
-                C(i, j) += A(i, p) * B(p, j);
-            }
-        }
-    }
-
-    return C;
-}
-
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) naive_reordered(const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2());
-    assert(A.size2() == B.size1());
-    for (int i = 0; i < A.size1(); i++) {
-        for (int p = 0; p < B.size1(); p++) {
-            for (int j = 0; j < B.size2(); j++) {
-                C(i, j) += A(i, p) * B(p, j);
-            }
-        }
-    }
-    return C;
-}
-
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) register_blocking(const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2());
-    assert(A.size2() == B.size1());
-
-    for (int i = 0; i < A.size1(); i++) {
-        for (int p = 0; p < B.size1(); p++) {
-            for (int j = 0; j < B.size2(); j++) {
-                C(i, j) += A(i, p) * B(p, j);
-            }
-        }
-    }
-
-    return C;
-}
-
 template<typename T>
 using BinaryMatrixOp = Matrix<T> (*)(const Matrix<T> &A, const Matrix<T> &B);
 
@@ -64,137 +19,15 @@ Matrix<T> run_function(BinaryMatrixOp<T> f, const Matrix<T> &A, const Matrix<T>
     auto a = steady_clock::now();
     auto C = f(A, B);
     auto b = steady_clock::now();
-    std::cout << "multiply: " << std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count() << "ms" << std::endl;
-    return C;
-}
-
-struct __m256_block_wise_config {
-    using FloatType = float;
-    using VectorType = __m256;
-    static constexpr auto LoadVector = _mm256_loadu_ps;
-    static constexpr auto StoreVector = _mm256_storeu_ps;
-    static constexpr auto BroadcastToVector = _mm256_broadcast_ss;
-};
-
-struct __m256d_block_wise_config {
-    using FloatType = double;
-    using VectorType = __m256d;
-    static constexpr auto LoadVector = _mm256_loadu_pd;
-    static constexpr auto StoreVector = _mm256_storeu_pd;
-    static constexpr auto BroadcastToVector = _mm256_broadcast_sd;
-};
-
-template<
-        // template parameter as struct: otherwise some warning about losing alignment information warning
-        typename BlockWiseConfig = __m256_block_wise_config,
-        unsigned numRows = 3, unsigned numColumns_width = 4
->
-struct block_wise {
-    using FloatType = typename BlockWiseConfig::FloatType;
-    using VectorType = typename BlockWiseConfig::VectorType;
-    static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
-    static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
-    static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
-    static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
-    static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); };
-
-    Matrix<FloatType> operator()(const Matrix<FloatType> &A, const Matrix<FloatType> &B) {
-        Matrix<FloatType> C(A.size1(), B.size2());
-        int m = A.size1();
-        int k = A.size2();
-        int n = B.size2();
-        int m_i = 0;
-        for(; m_i + numRows <= m; m_i += numRows) { // row
-            int n_i = 0;
-            for(; n_i + numColumns_width * VectorWidth <= n; n_i += numColumns_width * VectorWidth) {
-                handle_block(k, C, A, m_i, B, n_i);
-            }
-            // calculate remaining columns (like naive_reordered)
-            if(n_i < n) {
-                for (auto m_i_o = 0; m_i_o < numRows; ++m_i_o) {
-                    for (int p = 0; p < k; ++p) {
-                        for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
-                            C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
-                        }
-                    }
-                }
-            }
-        }
-        // calculate remaining rows (like naive_reordered)
-        for (; m_i < m; ++m_i) {
-            for (int p = 0; p < k; p++) {
-                for (int n_i = 0; n_i < n; ++n_i) {
-                    C(m_i, n_i) += A(m_i, p) * B(p, n_i);
-                }
-            }
-        }
-        return C;
-
+    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count();
+    if (ms > 1000) {
+        std::cout << "multiply: " << ms / 1000. << "s" << std::endl;
+    } else {
+        std::cout << "multiply: " << ms << "ms" << std::endl;
     }
-    void __attribute__ ((noinline))  handle_block(int k, Matrix<FloatType> &C, const Matrix<FloatType> &A, int aRowOffset, const Matrix<FloatType> &B, int bColOffset) {
-
-        // AVX2 has 16 registers
-        // should be compiled as registers (total: regA * regB)
-        VectorType CReg[numRows][numColumns_width] = {{0.0}};
-        // iterate over dot-product terms
-        for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
-            // Perform the DOT product
-            for (int bi = 0; bi < numColumns_width; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
-                VectorType bb = LoadVector(&B(p, bColOffset + bi * VectorWidth));
-                for (int ai = 0; ai < numRows; ai++) { // row index in A (handling regsA rows)
-                    VectorType aa = BroadcastToVector(&A(aRowOffset + ai, p));
-                    CReg[ai][bi] += aa * bb;
-                }
-            }
-        }
-        // total regA * regB + regB registers
-
-        // Accumulate the results into C.
-        for (int ai = 0; ai < numRows; ai++) {
-            for (int bi = 0; bi < numColumns_width; bi++) {
-                AddAndStore(&C(aRowOffset + ai, bColOffset + bi * VectorWidth), CReg[ai][bi]);
-            }
-        }
-    }
-
-};
-
-template<typename FloatType>
-struct block_wise_256;
-
-template<>
-struct block_wise_256<float> :
-        public block_wise<__m256_block_wise_config> {};
-template<>
-struct block_wise_256<double> :
-        public block_wise<__m256d_block_wise_config> {};
-
-
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) block_wise_256_f(const Matrix<T> &A, const Matrix<T> &B) {
-    return block_wise_256<T>()(A, B);
-}
-
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) boost_axpy_mul(const Matrix<T> &A, const Matrix<T> &B) {
-    Matrix<T> C(A.size1(), B.size2());
-    boost::numeric::ublas::axpy_prod(A, B, C);
     return C;
 }
 
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) boost_blocked_mul_64(const Matrix<T> &A, const Matrix<T> &B) {
-    return boost::numeric::ublas::block_prod<Matrix<T>, 64>(A, B);
-}
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) boost_blocked_mul_256(const Matrix<T> &A, const Matrix<T> &B) {
-    return boost::numeric::ublas::block_prod<Matrix<T>, 256>(A, B);
-}
-template<typename T>
-Matrix<T> __attribute__ ((noinline)) boost_mul(const Matrix<T> &A, const Matrix<T> &B) {
-    return boost::numeric::ublas::prod(A, B);
-}
-
 #define TEST_IF(test_function_name, function, A, B) {\
     if(test_function_name == #function) {\
         C = run_function(function, A, B);\
@@ -216,13 +49,23 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
 
     // use the result to prevent compiler to optimize...
     double use_result = 0;
-    TEST_IF(test_function_name, naive, A, B)
-    TEST_IF(test_function_name, naive_reordered, A, B)
-    TEST_IF(test_function_name, block_wise_256_f, A, B)
-    TEST_IF(test_function_name, boost_axpy_mul, A, B)
+    /*TEST_IF(test_function_name, naive, A, B)
     TEST_IF(test_function_name, boost_blocked_mul_64, A, B)
     TEST_IF(test_function_name, boost_blocked_mul_256, A, B)
     TEST_IF(test_function_name, boost_mul, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)*/
+
+    TEST_IF(test_function_name, naive_reordered, A, B)
+    TEST_IF(test_function_name, block_wise_256_f, A, B)
+    TEST_IF(test_function_name, block_wise_256_f2, A, B)
+    TEST_IF(test_function_name, boost_axpy_mul, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_block1, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_block2, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_naive_r3, A, B)
+    TEST_IF(test_function_name, blas, A, B)
 
     if(validate) {
         std::cout << "Validating matrix" << std::endl;
@@ -231,8 +74,10 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
             throw std::runtime_error("Result matrix has invalid size.");
         for(auto i = 0; i < C2.size1(); ++i) {
             for(auto j = 0; j < C2.size2(); ++j) {
-                if(abs(C(i, j) - C2(i, j)) > 1e-3)
-                    throw std::runtime_error("Result matrix is invalid.");
+                if(abs(C(i, j) - C2(i, j)) > 1e-1) {
+                    std::cerr << i << ", " << j << " is wrong with values " << C(i, j) << ", " << C2(i, j) << std::endl;
+                    exit(-1);
+                }
             }
         }
         std::cout << "Matrix seems fine" << std::endl;
@@ -241,6 +86,8 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
     return use_result == 0 ? -1 : 0;
 }
 
+
+
 int main(int argc, char* argv[]) {
     po::options_description desc("Multiply two matrices");
     desc.add_options()