From 520407e45d711dd949e6f5e2dd97067fb790ef5b Mon Sep 17 00:00:00 2001 From: har0ke Date: Fri, 3 Jul 2020 15:13:40 +0200 Subject: [PATCH] Added Devide and Conqour approach --- CMakeLists.txt | 14 +- plot.py | 46 +++++++ scripts/test.py | 58 ++++++++- src/BLASMul.h | 41 ++++++ src/Boost.h | 31 +++++ src/DevideAndConquer.h | 226 ++++++++++++++++++++++++++++++++ src/Naive.h | 41 ++++++ src/RegisterBlocking.h | 290 +++++++++++++++++++++++++++++++++++++++++ src/main.cpp | 213 +++++------------------------- 9 files changed, 768 insertions(+), 192 deletions(-) create mode 100644 plot.py create mode 100644 src/BLASMul.h create mode 100644 src/Boost.h create mode 100644 src/DevideAndConquer.h create mode 100644 src/Naive.h create mode 100644 src/RegisterBlocking.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f4fa8d..68c2689 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(smid_matrix CXX) include(CheckCXXCompilerFlag) set(CMAKE_CXX_STANDARD 17) option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON) -option(USE_CLANG "Build with clang instead of gcc" OFF) +option(USE_CLANG "Build with clang instead of gcc" ON) set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run") add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}") @@ -15,6 +15,8 @@ else() set(CMAKE_CXX_COMPILER "g++") endif() +set(CMAKE_CXX_FLAGS_RELEASE "-O3") + if(OPTIMIZE_FOR_NATIVE) CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) if(COMPILER_SUPPORTS_MARCH_NATIVE) @@ -31,12 +33,20 @@ else() endif() endif() +set(BLA_VENDER OpenBLAS) +find_package(BLAS REQUIRED) +if(BLAS_FOUND) + include_directories(${BLAS_INCLUDE_DIRS}) +ELSE() + message("OpenBLAS NOT found.") +endif(BLAS_FOUND) + find_package(Boost REQUIRED COMPONENTS filesystem program_options) include_directories("${Boost_INCLUDE_DIRS}") add_library(simple_matrix src/Matrix.cpp) -target_link_libraries(simple_matrix ${Boost_LIBRARIES}) +target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES}) add_executable(simd_multiply src/main.cpp) target_link_libraries(simd_multiply simple_matrix) diff --git a/plot.py b/plot.py new file mode 100644 index 0000000..9c1cfbf --- /dev/null +++ b/plot.py @@ -0,0 +1,46 @@ + +import matplotlib.pyplot as plt +import numpy as np +a = [ + +[1e6, '0.000', '0.000', '0.000', '0.300', '0.200', '0.250', '0.000', '0.000'], +[5e6, '0.200', '0.100', '0.200', '2.000', '1.800', '4.300', '0.200', '3.400'], +[1e7, '1.400', '1.350', '1.500', '4.000', '3.500', '4.750', '0.600', '7.750'], +[5e7, '7.450', '6.700', '8.100', '13.900', '11.950', '37.550', '2.350', '38.200'], +[1e8, '13.900', '13.150', '13.600', '15.300', '13.600', '100.200', '2.900', '75.300'], +[5e8, '81.100', '76.600', '74.350', '82.200', '75.150', '385.000', '14.600', '377.600'], +[1e9, '172.500', '166.250', '163.900', '127.100', '117.100', '686.450', '35.100', '808.800'], +[5e9, '1228.550', '1208.650', '1048.350', '532.850', '508.050', '2317.350', '127.750', '3998.350'], +[1e10,'2577.050', '2533.500', '2415.700', '922.000', '892.750', '3702.750', '298.550', '8060.550'], +[5e10, '15281.650', '15105.350', '12339.050', '4593.650', '4418.700', '16350.150', '1266.350', '40414.250'], +[1e11, '32127.000', '31768.100', '24630.650', '8223.350', '8152.700', '32005.850', '2425.700', '81039.750'], +[5e11, '249427.000', '248249.333', '124935.833', '39844.500', '39466.167', '138008.000', '11864.500', '402842.250'] + +] + +def get_i(x): + return np.array([(i[0], float(i[x + 1])) for i in a if len(i) > x + 1]) + +functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", + "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", + "blas", "naive_reordered"] + +for i, f in list(enumerate(functions)): + xy = get_i(i) + plt.plot(xy[:,0], xy[:,1] / 1000., label=f) + print(xy) +plt.legend() +plt.xscale("log") +plt.xlim((5e9, 1e12)) +plt.show() + + +for i, f in list(enumerate(functions)): + xy = get_i(i) + xy_l = get_i(len(functions) - 1) + plt.plot(xy[:,0], xy[:,1] / xy[:, 1], label=f) + print(xy) +plt.legend() +plt.xscale("log") +plt.xlim((5e9, 1e12)) +plt.show() \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py index 3a4a6ca..b1079b2 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -1,6 +1,13 @@ #!/usr/bin/python +import json import os +import shutil import subprocess +import sys +from datetime import datetime +from pprint import pprint + +import numpy def check_call_quiet(*args, **kwargs): @@ -8,7 +15,7 @@ def check_call_quiet(*args, **kwargs): output, err = p.communicate() rc = p.returncode if rc != 0: - print(output) + print(output.decode()) print(err) exit(0) @@ -16,7 +23,7 @@ def check_call_quiet(*args, **kwargs): def compile_and_run(source_path, build_path_prefix, target, native, use_clang, args): flags = [ "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), - "-DCMAKE_BUILD_TYPE=Release", + "-DCMAKE_BUILD_TYPE=RelWithDebInfo", "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON" ] @@ -29,7 +36,9 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a + [os.path.join(build_path, target)] + args) else: print(" ".join([os.path.join(build_path, target)] + args)) - subprocess.check_call([os.path.join(build_path, target)] + args) + output = subprocess.check_output([os.path.join(build_path, target)] + args) + return output + if __name__ == '__main__': @@ -37,16 +46,51 @@ if __name__ == '__main__': os.makedirs("data", exist_ok=True) os.chdir("data") with_double = False - extra_args = ["--validate"] + extra_args = []#["--validate"] + + total = float(sys.argv[1] if len(sys.argv) > 1 else 1e8) + sss= [] + i = pow(total, 1. / 3.) + for _ in range(20): + a = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))) + b = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))) + + c = round(total / a / b) + sss.append([str(a), str(b), str(c)]) + + functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", + "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", + "blas"] + if total < 1e10: + functions.append("naive_reordered") + times = [[] for f in functions] + output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json") for clang in [True]: - for sizes in [["2048", "2048", "999"]]: # [["873", "1456", "999"]]: + for sizes in sss: args = list(sizes) if with_double: args.append("--double") compile_and_run("..", "builds", "generate_random", True, True, args) folder = "x".join(sizes) - for function in ["block_wise_256_f", "boost_axpy_mul"]: # ["naive_reordered", "block_wise_256_f", "boost_blocked_mul_64", "boost_blocked_mul_256", "boost_mul", "boost_axpy_mul"]: + for fidx, function in enumerate(functions): arguments = [folder, "--algorithm", function] if with_double: arguments.append("--double") - compile_and_run("..", "builds", "simd_multiply", True, clang, arguments + extra_args) + output = compile_and_run("..", "builds", "simd_multiply", True, clang, arguments + extra_args) + ms = output.decode()[output.decode().find("multiply:") + 10:] + if "ms\n" in ms: + ms = float(ms.split("ms\n")[0]) + else: + ms = float(ms.split("s\n")[0]) * 1000 + times[fidx].append(ms) + + shutil.rmtree(folder) + print(["%.3f" % numpy.mean(ts) for ts in times]) + with open(output_file + ".cache", "w") as f: + json.dump({ + "times": times, + "sizes": sss, + "functions": functions, + "means": ["%.3f" % numpy.mean(ts) for ts in times] + }, f) + os.rename(output_file + ".cache", output_file) \ No newline at end of file diff --git a/src/BLASMul.h b/src/BLASMul.h new file mode 100644 index 0000000..4be61bc --- /dev/null +++ b/src/BLASMul.h @@ -0,0 +1,41 @@ +// +// Created by oke on 02.07.20. +// + +#ifndef SMID_MATRIX_BLASMUL_H +#define SMID_MATRIX_BLASMUL_H + +#include +#include "Matrix.h" + +template +struct blas_functor; + +template<> +struct blas_functor { + + static void multiply(Matrix &C, const Matrix &A, const Matrix &B) { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, C.size1(), C.size2(), A.size2(), 1., + &A(0, 0), A.size2(), &B(0, 0), B.size2(), 1, &C(0, 0), C.size2()); + } + +}; + +template<> +struct blas_functor { + + static void multiply(Matrix &C, const Matrix &A, const Matrix &B) { + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, C.size1(), C.size2(), A.size2(), 1, + &A(0, 0), A.size2(), &B(0, 0), B.size2(), 1, &C(0, 0), C.size2()); + } + +}; + +template +Matrix blas(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + blas_functor::multiply(C, A, B); + return C; +} + +#endif //SMID_MATRIX_BLASMUL_H diff --git a/src/Boost.h b/src/Boost.h new file mode 100644 index 0000000..c25784e --- /dev/null +++ b/src/Boost.h @@ -0,0 +1,31 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_BOOST_H +#define SMID_MATRIX_BOOST_H + +#include +#include + +template +Matrix __attribute__ ((noinline)) boost_axpy_mul(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + boost::numeric::ublas::axpy_prod(A, B, C); + return C; +} + +template +Matrix __attribute__ ((noinline)) boost_blocked_mul_64(const Matrix &A, const Matrix &B) { + return boost::numeric::ublas::block_prod, 64>(A, B); +} + +template +Matrix __attribute__ ((noinline)) boost_blocked_mul_256(const Matrix &A, const Matrix &B) { + return boost::numeric::ublas::block_prod, 256>(A, B); +} +template +Matrix __attribute__ ((noinline)) boost_mul(const Matrix &A, const Matrix &B) { + return boost::numeric::ublas::prod(A, B); +} +#endif //SMID_MATRIX_BOOST_H diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h new file mode 100644 index 0000000..146864c --- /dev/null +++ b/src/DevideAndConquer.h @@ -0,0 +1,226 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_DEVIDEANDCONQUER_H +#define SMID_MATRIX_DEVIDEANDCONQUER_H +#include "Boost.h" +#include "Naive.h" + +enum SplitAction { + SplitA, SplitB, SplitBoth, DoNotSplit +}; + +size_t splitByMultiple(size_t value, size_t multiple) { + auto result = (value / (multiple * 2)) * multiple; + return result == 0 ? 1 : result; +} + +template +void _divide_and_conquer(M1 &C, const M2 &A, const M3 &B) { + assert(C.size1() == A.size1() && C.size2() == B.size2()); + assert(A.size2() == B.size1()); + size_t n = A.size1(); + size_t p = A.size2(); + size_t m = B.size2(); + + switch (Multiplier::action(n, p, m)) { + case DoNotSplit: + constexpr Multiplier multiplier; + multiplier(C, A, B); + break; + case SplitA: + { + size_t split_index = splitByMultiple(n, Multiplier::SplitMultipleM); + if (split_index == 0) split_index = 1; + const auto A1 = boost::numeric::ublas::project(A, {0, split_index}, {0, A.size2()}); + const auto A2 = boost::numeric::ublas::project(A, {split_index, A.size1()}, {0, A.size2()}); + auto C1 = boost::numeric::ublas::project(C, {0, split_index}, {0, C.size2()}); + auto C2 = boost::numeric::ublas::project(C, {split_index, C.size1()}, {0, C.size2()}); + _divide_and_conquer(C1, A1, B); + _divide_and_conquer(C2, A2, B); + } + break; + case SplitB: + { + size_t split_index = splitByMultiple(m, Multiplier::SplitMultipleN); + if (split_index == 0) split_index = 1; + auto B1 = boost::numeric::ublas::project(B, {0, B.size1()}, {0, split_index}); + auto B2 = boost::numeric::ublas::project(B, {0, B.size1()}, {split_index, B.size2()}); + auto C1 = boost::numeric::ublas::project(C, {0, C.size1()}, {0, split_index}); + auto C2 = boost::numeric::ublas::project(C, {0, C.size1()}, {split_index, C.size2()}); + _divide_and_conquer(C1, A, B1); + _divide_and_conquer(C2, A, B2); + } + break; + case SplitBoth: + { + size_t split_index = splitByMultiple(p, Multiplier::SplitMultipleP); + auto B1 = boost::numeric::ublas::project(B, {0, split_index}, {0, B.size2()}); + auto B2 = boost::numeric::ublas::project(B, {split_index, B.size1()}, {0, B.size2()}); + auto A1 = boost::numeric::ublas::project(A, {0, A.size1()}, {0, split_index}); + auto A2 = boost::numeric::ublas::project(A, {0, A.size1()}, {split_index, A.size2()}); + _divide_and_conquer(C, A1, B1); + _divide_and_conquer(C, A2, B2); + } + break; + default: + std::cerr << "Invalid split action" << std::endl; + } +} + +template +struct multiplier_naive_functor { + + static constexpr unsigned SplitMultipleN = NumRows; + static constexpr unsigned SplitMultipleP = 1; + static constexpr unsigned SplitMultipleM = NumColumns; + + static SplitAction action(unsigned m, unsigned p, unsigned n) { + size_t max_dim = std::max(std::max(m, n), p); + if (m <= NumRows && n <= NumColumns) { + return DoNotSplit; + } else if (max_dim == m) { + return SplitA; + } else if (max_dim == n) { + return SplitB; + } else { + return SplitBoth; + } + } + + + template + void operator()(M1 &C, const M2 &A, const M3 &B) const { + if(C.size1() == NumRows && C.size2() == NumColumns) { + for (int i = 0; i < NumRows; i++) { + for (int p = 0; p < B.size1(); p++) { + for (int j = 0; j < NumColumns; j++) { + C(i, j) += A(i, p) * B(p, j); + } + } + } + return; + } + _naive_reordered(C, A, B); + } +}; + + +template +struct multiplier_block_wise_256 : block_wise_256 { + + static constexpr unsigned SplitMultipleM = 5 * block_wise_256::NumRows; + static constexpr unsigned SplitMultipleP = 20; + static constexpr unsigned SplitMultipleN = block_wise_256::NumColumns; + + static SplitAction action(unsigned m, unsigned p, unsigned n) { + size_t m_p = m / SplitMultipleM; + size_t p_p = p / SplitMultipleP; + size_t n_p = n / SplitMultipleN; + unsigned _SplitMultipleM = SplitMultipleM; + unsigned _SplitMultipleP = 1; + unsigned _SplitMultipleN = SplitMultipleN; + + auto max_dim = std::max(m_p, std::max(p_p, n_p)); + if (m <= SplitMultipleM && n <= SplitMultipleN) { + return DoNotSplit; + } else if (max_dim == m_p) { + return SplitA; + } else if (max_dim == n_p) { + return SplitB; + } else if (p > SplitMultipleP){ + return SplitBoth; + } else { + return DoNotSplit; + } + } + +}; + +template +struct multiplier_block_wise_256_alternative : block_wise_256_alternative { + + static constexpr unsigned SplitMultipleM = 5 * block_wise_256::NumRows; + static constexpr unsigned SplitMultipleP = 20; + static constexpr unsigned SplitMultipleN = block_wise_256::NumColumns; + + static SplitAction action(unsigned m, unsigned p, unsigned n) { + size_t m_p = m / SplitMultipleM; + size_t p_p = p / SplitMultipleP; + size_t n_p = n / SplitMultipleN; + unsigned _SplitMultipleM = SplitMultipleM; + unsigned _SplitMultipleP = 1; + unsigned _SplitMultipleN = SplitMultipleN; + + auto max_dim = std::max(m_p, std::max(p_p, n_p)); + if (m <= SplitMultipleM && n <= SplitMultipleN) { + return DoNotSplit; + } else if (max_dim == m_p) { + return SplitA; + } else if (max_dim == n_p) { + return SplitB; + } else if (p > SplitMultipleP){ + return SplitBoth; + } else { + return DoNotSplit; + } + } +}; + + +template<> +struct block_wise_256; + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r1(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _divide_and_conquer>(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r2(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _divide_and_conquer>(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r3(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _divide_and_conquer>(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r4(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _divide_and_conquer>(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_naive_r5(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _divide_and_conquer>(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_block1(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + auto nr = block_wise_256::NumRows; + auto nc = block_wise_256::NumColumns; + _divide_and_conquer>(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) divide_and_conquer_block2(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _divide_and_conquer>(C, A, B); + return C; +} + +#endif //SMID_MATRIX_DEVIDEANDCONQUER_H diff --git a/src/Naive.h b/src/Naive.h new file mode 100644 index 0000000..39e4238 --- /dev/null +++ b/src/Naive.h @@ -0,0 +1,41 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_NAIVE_H +#define SMID_MATRIX_NAIVE_H + +template +Matrix __attribute__ ((noinline)) naive(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + assert(A.size2() == B.size1()); + + for (int i = 0; i < A.size1(); i++) { + for (int j = 0; j < B.size2(); j++) { + for (int p = 0; p < B.size1(); p++) { + C(i, j) += A(i, p) * B(p, j); + } + } + } + return C; +} + +template +void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B) { + assert(A.size2() == B.size1()); + for (int i = 0; i < A.size1(); i++) { + for (int p = 0; p < B.size1(); p++) { + for (int j = 0; j < B.size2(); j++) { + C(i, j) += A(i, p) * B(p, j); + } + } + } +} +template +Matrix naive_reordered(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + _naive_reordered(C, A, B); + return C; +} + +#endif //SMID_MATRIX_NAIVE_H diff --git a/src/RegisterBlocking.h b/src/RegisterBlocking.h new file mode 100644 index 0000000..4c49ed6 --- /dev/null +++ b/src/RegisterBlocking.h @@ -0,0 +1,290 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_REGISTERBLOCKING_H +#define SMID_MATRIX_REGISTERBLOCKING_H + +#include "Matrix.h" +#include + +struct __m256_block_wise_config { + using FloatType = float; + using VectorType = __m256; + static constexpr auto LoadVector = _mm256_loadu_ps; + static constexpr auto StoreVector = _mm256_storeu_ps; + static constexpr auto BroadcastToVector = _mm256_broadcast_ss; +}; + +struct __m256d_block_wise_config { + using FloatType = double; + using VectorType = __m256d; + static constexpr auto LoadVector = _mm256_loadu_pd; + static constexpr auto StoreVector = _mm256_storeu_pd; + static constexpr auto BroadcastToVector = _mm256_broadcast_sd; +}; + +template +struct ExtendedBlockWiseConfig { + typedef typename BlockWiseConfig::FloatType FloatType; + using VectorType = typename BlockWiseConfig::VectorType; + static constexpr auto LoadVector = BlockWiseConfig::LoadVector; + static constexpr auto StoreVector = BlockWiseConfig::StoreVector; + static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; + static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); + static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); }; +}; +template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BitWiseConfig, + unsigned _NumRows, unsigned _NumColumnVectors +> +struct RegisterBlocking { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); }; + + static constexpr auto NumRows = _NumRows; + static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(&A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + } + } + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < _NumRows; ai++) { + for (int bi = 0; bi < _NumColumnVectors; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); + } + } + } + +}; + +template +struct BestRowRegisterBlocking; + +template +struct BestRowRegisterBlocking : public RegisterBlocking {}; +template +struct BestRowRegisterBlocking : public RegisterBlocking {}; +template +struct BestRowRegisterBlocking : public RegisterBlocking {}; +// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 +// C = floor ((16 - R) / R) + +template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BlockWiseConfig = __m256_block_wise_config, + unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4 +> +struct block_wise { + + typedef ExtendedBlockWiseConfig bwc; + + static constexpr unsigned NumRows = InitialNumRows; + static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); }; + + template + static void multiply(M1 &C, const M2 &A, const M3 &B) { + assert(C.size1() == A.size1() && C.size2() == B.size2()); + int m = A.size1(); + int k = A.size2(); + int n = B.size2(); + int m_i = 0; + for(; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row + int n_i = 0; + for(; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors * bwc::VectorWidth) { + RegisterBlocking::handle_block(k, C, A, m_i, B, n_i); + } + // calculate remaining columns (like naive_reordered) + if(n_i < n) { + for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) { + for (int p = 0; p < k; ++p) { + for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { + C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); + } + } + } + } + } + // calculate remaining rows (like naive_reordered) + for (; m_i < m; ++m_i) { + for (int p = 0; p < k; p++) { + for (int n_i = 0; n_i < n; ++n_i) { + C(m_i, n_i) += A(m_i, p) * B(p, n_i); + } + } + } + } + + template + class iterate_columns { + /** + * consume all n_i's possible + */ + template + static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { + for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n; + n_i += CurrentNumColumnVectors * bwc::VectorWidth) { + RegisterBlocking + ::handle_block(k, C, A, m_i, B, n_i); + } + } + + public: + + template + static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { + consume(k, m_i, n_i, n, C, A, B); + if(CurrentNumColumnVectors > 1) { + // try with less column vectors + iterate_columns::iterate(k, m_i, n_i, n, C, A, B); + } else { + // do rest of columns manually + if(n_i < n) { + for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) { + for (int p = 0; p < k; ++p) { + for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { + C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); + } + } + } + } + n_i = n; + } + assert(n_i == n); + } + }; + + template + class iterate_columns { + public: + template + static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) { + throw std::runtime_error("should not be called, this is a bug."); + } + + }; + + template + class iterate_rows { + + template + static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { + for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) { + size_t n_i = 0; + iterate_columns::iterate(k, m_i, n_i, n, C, A, B); + assert(n_i == n); + } + } + + public: + + template + static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { + consume(k, m_i, m, n, C, A, B); + if(CurrentNumRows > 1) { + // try with less num rows vectors + iterate_rows::iterate(k, m_i, m, n, C, A, B); + } + assert(m_i == m); + } + }; + + template + class iterate_rows<0, CurrentNumColumnVectors> { + + public: + template + static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) { + throw std::runtime_error("should not be called, this is a bug."); + } + }; + + template + static void multiply2(M1 &C, const M2 &A, const M3 &B) { + assert(C.size1() == A.size1() && C.size2() == B.size2()); + int m = A.size1(); + int k = A.size2(); + int n = B.size2(); + size_t m_i = 0; + iterate_rows::iterate(k, m_i, m, n, C, A, B); + assert(m_i == m); + } + +}; + +template +struct block_wise_256; + +template<> +struct block_wise_256 : public block_wise<__m256_block_wise_config> { + template + void operator()(M1 &C, const M2 &A, const M3 &B) const { + return multiply(C, A, B); + } + +}; + +template<> +struct block_wise_256 : public block_wise<__m256d_block_wise_config> { + template + void operator()(M1 &C, const M2 &A, const M3 &B) const { + return multiply(C, A, B); + } +}; + +template +struct block_wise_256_alternative; + +template<> +struct block_wise_256_alternative : public block_wise<__m256_block_wise_config> { + template + void operator()(M1 &C, const M2 &A, const M3 &B) const { + return multiply2(C, A, B); + } +}; + +template<> +struct block_wise_256_alternative : public block_wise<__m256d_block_wise_config> { + template + void operator()(M1 &C, const M2 &A, const M3 &B) const { + return multiply(C, A, B); + } +}; + +template +Matrix __attribute__ ((noinline)) block_wise_256_f(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + block_wise_256::multiply(C, A, B); + return C; +} + +template +Matrix __attribute__ ((noinline)) block_wise_256_f2(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2(), 0); + block_wise_256::multiply2(C, A, B); + return C; +} + +#endif //SMID_MATRIX_REGISTERBLOCKING_H diff --git a/src/main.cpp b/src/main.cpp index 04a2af7..e6c1d28 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,60 +2,15 @@ #include #include #include -#include -#include #include -#include - +#include "RegisterBlocking.h" +#include "Boost.h" +#include "Naive.h" +#include "DevideAndConquer.h" +#include "BLASMul.h" using namespace std::chrono; namespace po = boost::program_options; -template -Matrix __attribute__ ((noinline)) naive(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2()); - assert(A.size2() == B.size1()); - - for (int i = 0; i < A.size1(); i++) { - for (int j = 0; j < B.size2(); j++) { - for (int p = 0; p < B.size1(); p++) { - C(i, j) += A(i, p) * B(p, j); - } - } - } - - return C; -} - -template -Matrix __attribute__ ((noinline)) naive_reordered(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2()); - assert(A.size2() == B.size1()); - for (int i = 0; i < A.size1(); i++) { - for (int p = 0; p < B.size1(); p++) { - for (int j = 0; j < B.size2(); j++) { - C(i, j) += A(i, p) * B(p, j); - } - } - } - return C; -} - -template -Matrix __attribute__ ((noinline)) register_blocking(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2()); - assert(A.size2() == B.size1()); - - for (int i = 0; i < A.size1(); i++) { - for (int p = 0; p < B.size1(); p++) { - for (int j = 0; j < B.size2(); j++) { - C(i, j) += A(i, p) * B(p, j); - } - } - } - - return C; -} - template using BinaryMatrixOp = Matrix (*)(const Matrix &A, const Matrix &B); @@ -64,137 +19,15 @@ Matrix run_function(BinaryMatrixOp f, const Matrix &A, const Matrix auto a = steady_clock::now(); auto C = f(A, B); auto b = steady_clock::now(); - std::cout << "multiply: " << std::chrono::duration_cast(b - a).count() << "ms" << std::endl; - return C; -} - -struct __m256_block_wise_config { - using FloatType = float; - using VectorType = __m256; - static constexpr auto LoadVector = _mm256_loadu_ps; - static constexpr auto StoreVector = _mm256_storeu_ps; - static constexpr auto BroadcastToVector = _mm256_broadcast_ss; -}; - -struct __m256d_block_wise_config { - using FloatType = double; - using VectorType = __m256d; - static constexpr auto LoadVector = _mm256_loadu_pd; - static constexpr auto StoreVector = _mm256_storeu_pd; - static constexpr auto BroadcastToVector = _mm256_broadcast_sd; -}; - -template< - // template parameter as struct: otherwise some warning about losing alignment information warning - typename BlockWiseConfig = __m256_block_wise_config, - unsigned numRows = 3, unsigned numColumns_width = 4 -> -struct block_wise { - using FloatType = typename BlockWiseConfig::FloatType; - using VectorType = typename BlockWiseConfig::VectorType; - static constexpr auto LoadVector = BlockWiseConfig::LoadVector; - static constexpr auto StoreVector = BlockWiseConfig::StoreVector; - static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; - static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); - static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); }; - - Matrix operator()(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2()); - int m = A.size1(); - int k = A.size2(); - int n = B.size2(); - int m_i = 0; - for(; m_i + numRows <= m; m_i += numRows) { // row - int n_i = 0; - for(; n_i + numColumns_width * VectorWidth <= n; n_i += numColumns_width * VectorWidth) { - handle_block(k, C, A, m_i, B, n_i); - } - // calculate remaining columns (like naive_reordered) - if(n_i < n) { - for (auto m_i_o = 0; m_i_o < numRows; ++m_i_o) { - for (int p = 0; p < k; ++p) { - for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { - C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); - } - } - } - } - } - // calculate remaining rows (like naive_reordered) - for (; m_i < m; ++m_i) { - for (int p = 0; p < k; p++) { - for (int n_i = 0; n_i < n; ++n_i) { - C(m_i, n_i) += A(m_i, p) * B(p, n_i); - } - } - } - return C; - + auto ms = std::chrono::duration_cast(b - a).count(); + if (ms > 1000) { + std::cout << "multiply: " << ms / 1000. << "s" << std::endl; + } else { + std::cout << "multiply: " << ms << "ms" << std::endl; } - void __attribute__ ((noinline)) handle_block(int k, Matrix &C, const Matrix &A, int aRowOffset, const Matrix &B, int bColOffset) { - - // AVX2 has 16 registers - // should be compiled as registers (total: regA * regB) - VectorType CReg[numRows][numColumns_width] = {{0.0}}; - // iterate over dot-product terms - for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) - // Perform the DOT product - for (int bi = 0; bi < numColumns_width; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) - VectorType bb = LoadVector(&B(p, bColOffset + bi * VectorWidth)); - for (int ai = 0; ai < numRows; ai++) { // row index in A (handling regsA rows) - VectorType aa = BroadcastToVector(&A(aRowOffset + ai, p)); - CReg[ai][bi] += aa * bb; - } - } - } - // total regA * regB + regB registers - - // Accumulate the results into C. - for (int ai = 0; ai < numRows; ai++) { - for (int bi = 0; bi < numColumns_width; bi++) { - AddAndStore(&C(aRowOffset + ai, bColOffset + bi * VectorWidth), CReg[ai][bi]); - } - } - } - -}; - -template -struct block_wise_256; - -template<> -struct block_wise_256 : - public block_wise<__m256_block_wise_config> {}; -template<> -struct block_wise_256 : - public block_wise<__m256d_block_wise_config> {}; - - -template -Matrix __attribute__ ((noinline)) block_wise_256_f(const Matrix &A, const Matrix &B) { - return block_wise_256()(A, B); -} - -template -Matrix __attribute__ ((noinline)) boost_axpy_mul(const Matrix &A, const Matrix &B) { - Matrix C(A.size1(), B.size2()); - boost::numeric::ublas::axpy_prod(A, B, C); return C; } -template -Matrix __attribute__ ((noinline)) boost_blocked_mul_64(const Matrix &A, const Matrix &B) { - return boost::numeric::ublas::block_prod, 64>(A, B); -} -template -Matrix __attribute__ ((noinline)) boost_blocked_mul_256(const Matrix &A, const Matrix &B) { - return boost::numeric::ublas::block_prod, 256>(A, B); -} -template -Matrix __attribute__ ((noinline)) boost_mul(const Matrix &A, const Matrix &B) { - return boost::numeric::ublas::prod(A, B); -} - #define TEST_IF(test_function_name, function, A, B) {\ if(test_function_name == #function) {\ C = run_function(function, A, B);\ @@ -216,13 +49,23 @@ int main_work(const std::string &test_function_name, const std::string &input_fo // use the result to prevent compiler to optimize... double use_result = 0; - TEST_IF(test_function_name, naive, A, B) - TEST_IF(test_function_name, naive_reordered, A, B) - TEST_IF(test_function_name, block_wise_256_f, A, B) - TEST_IF(test_function_name, boost_axpy_mul, A, B) + /*TEST_IF(test_function_name, naive, A, B) TEST_IF(test_function_name, boost_blocked_mul_64, A, B) TEST_IF(test_function_name, boost_blocked_mul_256, A, B) TEST_IF(test_function_name, boost_mul, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)*/ + + TEST_IF(test_function_name, naive_reordered, A, B) + TEST_IF(test_function_name, block_wise_256_f, A, B) + TEST_IF(test_function_name, block_wise_256_f2, A, B) + TEST_IF(test_function_name, boost_axpy_mul, A, B) + TEST_IF(test_function_name, divide_and_conquer_block1, A, B) + TEST_IF(test_function_name, divide_and_conquer_block2, A, B) + TEST_IF(test_function_name, divide_and_conquer_naive_r3, A, B) + TEST_IF(test_function_name, blas, A, B) if(validate) { std::cout << "Validating matrix" << std::endl; @@ -231,8 +74,10 @@ int main_work(const std::string &test_function_name, const std::string &input_fo throw std::runtime_error("Result matrix has invalid size."); for(auto i = 0; i < C2.size1(); ++i) { for(auto j = 0; j < C2.size2(); ++j) { - if(abs(C(i, j) - C2(i, j)) > 1e-3) - throw std::runtime_error("Result matrix is invalid."); + if(abs(C(i, j) - C2(i, j)) > 1e-1) { + std::cerr << i << ", " << j << " is wrong with values " << C(i, j) << ", " << C2(i, j) << std::endl; + exit(-1); + } } } std::cout << "Matrix seems fine" << std::endl; @@ -241,6 +86,8 @@ int main_work(const std::string &test_function_name, const std::string &input_fo return use_result == 0 ? -1 : 0; } + + int main(int argc, char* argv[]) { po::options_description desc("Multiply two matrices"); desc.add_options()