From 8634a597ba40c517ce9c5ccb11394d1f70cd6638 Mon Sep 17 00:00:00 2001 From: har0ke Date: Sun, 5 Jul 2020 02:28:29 +0200 Subject: [PATCH] Reorganize register blocking --- CMakeLists.txt | 3 +- plot.py | 51 + scripts/dumpall.py | 10 + scripts/test.py | 23 +- src/DevideAndConquer.h | 23 +- src/main.cpp | 2 +- src/register_blocking/BlockWise.h | 111 ++ .../BlockWiseImpl.h} | 189 +-- .../detail/ExtendedBlockWiseConfig.h | 51 + .../detail/RegisterBlocking.h | 155 +++ .../detail/RegisterBlockingManual.h | 80 ++ test.pdf | Bin 0 -> 12322 bytes test.svg | 1195 +++++++++++++++++ 13 files changed, 1688 insertions(+), 205 deletions(-) create mode 100755 scripts/dumpall.py create mode 100644 src/register_blocking/BlockWise.h rename src/register_blocking/{RegisterBlocking.h => detail/BlockWiseImpl.h} (50%) create mode 100644 src/register_blocking/detail/ExtendedBlockWiseConfig.h create mode 100644 src/register_blocking/detail/RegisterBlocking.h create mode 100644 src/register_blocking/detail/RegisterBlockingManual.h create mode 100644 test.pdf create mode 100644 test.svg diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a71546..d1a2671 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,8 @@ else() set(CMAKE_CXX_COMPILER "g++") endif() -set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") if(OPTIMIZE_FOR_NATIVE) CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) diff --git a/plot.py b/plot.py index 3a4dd38..0035254 100644 --- a/plot.py +++ b/plot.py @@ -1,6 +1,57 @@ import matplotlib.pyplot as plt import numpy as np +import os +import json +from pprint import pprint + +folder = os.path.join(os.path.dirname(__file__), "data") + +files = list(reversed(sorted([os.path.join(folder, fn) for fn in os.listdir(folder) if fn.endswith(".json")]))) + + +each_size = { + +} +functions = set() +for fn in files: + with open(fn) as f: + data =json.load(f) + if data["total_size"] not in each_size: + each_size[data["total_size"]] = {v: k for k, v in zip(data["means"], data["functions"])} + functions = functions.union(set(data["functions"])) +pprint(each_size) +pprint(functions) + +for function in functions: + #if function.startswith("divide") and "naive" in function and not (function.endswith("r2") or function.endswith("r3")): + # continue + #if not function.startswith("divide_and_conquer_naive"): + # continue + if function.startswith("block_wise"): + continue + x = [] + y = [] + for size in sorted(each_size.keys()): + values = each_size[size] + if function in values: + x.append(pow(size, 1.)) + y.append(float(values[function]) / 1000.) + print(x, y) + plt.plot(x, y, label=function) + +from matplotlib import rc + +plt.legend() +plt.xscale("log") +plt.grid() +plt.xlim((1e10, 5.3e11)) +plt.ylim((0, 160)) +plt.savefig("test.pdf") +plt.xlabel("n * p * m") +plt.ylabel("seconds") +plt.show() +exit(0) a = [ [1e6, '0.000', '0.000', '0.000', '0.300', '0.200', '0.250', '0.000', '0.000'], diff --git a/scripts/dumpall.py b/scripts/dumpall.py new file mode 100755 index 0000000..d7c64bf --- /dev/null +++ b/scripts/dumpall.py @@ -0,0 +1,10 @@ +#!/usr/bin/python +import os +import sys +import subprocess +for e in range(8, 13): + for p in [1, 5]: + number = str(float(str(p) + "e" + str(e))) + args = [os.path.join(os.path.dirname(__file__), "test.py"), number] + sys.argv[1:] + print(" ".join(args)) + subprocess.check_call(args) \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py index ef480f2..1cd4254 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -21,10 +21,11 @@ def check_call_quiet(*args, **kwargs): exit(0) -def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, args): +def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): + flags = [ "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), - "-DCMAKE_BUILD_TYPE=RelWithDebInfo", + "-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"), "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON", "-DWITH_AVX512=" + ("ON" if avx512 else "OFF"), @@ -42,8 +43,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a output = subprocess.check_output([os.path.join(build_path, target)] + args) return output -very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"] -slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"] +very_slow_functions = ["naive_reordered"] +slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"] normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"] fast_functions = ["divide_and_conquer_block_avx2", "blas"] avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"] @@ -65,6 +66,7 @@ if __name__ == '__main__': parser.add_argument("--double", action="store_true") parser.add_argument("--gcc", action="store_true") parser.add_argument("--function", type=str, nargs="*") + parser.add_argument("--release", action="store_true") options = parser.parse_args() @@ -84,14 +86,14 @@ if __name__ == '__main__': if options.function: functions = options.function - + # functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"] extra_args = [] if options.validate: extra_args.append("--validate") if options.double: extra_args.append("--double") - total = options.total_size + total = int(options.total_size) matrix_combinations = [] i = pow(total, 1. / 3.) for _ in range(20): @@ -99,19 +101,19 @@ if __name__ == '__main__': b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))) c = int(round(total / a / b)) matrix_combinations.append([str(a), str(b), str(c)]) - + times = [[] for f in functions] output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json") clang = not options.gcc for sizes in matrix_combinations: args = list(sizes) - compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args) + compile_and_run("..", "builds", "generate_random", True, True, options.avx512, options.release, args) folder = "x".join(sizes) for fidx, function in enumerate(functions): arguments = [folder, "--algorithm", function] if with_double: arguments.append("--double") - output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args) + output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args) ms = output.decode()[output.decode().find("multiply:") + 10:] if "ms\n" in ms: ms = float(ms.split("ms\n")[0]) @@ -127,6 +129,7 @@ if __name__ == '__main__': "times": times, "sizes": matrix_combinations, "functions": functions, - "means": ["%.3f" % numpy.mean(ts) for ts in times] + "means": ["%.3f" % numpy.mean(ts) for ts in times], + "total_size": total }, f) os.rename(output_file + ".cache", output_file) \ No newline at end of file diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h index 425db9d..5557818 100644 --- a/src/DevideAndConquer.h +++ b/src/DevideAndConquer.h @@ -4,6 +4,8 @@ #ifndef SMID_MATRIX_DEVIDEANDCONQUER_H #define SMID_MATRIX_DEVIDEANDCONQUER_H + +#include "register_blocking/BlockWise.h" #include "Boost.h" #include "Naive.h" @@ -107,14 +109,14 @@ struct multiplier_naive_functor { }; -template +template struct multiplier_block_wise : block_wise { typedef block_wise Base; - static constexpr unsigned SplitMultipleM = 5 * Base::NumRows; - static constexpr unsigned SplitMultipleP = 20; - static constexpr unsigned SplitMultipleN = Base::NumColumns; + static constexpr unsigned SplitMultipleM = RowMultiplier * Base::NumRows; + static constexpr unsigned SplitMultipleP = PMultiplier; + static constexpr unsigned SplitMultipleN = ColumnMultiplier * Base::NumColumns; static SplitAction action(unsigned m, unsigned p, unsigned n) { size_t m_p = m / SplitMultipleM; @@ -143,27 +145,27 @@ struct multiplier_block_wise : block_wise { template void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } template @@ -174,10 +176,9 @@ void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix &C, const template void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } - #ifdef WITH_AVX512 template void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix &C, const Matrix &A, const Matrix &B) { diff --git a/src/main.cpp b/src/main.cpp index acec4cc..e3868f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,7 +3,7 @@ #include #include #include -#include "register_blocking/RegisterBlocking.h" +#include "register_blocking/BlockWise.h" #include "Boost.h" #include "Naive.h" #include "DevideAndConquer.h" diff --git a/src/register_blocking/BlockWise.h b/src/register_blocking/BlockWise.h new file mode 100644 index 0000000..4ab8827 --- /dev/null +++ b/src/register_blocking/BlockWise.h @@ -0,0 +1,111 @@ +// +// Created by oke on 04.07.20. +// + +#ifndef SMID_MATRIX_BLOCKWISE_H +#define SMID_MATRIX_BLOCKWISE_H + +#include "detail/BlockWiseImpl.h" + +#include + +struct __m128_block_wise_config { + using FloatType = float; + using VectorType = __m128; + static constexpr auto LoadVector = _mm_loadu_ps; + static constexpr auto StoreVector = _mm_storeu_ps; + static constexpr auto BroadcastToVector = _mm_set1_ps; + static constexpr auto XOR = _mm_xor_ps; + static constexpr unsigned Registers = 16; +}; + +struct __m128d_block_wise_config { + using FloatType = double; + using VectorType = __m128d; + static constexpr auto LoadVector = _mm_loadu_pd; + static constexpr auto StoreVector = _mm_storeu_pd; + static constexpr auto BroadcastToVector = _mm_set1_pd; + static constexpr auto XOR = _mm_xor_pd; + static constexpr unsigned Registers = 16; +}; + +struct __m256_block_wise_config { + using FloatType = float; + using VectorType = __m256; + static constexpr auto LoadVector = _mm256_loadu_ps; + static constexpr auto StoreVector = _mm256_storeu_ps; + static constexpr auto BroadcastToVector = _mm256_set1_ps; + static constexpr auto XOR = _mm256_xor_ps; + static constexpr unsigned Registers = 16; +}; + +struct __m256d_block_wise_config { + using FloatType = double; + using VectorType = __m256d; + static constexpr auto LoadVector = _mm256_loadu_pd; + static constexpr auto StoreVector = _mm256_storeu_pd; + static constexpr auto BroadcastToVector = _mm256_set1_pd; + static constexpr auto XOR = _mm256_xor_pd; + static constexpr unsigned Registers = 16; +}; + +#ifdef WITH_AVX512 +struct __m512_block_wise_config { + using FloatType = float; + using VectorType = __m512; + static constexpr auto LoadVector = _mm512_loadu_ps; + static constexpr auto StoreVector = _mm512_storeu_ps; + static constexpr auto BroadcastToVector = _mm512_set1_ps; + static constexpr unsigned Registers = 32; +}; + +struct __m512d_block_wise_config { + using FloatType = double; + using VectorType = __m512d; + static constexpr auto LoadVector = _mm512_loadu_pd; + static constexpr auto StoreVector = _mm512_storeu_pd; + static constexpr auto BroadcastToVector = _mm512_set1_pd; + static constexpr unsigned Registers = 32; +}; +#endif + +enum AvxVersion { + SSE, + AVX2, +#ifdef WITH_AVX512 + AVX512 +#endif +}; + +template +struct block_wise; + +template<> struct block_wise : public detail::block_wise_base<__m128_block_wise_config> {}; +template<> struct block_wise : public detail::block_wise_base<__m128d_block_wise_config> {}; + +template<> struct block_wise : public detail::block_wise_base<__m256_block_wise_config> {}; +template<> struct block_wise : public detail::block_wise_base<__m256d_block_wise_config> {}; + +#ifdef WITH_AVX512 +template<> struct block_wise : public detail::block_wise_base<__m512_block_wise_config> {}; + template<> struct block_wise : public detail::block_wise_base<__m512d_block_wise_config> {}; +#endif + +template +void __attribute__ ((noinline)) block_wise_sse(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply2(C, A, B); +} + +template +void __attribute__ ((noinline)) block_wise_avx2(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply2(C, A, B); +} + +#ifdef WITH_AVX512 +template +void __attribute__ ((noinline)) block_wise_avx512(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply2(C, A, B); +} +#endif + +#endif //SMID_MATRIX_BLOCKWISE_H diff --git a/src/register_blocking/RegisterBlocking.h b/src/register_blocking/detail/BlockWiseImpl.h similarity index 50% rename from src/register_blocking/RegisterBlocking.h rename to src/register_blocking/detail/BlockWiseImpl.h index 4d50148..544c3d3 100644 --- a/src/register_blocking/RegisterBlocking.h +++ b/src/register_blocking/detail/BlockWiseImpl.h @@ -1,95 +1,14 @@ // -// Created by oke on 01.07.20. +// Created by oke on 04.07.20. // -#ifndef SMID_MATRIX_REGISTERBLOCKING_H -#define SMID_MATRIX_REGISTERBLOCKING_H +#ifndef SMID_MATRIX_BLOCKWISE_IMPL_H +#define SMID_MATRIX_BLOCKWISE_IMPL_H -#include "../Matrix.h" -#include +#include "RegisterBlocking.h" namespace detail { - // maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 - // C = floor ((16 - R) / R) - - constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) { - if (R == 0) return 0; - return (unsigned) ((double) (Registers - R) / (double) R); - } - - constexpr unsigned GetInitialRows(unsigned Registers) { - for(unsigned R = Registers; R > 0; --R) { - if(R + R * (R + 1) < Registers) { - return R; - } - } - return 0; - } - - - template - struct ExtendedBlockWiseConfig { - typedef typename BlockWiseConfig::FloatType FloatType; - using VectorType = typename BlockWiseConfig::VectorType; - - static constexpr auto Registers = BlockWiseConfig::Registers; - static constexpr auto LoadVector = BlockWiseConfig::LoadVector; - static constexpr auto StoreVector = BlockWiseConfig::StoreVector; - static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; - static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); - static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, - LoadVector(memory) + - vector); - }; - }; - - template< - // template parameter as struct: otherwise some warning about losing alignment information warning - typename BitWiseConfig, - unsigned _NumRows, unsigned _NumColumnVectors - > - struct RegisterBlocking { - typedef ExtendedBlockWiseConfig bwc; - - static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { - bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); - }; - - static constexpr auto NumRows = _NumRows; - static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; - - template - static void __attribute__ ((noinline)) - handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { - - // AVX2 has 16 registers - // should be compiled as registers (total: regA * regB) - typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; - // iterate over dot-product terms - for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) - // Perform the DOT product - for (int bi = 0; - bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) - typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); - for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) - typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); - CReg[ai][bi] += aa * bb; - } - } - } - // total regA * regB + regB registers - - // Accumulate the results into C. - for (int ai = 0; ai < _NumRows; ai++) { - for (int bi = 0; bi < _NumColumnVectors; bi++) { - AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); - } - } - } - - }; - template< // template parameter as struct: otherwise some warning about losing alignment information warning typename BlockWiseConfig @@ -107,6 +26,7 @@ namespace detail { bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); }; + /* template static void multiply(M1 &C, const M2 &A, const M3 &B) { assert(C.size1() == A.size1() && C.size2() == B.size2()); @@ -142,6 +62,7 @@ namespace detail { } } } + */ template class iterate_columns { @@ -250,100 +171,4 @@ namespace detail { }; } - -struct __m128_block_wise_config { - using FloatType = float; - using VectorType = __m128; - static constexpr auto LoadVector = _mm_loadu_ps; - static constexpr auto StoreVector = _mm_storeu_ps; - static constexpr auto BroadcastToVector = _mm_set1_ps; - static constexpr unsigned Registers = 16; -}; - -struct __m128d_block_wise_config { - using FloatType = double; - using VectorType = __m128d; - static constexpr auto LoadVector = _mm_loadu_pd; - static constexpr auto StoreVector = _mm_storeu_pd; - static constexpr auto BroadcastToVector = _mm_set1_pd; - static constexpr unsigned Registers = 16; -}; - -struct __m256_block_wise_config { - using FloatType = float; - using VectorType = __m256; - static constexpr auto LoadVector = _mm256_loadu_ps; - static constexpr auto StoreVector = _mm256_storeu_ps; - static constexpr auto BroadcastToVector = _mm256_set1_ps; - static constexpr unsigned Registers = 16; -}; - -struct __m256d_block_wise_config { - using FloatType = double; - using VectorType = __m256d; - static constexpr auto LoadVector = _mm256_loadu_pd; - static constexpr auto StoreVector = _mm256_storeu_pd; - static constexpr auto BroadcastToVector = _mm256_set1_pd; - static constexpr unsigned Registers = 16; -}; - -#ifdef WITH_AVX512 -struct __m512_block_wise_config { - using FloatType = float; - using VectorType = __m512; - static constexpr auto LoadVector = _mm512_loadu_ps; - static constexpr auto StoreVector = _mm512_storeu_ps; - static constexpr auto BroadcastToVector = _mm512_set1_ps; - static constexpr unsigned Registers = 32; -}; - -struct __m512d_block_wise_config { - using FloatType = double; - using VectorType = __m512d; - static constexpr auto LoadVector = _mm512_loadu_pd; - static constexpr auto StoreVector = _mm512_storeu_pd; - static constexpr auto BroadcastToVector = _mm512_set1_pd; - static constexpr unsigned Registers = 32; -}; -#endif - -enum AvxVersion { - SSE, - AVX2, -#ifdef WITH_AVX512 - AVX512 -#endif -}; - -template -struct block_wise; - -template<> struct block_wise : public detail::block_wise_base<__m128_block_wise_config> {}; -template<> struct block_wise : public detail::block_wise_base<__m128d_block_wise_config> {}; - -template<> struct block_wise : public detail::block_wise_base<__m256_block_wise_config> {}; -template<> struct block_wise : public detail::block_wise_base<__m256d_block_wise_config> {}; - -#ifdef WITH_AVX512 - template<> struct block_wise : public detail::block_wise_base<__m512_block_wise_config> {}; - template<> struct block_wise : public detail::block_wise_base<__m512d_block_wise_config> {}; -#endif - -template -void __attribute__ ((noinline)) block_wise_sse(Matrix &C, const Matrix &A, const Matrix &B) { - block_wise::multiply(C, A, B); -} - -template -void __attribute__ ((noinline)) block_wise_avx2(Matrix &C, const Matrix &A, const Matrix &B) { - block_wise::multiply(C, A, B); -} - -#ifdef WITH_AVX512 -template -void __attribute__ ((noinline)) block_wise_avx512(Matrix &C, const Matrix &A, const Matrix &B) { - block_wise::multiply(C, A, B); -} -#endif - -#endif //SMID_MATRIX_REGISTERBLOCKING_H +#endif // SMID_MATRIX_BLOCKWISE_IMPL_H diff --git a/src/register_blocking/detail/ExtendedBlockWiseConfig.h b/src/register_blocking/detail/ExtendedBlockWiseConfig.h new file mode 100644 index 0000000..04620df --- /dev/null +++ b/src/register_blocking/detail/ExtendedBlockWiseConfig.h @@ -0,0 +1,51 @@ +// +// Created by oke on 04.07.20. +// + +#ifndef SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H +#define SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H + +template +struct ExtendedBlockWiseConfig { + + typedef typename BlockWiseConfig::FloatType FloatType; + using VectorType = typename BlockWiseConfig::VectorType; + + static constexpr auto Registers = BlockWiseConfig::Registers; + static constexpr auto LoadVector = BlockWiseConfig::LoadVector; + static constexpr auto StoreVector = BlockWiseConfig::StoreVector; + static constexpr auto XOR = BlockWiseConfig::XOR; + static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; + static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); + static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { + StoreVector(memory, LoadVector(memory) + vector); + }; +}; + +// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 +// C = floor ((16 - R) / R) + +// maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16 +// C = floor ((16 - R) / R) + +unsigned constexpr extra_registers = 2; + + +constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) { + if (R == 0) return 0; + auto result = (unsigned) ((double) (Registers - extra_registers - R) / (double) R); + if (extra_registers + R + R * (result + 1) <= Registers) + return result + 1; + return result; +} + +constexpr unsigned GetInitialRows(unsigned Registers) { + for(unsigned R = Registers; R > 0; --R) { + if(extra_registers + R + R * R <= Registers) { + return R; + } + } + return 0; +} + +#endif //SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H diff --git a/src/register_blocking/detail/RegisterBlocking.h b/src/register_blocking/detail/RegisterBlocking.h new file mode 100644 index 0000000..09cac35 --- /dev/null +++ b/src/register_blocking/detail/RegisterBlocking.h @@ -0,0 +1,155 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_REGISTERBLOCKING_H +#define SMID_MATRIX_REGISTERBLOCKING_H + +#include "../../Matrix.h" +#include "ExtendedBlockWiseConfig.h" + +namespace detail { + + template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BitWiseConfig, + unsigned _NumRows, unsigned _NumColumnVectors + > + struct RegisterBlocking { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = _NumRows; + static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; + bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + } + } + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < _NumRows; ai++) { + for (int bi = 0; bi < _NumColumnVectors; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); + } + } + } + }; + +/* + template + struct RegisterBlocking { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = 3; + static constexpr auto NumColumns = 4 * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType r1c1; + typename bwc::VectorType r1c2; + typename bwc::VectorType r1c3; + typename bwc::VectorType r1c4; + typename bwc::VectorType r2c1; + typename bwc::VectorType r2c2; + typename bwc::VectorType r2c3; + typename bwc::VectorType r2c4; + typename bwc::VectorType r3c1; + typename bwc::VectorType r3c2; + typename bwc::VectorType r3c3; + typename bwc::VectorType r3c4; + typename bwc::VectorType r1; + typename bwc::VectorType r2; + typename bwc::VectorType r3; + typename bwc::VectorType c; + + r1c1 = bwc::XOR(r1c1, r1c1); + r1c2 = bwc::XOR(r1c2, r1c2); + r1c3 = bwc::XOR(r1c3, r1c3); + r1c4 = bwc::XOR(r1c4, r1c4); + r2c1 = bwc::XOR(r2c1, r2c1); + r2c2 = bwc::XOR(r2c2, r2c2); + r2c3 = bwc::XOR(r2c3, r2c3); + r2c4 = bwc::XOR(r2c4, r2c4); + r3c1 = bwc::XOR(r3c1, r3c1); + r3c2 = bwc::XOR(r3c2, r3c2); + r3c3 = bwc::XOR(r3c3, r3c3); + r3c4 = bwc::XOR(r3c4, r3c4); + + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + r1 = bwc::BroadcastToVector(A(aRowOffset + 0, p)); + r2 = bwc::BroadcastToVector(A(aRowOffset + 1, p)); + r3 = bwc::BroadcastToVector(A(aRowOffset + 2, p)); + + c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth)); + r1c1 += r1 * c; + r2c1 += r2 * c; + r3c1 += r3 * c; + + c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth)); + r1c2 += r1 * c; + r2c2 += r2 * c; + r3c2 += r3 * c; + + c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth)); + r1c3 += r1 * c; + r2c3 += r2 * c; + r3c3 += r3 * c; + + c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth)); + r1c4 += r1 * c; + r2c4 += r2 * c; + r3c4 += r3 * c; + } + + // total regA * regB + regB registers + + // Accumulate the results into C. + AddAndStore(&C(aRowOffset + 0, bColOffset + 0 * bwc::VectorWidth), r1c1); + AddAndStore(&C(aRowOffset + 1, bColOffset + 0 * bwc::VectorWidth), r2c1); + AddAndStore(&C(aRowOffset + 2, bColOffset + 0 * bwc::VectorWidth), r3c1); + AddAndStore(&C(aRowOffset + 0, bColOffset + 1 * bwc::VectorWidth), r1c2); + AddAndStore(&C(aRowOffset + 1, bColOffset + 1 * bwc::VectorWidth), r2c2); + AddAndStore(&C(aRowOffset + 2, bColOffset + 1 * bwc::VectorWidth), r3c2); + AddAndStore(&C(aRowOffset + 0, bColOffset + 2 * bwc::VectorWidth), r1c3); + AddAndStore(&C(aRowOffset + 1, bColOffset + 2 * bwc::VectorWidth), r2c3); + AddAndStore(&C(aRowOffset + 2, bColOffset + 2 * bwc::VectorWidth), r3c3); + AddAndStore(&C(aRowOffset + 0, bColOffset + 3 * bwc::VectorWidth), r1c4); + AddAndStore(&C(aRowOffset + 1, bColOffset + 3 * bwc::VectorWidth), r2c4); + AddAndStore(&C(aRowOffset + 2, bColOffset + 3 * bwc::VectorWidth), r3c4); + } + }; +*/ + +} + +#endif //SMID_MATRIX_REGISTERBLOCKING_H diff --git a/src/register_blocking/detail/RegisterBlockingManual.h b/src/register_blocking/detail/RegisterBlockingManual.h new file mode 100644 index 0000000..fd624e3 --- /dev/null +++ b/src/register_blocking/detail/RegisterBlockingManual.h @@ -0,0 +1,80 @@ +// +// Created by oke on 01.07.20. +// + +#ifndef SMID_MATRIX_REGISTERBLOCKINGMANUAL_H +#define SMID_MATRIX_REGISTERBLOCKINGMANUAL_H + +#include "../../Matrix.h" +#include "ExtendedBlockWiseConfig.h" + +template +struct a { + typedef ExtendedBlockWiseConfig bwc; + + + template + void do_stuff(int p, int bi, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + + for (int ai = 0; ai < Rows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + + } + +}; + + +namespace detail { + + template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BitWiseConfig, + unsigned _NumRows, unsigned _NumColumnVectors + > + struct RegisterBlockingManual { + typedef ExtendedBlockWiseConfig bwc; + + static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) { + bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); + }; + + static constexpr auto NumRows = _NumRows; + static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth; + + template + static void __attribute__ ((noinline)) + handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth)); + for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows) + typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + } + } + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < _NumRows; ai++) { + for (int bi = 0; bi < _NumColumnVectors; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]); + } + } + } + + }; + +} + +#endif //SMID_MATRIX_REGISTERBLOCKINGMANUAL_H diff --git a/test.pdf b/test.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f5f0089cf7d1c5a1a87c4b5b0d32ca38af760bcd GIT binary patch literal 12322 zcmb_?2{cvT_jp3_keP(K2$8(`@l0hV^Av^h%v0uhst_rfOqt76Xfl&jk}0z^_>fP^ zP{vH5lHa*c^`Wfqvi|GOI_unb&OUoO`|Q2Xy<1RMN#y`q9ET9R-3JvsK%gNM^dY37zncw&R3v$koL%i8IXQ%li!}@=xn_Z~pO>nE7s<;8!mTb+ zadq*6u&d9yBrh);4;Kgz|3n~lJzT8}Y`h>-Ku1Xh;J5Mff{^Mb0ZoOUz2eVa9WsZI z2KFRt2Nyetu)5qt$I{Wp$_tVpBma*bGztv-5e8Oh*@9=JiW!`Zk`f-X2y!jj*16s}a`h8Xdwc{o+~0!5OFtLaI0eJu2B) zxmp9YYudQjdD%l~cz5*Sg`Qp>Hl&jXztrk$%`T>@^brdqBDP857$M?`N>0}aRP z7c!Z6j;g0U_cZO%^ncAwuv9CUj&Y*jlP#y89SIHPrHnDP5# zl=JZq3yO*+=6JcQt$wAGCy0BTpPjEaK1FE#Hay6_yE3wWOla>p^Rj1!a_#*>T(8F5 zwfc*!X{6@edyZA%|2bD1TBjDBFumep6M}s`JQ&}t!!}FzkeVWhn`>85AycB7NZ`BF z?-cE#7F>IHRd%RTYZ{@OmEPPR>=(xz3v?+r>_p!ENwwmR>zW2zaM5_I} zC%Kt5>WZm~A3AEcJ6=Aij#g4@6HkuK=s_hFFSOGAs7VcW&h*xMXWk<6gvs!@?n(q> z(i7v8#FU)i&J?xhp^=#v#`B-HJk_ylO-Yy!tUt!z)q1gyO;}KB-)NPV9PcUGw!=pJ zh(%rZOIZ2N7kbzl10g1r#bz_-X?3$}A4|l(*ceW4l}o~^>*XKLH7Yg{=~umzZ++NE zZOblwJ!#YC5>dMgclPVqDKwSjvvTv=ygCxESrFiwHKKGpPc-QWL(TaMY(@G;n4HnQ zd{uk(>WrF89K8sVxz|H;Y>weM$3&Jgjp8-)0zxH+#2!e9Ch0Ht<14G=PbpF=;rC0M zQ>3RS<;7iA`6Np3Eo$~v;S-{%8zZ?b!1BV^Jxx_Z6z{^)5&}z#sCxxddUKg#=X)5s zJg zO`t(Q6@?Qwc3r&P{og`ro1cG z4!%^DX@vUW2Gz-s+S|eNi`&Xpq7MEudhvTxR~wDRXO;NqiA#?QS8683wu)T~@r;v7 zH{J6sBV73@?iyOoN;IU3D|OgLx>!jwn*YOryC%8gh|AsMV|BUVd_i|6xhuMd!lNE_ z6}XER+e{^OvGu!tz|t$6&{d$m#(q9@N#mgIleu`x<(W1^Nv}F`dXbA$Oa@_hIW) zt#)49)Yo~R8Sgf3vy8pp{4V+IuDU6DXli0~c+%3l{K1>yIns#s2rj}PTJHIX-u+9> zS3CMD78c)r{T@<~5}C@ImdG>qLaGEXaFFZ8p@r_YnX1PpV#54q_Z-L!f#6vA(_{aP znq*H7yL>bbC-EzkqT$%KnK*z&;noEK*{jl5YfV*ic*yIsW4X$#!^xD#>k%Zu9j1st8+r48no-(|BX@3;a>gK&7U9H;Vi0KtL4(di7mLmA%p6zvvZH)g(NlRx^^sy> znK#}6I^r&ztxSy8m4=JjdgBuv*hk}sX=Pc;223V{rIQOk?Mjgx51CSV^*s8nJzq$p zvGACnMW2C?GK;l#nTgAVm+6x?_T_p;cyd9bJ?A6zhb@f@PSH!diXnT%8sl2%Fe*2- z(k1;GLt>KW{NqRbmMNTPJ*79;4LFO~jN&m^iFJxs)>HSO;-jxx*gtnB@9F-}>DsM% zvnx(wSaod<3n#S0nKK9cqWE4tX*q+e4AT0D3zER}ewOOl?WR*9-j~=MYWFnZiLqg# zZ#B}qM=jVlGC#!DITc?)Vu;t=nWsUtN_?VG0(7UkvsZBkM^0K79Wi-)0U38 zdR>5Rz%@=_;1l~{A1&uy1>I8DZ_qTCiJy~7+MDS^a(A3g;N#Emw!$|oUeMgBMQdSU zuaVly%+1a07gf_PKUi=2Dmr;FrOAA{N``$mZF#7$9@X)Pz#dw0*PV&Wk}W}>j~^q( z?$W_PoIP`&l^QJ)4}E+xzN?#*;voDrBE0>g8lL;8_6dy+jRK~9Iz1sb=?<6N8Eu~^ z7B87_(vj?$$PZsG?(LmX7IR~l2(Qix0f$%TA7m9w?dhs8UHp+Axcr?m*U%ti19fdS z%fn#S8yQ|lx`>jGevEil{4!VcNsIR4BtztaYld|Z1IrJ!nK>m@q2t4iG`D-E%Q(2s zaV>HfOy858s0|;Lt%#%n6xf30>#qocs4>%)jcm3&*!&u2_)L_71ll)n0t*q}rf&fgdfL%{7Q& z?n=FDiMT;qCnb##Wv5OY^?d$TZpGf*RO@eDVb*5x|I@T^_;seGtXmbMh^8NtrRJou zSQZOR_f?DUeJp!e#qBKLv4dBSv0OX%M1#IC&h@>jio7h{W6rbhvCrB$&&s!jXCF$P z=VD6oi}**y<)e>V+rwIHG;u3u-7&Y^7fD~t-VCO07iMZZ88;oR@Ic*=-Q6(5U2{6W zs6FQOnVeGR;ulxO`C`H~)CGkSLkvV$N<>UWKmMc4dS7kuOsp1eKW(Yg!5+2w1X#$c{>L)bBw;FF85za znBgz&z47vsv7la3%*&gXD`ZxpGp2t~Uc7#kd=Xn8Trg`dasNwuSR!Vf_GI-nT&U4B zz3n0uhjKn^d8Fs{Prf~6n$lvJk3dqEq2m(i^8^o$UOk7^44L;Zf3G)F?d)hb%r_}c z$Xzt?IPoCjjYns)WuXpF6SA?y(B#IEw^@qQXZNu(ZaMJo4E4Yxo|S3NTQ#rmpY^B> zwBBlQxZCAWy<#Vq+EICC(zh$@AG3AuIS)+jgN(axpS|H_d5$4ri^!t_hNx?|dag%? z$6w9o?V66x{%l%f9!1hMHSMMHv+ZE&>Aw=gjGgG@azD!KbWrwFmL8^5kBd>uE@=8K zMt`;wpzUgW`&`kkT|wAl@38u{5jTN>^j>*7^ql*>1RdvvYa&Oke5RIuPC;3{+3n?bgG6G$F6$S>1z7>K3fClzdAl zUJ0c5-F=Z+c3)B5P%8WVo8ND@7V@Ms3I)2B zT#9RHxfRad?GtdTtlf&Y+c*8&*Y6bKTW)Y|a2((}_-5i0alNC}ijik|0?|-TJ>=KX zRq9yr3g}unQP91k`ib_|C=TV&IlQ9=YO(^u?xZVnEiUni5z372pUKSUKQQW=X%R01 z%PGT(rrpE3)W3ak57<($9I)*zV;2v<$p#$4m;B9eNRV7-aac^bE=u=4j)EnELkKZ! zyA2@{VRqwG>V40L(bqKvqI46@ehSv%ceC-+!aADDS7$0x*1$}BTH^; z_{Ij029!o+H=gb?a5USc9ev9oI4{`oT^0AR#>=RKt?eU}kqul<8!!gH`q&I(e>k-b zm6X7AM4VGoYrR`UDG7^B;HyMKP-^TA~wS2k6Lb=X|*oS7=$y%!muVvJTS}Em z=L<0&eV=&4-ZPBcO*Jg4(5siqmtvR99eMsWfAV{U*z-pMkr-;{SHv9zx1lmpG;3Dt z5#PvfN|$fO=4HM2x$L|c7F+-5`AhZ+ZNj%v!JXSfLvGM&&S+itjyUvTSR}xM_RF+; z)$F1BX#p450iCud6AR`2(b;DTovr1Cy} zB>RJXoPKa+okP6Wv^~LdKiwTxUfUf3aaml~nWWhJCo%^;awprCRNdxJ%&QhornWwN zVm-7Y+;(1MGAM;t&-|_RsV*y*S5BwczvSO9xt%i<-rHu5y-qcm$1|}^o#!bPzJa8h z&g;;BTtjrLm=)19Ao$V^wgpW{v^VE8|2VlfHCut8z!I0N*%TKk_5A8>5glGR4a>-f zkwY%l1VK?{WbA#xg$~4YaMUo{!rPY##frE@dtwM3Ry3cR_k{)_*-e!7!S%jc>{o`9 zUX({7cNdyM9oZJI%OH;i#+~#vNvs$6=?`mdTe;)0px-wo%j24PF3#)o-KC+q=Jo>* zwPi0VJ*Zwf@8q5xRvhB3;>^ZIiesAsczoaB0EtQ=j%@B~Gp6qsv46(VGtu`j;(6gPCZ&rRn+V1*P5 z#-N`R5lNL!FBV}iy)7xr0$-t`ACoqob>HQL_nF}UgM#^m?V>t9KJK!u&0HJs1(NK| zc^3?pu+Hyk>1wzr@zM8so;Q1^zL$@pP4T7C$dBH_TEhv^h@Gd@7(W#m?H)6*^o|;z z$E0zN9=l&2pIKRo&CQGDA24ZsFHrdoD@qaNC}(e-`s0 zO`OXjgM$KZ8sB<`cv4E@(1Xjn%JYl+t##@7M-^JP$BL)k>`XjIOuuDu*jK3ZGPjxB z%NvyL$Ep#PRu^JjpYa^M@r2Z&>Hg{j#j9IQMMoPU^$5PU;HZw5c6*c)SR@b_k7H=8Q`dGpiK zBf3z5L~|JtY_VmmWJH$R8NYOjTfd(wPf3wYy=h3q05eonxmRGX)APg~F$c=b9x*By zXeifoS1WrrIbF-WZ7OEoF`FlLZ(!%UKwpXOgLCJ4wqEHBH7A|faap>;Vo^3w$k~97 zUr(@`-Xi(_Q=Yp@^zz%R(zv$eAi@mhed9kAGe}dv+F@&ut*XzO<`p)4w!GhiFZ7N< zWu8P=cwf)Wot8Eq9vx2nK>5xhvsuad*|3dd`VNx)=QfAH|~1jpVyt5wKvAb%09)93V+u6xR@bRlK0W{`aQb1;6tU7 zY3jP@;Cq&5vQm9c@u%*5ClBqHE$>4F$v;M!vQ$he~Z1NB=93I7eG+-y#R#ja1I z59kvw?nTo#FC#*LZOlxk@7C=8w&JC3L`|LV+$=NbQL7{vB6hh~&*}YZ72E4iKi?~8 zIXk7++bDI;sZrUiDdDM|lGq^nnas1$soq7d-AlJynu@pABYPIOj*REgHY?O$z?RgC zUaS>uEVLNo&kyu7$ga)X`{u)i6}dW3&QOj=0`^akOCtpWevh)1|B;(fD*G%D3KM{37P0jgk||E-bsg?#u9-OF2~LAM0(h zCATgmi9V$~oZRA^3H@mN#CH?JlzaEnVc~K;cHhw^v*OB!Y3dl$1?BUtY;9vtljeP| zVG3HM{S#Pow3QdR0?Viv(mss#I3=A`DJRzK7Yri!1bEC;Cm&^O%2QKgfAIjJoz=*u z@g`w0`FwI$ck}0e6BoEntx#a!2T^RWDVt61u;}%shu7Ee0H)Xb*ud>f+qK8R>0U*S zlaKF&3-7;w;E;ki@@SVvNVi~bPVJ!ao&~`|1-VNT$u0VI3mnC4?Fa6uwlkL>Kv3zi z$}VDr+4;NMyFW_#krL%7?=fkMaQJPH-Lmc8zzM%N?XV}(+?amm*eko3_SPh8T+mXB zlU4t|e`r2C9_JMzwBK_n&?Aq{F{_D&9WtqWvpsvi$nL$YvQcV99@P>@&h=k5zS>|X zD8$9|R;W;`%CXYYNoECZ}I|6+igOqiOQte)zN~iQ{FMRysZ}+_vPRm36J( z!c)ztvk|o{x+%f&wE2mbUi8rl} z1vDXfCrP7X)AUb`j#o4=9K5V?$G6(((J2K3RN@A*Y&t>4t-re*&?8=@l&5JPVGmI7 zd@)tnd}U$f6XH2VScD=Zb--P>UWX}RfB#IQQOUqm(#{&+nj8N<_vYpxGO??5?KIT7 zW&Ut?^>!Z~+pV+Nc(E|9G0KBh2#El9Kh64{32!buRp4tU9a~+@eoptMXVeDXZyJ+o zb#kNWCd}Fy8Y`J!+3d_tiLZ|i7trFRpP8}bV=IM!`RYy%3C?692+Qb)0wfvdSlSKthGadk11>+HIox{ zW0vyLti9Q%<#M0TOYcMHH24&6)>87SzE@F=X#41QzX`qLAst5Oc+6}@^w{2uw4>}N zYaWzHPqvDog~eC}ES%!{QdHR%2d>e{hmOzGa_&^hcDm+@&+v{738{Kv-ksyU@InBQ z+IUfyp4*N3%?sHbeJpKL$wH3}MYvKV;}{J&uN3vrp1qZ0nP^~SAN~G_?9_$h6H62| zU!=cnAoFH(Wc>QjfW@Q0eYd03umg`F7@_b~MGUQ)?E3VT@5aq&SodbsQX9~c&SZNs ziF#IaZ81z_i0K@*MUP*8Ne<+vjyl%e55j>fH5pGD3`9p2HHWU(VByNlQEN`R&0 z&c59x+->g*KYeAcx_TtIV_4;Zpna(7xPe1of;;OoN<8(b$f#M?L#E1%JmWXGt8Yt+ zt~yEP{#!kgH@5P^sQ<)?f5oFW5NWgd7Y39=taZP5-DU<51gq>29JkoEp0Q3pQtiFs zbh1O5#Jr+l(ilhMpV!nT@<~HZN|Uin{Ba>a-5qbi@#o1<`u?dKNkxU7Z{AzD?GieE znnS|kqeILtyw^Zzu6Vq{nL5GGNkQI$A0O#(`8jfV_)sw`Hd)X*KH$(YDUr#ztWEKL zC8T!pz~GImv(MAHm$*|WK1&z731$Z)abgHli~WXVS(|`7K4(%RRgZ1|J{NVNS-$25)@G1 z;Yd2-Z2-zTK)r`O$pfxF@r2MnYf+S4K$!(xTp}ZbKq@)d+SguPX$OG}0DSbwGF=g!FXqgYX0hY2#pL?*$R@5YpSl+Q!4v%GJXLB1%9= z5`?sbkX8t!H2{LpC;;IAIxyn|A)O)QNeJlzkV8l}2Lr5P84PJyJ z{UD@2WRCb*f%BVJu&+oJ2M-fj0ir zDgtTXZRtfW5rPYau)nHiR<&>JVC`iO)D4@hK5US`zJjjJu;o3i;KCUqh6tA#xv8yc z6vzcL49^^ZFRpN1jGZTh#jJ9@33>ly6%-YL8YTxTc^5lp8wdsJc}RX_27x+oU~3J$ zY)&45^|@N{^oPojCchb5Bge1vXbKTfsQ(`g@&Aeki5+n&68Uaf{5b-ET z0=$jDNZ=tXcts31Jm3?ahvBi{jTN~EUpE2#|7wU|^H=~2HyFpSh6l+W8U-sr5(gSU zM3jUiB*1TQLBa##5^w{@2SdaoK!XV2jchAFZ34E3wGHNr{2SnaZR1yiB`2WpU=IPWunla` zh>~DWV7ma$e{K6^6Tf;`DPZ3b=J)$t8hS5Kt%i$r?h#xgD6bfykdg;U{unA%D^XRg>hngIV>XAdY}tCxbgdn4cAw z&JeHw^4LiT1K(@_3KwAHt52>FhD-`Uj%L8qlIPtZ^1DL-=?)BMb<6{}*wrWCY=2UL zV;zu|yatqY{tTXQNCY-d9`%NROIUsSFA9RqA`kh1v$Xo;2drxK$sfXz2_ja5EbzZD z=zpGdZGcb#*11akzxlP%ULL(#&M6OY{0f%83sR9*K=m+Uq&(&q82W#LVSj`eR-iz2?HV_0TgZ}RUxni zO7Zisu|${J9o|!h*>2CmaU#Cx0mP-~6G7c-X|((;#C19+M;jkFp*P zWQ!nqTR#S9z$bG(K2Txv7k?N6>`eZEBf`F8{TLRr!5AJspzGm4ocT*HI5goe{o$}6 zx&3o3oR0iChL`voj`%kXJQ{Wz>v_TBVBhw~7#>9RKgT3Nce}2d#d3ceJD;kX@U=V_W$~r2D{{wDF BOgaDn literal 0 HcmV?d00001 diff --git a/test.svg b/test.svg new file mode 100644 index 0000000..3c87af6 --- /dev/null +++ b/test.svg @@ -0,0 +1,1195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +