diff --git a/scripts/test.py b/scripts/test.py index 40b59f0..ef480f2 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -17,7 +17,7 @@ def check_call_quiet(*args, **kwargs): rc = p.returncode if rc != 0: print(output.decode()) - print(err) + print(err.decode()) exit(0) @@ -44,7 +44,7 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"] slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"] -normal_functions = ["block_wise_avx2"] +normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"] fast_functions = ["divide_and_conquer_block_avx2", "blas"] avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"] @@ -63,6 +63,7 @@ if __name__ == '__main__': parser.add_argument("--avx512", action="store_true") parser.add_argument("--validate", action="store_true") parser.add_argument("--double", action="store_true") + parser.add_argument("--gcc", action="store_true") parser.add_argument("--function", type=str, nargs="*") options = parser.parse_args() @@ -101,31 +102,31 @@ if __name__ == '__main__': times = [[] for f in functions] output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json") - for clang in [True]: - for sizes in matrix_combinations: - args = list(sizes) - compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args) - folder = "x".join(sizes) - for fidx, function in enumerate(functions): - arguments = [folder, "--algorithm", function] - if with_double: - arguments.append("--double") - output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args) - ms = output.decode()[output.decode().find("multiply:") + 10:] - if "ms\n" in ms: - ms = float(ms.split("ms\n")[0]) - else: - ms = float(ms.split("s\n")[0]) * 1000 - times[fidx].append(ms) + clang = not options.gcc + for sizes in matrix_combinations: + args = list(sizes) + compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args) + folder = "x".join(sizes) + for fidx, function in enumerate(functions): + arguments = [folder, "--algorithm", function] + if with_double: + arguments.append("--double") + output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args) + ms = output.decode()[output.decode().find("multiply:") + 10:] + if "ms\n" in ms: + ms = float(ms.split("ms\n")[0]) + else: + ms = float(ms.split("s\n")[0]) * 1000 + times[fidx].append(ms) - shutil.rmtree(folder) - print(["%.3f" % numpy.mean(ts) for ts in times]) - with open(output_file + ".cache", "w") as f: - json.dump({ - "extr_args": extra_args, - "times": times, - "sizes": matrix_combinations, - "functions": functions, - "means": ["%.3f" % numpy.mean(ts) for ts in times] - }, f) - os.rename(output_file + ".cache", output_file) \ No newline at end of file + shutil.rmtree(folder) + print(["%.3f" % numpy.mean(ts) for ts in times]) + with open(output_file + ".cache", "w") as f: + json.dump({ + "extr_args": extra_args, + "times": times, + "sizes": matrix_combinations, + "functions": functions, + "means": ["%.3f" % numpy.mean(ts) for ts in times] + }, f) + os.rename(output_file + ".cache", output_file) \ No newline at end of file diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h index 7fc7a25..425db9d 100644 --- a/src/DevideAndConquer.h +++ b/src/DevideAndConquer.h @@ -166,6 +166,12 @@ void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix &C, const _divide_and_conquer>(C, A, B); } +template +void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); +} + + template void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix &C, const Matrix &A, const Matrix &B) { _divide_and_conquer>(C, A, B); diff --git a/src/main.cpp b/src/main.cpp index 037fb07..acec4cc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,7 +3,7 @@ #include #include #include -#include "RegisterBlocking.h" +#include "register_blocking/RegisterBlocking.h" #include "Boost.h" #include "Naive.h" #include "DevideAndConquer.h" @@ -52,6 +52,7 @@ int main_work(const std::string &test_function_name, const std::string &input_fo double use_result = 0; TEST_IF(test_function_name, naive_reordered, A, B) + TEST_IF(test_function_name, block_wise_sse, A, B) TEST_IF(test_function_name, block_wise_avx2, A, B) #ifdef WITH_AVX512 @@ -59,6 +60,7 @@ int main_work(const std::string &test_function_name, const std::string &input_fo #endif TEST_IF(test_function_name, boost_axpy_mul, A, B) + TEST_IF(test_function_name, divide_and_conquer_block_sse, A, B) TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B) #ifdef WITH_AVX512 diff --git a/src/RegisterBlocking.h b/src/register_blocking/RegisterBlocking.h similarity index 82% rename from src/RegisterBlocking.h rename to src/register_blocking/RegisterBlocking.h index 2dbfa6b..4d50148 100644 --- a/src/RegisterBlocking.h +++ b/src/register_blocking/RegisterBlocking.h @@ -5,58 +5,11 @@ #ifndef SMID_MATRIX_REGISTERBLOCKING_H #define SMID_MATRIX_REGISTERBLOCKING_H -#include "Matrix.h" +#include "../Matrix.h" #include -enum AvxVersion { - AVX2, - -#ifdef WITH_AVX512 - AVX512 -#endif - -}; - namespace detail { - struct __m256_block_wise_config { - using FloatType = float; - using VectorType = __m256; - static constexpr auto LoadVector = _mm256_loadu_ps; - static constexpr auto StoreVector = _mm256_storeu_ps; - static constexpr auto BroadcastToVector = _mm256_set1_ps; - static constexpr unsigned Registers = 16; - }; - - struct __m256d_block_wise_config { - using FloatType = double; - using VectorType = __m256d; - static constexpr auto LoadVector = _mm256_loadu_pd; - static constexpr auto StoreVector = _mm256_storeu_pd; - static constexpr auto BroadcastToVector = _mm256_set1_pd; - static constexpr unsigned Registers = 16; - }; - -#ifdef WITH_AVX512 - struct __m512_block_wise_config { - using FloatType = float; - using VectorType = __m512; - static constexpr auto LoadVector = _mm512_loadu_ps; - static constexpr auto StoreVector = _mm512_storeu_ps; - static constexpr auto BroadcastToVector = _mm512_set1_ps; - static constexpr unsigned Registers = 32; - }; - - struct __m512d_block_wise_config { - using FloatType = double; - using VectorType = __m512d; - static constexpr auto LoadVector = _mm512_loadu_pd; - static constexpr auto StoreVector = _mm512_storeu_pd; - static constexpr auto BroadcastToVector = _mm512_set1_pd; - static constexpr unsigned Registers = 32; - }; -#endif - // maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16 // C = floor ((16 - R) / R) @@ -139,7 +92,7 @@ namespace detail { template< // template parameter as struct: otherwise some warning about losing alignment information warning - typename BlockWiseConfig = __m256_block_wise_config + typename BlockWiseConfig > struct block_wise { @@ -298,23 +251,94 @@ namespace detail { }; } +struct __m128_block_wise_config { + using FloatType = float; + using VectorType = __m128; + static constexpr auto LoadVector = _mm_loadu_ps; + static constexpr auto StoreVector = _mm_storeu_ps; + static constexpr auto BroadcastToVector = _mm_set1_ps; + static constexpr unsigned Registers = 16; +}; + +struct __m128d_block_wise_config { + using FloatType = double; + using VectorType = __m128d; + static constexpr auto LoadVector = _mm_loadu_pd; + static constexpr auto StoreVector = _mm_storeu_pd; + static constexpr auto BroadcastToVector = _mm_set1_pd; + static constexpr unsigned Registers = 16; +}; + +struct __m256_block_wise_config { + using FloatType = float; + using VectorType = __m256; + static constexpr auto LoadVector = _mm256_loadu_ps; + static constexpr auto StoreVector = _mm256_storeu_ps; + static constexpr auto BroadcastToVector = _mm256_set1_ps; + static constexpr unsigned Registers = 16; +}; + +struct __m256d_block_wise_config { + using FloatType = double; + using VectorType = __m256d; + static constexpr auto LoadVector = _mm256_loadu_pd; + static constexpr auto StoreVector = _mm256_storeu_pd; + static constexpr auto BroadcastToVector = _mm256_set1_pd; + static constexpr unsigned Registers = 16; +}; + +#ifdef WITH_AVX512 +struct __m512_block_wise_config { + using FloatType = float; + using VectorType = __m512; + static constexpr auto LoadVector = _mm512_loadu_ps; + static constexpr auto StoreVector = _mm512_storeu_ps; + static constexpr auto BroadcastToVector = _mm512_set1_ps; + static constexpr unsigned Registers = 32; +}; + +struct __m512d_block_wise_config { + using FloatType = double; + using VectorType = __m512d; + static constexpr auto LoadVector = _mm512_loadu_pd; + static constexpr auto StoreVector = _mm512_storeu_pd; + static constexpr auto BroadcastToVector = _mm512_set1_pd; + static constexpr unsigned Registers = 32; +}; +#endif + +enum AvxVersion { + SSE, + AVX2, +#ifdef WITH_AVX512 + AVX512 +#endif +}; + template struct block_wise; -template<> struct block_wise : public detail::block_wise_base {}; -template<> struct block_wise : public detail::block_wise_base {}; +template<> struct block_wise : public detail::block_wise_base<__m128_block_wise_config> {}; +template<> struct block_wise : public detail::block_wise_base<__m128d_block_wise_config> {}; + +template<> struct block_wise : public detail::block_wise_base<__m256_block_wise_config> {}; +template<> struct block_wise : public detail::block_wise_base<__m256d_block_wise_config> {}; #ifdef WITH_AVX512 - template<> struct block_wise : public detail::block_wise_base {}; - template<> struct block_wise : public detail::block_wise_base {}; + template<> struct block_wise : public detail::block_wise_base<__m512_block_wise_config> {}; + template<> struct block_wise : public detail::block_wise_base<__m512d_block_wise_config> {}; #endif +template +void __attribute__ ((noinline)) block_wise_sse(Matrix &C, const Matrix &A, const Matrix &B) { + block_wise::multiply(C, A, B); +} + template void __attribute__ ((noinline)) block_wise_avx2(Matrix &C, const Matrix &A, const Matrix &B) { block_wise::multiply(C, A, B); } - #ifdef WITH_AVX512 template void __attribute__ ((noinline)) block_wise_avx512(Matrix &C, const Matrix &A, const Matrix &B) {