From fffd9ce319d439ae82df3edec3a9c8b31fc5e148 Mon Sep 17 00:00:00 2001 From: har0ke Date: Sun, 5 Jul 2020 03:54:36 +0200 Subject: [PATCH] Compare different block configurations --- CMakeLists.txt | 10 +++--- scripts/test.py | 33 +++++++++++++++---- src/DevideAndConquer.h | 24 +++++++++++++- src/main.cpp | 14 ++++++-- src/register_blocking/BlockWise.h | 2 ++ .../detail/ExtendedBlockWiseConfig.h | 2 +- 6 files changed, 68 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d1a2671..05eabcf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,6 @@ set(CMAKE_CXX_STANDARD 17) option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON) option(USE_CLANG "Build with clang instead of gcc" ON) option(WITH_AVX512 "Enable AVX512" OFF) -set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run") - -add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}") if(WITH_AVX512) add_compile_definitions(WITH_AVX512) @@ -21,6 +18,8 @@ else() endif() set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") if(OPTIMIZE_FOR_NATIVE) @@ -49,9 +48,8 @@ endif(BLAS_FOUND) find_package(Boost REQUIRED COMPONENTS filesystem program_options) -include_directories("${Boost_INCLUDE_DIRS}") +include_directories("${Boost_INCLUDE_DIRS} ${BLAS_INCLUDE_DIRS}") -message("${Boost_LIBRARIES} ${BLAS_LIBRARIES}") add_library(simple_matrix src/Matrix.cpp) target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES}) @@ -59,4 +57,4 @@ add_executable(simd_multiply src/main.cpp) target_link_libraries(simd_multiply simple_matrix) add_executable(generate_random src/generate_random.cpp) -target_link_libraries(generate_random simple_matrix) \ No newline at end of file +target_link_libraries(generate_random simple_matrix) diff --git a/scripts/test.py b/scripts/test.py index c5e9462..2e5912a 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -16,13 +16,12 @@ def check_call_quiet(*args, **kwargs): output, err = p.communicate() rc = p.returncode if rc != 0: - print(output.decode()) - print(err.decode()) + print(output.decode("utf-8")) + print(err.decode("utf-8")) exit(0) -def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): - +def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): flags = [ "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), "-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"), @@ -30,7 +29,19 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON", "-DWITH_AVX512=" + ("ON" if avx512 else "OFF"), ] - build_path = os.path.join(build_path_prefix, " ".join(flags)) + build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_") + return build_path + + +def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args): + flags = [ + "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), + "-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"), + "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), + "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON", + "-DWITH_AVX512=" + ("ON" if avx512 else "OFF"), + ] + build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_") if not os.path.exists(build_path): os.makedirs(build_path) check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags) @@ -86,7 +97,7 @@ if __name__ == '__main__': if options.function: functions = options.function - # functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"] + functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"] extra_args = [] if options.validate: extra_args.append("--validate") @@ -105,6 +116,7 @@ if __name__ == '__main__': times = [[] for f in functions] output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json") clang = not options.gcc + already_dumped = False for sizes in matrix_combinations: args = list(sizes) compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, args) @@ -115,6 +127,15 @@ if __name__ == '__main__': arguments.append("--double") output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args) ms = output.decode()[output.decode().find("multiply:") + 10:] + + if not already_dumped: + build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args), "simd_multiply") + os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path) + os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path) + upp = os.path.expanduser("~/dup.sh") + os.system('bash %s upload main_with_source.s .' % upp) + os.system('bash %s upload main_wo_source.s .' % upp) + already_dumped = True if "ms\n" in ms: ms = float(ms.split("ms\n")[0]) else: diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h index 5557818..e421482 100644 --- a/src/DevideAndConquer.h +++ b/src/DevideAndConquer.h @@ -182,8 +182,30 @@ void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix &C, cons #ifdef WITH_AVX512 template void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix &C, const Matrix &A, const Matrix &B) { - _divide_and_conquer>(C, A, B); + _divide_and_conquer>(C, A, B); } +template +void __attribute__ ((noinline)) divide_and_conquer_block_avx5120(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); +} +template +void __attribute__ ((noinline)) divide_and_conquer_block_avx5121(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); +} +template +void __attribute__ ((noinline)) divide_and_conquer_block_avx5122(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); +} +template +void __attribute__ ((noinline)) divide_and_conquer_block_avx5123(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); +} +template +void __attribute__ ((noinline)) divide_and_conquer_block_avx5124(Matrix &C, const Matrix &A, const Matrix &B) { + _divide_and_conquer>(C, A, B); +} + + #endif #endif //SMID_MATRIX_DEVIDEANDCONQUER_H diff --git a/src/main.cpp b/src/main.cpp index e3868f8..ae0dbfd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -64,6 +64,11 @@ int main_work(const std::string &test_function_name, const std::string &input_fo TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B) #ifdef WITH_AVX512 + TEST_IF(test_function_name, divide_and_conquer_block_avx5120, A, B) + TEST_IF(test_function_name, divide_and_conquer_block_avx5121, A, B) + TEST_IF(test_function_name, divide_and_conquer_block_avx5122, A, B) + TEST_IF(test_function_name, divide_and_conquer_block_avx5123, A, B) + TEST_IF(test_function_name, divide_and_conquer_block_avx512, A, B) #endif @@ -110,9 +115,12 @@ int main(int argc, char* argv[]) { po::store(po::command_line_parser(argc, argv). options(desc).positional(p).run(), vm); po::notify(vm); - - std::string test_function_name = vm.count("algorithm") ? vm["algorithm"].as() : DEFAULT_TEST_FUNCTION_NAME; - + std::string test_function_name; + if (vm.count("algorithm")) { + test_function_name = vm["algorithm"].as(); + } else { + throw std::runtime_error("missing algorithm"); + } if(vm.count("double")) { return main_work(test_function_name, vm["input-folder"].as(), vm.count("validate")); } else { diff --git a/src/register_blocking/BlockWise.h b/src/register_blocking/BlockWise.h index 4ab8827..eb5d9ae 100644 --- a/src/register_blocking/BlockWise.h +++ b/src/register_blocking/BlockWise.h @@ -56,6 +56,7 @@ struct __m512_block_wise_config { static constexpr auto LoadVector = _mm512_loadu_ps; static constexpr auto StoreVector = _mm512_storeu_ps; static constexpr auto BroadcastToVector = _mm512_set1_ps; + static constexpr auto XOR = _mm512_xor_ps; static constexpr unsigned Registers = 32; }; @@ -65,6 +66,7 @@ struct __m512d_block_wise_config { static constexpr auto LoadVector = _mm512_loadu_pd; static constexpr auto StoreVector = _mm512_storeu_pd; static constexpr auto BroadcastToVector = _mm512_set1_pd; + static constexpr auto XOR = _mm512_xor_pd; static constexpr unsigned Registers = 32; }; #endif diff --git a/src/register_blocking/detail/ExtendedBlockWiseConfig.h b/src/register_blocking/detail/ExtendedBlockWiseConfig.h index 04620df..f59279f 100644 --- a/src/register_blocking/detail/ExtendedBlockWiseConfig.h +++ b/src/register_blocking/detail/ExtendedBlockWiseConfig.h @@ -28,7 +28,7 @@ struct ExtendedBlockWiseConfig { // maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16 // C = floor ((16 - R) / R) -unsigned constexpr extra_registers = 2; +unsigned constexpr extra_registers = 1; constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {