Compare different block configurations

This commit is contained in:
har0ke 2020-07-05 03:54:36 +02:00
parent 7b727cb1dc
commit fffd9ce319
6 changed files with 68 additions and 17 deletions

View File

@ -6,9 +6,6 @@ set(CMAKE_CXX_STANDARD 17)
option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON)
option(USE_CLANG "Build with clang instead of gcc" ON)
option(WITH_AVX512 "Enable AVX512" OFF)
set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run")
add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}")
if(WITH_AVX512)
add_compile_definitions(WITH_AVX512)
@ -21,6 +18,8 @@ else()
endif()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
if(OPTIMIZE_FOR_NATIVE)
@ -49,9 +48,8 @@ endif(BLAS_FOUND)
find_package(Boost REQUIRED COMPONENTS filesystem program_options)
include_directories("${Boost_INCLUDE_DIRS}")
include_directories("${Boost_INCLUDE_DIRS} ${BLAS_INCLUDE_DIRS}")
message("${Boost_LIBRARIES} ${BLAS_LIBRARIES}")
add_library(simple_matrix src/Matrix.cpp)
target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES})
@ -59,4 +57,4 @@ add_executable(simd_multiply src/main.cpp)
target_link_libraries(simd_multiply simple_matrix)
add_executable(generate_random src/generate_random.cpp)
target_link_libraries(generate_random simple_matrix)
target_link_libraries(generate_random simple_matrix)

View File

@ -16,13 +16,12 @@ def check_call_quiet(*args, **kwargs):
output, err = p.communicate()
rc = p.returncode
if rc != 0:
print(output.decode())
print(err.decode())
print(output.decode("utf-8"))
print(err.decode("utf-8"))
exit(0)
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
flags = [
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
@ -30,7 +29,19 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
]
build_path = os.path.join(build_path_prefix, " ".join(flags))
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
return build_path
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
flags = [
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
]
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
if not os.path.exists(build_path):
os.makedirs(build_path)
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
@ -86,7 +97,7 @@ if __name__ == '__main__':
if options.function:
functions = options.function
# functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"]
functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"]
extra_args = []
if options.validate:
extra_args.append("--validate")
@ -105,6 +116,7 @@ if __name__ == '__main__':
times = [[] for f in functions]
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
clang = not options.gcc
already_dumped = False
for sizes in matrix_combinations:
args = list(sizes)
compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, args)
@ -115,6 +127,15 @@ if __name__ == '__main__':
arguments.append("--double")
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args)
ms = output.decode()[output.decode().find("multiply:") + 10:]
if not already_dumped:
build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args), "simd_multiply")
os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path)
os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path)
upp = os.path.expanduser("~/dup.sh")
os.system('bash %s upload main_with_source.s .' % upp)
os.system('bash %s upload main_wo_source.s .' % upp)
already_dumped = True
if "ms\n" in ms:
ms = float(ms.split("ms\n")[0])
else:

View File

@ -182,8 +182,30 @@ void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, cons
#ifdef WITH_AVX512
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX512>>(C, A, B);
_divide_and_conquer<multiplier_block_wise<T, AVX512, 6, 5, 75>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx5120(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX512, 6, 1, 75>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx5121(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX512, 6, 1, 20>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx5122(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX512, 15, 1, 75>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx5123(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX512, 15, 1, 100>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx5124(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX512, 15, 1, 150>>(C, A, B);
}
#endif
#endif //SMID_MATRIX_DEVIDEANDCONQUER_H

View File

@ -64,6 +64,11 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
#ifdef WITH_AVX512
TEST_IF(test_function_name, divide_and_conquer_block_avx5120, A, B)
TEST_IF(test_function_name, divide_and_conquer_block_avx5121, A, B)
TEST_IF(test_function_name, divide_and_conquer_block_avx5122, A, B)
TEST_IF(test_function_name, divide_and_conquer_block_avx5123, A, B)
TEST_IF(test_function_name, divide_and_conquer_block_avx512, A, B)
#endif
@ -110,9 +115,12 @@ int main(int argc, char* argv[]) {
po::store(po::command_line_parser(argc, argv).
options(desc).positional(p).run(), vm);
po::notify(vm);
std::string test_function_name = vm.count("algorithm") ? vm["algorithm"].as<std::string>() : DEFAULT_TEST_FUNCTION_NAME;
std::string test_function_name;
if (vm.count("algorithm")) {
test_function_name = vm["algorithm"].as<std::string>();
} else {
throw std::runtime_error("missing algorithm");
}
if(vm.count("double")) {
return main_work<double>(test_function_name, vm["input-folder"].as<std::string>(), vm.count("validate"));
} else {

View File

@ -56,6 +56,7 @@ struct __m512_block_wise_config {
static constexpr auto LoadVector = _mm512_loadu_ps;
static constexpr auto StoreVector = _mm512_storeu_ps;
static constexpr auto BroadcastToVector = _mm512_set1_ps;
static constexpr auto XOR = _mm512_xor_ps;
static constexpr unsigned Registers = 32;
};
@ -65,6 +66,7 @@ struct __m512d_block_wise_config {
static constexpr auto LoadVector = _mm512_loadu_pd;
static constexpr auto StoreVector = _mm512_storeu_pd;
static constexpr auto BroadcastToVector = _mm512_set1_pd;
static constexpr auto XOR = _mm512_xor_pd;
static constexpr unsigned Registers = 32;
};
#endif

View File

@ -28,7 +28,7 @@ struct ExtendedBlockWiseConfig {
// maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16
// C = floor ((16 - R) / R)
unsigned constexpr extra_registers = 2;
unsigned constexpr extra_registers = 1;
constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {