Compare different block configurations
This commit is contained in:
parent
7b727cb1dc
commit
fffd9ce319
@ -6,9 +6,6 @@ set(CMAKE_CXX_STANDARD 17)
|
||||
option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON)
|
||||
option(USE_CLANG "Build with clang instead of gcc" ON)
|
||||
option(WITH_AVX512 "Enable AVX512" OFF)
|
||||
set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run")
|
||||
|
||||
add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}")
|
||||
|
||||
if(WITH_AVX512)
|
||||
add_compile_definitions(WITH_AVX512)
|
||||
@ -21,6 +18,8 @@ else()
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
|
||||
|
||||
if(OPTIMIZE_FOR_NATIVE)
|
||||
@ -49,9 +48,8 @@ endif(BLAS_FOUND)
|
||||
|
||||
find_package(Boost REQUIRED COMPONENTS filesystem program_options)
|
||||
|
||||
include_directories("${Boost_INCLUDE_DIRS}")
|
||||
include_directories("${Boost_INCLUDE_DIRS} ${BLAS_INCLUDE_DIRS}")
|
||||
|
||||
message("${Boost_LIBRARIES} ${BLAS_LIBRARIES}")
|
||||
add_library(simple_matrix src/Matrix.cpp)
|
||||
target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES})
|
||||
|
||||
@ -59,4 +57,4 @@ add_executable(simd_multiply src/main.cpp)
|
||||
target_link_libraries(simd_multiply simple_matrix)
|
||||
|
||||
add_executable(generate_random src/generate_random.cpp)
|
||||
target_link_libraries(generate_random simple_matrix)
|
||||
target_link_libraries(generate_random simple_matrix)
|
||||
|
@ -16,13 +16,12 @@ def check_call_quiet(*args, **kwargs):
|
||||
output, err = p.communicate()
|
||||
rc = p.returncode
|
||||
if rc != 0:
|
||||
print(output.decode())
|
||||
print(err.decode())
|
||||
print(output.decode("utf-8"))
|
||||
print(err.decode("utf-8"))
|
||||
exit(0)
|
||||
|
||||
|
||||
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
|
||||
|
||||
def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
|
||||
flags = [
|
||||
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
||||
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
||||
@ -30,7 +29,19 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
|
||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
||||
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
||||
]
|
||||
build_path = os.path.join(build_path_prefix, " ".join(flags))
|
||||
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
|
||||
return build_path
|
||||
|
||||
|
||||
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
|
||||
flags = [
|
||||
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
||||
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
||||
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
||||
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
||||
]
|
||||
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
|
||||
if not os.path.exists(build_path):
|
||||
os.makedirs(build_path)
|
||||
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
||||
@ -86,7 +97,7 @@ if __name__ == '__main__':
|
||||
|
||||
if options.function:
|
||||
functions = options.function
|
||||
# functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"]
|
||||
functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"]
|
||||
extra_args = []
|
||||
if options.validate:
|
||||
extra_args.append("--validate")
|
||||
@ -105,6 +116,7 @@ if __name__ == '__main__':
|
||||
times = [[] for f in functions]
|
||||
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
||||
clang = not options.gcc
|
||||
already_dumped = False
|
||||
for sizes in matrix_combinations:
|
||||
args = list(sizes)
|
||||
compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, args)
|
||||
@ -115,6 +127,15 @@ if __name__ == '__main__':
|
||||
arguments.append("--double")
|
||||
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args)
|
||||
ms = output.decode()[output.decode().find("multiply:") + 10:]
|
||||
|
||||
if not already_dumped:
|
||||
build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args), "simd_multiply")
|
||||
os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path)
|
||||
os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path)
|
||||
upp = os.path.expanduser("~/dup.sh")
|
||||
os.system('bash %s upload main_with_source.s .' % upp)
|
||||
os.system('bash %s upload main_wo_source.s .' % upp)
|
||||
already_dumped = True
|
||||
if "ms\n" in ms:
|
||||
ms = float(ms.split("ms\n")[0])
|
||||
else:
|
||||
|
@ -182,8 +182,30 @@ void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, cons
|
||||
#ifdef WITH_AVX512
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512, 6, 5, 75>>(C, A, B);
|
||||
}
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx5120(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512, 6, 1, 75>>(C, A, B);
|
||||
}
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx5121(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512, 6, 1, 20>>(C, A, B);
|
||||
}
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx5122(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512, 15, 1, 75>>(C, A, B);
|
||||
}
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx5123(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512, 15, 1, 100>>(C, A, B);
|
||||
}
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx5124(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX512, 15, 1, 150>>(C, A, B);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#endif //SMID_MATRIX_DEVIDEANDCONQUER_H
|
||||
|
14
src/main.cpp
14
src/main.cpp
@ -64,6 +64,11 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
|
||||
TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
TEST_IF(test_function_name, divide_and_conquer_block_avx5120, A, B)
|
||||
TEST_IF(test_function_name, divide_and_conquer_block_avx5121, A, B)
|
||||
TEST_IF(test_function_name, divide_and_conquer_block_avx5122, A, B)
|
||||
TEST_IF(test_function_name, divide_and_conquer_block_avx5123, A, B)
|
||||
|
||||
TEST_IF(test_function_name, divide_and_conquer_block_avx512, A, B)
|
||||
#endif
|
||||
|
||||
@ -110,9 +115,12 @@ int main(int argc, char* argv[]) {
|
||||
po::store(po::command_line_parser(argc, argv).
|
||||
options(desc).positional(p).run(), vm);
|
||||
po::notify(vm);
|
||||
|
||||
std::string test_function_name = vm.count("algorithm") ? vm["algorithm"].as<std::string>() : DEFAULT_TEST_FUNCTION_NAME;
|
||||
|
||||
std::string test_function_name;
|
||||
if (vm.count("algorithm")) {
|
||||
test_function_name = vm["algorithm"].as<std::string>();
|
||||
} else {
|
||||
throw std::runtime_error("missing algorithm");
|
||||
}
|
||||
if(vm.count("double")) {
|
||||
return main_work<double>(test_function_name, vm["input-folder"].as<std::string>(), vm.count("validate"));
|
||||
} else {
|
||||
|
@ -56,6 +56,7 @@ struct __m512_block_wise_config {
|
||||
static constexpr auto LoadVector = _mm512_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm512_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm512_set1_ps;
|
||||
static constexpr auto XOR = _mm512_xor_ps;
|
||||
static constexpr unsigned Registers = 32;
|
||||
};
|
||||
|
||||
@ -65,6 +66,7 @@ struct __m512d_block_wise_config {
|
||||
static constexpr auto LoadVector = _mm512_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm512_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm512_set1_pd;
|
||||
static constexpr auto XOR = _mm512_xor_pd;
|
||||
static constexpr unsigned Registers = 32;
|
||||
};
|
||||
#endif
|
||||
|
@ -28,7 +28,7 @@ struct ExtendedBlockWiseConfig {
|
||||
// maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16
|
||||
// C = floor ((16 - R) / R)
|
||||
|
||||
unsigned constexpr extra_registers = 2;
|
||||
unsigned constexpr extra_registers = 1;
|
||||
|
||||
|
||||
constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user