Generate implementations for specific block-sizes
This commit is contained in:
parent
fffd9ce319
commit
dff24a914c
@ -6,10 +6,14 @@ set(CMAKE_CXX_STANDARD 17)
|
|||||||
option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON)
|
option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON)
|
||||||
option(USE_CLANG "Build with clang instead of gcc" ON)
|
option(USE_CLANG "Build with clang instead of gcc" ON)
|
||||||
option(WITH_AVX512 "Enable AVX512" OFF)
|
option(WITH_AVX512 "Enable AVX512" OFF)
|
||||||
|
option(WITH_MANUAL "Enable MANUAL" ON)
|
||||||
|
|
||||||
if(WITH_AVX512)
|
if(WITH_AVX512)
|
||||||
add_compile_definitions(WITH_AVX512)
|
add_compile_definitions(WITH_AVX512)
|
||||||
endif()
|
endif()
|
||||||
|
if(WITH_MANUAL)
|
||||||
|
add_compile_definitions(WITH_MANUAL)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(USE_CLANG)
|
if(USE_CLANG)
|
||||||
set(CMAKE_CXX_COMPILER "clang++")
|
set(CMAKE_CXX_COMPILER "clang++")
|
||||||
|
109
scripts/generate.py
Normal file
109
scripts/generate.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import os
|
||||||
|
code_prefix = """
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
template<typename BitWiseConfig, unsigned _NumRows, unsigned _NumColumnVectors>
|
||||||
|
struct RegisterBlocking;
|
||||||
|
"""
|
||||||
|
|
||||||
|
code_template = """
|
||||||
|
template<typename BitWiseConfig>
|
||||||
|
struct RegisterBlocking<BitWiseConfig, %d, %d> {
|
||||||
|
|
||||||
|
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||||
|
|
||||||
|
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||||
|
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr auto NumRows = 3;
|
||||||
|
static constexpr auto NumColumns = 4 * bwc::VectorWidth;
|
||||||
|
|
||||||
|
template<typename M1, typename M2, typename M3>
|
||||||
|
static void __attribute__ ((noinline))
|
||||||
|
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
for (int p = 0; p < k; p++) {
|
||||||
|
%s
|
||||||
|
}
|
||||||
|
|
||||||
|
%s
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
"""
|
||||||
|
|
||||||
|
code_postfix = "}"
|
||||||
|
|
||||||
|
|
||||||
|
def generate_rows(num_rows, num_columns):
|
||||||
|
|
||||||
|
variables = ["c"]
|
||||||
|
zero_variables = []
|
||||||
|
loop_lines = []
|
||||||
|
after_loop = []
|
||||||
|
for row_index in range(num_rows):
|
||||||
|
variables.append("r%d" % row_index)
|
||||||
|
for column_index in range(num_columns):
|
||||||
|
zero_variables.append("r%dc%d" % (row_index, column_index))
|
||||||
|
|
||||||
|
for row_index in range(num_rows):
|
||||||
|
loop_lines.append("r%d = bwc::BroadcastToVector(A(aRowOffset + %d, p));" % (row_index, row_index))
|
||||||
|
|
||||||
|
for column_index in range(num_columns):
|
||||||
|
loop_lines.append("c = bwc::LoadVector(&B(p, bColOffset + %d * bwc::VectorWidth));" % column_index)
|
||||||
|
for row_index in range(num_rows):
|
||||||
|
loop_lines.append("r%dc%d += r%d * c;" % (row_index, column_index, row_index))
|
||||||
|
|
||||||
|
after_loop.append("size_t row = aRowOffset;")
|
||||||
|
after_loop.append("size_t column;")
|
||||||
|
for row_index in range(num_rows):
|
||||||
|
after_loop.append("column = bColOffset;")
|
||||||
|
for column_index in range(num_columns):
|
||||||
|
after_loop.append("AddAndStore(&C(row, column), r%dc%d);" % (row_index, column_index))
|
||||||
|
if column_index != num_columns - 1:
|
||||||
|
after_loop.append("column += bwc::VectorWidth;")
|
||||||
|
if row_index != num_rows - 1:
|
||||||
|
after_loop.append("++row;")
|
||||||
|
print(len(variables) + len(zero_variables))
|
||||||
|
variables_lines = ["typename bwc::VectorType %s;" % variable for variable in variables]
|
||||||
|
variables_lines += ["typename bwc::VectorType %s;" % variable for variable in zero_variables]
|
||||||
|
variables_lines += [variable + " = bwc::XOR(" + variable + ", " + variable + ");" for variable in zero_variables]
|
||||||
|
|
||||||
|
variables_code = "\n".join(" " + line for line in variables_lines)
|
||||||
|
loop_code = "\n".join(" " + line for line in loop_lines)
|
||||||
|
after_loop_code = "\n".join(" " + line for line in after_loop)
|
||||||
|
|
||||||
|
return code_template % (num_rows, num_columns, variables_code, loop_code, after_loop_code)
|
||||||
|
|
||||||
|
|
||||||
|
max_register_count = 32
|
||||||
|
|
||||||
|
extra_registers = 1
|
||||||
|
|
||||||
|
|
||||||
|
def get_initial_column_vectors(rows, registers):
|
||||||
|
if rows == 0:
|
||||||
|
return 0
|
||||||
|
result = int (float(registers - extra_registers - rows) / rows)
|
||||||
|
if extra_registers + rows + rows * (result + 1) <= registers:
|
||||||
|
return result + 1
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_initial_rows(registers):
|
||||||
|
for rows in reversed(range(registers)):
|
||||||
|
if extra_registers + rows + rows * rows <= registers:
|
||||||
|
return rows
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
fn = os.path.join(os.path.dirname(__file__), "..", "src/register_blocking/detail/manual.h")
|
||||||
|
with open(fn, "w") as f:
|
||||||
|
f.write(code_prefix)
|
||||||
|
for rows in range(1, get_initial_rows(max_register_count) + 1):
|
||||||
|
for columns in range(1, get_initial_column_vectors(rows, max_register_count) + 1):
|
||||||
|
f.write(generate_rows(rows, columns))
|
||||||
|
f.write(code_postfix)
|
@ -21,27 +21,20 @@ def check_call_quiet(*args, **kwargs):
|
|||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
|
||||||
def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
|
def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args):
|
||||||
flags = [
|
flags = [
|
||||||
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
||||||
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
||||||
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
||||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
||||||
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
||||||
]
|
"-DWITH_MANUAL=" + ("OFF" if no_manual else "ON"),
|
||||||
|
]
|
||||||
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
|
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
|
||||||
return build_path
|
return flags, build_path
|
||||||
|
|
||||||
|
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args):
|
||||||
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
|
flags, build_path = get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args)
|
||||||
flags = [
|
|
||||||
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
|
||||||
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
|
||||||
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
|
||||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
|
||||||
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
|
||||||
]
|
|
||||||
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
|
|
||||||
if not os.path.exists(build_path):
|
if not os.path.exists(build_path):
|
||||||
os.makedirs(build_path)
|
os.makedirs(build_path)
|
||||||
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
||||||
@ -58,7 +51,8 @@ very_slow_functions = ["naive_reordered"]
|
|||||||
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"]
|
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"]
|
||||||
normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
|
normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
|
||||||
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
||||||
avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
|
avx512_fast_functions = ["divide_and_conquer_block_avx512"]
|
||||||
|
avx512_normal_functions = ["block_wise_avx512"]
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), ".."))
|
os.chdir(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
@ -78,11 +72,15 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument("--gcc", action="store_true")
|
parser.add_argument("--gcc", action="store_true")
|
||||||
parser.add_argument("--function", type=str, nargs="*")
|
parser.add_argument("--function", type=str, nargs="*")
|
||||||
parser.add_argument("--release", action="store_true")
|
parser.add_argument("--release", action="store_true")
|
||||||
|
parser.add_argument("--no_manual", action="store_true")
|
||||||
|
|
||||||
options = parser.parse_args()
|
options = parser.parse_args()
|
||||||
|
|
||||||
functions = fast_functions
|
functions = fast_functions
|
||||||
|
|
||||||
|
if options.avx512:
|
||||||
|
functions += avx512_fast_functions
|
||||||
|
|
||||||
if options.very_slow:
|
if options.very_slow:
|
||||||
functions += very_slow_functions
|
functions += very_slow_functions
|
||||||
|
|
||||||
@ -92,12 +90,13 @@ if __name__ == '__main__':
|
|||||||
if options.very_slow or options.slow or options.normal:
|
if options.very_slow or options.slow or options.normal:
|
||||||
functions += normal_functions
|
functions += normal_functions
|
||||||
|
|
||||||
if options.avx512:
|
if options.avx512 and (options.very_slow or options.slow or options.normal):
|
||||||
functions += avx512_functions
|
functions += avx512_normal_functions
|
||||||
|
|
||||||
|
|
||||||
if options.function:
|
if options.function:
|
||||||
functions = options.function
|
functions = options.function
|
||||||
functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"]
|
# functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"]
|
||||||
extra_args = []
|
extra_args = []
|
||||||
if options.validate:
|
if options.validate:
|
||||||
extra_args.append("--validate")
|
extra_args.append("--validate")
|
||||||
@ -119,22 +118,22 @@ if __name__ == '__main__':
|
|||||||
already_dumped = False
|
already_dumped = False
|
||||||
for sizes in matrix_combinations:
|
for sizes in matrix_combinations:
|
||||||
args = list(sizes)
|
args = list(sizes)
|
||||||
compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, args)
|
compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, options.no_manual, args)
|
||||||
folder = "x".join(sizes)
|
folder = "x".join(sizes)
|
||||||
for fidx, function in enumerate(functions):
|
for fidx, function in enumerate(functions):
|
||||||
arguments = [folder, "--algorithm", function]
|
arguments = [folder, "--algorithm", function]
|
||||||
if with_double:
|
if with_double:
|
||||||
arguments.append("--double")
|
arguments.append("--double")
|
||||||
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args)
|
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, options.no_manual, arguments + extra_args)
|
||||||
ms = output.decode()[output.decode().find("multiply:") + 10:]
|
ms = output.decode()[output.decode().find("multiply:") + 10:]
|
||||||
|
|
||||||
if not already_dumped:
|
if not already_dumped:
|
||||||
build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args), "simd_multiply")
|
build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, options.no_manual, arguments + extra_args)[1], "simd_multiply")
|
||||||
os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path)
|
# os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path)
|
||||||
os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path)
|
# os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path)
|
||||||
upp = os.path.expanduser("~/dup.sh")
|
upp = os.path.expanduser("~/dup.sh")
|
||||||
os.system('bash %s upload main_with_source.s .' % upp)
|
# os.system('bash %s upload main_with_source.s .' % upp)
|
||||||
os.system('bash %s upload main_wo_source.s .' % upp)
|
# os.system('bash %s upload main_wo_source.s .' % upp)
|
||||||
already_dumped = True
|
already_dumped = True
|
||||||
if "ms\n" in ms:
|
if "ms\n" in ms:
|
||||||
ms = float(ms.split("ms\n")[0])
|
ms = float(ms.split("ms\n")[0])
|
||||||
|
@ -8,6 +8,10 @@
|
|||||||
#include "../../Matrix.h"
|
#include "../../Matrix.h"
|
||||||
#include "ExtendedBlockWiseConfig.h"
|
#include "ExtendedBlockWiseConfig.h"
|
||||||
|
|
||||||
|
#ifdef WITH_MANUAL
|
||||||
|
#include "manual.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
template<
|
template<
|
||||||
|
7568
src/register_blocking/detail/manual.h
Normal file
7568
src/register_blocking/detail/manual.h
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user