159 lines
6.7 KiB
Python
Executable File
159 lines
6.7 KiB
Python
Executable File
#!/usr/bin/python
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from pprint import pprint
|
|
|
|
import numpy
|
|
|
|
|
|
def check_call_quiet(*args, **kwargs):
|
|
p = subprocess.Popen(*args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)
|
|
output, err = p.communicate()
|
|
rc = p.returncode
|
|
if rc != 0:
|
|
print(output.decode("utf-8"))
|
|
print(err.decode("utf-8"))
|
|
exit(0)
|
|
|
|
|
|
def get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args):
|
|
flags = [
|
|
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
|
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
|
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
|
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
|
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
|
"-DWITH_MANUAL=" + ("OFF" if no_manual else "ON"),
|
|
]
|
|
build_path = os.path.join(build_path_prefix, " ".join(flags)).replace("-", " ").replace(" ", " ").replace(" ", "_")
|
|
return flags, build_path
|
|
|
|
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args):
|
|
flags, build_path = get_build_path(source_path, build_path_prefix, target, native, use_clang, avx512, release, no_manual, args)
|
|
if not os.path.exists(build_path):
|
|
os.makedirs(build_path)
|
|
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
|
check_call_quiet(["make", target], cwd=build_path)
|
|
if False:
|
|
subprocess.check_call(["perf", "stat", "-B", "-e", "cache-references,cache-misses,cycles,instructions,branches,faults,migrations"]
|
|
+ [os.path.join(build_path, target)] + args)
|
|
else:
|
|
print(" ".join([os.path.join(build_path, target)] + args))
|
|
output = subprocess.check_output([os.path.join(build_path, target)] + args)
|
|
return output
|
|
|
|
very_slow_functions = ["naive_reordered"]
|
|
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"]
|
|
normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
|
|
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
|
avx512_fast_functions = ["divide_and_conquer_block_avx512"]
|
|
avx512_normal_functions = ["block_wise_avx512"]
|
|
|
|
if __name__ == '__main__':
|
|
os.chdir(os.path.join(os.path.dirname(__file__), ".."))
|
|
if not os.path.exists("data"):
|
|
os.makedirs("data")
|
|
os.chdir("data")
|
|
with_double = False
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("total_size", type=float)
|
|
parser.add_argument("--very_slow", action="store_true")
|
|
parser.add_argument("--slow", action="store_true")
|
|
parser.add_argument("--normal", action="store_true")
|
|
parser.add_argument("--avx512", action="store_true")
|
|
parser.add_argument("--validate", action="store_true")
|
|
parser.add_argument("--double", action="store_true")
|
|
parser.add_argument("--gcc", action="store_true")
|
|
parser.add_argument("--function", type=str, nargs="*")
|
|
parser.add_argument("--release", action="store_true")
|
|
parser.add_argument("--no_manual", action="store_true")
|
|
parser.add_argument("--verbose", action="store_true")
|
|
|
|
options = parser.parse_args()
|
|
|
|
functions = fast_functions
|
|
|
|
if options.avx512:
|
|
functions += avx512_fast_functions
|
|
|
|
if options.very_slow:
|
|
functions += very_slow_functions
|
|
|
|
if options.very_slow or options.slow:
|
|
functions += slow_functions
|
|
|
|
if options.very_slow or options.slow or options.normal:
|
|
functions += normal_functions
|
|
|
|
if options.avx512 and (options.very_slow or options.slow or options.normal):
|
|
functions += avx512_normal_functions
|
|
|
|
|
|
if options.function:
|
|
functions = options.function
|
|
# functions = ["divide_and_conquer_block_avx2", "divide_and_conquer_block_avx5120"]#, "divide_and_conquer_block_avx5121", "divide_and_conquer_block_avx5122", "divide_and_conquer_block_avx5123", "divide_and_conquer_block_avx512"]
|
|
extra_args = []
|
|
if options.validate:
|
|
extra_args.append("--validate")
|
|
if options.double:
|
|
extra_args.append("--double")
|
|
|
|
total = int(options.total_size)
|
|
matrix_combinations = []
|
|
i = pow(total, 1. / 3.)
|
|
for _ in range(20):
|
|
a = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
|
|
b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
|
|
c = int(round(total / a / b))
|
|
matrix_combinations.append([str(a), str(b), str(c)])
|
|
|
|
times = [[] for f in functions]
|
|
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
|
clang = not options.gcc
|
|
already_dumped = False
|
|
for sizes in matrix_combinations:
|
|
args = list(sizes)
|
|
compile_and_run("..", "builds", "generate_random", True, clang, options.avx512, options.release, options.no_manual, args)
|
|
folder = "x".join(sizes)
|
|
for fidx, function in enumerate(functions):
|
|
arguments = [folder, "--algorithm", function]
|
|
if with_double:
|
|
arguments.append("--double")
|
|
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, options.no_manual, arguments + extra_args)
|
|
if options.verbose:
|
|
print(output.decode("utf-8"))
|
|
ms = output.decode()[output.decode("utf-8").find("multiply:") + 10:]
|
|
|
|
if not already_dumped:
|
|
build_path = os.path.join(get_build_path("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, options.no_manual, arguments + extra_args)[1], "simd_multiply")
|
|
# os.system('objdump -d -M intel -S "%s" -C > main_with_source.s' % build_path)
|
|
# os.system('objdump -d -M intel "%s" -C > main_wo_source.s' % build_path)
|
|
upp = os.path.expanduser("~/dup.sh")
|
|
# os.system('bash %s upload main_with_source.s .' % upp)
|
|
# os.system('bash %s upload main_wo_source.s .' % upp)
|
|
already_dumped = True
|
|
if "ms\n" in ms:
|
|
ms = float(ms.split("ms\n")[0])
|
|
else:
|
|
ms = float(ms.split("s\n")[0]) * 1000
|
|
times[fidx].append(ms)
|
|
|
|
shutil.rmtree(folder)
|
|
print(["%.3f" % numpy.mean(ts) for ts in times])
|
|
with open(output_file + ".cache", "w") as f:
|
|
json.dump({
|
|
"extr_args": extra_args,
|
|
"times": times,
|
|
"sizes": matrix_combinations,
|
|
"functions": functions,
|
|
"means": ["%.3f" % numpy.mean(ts) for ts in times],
|
|
"total_size": total
|
|
}, f)
|
|
os.rename(output_file + ".cache", output_file)
|