Add SSE
This commit is contained in:
parent
ad6c4ca996
commit
b697b44fb5
@ -17,7 +17,7 @@ def check_call_quiet(*args, **kwargs):
|
|||||||
rc = p.returncode
|
rc = p.returncode
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
print(output.decode())
|
print(output.decode())
|
||||||
print(err)
|
print(err.decode())
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
|
|||||||
|
|
||||||
very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
|
very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
|
||||||
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
|
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
|
||||||
normal_functions = ["block_wise_avx2"]
|
normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
|
||||||
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
||||||
avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
|
avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
|
||||||
|
|
||||||
@ -63,6 +63,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument("--avx512", action="store_true")
|
parser.add_argument("--avx512", action="store_true")
|
||||||
parser.add_argument("--validate", action="store_true")
|
parser.add_argument("--validate", action="store_true")
|
||||||
parser.add_argument("--double", action="store_true")
|
parser.add_argument("--double", action="store_true")
|
||||||
|
parser.add_argument("--gcc", action="store_true")
|
||||||
parser.add_argument("--function", type=str, nargs="*")
|
parser.add_argument("--function", type=str, nargs="*")
|
||||||
|
|
||||||
options = parser.parse_args()
|
options = parser.parse_args()
|
||||||
@ -101,31 +102,31 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
times = [[] for f in functions]
|
times = [[] for f in functions]
|
||||||
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
||||||
for clang in [True]:
|
clang = not options.gcc
|
||||||
for sizes in matrix_combinations:
|
for sizes in matrix_combinations:
|
||||||
args = list(sizes)
|
args = list(sizes)
|
||||||
compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args)
|
compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args)
|
||||||
folder = "x".join(sizes)
|
folder = "x".join(sizes)
|
||||||
for fidx, function in enumerate(functions):
|
for fidx, function in enumerate(functions):
|
||||||
arguments = [folder, "--algorithm", function]
|
arguments = [folder, "--algorithm", function]
|
||||||
if with_double:
|
if with_double:
|
||||||
arguments.append("--double")
|
arguments.append("--double")
|
||||||
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args)
|
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args)
|
||||||
ms = output.decode()[output.decode().find("multiply:") + 10:]
|
ms = output.decode()[output.decode().find("multiply:") + 10:]
|
||||||
if "ms\n" in ms:
|
if "ms\n" in ms:
|
||||||
ms = float(ms.split("ms\n")[0])
|
ms = float(ms.split("ms\n")[0])
|
||||||
else:
|
else:
|
||||||
ms = float(ms.split("s\n")[0]) * 1000
|
ms = float(ms.split("s\n")[0]) * 1000
|
||||||
times[fidx].append(ms)
|
times[fidx].append(ms)
|
||||||
|
|
||||||
shutil.rmtree(folder)
|
shutil.rmtree(folder)
|
||||||
print(["%.3f" % numpy.mean(ts) for ts in times])
|
print(["%.3f" % numpy.mean(ts) for ts in times])
|
||||||
with open(output_file + ".cache", "w") as f:
|
with open(output_file + ".cache", "w") as f:
|
||||||
json.dump({
|
json.dump({
|
||||||
"extr_args": extra_args,
|
"extr_args": extra_args,
|
||||||
"times": times,
|
"times": times,
|
||||||
"sizes": matrix_combinations,
|
"sizes": matrix_combinations,
|
||||||
"functions": functions,
|
"functions": functions,
|
||||||
"means": ["%.3f" % numpy.mean(ts) for ts in times]
|
"means": ["%.3f" % numpy.mean(ts) for ts in times]
|
||||||
}, f)
|
}, f)
|
||||||
os.rename(output_file + ".cache", output_file)
|
os.rename(output_file + ".cache", output_file)
|
@ -166,6 +166,12 @@ void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix<T> &C, const
|
|||||||
_divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
|
_divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
|
_divide_and_conquer<multiplier_block_wise<T, SSE>>(C, A, B);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
_divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
|
_divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include "RegisterBlocking.h"
|
#include "register_blocking/RegisterBlocking.h"
|
||||||
#include "Boost.h"
|
#include "Boost.h"
|
||||||
#include "Naive.h"
|
#include "Naive.h"
|
||||||
#include "DevideAndConquer.h"
|
#include "DevideAndConquer.h"
|
||||||
@ -52,6 +52,7 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
|
|||||||
double use_result = 0;
|
double use_result = 0;
|
||||||
|
|
||||||
TEST_IF(test_function_name, naive_reordered, A, B)
|
TEST_IF(test_function_name, naive_reordered, A, B)
|
||||||
|
TEST_IF(test_function_name, block_wise_sse, A, B)
|
||||||
TEST_IF(test_function_name, block_wise_avx2, A, B)
|
TEST_IF(test_function_name, block_wise_avx2, A, B)
|
||||||
|
|
||||||
#ifdef WITH_AVX512
|
#ifdef WITH_AVX512
|
||||||
@ -59,6 +60,7 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
TEST_IF(test_function_name, boost_axpy_mul, A, B)
|
TEST_IF(test_function_name, boost_axpy_mul, A, B)
|
||||||
|
TEST_IF(test_function_name, divide_and_conquer_block_sse, A, B)
|
||||||
TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
|
TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
|
||||||
|
|
||||||
#ifdef WITH_AVX512
|
#ifdef WITH_AVX512
|
||||||
|
@ -5,58 +5,11 @@
|
|||||||
#ifndef SMID_MATRIX_REGISTERBLOCKING_H
|
#ifndef SMID_MATRIX_REGISTERBLOCKING_H
|
||||||
#define SMID_MATRIX_REGISTERBLOCKING_H
|
#define SMID_MATRIX_REGISTERBLOCKING_H
|
||||||
|
|
||||||
#include "Matrix.h"
|
#include "../Matrix.h"
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
enum AvxVersion {
|
|
||||||
AVX2,
|
|
||||||
|
|
||||||
#ifdef WITH_AVX512
|
|
||||||
AVX512
|
|
||||||
#endif
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
struct __m256_block_wise_config {
|
|
||||||
using FloatType = float;
|
|
||||||
using VectorType = __m256;
|
|
||||||
static constexpr auto LoadVector = _mm256_loadu_ps;
|
|
||||||
static constexpr auto StoreVector = _mm256_storeu_ps;
|
|
||||||
static constexpr auto BroadcastToVector = _mm256_set1_ps;
|
|
||||||
static constexpr unsigned Registers = 16;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __m256d_block_wise_config {
|
|
||||||
using FloatType = double;
|
|
||||||
using VectorType = __m256d;
|
|
||||||
static constexpr auto LoadVector = _mm256_loadu_pd;
|
|
||||||
static constexpr auto StoreVector = _mm256_storeu_pd;
|
|
||||||
static constexpr auto BroadcastToVector = _mm256_set1_pd;
|
|
||||||
static constexpr unsigned Registers = 16;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef WITH_AVX512
|
|
||||||
struct __m512_block_wise_config {
|
|
||||||
using FloatType = float;
|
|
||||||
using VectorType = __m512;
|
|
||||||
static constexpr auto LoadVector = _mm512_loadu_ps;
|
|
||||||
static constexpr auto StoreVector = _mm512_storeu_ps;
|
|
||||||
static constexpr auto BroadcastToVector = _mm512_set1_ps;
|
|
||||||
static constexpr unsigned Registers = 32;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __m512d_block_wise_config {
|
|
||||||
using FloatType = double;
|
|
||||||
using VectorType = __m512d;
|
|
||||||
static constexpr auto LoadVector = _mm512_loadu_pd;
|
|
||||||
static constexpr auto StoreVector = _mm512_storeu_pd;
|
|
||||||
static constexpr auto BroadcastToVector = _mm512_set1_pd;
|
|
||||||
static constexpr unsigned Registers = 32;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
|
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
|
||||||
// C = floor ((16 - R) / R)
|
// C = floor ((16 - R) / R)
|
||||||
|
|
||||||
@ -139,7 +92,7 @@ namespace detail {
|
|||||||
|
|
||||||
template<
|
template<
|
||||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||||
typename BlockWiseConfig = __m256_block_wise_config
|
typename BlockWiseConfig
|
||||||
>
|
>
|
||||||
struct block_wise {
|
struct block_wise {
|
||||||
|
|
||||||
@ -298,23 +251,94 @@ namespace detail {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct __m128_block_wise_config {
|
||||||
|
using FloatType = float;
|
||||||
|
using VectorType = __m128;
|
||||||
|
static constexpr auto LoadVector = _mm_loadu_ps;
|
||||||
|
static constexpr auto StoreVector = _mm_storeu_ps;
|
||||||
|
static constexpr auto BroadcastToVector = _mm_set1_ps;
|
||||||
|
static constexpr unsigned Registers = 16;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __m128d_block_wise_config {
|
||||||
|
using FloatType = double;
|
||||||
|
using VectorType = __m128d;
|
||||||
|
static constexpr auto LoadVector = _mm_loadu_pd;
|
||||||
|
static constexpr auto StoreVector = _mm_storeu_pd;
|
||||||
|
static constexpr auto BroadcastToVector = _mm_set1_pd;
|
||||||
|
static constexpr unsigned Registers = 16;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __m256_block_wise_config {
|
||||||
|
using FloatType = float;
|
||||||
|
using VectorType = __m256;
|
||||||
|
static constexpr auto LoadVector = _mm256_loadu_ps;
|
||||||
|
static constexpr auto StoreVector = _mm256_storeu_ps;
|
||||||
|
static constexpr auto BroadcastToVector = _mm256_set1_ps;
|
||||||
|
static constexpr unsigned Registers = 16;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __m256d_block_wise_config {
|
||||||
|
using FloatType = double;
|
||||||
|
using VectorType = __m256d;
|
||||||
|
static constexpr auto LoadVector = _mm256_loadu_pd;
|
||||||
|
static constexpr auto StoreVector = _mm256_storeu_pd;
|
||||||
|
static constexpr auto BroadcastToVector = _mm256_set1_pd;
|
||||||
|
static constexpr unsigned Registers = 16;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef WITH_AVX512
|
||||||
|
struct __m512_block_wise_config {
|
||||||
|
using FloatType = float;
|
||||||
|
using VectorType = __m512;
|
||||||
|
static constexpr auto LoadVector = _mm512_loadu_ps;
|
||||||
|
static constexpr auto StoreVector = _mm512_storeu_ps;
|
||||||
|
static constexpr auto BroadcastToVector = _mm512_set1_ps;
|
||||||
|
static constexpr unsigned Registers = 32;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __m512d_block_wise_config {
|
||||||
|
using FloatType = double;
|
||||||
|
using VectorType = __m512d;
|
||||||
|
static constexpr auto LoadVector = _mm512_loadu_pd;
|
||||||
|
static constexpr auto StoreVector = _mm512_storeu_pd;
|
||||||
|
static constexpr auto BroadcastToVector = _mm512_set1_pd;
|
||||||
|
static constexpr unsigned Registers = 32;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
enum AvxVersion {
|
||||||
|
SSE,
|
||||||
|
AVX2,
|
||||||
|
#ifdef WITH_AVX512
|
||||||
|
AVX512
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
template<typename FloatType, AvxVersion avxVersion>
|
template<typename FloatType, AvxVersion avxVersion>
|
||||||
struct block_wise;
|
struct block_wise;
|
||||||
|
|
||||||
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<detail::__m256_block_wise_config> {};
|
template<> struct block_wise<float, SSE> : public detail::block_wise_base<__m128_block_wise_config> {};
|
||||||
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<detail::__m256d_block_wise_config> {};
|
template<> struct block_wise<double, SSE> : public detail::block_wise_base<__m128d_block_wise_config> {};
|
||||||
|
|
||||||
|
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<__m256_block_wise_config> {};
|
||||||
|
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<__m256d_block_wise_config> {};
|
||||||
|
|
||||||
#ifdef WITH_AVX512
|
#ifdef WITH_AVX512
|
||||||
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<detail::__m512_block_wise_config> {};
|
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<__m512_block_wise_config> {};
|
||||||
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<detail::__m512d_block_wise_config> {};
|
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<__m512d_block_wise_config> {};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
void __attribute__ ((noinline)) block_wise_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
|
block_wise<T, SSE>::multiply(C, A, B);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
block_wise<T, AVX2>::multiply(C, A, B);
|
block_wise<T, AVX2>::multiply(C, A, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef WITH_AVX512
|
#ifdef WITH_AVX512
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
Loading…
x
Reference in New Issue
Block a user