Added avx512
This commit is contained in:
parent
520407e45d
commit
45ff460f4a
@ -45,6 +45,7 @@ find_package(Boost REQUIRED COMPONENTS filesystem program_options)
|
|||||||
|
|
||||||
include_directories("${Boost_INCLUDE_DIRS}")
|
include_directories("${Boost_INCLUDE_DIRS}")
|
||||||
|
|
||||||
|
message("${Boost_LIBRARIES} ${BLAS_LIBRARIES}")
|
||||||
add_library(simple_matrix src/Matrix.cpp)
|
add_library(simple_matrix src/Matrix.cpp)
|
||||||
target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES})
|
target_link_libraries(simple_matrix ${Boost_LIBRARIES} ${BLAS_LIBRARIES})
|
||||||
|
|
||||||
|
5
plot.py
5
plot.py
@ -21,9 +21,8 @@ a = [
|
|||||||
def get_i(x):
|
def get_i(x):
|
||||||
return np.array([(i[0], float(i[x + 1])) for i in a if len(i) > x + 1])
|
return np.array([(i[0], float(i[x + 1])) for i in a if len(i) > x + 1])
|
||||||
|
|
||||||
functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul",
|
[1e9, '172.500', '166.250', '163.900', '127.100', '117.100', '686.450', '35.100', '808.800'],
|
||||||
"divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3",
|
functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul", "divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3", "blas", "naive_reordered"]
|
||||||
"blas", "naive_reordered"]
|
|
||||||
|
|
||||||
for i, f in list(enumerate(functions)):
|
for i, f in list(enumerate(functions)):
|
||||||
xy = get_i(i)
|
xy = get_i(i)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
@ -28,7 +29,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
|
|||||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON"
|
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON"
|
||||||
]
|
]
|
||||||
build_path = os.path.join(build_path_prefix, " ".join(flags))
|
build_path = os.path.join(build_path_prefix, " ".join(flags))
|
||||||
os.makedirs(build_path, exist_ok=True)
|
if not os.path.exists(build_path):
|
||||||
|
os.makedirs(build_path)
|
||||||
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
||||||
check_call_quiet(["make", target], cwd=build_path)
|
check_call_quiet(["make", target], cwd=build_path)
|
||||||
if False:
|
if False:
|
||||||
@ -39,37 +41,68 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
|
|||||||
output = subprocess.check_output([os.path.join(build_path, target)] + args)
|
output = subprocess.check_output([os.path.join(build_path, target)] + args)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
|
||||||
|
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
|
||||||
|
normal_functions = ["block_wise_avx2"]
|
||||||
|
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
||||||
|
avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), ".."))
|
os.chdir(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
os.makedirs("data", exist_ok=True)
|
if not os.path.exists("data"):
|
||||||
|
os.makedirs("data")
|
||||||
os.chdir("data")
|
os.chdir("data")
|
||||||
with_double = False
|
with_double = False
|
||||||
extra_args = []#["--validate"]
|
|
||||||
|
|
||||||
total = float(sys.argv[1] if len(sys.argv) > 1 else 1e8)
|
parser = argparse.ArgumentParser()
|
||||||
sss= []
|
parser.add_argument("total_size", type=float)
|
||||||
|
parser.add_argument("--very_slow", action="store_true")
|
||||||
|
parser.add_argument("--slow", action="store_true")
|
||||||
|
parser.add_argument("--normal", action="store_true")
|
||||||
|
parser.add_argument("--avx512", action="store_true")
|
||||||
|
parser.add_argument("--validate", action="store_true")
|
||||||
|
parser.add_argument("--double", action="store_true")
|
||||||
|
parser.add_argument("--function", type=str, nargs="*")
|
||||||
|
|
||||||
|
options = parser.parse_args()
|
||||||
|
|
||||||
|
functions = fast_functions
|
||||||
|
|
||||||
|
if options.very_slow:
|
||||||
|
functions += very_slow_functions
|
||||||
|
|
||||||
|
if options.very_slow or options.slow:
|
||||||
|
functions += slow_functions
|
||||||
|
|
||||||
|
if options.very_slow or options.slow or options.normal:
|
||||||
|
functions += normal_functions
|
||||||
|
|
||||||
|
if options.avx512:
|
||||||
|
functions += avx512_functions
|
||||||
|
|
||||||
|
if options.function:
|
||||||
|
functions = options.function
|
||||||
|
|
||||||
|
extra_args = []
|
||||||
|
if options.validate:
|
||||||
|
extra_args.append("--validate")
|
||||||
|
if options.double:
|
||||||
|
extra_args.append("--double")
|
||||||
|
|
||||||
|
total = options.total_size
|
||||||
|
matrix_combinations = []
|
||||||
i = pow(total, 1. / 3.)
|
i = pow(total, 1. / 3.)
|
||||||
for _ in range(20):
|
for _ in range(20):
|
||||||
a = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))
|
a = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
|
||||||
b = round(max(0.01 * i + 1, numpy.random.normal(i, i / 2)))
|
b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
|
||||||
|
c = int(round(total / a / b))
|
||||||
|
matrix_combinations.append([str(a), str(b), str(c)])
|
||||||
|
|
||||||
c = round(total / a / b)
|
|
||||||
sss.append([str(a), str(b), str(c)])
|
|
||||||
|
|
||||||
functions = ["block_wise_256_f", "block_wise_256_f2", "boost_axpy_mul",
|
|
||||||
"divide_and_conquer_block1", "divide_and_conquer_block2", "divide_and_conquer_naive_r3",
|
|
||||||
"blas"]
|
|
||||||
if total < 1e10:
|
|
||||||
functions.append("naive_reordered")
|
|
||||||
times = [[] for f in functions]
|
times = [[] for f in functions]
|
||||||
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
||||||
for clang in [True]:
|
for clang in [True]:
|
||||||
for sizes in sss:
|
for sizes in matrix_combinations:
|
||||||
args = list(sizes)
|
args = list(sizes)
|
||||||
if with_double:
|
|
||||||
args.append("--double")
|
|
||||||
compile_and_run("..", "builds", "generate_random", True, True, args)
|
compile_and_run("..", "builds", "generate_random", True, True, args)
|
||||||
folder = "x".join(sizes)
|
folder = "x".join(sizes)
|
||||||
for fidx, function in enumerate(functions):
|
for fidx, function in enumerate(functions):
|
||||||
@ -88,8 +121,9 @@ if __name__ == '__main__':
|
|||||||
print(["%.3f" % numpy.mean(ts) for ts in times])
|
print(["%.3f" % numpy.mean(ts) for ts in times])
|
||||||
with open(output_file + ".cache", "w") as f:
|
with open(output_file + ".cache", "w") as f:
|
||||||
json.dump({
|
json.dump({
|
||||||
|
"extr_args": extra_args,
|
||||||
"times": times,
|
"times": times,
|
||||||
"sizes": sss,
|
"sizes": matrix_combinations,
|
||||||
"functions": functions,
|
"functions": functions,
|
||||||
"means": ["%.3f" % numpy.mean(ts) for ts in times]
|
"means": ["%.3f" % numpy.mean(ts) for ts in times]
|
||||||
}, f)
|
}, f)
|
||||||
|
@ -32,10 +32,8 @@ struct blas_functor<double> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> blas(const Matrix<T> &A, const Matrix<T> &B) {
|
void blas(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
blas_functor<T>::multiply(C, A, B);
|
blas_functor<T>::multiply(C, A, B);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif //SMID_MATRIX_BLASMUL_H
|
#endif //SMID_MATRIX_BLASMUL_H
|
||||||
|
@ -9,10 +9,8 @@
|
|||||||
#include <boost/numeric/ublas/operation.hpp>
|
#include <boost/numeric/ublas/operation.hpp>
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) boost_axpy_mul(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) boost_axpy_mul(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
boost::numeric::ublas::axpy_prod(A, B, C);
|
boost::numeric::ublas::axpy_prod(A, B, C);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -102,17 +102,19 @@ struct multiplier_naive_functor {
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
_naive_reordered(C, A, B);
|
naive_reordered(C, A, B);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename FloatType>
|
template<typename FloatType, AvxVersion version>
|
||||||
struct multiplier_block_wise_256 : block_wise_256<FloatType> {
|
struct multiplier_block_wise : block_wise<FloatType, version> {
|
||||||
|
|
||||||
static constexpr unsigned SplitMultipleM = 5 * block_wise_256<FloatType>::NumRows;
|
typedef block_wise<FloatType, version> Base;
|
||||||
|
|
||||||
|
static constexpr unsigned SplitMultipleM = 5 * Base::NumRows;
|
||||||
static constexpr unsigned SplitMultipleP = 20;
|
static constexpr unsigned SplitMultipleP = 20;
|
||||||
static constexpr unsigned SplitMultipleN = block_wise_256<FloatType>::NumColumns;
|
static constexpr unsigned SplitMultipleN = Base::NumColumns;
|
||||||
|
|
||||||
static SplitAction action(unsigned m, unsigned p, unsigned n) {
|
static SplitAction action(unsigned m, unsigned p, unsigned n) {
|
||||||
size_t m_p = m / SplitMultipleM;
|
size_t m_p = m / SplitMultipleM;
|
||||||
@ -138,89 +140,43 @@ struct multiplier_block_wise_256 : block_wise_256<FloatType> {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename FloatType>
|
|
||||||
struct multiplier_block_wise_256_alternative : block_wise_256_alternative<FloatType> {
|
|
||||||
|
|
||||||
static constexpr unsigned SplitMultipleM = 5 * block_wise_256<FloatType>::NumRows;
|
|
||||||
static constexpr unsigned SplitMultipleP = 20;
|
|
||||||
static constexpr unsigned SplitMultipleN = block_wise_256<FloatType>::NumColumns;
|
|
||||||
|
|
||||||
static SplitAction action(unsigned m, unsigned p, unsigned n) {
|
|
||||||
size_t m_p = m / SplitMultipleM;
|
|
||||||
size_t p_p = p / SplitMultipleP;
|
|
||||||
size_t n_p = n / SplitMultipleN;
|
|
||||||
unsigned _SplitMultipleM = SplitMultipleM;
|
|
||||||
unsigned _SplitMultipleP = 1;
|
|
||||||
unsigned _SplitMultipleN = SplitMultipleN;
|
|
||||||
|
|
||||||
auto max_dim = std::max(m_p, std::max(p_p, n_p));
|
|
||||||
if (m <= SplitMultipleM && n <= SplitMultipleN) {
|
|
||||||
return DoNotSplit;
|
|
||||||
} else if (max_dim == m_p) {
|
|
||||||
return SplitA;
|
|
||||||
} else if (max_dim == n_p) {
|
|
||||||
return SplitB;
|
|
||||||
} else if (p > SplitMultipleP){
|
|
||||||
return SplitBoth;
|
|
||||||
} else {
|
|
||||||
return DoNotSplit;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template<>
|
|
||||||
struct block_wise_256<float>;
|
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r1(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
_divide_and_conquer<multiplier_naive_functor<5, 5>>(C, A, B);
|
_divide_and_conquer<multiplier_naive_functor<5, 5>>(C, A, B);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r2(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
_divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
|
_divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r3(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
_divide_and_conquer<multiplier_naive_functor<200, 200>>(C, A, B);
|
_divide_and_conquer<multiplier_naive_functor<200, 200>>(C, A, B);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r4(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
_divide_and_conquer<multiplier_naive_functor<500, 500>>(C, A, B);
|
_divide_and_conquer<multiplier_naive_functor<500, 500>>(C, A, B);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_naive_r5(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
_divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
|
_divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_block1(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
_divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
|
||||||
auto nr = block_wise_256<T>::NumRows;
|
|
||||||
auto nc = block_wise_256<T>::NumColumns;
|
|
||||||
_divide_and_conquer<multiplier_block_wise_256<T>>(C, A, B);
|
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef WITH_AVX512
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) divide_and_conquer_block2(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
_divide_and_conquer<multiplier_block_wise<T, AVX512>>(C, A, B);
|
||||||
_divide_and_conquer<multiplier_block_wise_256_alternative<T>>(C, A, B);
|
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif //SMID_MATRIX_DEVIDEANDCONQUER_H
|
#endif //SMID_MATRIX_DEVIDEANDCONQUER_H
|
||||||
|
@ -21,7 +21,7 @@ Matrix<T> __attribute__ ((noinline)) naive(const Matrix<T> &A, const Matrix<T> &
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
template<typename M1, typename M2, typename M3>
|
||||||
void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B) {
|
void __attribute__ ((noinline)) naive_reordered(M1 &C, const M2 &A, const M3 &B) {
|
||||||
assert(A.size2() == B.size1());
|
assert(A.size2() == B.size1());
|
||||||
for (int i = 0; i < A.size1(); i++) {
|
for (int i = 0; i < A.size1(); i++) {
|
||||||
for (int p = 0; p < B.size1(); p++) {
|
for (int p = 0; p < B.size1(); p++) {
|
||||||
@ -31,11 +31,5 @@ void __attribute__ ((noinline)) _naive_reordered(M1 &C, const M2 &A, const M3 &B
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<typename T>
|
|
||||||
Matrix<T> naive_reordered(const Matrix<T> &A, const Matrix<T> &B) {
|
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
_naive_reordered(C, A, B);
|
|
||||||
return C;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif //SMID_MATRIX_NAIVE_H
|
#endif //SMID_MATRIX_NAIVE_H
|
||||||
|
@ -8,161 +8,160 @@
|
|||||||
#include "Matrix.h"
|
#include "Matrix.h"
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
struct __m256_block_wise_config {
|
enum AvxVersion {
|
||||||
using FloatType = float;
|
AVX2,
|
||||||
using VectorType = __m256;
|
|
||||||
static constexpr auto LoadVector = _mm256_loadu_ps;
|
|
||||||
static constexpr auto StoreVector = _mm256_storeu_ps;
|
|
||||||
static constexpr auto BroadcastToVector = _mm256_broadcast_ss;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __m256d_block_wise_config {
|
#ifdef WITH_AVX512
|
||||||
using FloatType = double;
|
AVX512
|
||||||
using VectorType = __m256d;
|
#endif
|
||||||
static constexpr auto LoadVector = _mm256_loadu_pd;
|
|
||||||
static constexpr auto StoreVector = _mm256_storeu_pd;
|
|
||||||
static constexpr auto BroadcastToVector = _mm256_broadcast_sd;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename BlockWiseConfig>
|
|
||||||
struct ExtendedBlockWiseConfig {
|
|
||||||
typedef typename BlockWiseConfig::FloatType FloatType;
|
|
||||||
using VectorType = typename BlockWiseConfig::VectorType;
|
|
||||||
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
|
|
||||||
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
|
|
||||||
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
|
|
||||||
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
|
|
||||||
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); };
|
|
||||||
};
|
|
||||||
template<
|
|
||||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
|
||||||
typename BitWiseConfig,
|
|
||||||
unsigned _NumRows, unsigned _NumColumnVectors
|
|
||||||
>
|
|
||||||
struct RegisterBlocking {
|
|
||||||
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
|
||||||
|
|
||||||
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
|
||||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); };
|
|
||||||
|
|
||||||
static constexpr auto NumRows = _NumRows;
|
|
||||||
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
|
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
static void __attribute__ ((noinline)) handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
|
||||||
|
|
||||||
// AVX2 has 16 registers
|
|
||||||
// should be compiled as registers (total: regA * regB)
|
|
||||||
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
|
|
||||||
// iterate over dot-product terms
|
|
||||||
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
|
||||||
// Perform the DOT product
|
|
||||||
for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
|
|
||||||
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
|
|
||||||
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
|
|
||||||
typename bwc::VectorType aa = bwc::BroadcastToVector(&A(aRowOffset + ai, p));
|
|
||||||
CReg[ai][bi] += aa * bb;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// total regA * regB + regB registers
|
|
||||||
|
|
||||||
// Accumulate the results into C.
|
|
||||||
for (int ai = 0; ai < _NumRows; ai++) {
|
|
||||||
for (int bi = 0; bi < _NumColumnVectors; bi++) {
|
|
||||||
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename BitWiseConfig, unsigned Rows>
|
namespace detail {
|
||||||
struct BestRowRegisterBlocking;
|
|
||||||
|
|
||||||
template<typename BitWiseConfig>
|
struct __m256_block_wise_config {
|
||||||
struct BestRowRegisterBlocking<BitWiseConfig, 1> : public RegisterBlocking<BitWiseConfig, 1, 15> {};
|
using FloatType = float;
|
||||||
template<typename BitWiseConfig>
|
using VectorType = __m256;
|
||||||
struct BestRowRegisterBlocking<BitWiseConfig, 2> : public RegisterBlocking<BitWiseConfig, 2, 7> {};
|
static constexpr auto LoadVector = _mm256_loadu_ps;
|
||||||
template<typename BitWiseConfig>
|
static constexpr auto StoreVector = _mm256_storeu_ps;
|
||||||
struct BestRowRegisterBlocking<BitWiseConfig, 3> : public RegisterBlocking<BitWiseConfig, 3, 4> {};
|
static constexpr auto BroadcastToVector = _mm256_set1_ps;
|
||||||
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
|
};
|
||||||
// C = floor ((16 - R) / R)
|
|
||||||
|
|
||||||
template<
|
struct __m256d_block_wise_config {
|
||||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
using FloatType = double;
|
||||||
typename BlockWiseConfig = __m256_block_wise_config,
|
using VectorType = __m256d;
|
||||||
unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4
|
static constexpr auto LoadVector = _mm256_loadu_pd;
|
||||||
>
|
static constexpr auto StoreVector = _mm256_storeu_pd;
|
||||||
struct block_wise {
|
static constexpr auto BroadcastToVector = _mm256_set1_pd;
|
||||||
|
};
|
||||||
|
|
||||||
typedef ExtendedBlockWiseConfig<BlockWiseConfig> bwc;
|
#ifdef WITH_AVX512
|
||||||
|
struct __m512_block_wise_config {
|
||||||
|
using FloatType = float;
|
||||||
|
using VectorType = __m512;
|
||||||
|
static constexpr auto LoadVector = _mm512_loadu_ps;
|
||||||
|
static constexpr auto StoreVector = _mm512_storeu_ps;
|
||||||
|
static constexpr auto BroadcastToVector = _mm512_set1_ps;
|
||||||
|
};
|
||||||
|
|
||||||
static constexpr unsigned NumRows = InitialNumRows;
|
struct __m512d_block_wise_config {
|
||||||
static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth;
|
using FloatType = double;
|
||||||
|
using VectorType = __m512d;
|
||||||
|
static constexpr auto LoadVector = _mm512_loadu_pd;
|
||||||
|
static constexpr auto StoreVector = _mm512_storeu_pd;
|
||||||
|
static constexpr auto BroadcastToVector = _mm512_set1_pd;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
template<typename BlockWiseConfig>
|
||||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector); };
|
struct ExtendedBlockWiseConfig {
|
||||||
|
typedef typename BlockWiseConfig::FloatType FloatType;
|
||||||
|
using VectorType = typename BlockWiseConfig::VectorType;
|
||||||
|
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
|
||||||
|
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
|
||||||
|
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
|
||||||
|
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
|
||||||
|
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory,
|
||||||
|
LoadVector(memory) +
|
||||||
|
vector);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
template<
|
||||||
static void multiply(M1 &C, const M2 &A, const M3 &B) {
|
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||||
assert(C.size1() == A.size1() && C.size2() == B.size2());
|
typename BitWiseConfig,
|
||||||
int m = A.size1();
|
unsigned _NumRows, unsigned _NumColumnVectors
|
||||||
int k = A.size2();
|
>
|
||||||
int n = B.size2();
|
struct RegisterBlocking {
|
||||||
int m_i = 0;
|
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||||
for(; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row
|
|
||||||
int n_i = 0;
|
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||||
for(; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors * bwc::VectorWidth) {
|
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||||
RegisterBlocking<BlockWiseConfig, InitialNumRows, InitialNumColumnVectors>::handle_block(k, C, A, m_i, B, n_i);
|
};
|
||||||
}
|
|
||||||
// calculate remaining columns (like naive_reordered)
|
static constexpr auto NumRows = _NumRows;
|
||||||
if(n_i < n) {
|
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
|
||||||
for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) {
|
|
||||||
for (int p = 0; p < k; ++p) {
|
template<typename M1, typename M2, typename M3>
|
||||||
for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
|
static void __attribute__ ((noinline))
|
||||||
C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
|
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||||
}
|
|
||||||
|
// AVX2 has 16 registers
|
||||||
|
// should be compiled as registers (total: regA * regB)
|
||||||
|
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
|
||||||
|
// iterate over dot-product terms
|
||||||
|
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
||||||
|
// Perform the DOT product
|
||||||
|
for (int bi = 0;
|
||||||
|
bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
|
||||||
|
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
|
||||||
|
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
|
||||||
|
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
|
||||||
|
CReg[ai][bi] += aa * bb;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
// total regA * regB + regB registers
|
||||||
// calculate remaining rows (like naive_reordered)
|
|
||||||
for (; m_i < m; ++m_i) {
|
// Accumulate the results into C.
|
||||||
for (int p = 0; p < k; p++) {
|
for (int ai = 0; ai < _NumRows; ai++) {
|
||||||
for (int n_i = 0; n_i < n; ++n_i) {
|
for (int bi = 0; bi < _NumColumnVectors; bi++) {
|
||||||
C(m_i, n_i) += A(m_i, p) * B(p, n_i);
|
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
|
};
|
||||||
class iterate_columns {
|
|
||||||
/**
|
|
||||||
* consume all n_i's possible
|
|
||||||
*/
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
|
||||||
for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n;
|
|
||||||
n_i += CurrentNumColumnVectors * bwc::VectorWidth) {
|
|
||||||
RegisterBlocking<BlockWiseConfig, CurrentNumRows, CurrentNumColumnVectors>
|
|
||||||
::handle_block(k, C, A, m_i, B, n_i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
template<typename BitWiseConfig, unsigned Rows>
|
||||||
|
struct BestRowRegisterBlocking;
|
||||||
|
|
||||||
|
template<typename BitWiseConfig>
|
||||||
|
struct BestRowRegisterBlocking<BitWiseConfig, 1> : public RegisterBlocking<BitWiseConfig, 1, 15> {
|
||||||
|
};
|
||||||
|
template<typename BitWiseConfig>
|
||||||
|
struct BestRowRegisterBlocking<BitWiseConfig, 2> : public RegisterBlocking<BitWiseConfig, 2, 7> {
|
||||||
|
};
|
||||||
|
template<typename BitWiseConfig>
|
||||||
|
struct BestRowRegisterBlocking<BitWiseConfig, 3> : public RegisterBlocking<BitWiseConfig, 3, 4> {
|
||||||
|
};
|
||||||
|
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
|
||||||
|
// C = floor ((16 - R) / R)
|
||||||
|
|
||||||
|
template<
|
||||||
|
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||||
|
typename BlockWiseConfig = __m256_block_wise_config,
|
||||||
|
unsigned InitialNumRows = 3, unsigned InitialNumColumnVectors = 4
|
||||||
|
>
|
||||||
|
struct block_wise {
|
||||||
|
|
||||||
|
typedef ExtendedBlockWiseConfig<BlockWiseConfig> bwc;
|
||||||
|
|
||||||
|
static constexpr unsigned NumRows = InitialNumRows;
|
||||||
|
static constexpr unsigned NumColumns = InitialNumColumnVectors * bwc::VectorWidth;
|
||||||
|
|
||||||
|
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||||
|
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||||
|
};
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
template<typename M1, typename M2, typename M3>
|
||||||
static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
static void multiply(M1 &C, const M2 &A, const M3 &B) {
|
||||||
consume(k, m_i, n_i, n, C, A, B);
|
assert(C.size1() == A.size1() && C.size2() == B.size2());
|
||||||
if(CurrentNumColumnVectors > 1) {
|
int m = A.size1();
|
||||||
// try with less column vectors
|
int k = A.size2();
|
||||||
iterate_columns<CurrentNumRows, CurrentNumColumnVectors - 1>::iterate(k, m_i, n_i, n, C, A, B);
|
int n = B.size2();
|
||||||
} else {
|
int m_i = 0;
|
||||||
// do rest of columns manually
|
for (; m_i + InitialNumRows <= m; m_i += InitialNumRows) { // row
|
||||||
if(n_i < n) {
|
int n_i = 0;
|
||||||
for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) {
|
for (; n_i + InitialNumColumnVectors * bwc::VectorWidth <= n; n_i += InitialNumColumnVectors *
|
||||||
|
bwc::VectorWidth) {
|
||||||
|
RegisterBlocking<BlockWiseConfig, InitialNumRows, InitialNumColumnVectors>::handle_block(k, C, A,
|
||||||
|
m_i, B,
|
||||||
|
n_i);
|
||||||
|
}
|
||||||
|
// calculate remaining columns (like naive_reordered)
|
||||||
|
if (n_i < n) {
|
||||||
|
for (auto m_i_o = 0; m_i_o < InitialNumRows; ++m_i_o) {
|
||||||
for (int p = 0; p < k; ++p) {
|
for (int p = 0; p < k; ++p) {
|
||||||
for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
|
for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
|
||||||
C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
|
C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
|
||||||
@ -170,121 +169,147 @@ struct block_wise {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n_i = n;
|
|
||||||
}
|
}
|
||||||
assert(n_i == n);
|
// calculate remaining rows (like naive_reordered)
|
||||||
}
|
for (; m_i < m; ++m_i) {
|
||||||
};
|
for (int p = 0; p < k; p++) {
|
||||||
|
for (int n_i = 0; n_i < n; ++n_i) {
|
||||||
template<unsigned CurrentNumRows>
|
C(m_i, n_i) += A(m_i, p) * B(p, n_i);
|
||||||
class iterate_columns<CurrentNumRows, 0> {
|
}
|
||||||
public:
|
}
|
||||||
template<typename M1, typename M2, typename M3>
|
}
|
||||||
static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
|
||||||
throw std::runtime_error("should not be called, this is a bug.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
|
||||||
|
class iterate_columns {
|
||||||
|
/**
|
||||||
|
* consume all n_i's possible
|
||||||
|
*/
|
||||||
|
template<typename M1, typename M2, typename M3>
|
||||||
|
static void consume(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
||||||
|
for (; n_i + CurrentNumColumnVectors * bwc::VectorWidth <= n;
|
||||||
|
n_i += CurrentNumColumnVectors * bwc::VectorWidth) {
|
||||||
|
RegisterBlocking<BlockWiseConfig, CurrentNumRows, CurrentNumColumnVectors>
|
||||||
|
::handle_block(k, C, A, m_i, B, n_i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
|
public:
|
||||||
class iterate_rows {
|
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
template<typename M1, typename M2, typename M3>
|
||||||
static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
||||||
for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) {
|
consume(k, m_i, n_i, n, C, A, B);
|
||||||
size_t n_i = 0;
|
if (CurrentNumColumnVectors > 1) {
|
||||||
iterate_columns<CurrentNumRows, CurrentNumColumnVectors>::iterate(k, m_i, n_i, n, C, A, B);
|
// try with less column vectors
|
||||||
|
iterate_columns<CurrentNumRows, CurrentNumColumnVectors - 1>::iterate(k, m_i, n_i, n, C, A, B);
|
||||||
|
} else {
|
||||||
|
// do rest of columns manually
|
||||||
|
if (n_i < n) {
|
||||||
|
for (auto m_i_o = 0; m_i_o < CurrentNumRows; ++m_i_o) {
|
||||||
|
for (int p = 0; p < k; ++p) {
|
||||||
|
for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
|
||||||
|
C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
n_i = n;
|
||||||
|
}
|
||||||
assert(n_i == n);
|
assert(n_i == n);
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
public:
|
template<unsigned CurrentNumRows>
|
||||||
|
class iterate_columns<CurrentNumRows, 0> {
|
||||||
|
public:
|
||||||
|
template<typename M1, typename M2, typename M3>
|
||||||
|
static void iterate(size_t k, size_t m_i, size_t &n_i, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
||||||
|
throw std::runtime_error("should not be called, this is a bug.");
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
|
||||||
|
class iterate_rows {
|
||||||
|
|
||||||
|
template<typename M1, typename M2, typename M3>
|
||||||
|
static void consume(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
||||||
|
for (; m_i + CurrentNumRows <= m; m_i += CurrentNumRows) {
|
||||||
|
size_t n_i = 0;
|
||||||
|
iterate_columns<CurrentNumRows, CurrentNumColumnVectors>::iterate(k, m_i, n_i, n, C, A, B);
|
||||||
|
assert(n_i == n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
template<typename M1, typename M2, typename M3>
|
||||||
|
static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
||||||
|
consume(k, m_i, m, n, C, A, B);
|
||||||
|
if (CurrentNumRows > 1) {
|
||||||
|
// try with less num rows vectors
|
||||||
|
iterate_rows<CurrentNumRows - 1, CurrentNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
|
||||||
|
}
|
||||||
|
assert(m_i == m);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<unsigned CurrentNumColumnVectors>
|
||||||
|
class iterate_rows<0, CurrentNumColumnVectors> {
|
||||||
|
|
||||||
|
public:
|
||||||
|
template<typename M1, typename M2, typename M3>
|
||||||
|
static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
||||||
|
throw std::runtime_error("should not be called, this is a bug.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
template<typename M1, typename M2, typename M3>
|
||||||
static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
static void multiply2(M1 &C, const M2 &A, const M3 &B) {
|
||||||
consume(k, m_i, m, n, C, A, B);
|
assert(C.size1() == A.size1() && C.size2() == B.size2());
|
||||||
if(CurrentNumRows > 1) {
|
int m = A.size1();
|
||||||
// try with less num rows vectors
|
int k = A.size2();
|
||||||
iterate_rows<CurrentNumRows - 1, CurrentNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
|
int n = B.size2();
|
||||||
}
|
size_t m_i = 0;
|
||||||
|
iterate_rows<InitialNumRows, InitialNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
|
||||||
assert(m_i == m);
|
assert(m_i == m);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<unsigned CurrentNumColumnVectors>
|
template<typename BlockWiseConfig>
|
||||||
class iterate_rows<0, CurrentNumColumnVectors> {
|
struct block_wise_base : block_wise<BlockWiseConfig> {
|
||||||
|
|
||||||
public:
|
|
||||||
template<typename M1, typename M2, typename M3>
|
template<typename M1, typename M2, typename M3>
|
||||||
static void iterate(size_t k, size_t &m_i, size_t m, size_t n, M1 &C, const M2 &A, const M3 &B) {
|
void operator()(M1 &C, const M2 &A, const M3 &B) const {
|
||||||
throw std::runtime_error("should not be called, this is a bug.");
|
return block_wise<BlockWiseConfig>::multiply2(C, A, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
static void multiply2(M1 &C, const M2 &A, const M3 &B) {
|
|
||||||
assert(C.size1() == A.size1() && C.size2() == B.size2());
|
|
||||||
int m = A.size1();
|
|
||||||
int k = A.size2();
|
|
||||||
int n = B.size2();
|
|
||||||
size_t m_i = 0;
|
|
||||||
iterate_rows<InitialNumRows, InitialNumColumnVectors>::iterate(k, m_i, m, n, C, A, B);
|
|
||||||
assert(m_i == m);
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename FloatType>
|
|
||||||
struct block_wise_256;
|
|
||||||
|
|
||||||
template<>
|
|
||||||
struct block_wise_256<float> : public block_wise<__m256_block_wise_config> {
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
void operator()(M1 &C, const M2 &A, const M3 &B) const {
|
|
||||||
return multiply(C, A, B);
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
template<>
|
|
||||||
struct block_wise_256<double> : public block_wise<__m256d_block_wise_config> {
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
void operator()(M1 &C, const M2 &A, const M3 &B) const {
|
|
||||||
return multiply(C, A, B);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename FloatType>
|
|
||||||
struct block_wise_256_alternative;
|
|
||||||
|
|
||||||
template<>
|
|
||||||
struct block_wise_256_alternative<float> : public block_wise<__m256_block_wise_config> {
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
void operator()(M1 &C, const M2 &A, const M3 &B) const {
|
|
||||||
return multiply2(C, A, B);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<>
|
|
||||||
struct block_wise_256_alternative<double> : public block_wise<__m256d_block_wise_config> {
|
|
||||||
template<typename M1, typename M2, typename M3>
|
|
||||||
void operator()(M1 &C, const M2 &A, const M3 &B) const {
|
|
||||||
return multiply(C, A, B);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
Matrix<T> __attribute__ ((noinline)) block_wise_256_f(const Matrix<T> &A, const Matrix<T> &B) {
|
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
|
||||||
block_wise_256<T>::multiply(C, A, B);
|
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename FloatType, AvxVersion avxVersion>
|
||||||
|
struct block_wise;
|
||||||
|
|
||||||
|
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<detail::__m256_block_wise_config> {};
|
||||||
|
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<detail::__m256d_block_wise_config> {};
|
||||||
|
|
||||||
|
#ifdef WITH_AVX512
|
||||||
|
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<detail::__m512_block_wise_config> {};
|
||||||
|
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<detail::__m512d_block_wise_config> {};
|
||||||
|
#endif
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> __attribute__ ((noinline)) block_wise_256_f2(const Matrix<T> &A, const Matrix<T> &B) {
|
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
Matrix<T> C(A.size1(), B.size2(), 0);
|
block_wise<T, AVX2>::multiply(C, A, B);
|
||||||
block_wise_256<T>::multiply2(C, A, B);
|
|
||||||
return C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef WITH_AVX512
|
||||||
|
template<typename T>
|
||||||
|
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
|
block_wise<T, AVX512>::multiply(C, A, B);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif //SMID_MATRIX_REGISTERBLOCKING_H
|
#endif //SMID_MATRIX_REGISTERBLOCKING_H
|
||||||
|
36
src/main.cpp
36
src/main.cpp
@ -12,12 +12,13 @@ using namespace std::chrono;
|
|||||||
namespace po = boost::program_options;
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
using BinaryMatrixOp = Matrix<T> (*)(const Matrix<T> &A, const Matrix<T> &B);
|
using BinaryMatrixOp = void (*)(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
Matrix<T> run_function(BinaryMatrixOp<T> f, const Matrix<T> &A, const Matrix<T> &B) {
|
Matrix<T> run_function(BinaryMatrixOp<T> f, const Matrix<T> &A, const Matrix<T> &B) {
|
||||||
|
Matrix<T> C(A.size1(), B.size2(), 0);
|
||||||
auto a = steady_clock::now();
|
auto a = steady_clock::now();
|
||||||
auto C = f(A, B);
|
f(C, A, B);
|
||||||
auto b = steady_clock::now();
|
auto b = steady_clock::now();
|
||||||
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count();
|
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count();
|
||||||
if (ms > 1000) {
|
if (ms > 1000) {
|
||||||
@ -49,27 +50,32 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
|
|||||||
|
|
||||||
// use the result to prevent compiler to optimize...
|
// use the result to prevent compiler to optimize...
|
||||||
double use_result = 0;
|
double use_result = 0;
|
||||||
/*TEST_IF(test_function_name, naive, A, B)
|
|
||||||
TEST_IF(test_function_name, boost_blocked_mul_64, A, B)
|
|
||||||
TEST_IF(test_function_name, boost_blocked_mul_256, A, B)
|
|
||||||
TEST_IF(test_function_name, boost_mul, A, B)
|
|
||||||
TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B)
|
|
||||||
TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B)
|
|
||||||
TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B)
|
|
||||||
TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)*/
|
|
||||||
|
|
||||||
TEST_IF(test_function_name, naive_reordered, A, B)
|
TEST_IF(test_function_name, naive_reordered, A, B)
|
||||||
TEST_IF(test_function_name, block_wise_256_f, A, B)
|
TEST_IF(test_function_name, block_wise_avx2, A, B)
|
||||||
TEST_IF(test_function_name, block_wise_256_f2, A, B)
|
|
||||||
|
#ifdef WITH_AVX512
|
||||||
|
TEST_IF(test_function_name, block_wise_avx512, A, B)
|
||||||
|
#endif
|
||||||
|
|
||||||
TEST_IF(test_function_name, boost_axpy_mul, A, B)
|
TEST_IF(test_function_name, boost_axpy_mul, A, B)
|
||||||
TEST_IF(test_function_name, divide_and_conquer_block1, A, B)
|
TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
|
||||||
TEST_IF(test_function_name, divide_and_conquer_block2, A, B)
|
|
||||||
|
#ifdef WITH_AVX512
|
||||||
|
TEST_IF(test_function_name, divide_and_conquer_block_avx512, A, B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TEST_IF(test_function_name, divide_and_conquer_naive_r1, A, B)
|
||||||
|
TEST_IF(test_function_name, divide_and_conquer_naive_r2, A, B)
|
||||||
TEST_IF(test_function_name, divide_and_conquer_naive_r3, A, B)
|
TEST_IF(test_function_name, divide_and_conquer_naive_r3, A, B)
|
||||||
|
TEST_IF(test_function_name, divide_and_conquer_naive_r4, A, B)
|
||||||
|
TEST_IF(test_function_name, divide_and_conquer_naive_r5, A, B)
|
||||||
TEST_IF(test_function_name, blas, A, B)
|
TEST_IF(test_function_name, blas, A, B)
|
||||||
|
|
||||||
if(validate) {
|
if(validate) {
|
||||||
std::cout << "Validating matrix" << std::endl;
|
std::cout << "Validating matrix" << std::endl;
|
||||||
auto C2 = boost_axpy_mul(A, B);
|
Matrix<FloatType> C2(A.size1(), B.size2(), 0);
|
||||||
|
boost_axpy_mul(C2, A, B);
|
||||||
if(C.size1() != C2.size1() || C.size2() != C2.size2())
|
if(C.size1() != C2.size1() || C.size2() != C2.size2())
|
||||||
throw std::runtime_error("Result matrix has invalid size.");
|
throw std::runtime_error("Result matrix has invalid size.");
|
||||||
for(auto i = 0; i < C2.size1(); ++i) {
|
for(auto i = 0; i < C2.size1(); ++i) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user