Reorganize register blocking
This commit is contained in:
parent
b697b44fb5
commit
8634a597ba
@ -20,7 +20,8 @@ else()
|
||||
set(CMAKE_CXX_COMPILER "g++")
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
|
||||
|
||||
if(OPTIMIZE_FOR_NATIVE)
|
||||
CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
|
51
plot.py
51
plot.py
@ -1,6 +1,57 @@
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
from pprint import pprint
|
||||
|
||||
folder = os.path.join(os.path.dirname(__file__), "data")
|
||||
|
||||
files = list(reversed(sorted([os.path.join(folder, fn) for fn in os.listdir(folder) if fn.endswith(".json")])))
|
||||
|
||||
|
||||
each_size = {
|
||||
|
||||
}
|
||||
functions = set()
|
||||
for fn in files:
|
||||
with open(fn) as f:
|
||||
data =json.load(f)
|
||||
if data["total_size"] not in each_size:
|
||||
each_size[data["total_size"]] = {v: k for k, v in zip(data["means"], data["functions"])}
|
||||
functions = functions.union(set(data["functions"]))
|
||||
pprint(each_size)
|
||||
pprint(functions)
|
||||
|
||||
for function in functions:
|
||||
#if function.startswith("divide") and "naive" in function and not (function.endswith("r2") or function.endswith("r3")):
|
||||
# continue
|
||||
#if not function.startswith("divide_and_conquer_naive"):
|
||||
# continue
|
||||
if function.startswith("block_wise"):
|
||||
continue
|
||||
x = []
|
||||
y = []
|
||||
for size in sorted(each_size.keys()):
|
||||
values = each_size[size]
|
||||
if function in values:
|
||||
x.append(pow(size, 1.))
|
||||
y.append(float(values[function]) / 1000.)
|
||||
print(x, y)
|
||||
plt.plot(x, y, label=function)
|
||||
|
||||
from matplotlib import rc
|
||||
|
||||
plt.legend()
|
||||
plt.xscale("log")
|
||||
plt.grid()
|
||||
plt.xlim((1e10, 5.3e11))
|
||||
plt.ylim((0, 160))
|
||||
plt.savefig("test.pdf")
|
||||
plt.xlabel("n * p * m")
|
||||
plt.ylabel("seconds")
|
||||
plt.show()
|
||||
exit(0)
|
||||
a = [
|
||||
|
||||
[1e6, '0.000', '0.000', '0.000', '0.300', '0.200', '0.250', '0.000', '0.000'],
|
||||
|
10
scripts/dumpall.py
Executable file
10
scripts/dumpall.py
Executable file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
for e in range(8, 13):
|
||||
for p in [1, 5]:
|
||||
number = str(float(str(p) + "e" + str(e)))
|
||||
args = [os.path.join(os.path.dirname(__file__), "test.py"), number] + sys.argv[1:]
|
||||
print(" ".join(args))
|
||||
subprocess.check_call(args)
|
@ -21,10 +21,11 @@ def check_call_quiet(*args, **kwargs):
|
||||
exit(0)
|
||||
|
||||
|
||||
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, args):
|
||||
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
|
||||
|
||||
flags = [
|
||||
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
||||
"-DCMAKE_BUILD_TYPE=RelWithDebInfo",
|
||||
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
|
||||
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
|
||||
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
|
||||
@ -42,8 +43,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
|
||||
output = subprocess.check_output([os.path.join(build_path, target)] + args)
|
||||
return output
|
||||
|
||||
very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
|
||||
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
|
||||
very_slow_functions = ["naive_reordered"]
|
||||
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"]
|
||||
normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
|
||||
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
|
||||
avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
|
||||
@ -65,6 +66,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument("--double", action="store_true")
|
||||
parser.add_argument("--gcc", action="store_true")
|
||||
parser.add_argument("--function", type=str, nargs="*")
|
||||
parser.add_argument("--release", action="store_true")
|
||||
|
||||
options = parser.parse_args()
|
||||
|
||||
@ -84,14 +86,14 @@ if __name__ == '__main__':
|
||||
|
||||
if options.function:
|
||||
functions = options.function
|
||||
|
||||
# functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"]
|
||||
extra_args = []
|
||||
if options.validate:
|
||||
extra_args.append("--validate")
|
||||
if options.double:
|
||||
extra_args.append("--double")
|
||||
|
||||
total = options.total_size
|
||||
total = int(options.total_size)
|
||||
matrix_combinations = []
|
||||
i = pow(total, 1. / 3.)
|
||||
for _ in range(20):
|
||||
@ -99,19 +101,19 @@ if __name__ == '__main__':
|
||||
b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
|
||||
c = int(round(total / a / b))
|
||||
matrix_combinations.append([str(a), str(b), str(c)])
|
||||
|
||||
|
||||
times = [[] for f in functions]
|
||||
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
|
||||
clang = not options.gcc
|
||||
for sizes in matrix_combinations:
|
||||
args = list(sizes)
|
||||
compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args)
|
||||
compile_and_run("..", "builds", "generate_random", True, True, options.avx512, options.release, args)
|
||||
folder = "x".join(sizes)
|
||||
for fidx, function in enumerate(functions):
|
||||
arguments = [folder, "--algorithm", function]
|
||||
if with_double:
|
||||
arguments.append("--double")
|
||||
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args)
|
||||
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args)
|
||||
ms = output.decode()[output.decode().find("multiply:") + 10:]
|
||||
if "ms\n" in ms:
|
||||
ms = float(ms.split("ms\n")[0])
|
||||
@ -127,6 +129,7 @@ if __name__ == '__main__':
|
||||
"times": times,
|
||||
"sizes": matrix_combinations,
|
||||
"functions": functions,
|
||||
"means": ["%.3f" % numpy.mean(ts) for ts in times]
|
||||
"means": ["%.3f" % numpy.mean(ts) for ts in times],
|
||||
"total_size": total
|
||||
}, f)
|
||||
os.rename(output_file + ".cache", output_file)
|
@ -4,6 +4,8 @@
|
||||
|
||||
#ifndef SMID_MATRIX_DEVIDEANDCONQUER_H
|
||||
#define SMID_MATRIX_DEVIDEANDCONQUER_H
|
||||
|
||||
#include "register_blocking/BlockWise.h"
|
||||
#include "Boost.h"
|
||||
#include "Naive.h"
|
||||
|
||||
@ -107,14 +109,14 @@ struct multiplier_naive_functor {
|
||||
};
|
||||
|
||||
|
||||
template<typename FloatType, AvxVersion version>
|
||||
template<typename FloatType, AvxVersion version, unsigned RowMultiplier = 5, unsigned ColumnMultiplier = 1, unsigned PMultiplier = 20>
|
||||
struct multiplier_block_wise : block_wise<FloatType, version> {
|
||||
|
||||
typedef block_wise<FloatType, version> Base;
|
||||
|
||||
static constexpr unsigned SplitMultipleM = 5 * Base::NumRows;
|
||||
static constexpr unsigned SplitMultipleP = 20;
|
||||
static constexpr unsigned SplitMultipleN = Base::NumColumns;
|
||||
static constexpr unsigned SplitMultipleM = RowMultiplier * Base::NumRows;
|
||||
static constexpr unsigned SplitMultipleP = PMultiplier;
|
||||
static constexpr unsigned SplitMultipleN = ColumnMultiplier * Base::NumColumns;
|
||||
|
||||
static SplitAction action(unsigned m, unsigned p, unsigned n) {
|
||||
size_t m_p = m / SplitMultipleM;
|
||||
@ -143,27 +145,27 @@ struct multiplier_block_wise : block_wise<FloatType, version> {
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_naive_functor<5, 5>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_naive_functor<40, 40>>(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_naive_functor<45, 45>>(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_naive_functor<200, 200>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_naive_functor<35, 35>>(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_naive_functor<500, 500>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_naive_functor<30, 30>>(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
@ -174,10 +176,9 @@ void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix<T> &C, const
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
|
||||
_divide_and_conquer<multiplier_block_wise<T, AVX2, 10, 1, 75>>(C, A, B);
|
||||
}
|
||||
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
|
@ -3,7 +3,7 @@
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <boost/program_options.hpp>
|
||||
#include "register_blocking/RegisterBlocking.h"
|
||||
#include "register_blocking/BlockWise.h"
|
||||
#include "Boost.h"
|
||||
#include "Naive.h"
|
||||
#include "DevideAndConquer.h"
|
||||
|
111
src/register_blocking/BlockWise.h
Normal file
111
src/register_blocking/BlockWise.h
Normal file
@ -0,0 +1,111 @@
|
||||
//
|
||||
// Created by oke on 04.07.20.
|
||||
//
|
||||
|
||||
#ifndef SMID_MATRIX_BLOCKWISE_H
|
||||
#define SMID_MATRIX_BLOCKWISE_H
|
||||
|
||||
#include "detail/BlockWiseImpl.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
struct __m128_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m128;
|
||||
static constexpr auto LoadVector = _mm_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm_set1_ps;
|
||||
static constexpr auto XOR = _mm_xor_ps;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
struct __m128d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m128d;
|
||||
static constexpr auto LoadVector = _mm_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm_set1_pd;
|
||||
static constexpr auto XOR = _mm_xor_pd;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
struct __m256_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m256;
|
||||
static constexpr auto LoadVector = _mm256_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm256_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm256_set1_ps;
|
||||
static constexpr auto XOR = _mm256_xor_ps;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
struct __m256d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m256d;
|
||||
static constexpr auto LoadVector = _mm256_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm256_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm256_set1_pd;
|
||||
static constexpr auto XOR = _mm256_xor_pd;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
struct __m512_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m512;
|
||||
static constexpr auto LoadVector = _mm512_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm512_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm512_set1_ps;
|
||||
static constexpr unsigned Registers = 32;
|
||||
};
|
||||
|
||||
struct __m512d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m512d;
|
||||
static constexpr auto LoadVector = _mm512_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm512_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm512_set1_pd;
|
||||
static constexpr unsigned Registers = 32;
|
||||
};
|
||||
#endif
|
||||
|
||||
enum AvxVersion {
|
||||
SSE,
|
||||
AVX2,
|
||||
#ifdef WITH_AVX512
|
||||
AVX512
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename FloatType, AvxVersion avxVersion>
|
||||
struct block_wise;
|
||||
|
||||
template<> struct block_wise<float, SSE> : public detail::block_wise_base<__m128_block_wise_config> {};
|
||||
template<> struct block_wise<double, SSE> : public detail::block_wise_base<__m128d_block_wise_config> {};
|
||||
|
||||
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<__m256_block_wise_config> {};
|
||||
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<__m256d_block_wise_config> {};
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<__m512_block_wise_config> {};
|
||||
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<__m512d_block_wise_config> {};
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) block_wise_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
block_wise<T, SSE>::multiply2(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
block_wise<T, AVX2>::multiply2(C, A, B);
|
||||
}
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
block_wise<T, AVX512>::multiply2(C, A, B);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //SMID_MATRIX_BLOCKWISE_H
|
@ -1,95 +1,14 @@
|
||||
//
|
||||
// Created by oke on 01.07.20.
|
||||
// Created by oke on 04.07.20.
|
||||
//
|
||||
|
||||
#ifndef SMID_MATRIX_REGISTERBLOCKING_H
|
||||
#define SMID_MATRIX_REGISTERBLOCKING_H
|
||||
#ifndef SMID_MATRIX_BLOCKWISE_IMPL_H
|
||||
#define SMID_MATRIX_BLOCKWISE_IMPL_H
|
||||
|
||||
#include "../Matrix.h"
|
||||
#include <immintrin.h>
|
||||
#include "RegisterBlocking.h"
|
||||
|
||||
namespace detail {
|
||||
|
||||
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
|
||||
// C = floor ((16 - R) / R)
|
||||
|
||||
constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {
|
||||
if (R == 0) return 0;
|
||||
return (unsigned) ((double) (Registers - R) / (double) R);
|
||||
}
|
||||
|
||||
constexpr unsigned GetInitialRows(unsigned Registers) {
|
||||
for(unsigned R = Registers; R > 0; --R) {
|
||||
if(R + R * (R + 1) < Registers) {
|
||||
return R;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
template<typename BlockWiseConfig>
|
||||
struct ExtendedBlockWiseConfig {
|
||||
typedef typename BlockWiseConfig::FloatType FloatType;
|
||||
using VectorType = typename BlockWiseConfig::VectorType;
|
||||
|
||||
static constexpr auto Registers = BlockWiseConfig::Registers;
|
||||
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
|
||||
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
|
||||
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
|
||||
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
|
||||
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory,
|
||||
LoadVector(memory) +
|
||||
vector);
|
||||
};
|
||||
};
|
||||
|
||||
template<
|
||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||
typename BitWiseConfig,
|
||||
unsigned _NumRows, unsigned _NumColumnVectors
|
||||
>
|
||||
struct RegisterBlocking {
|
||||
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||
|
||||
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||
};
|
||||
|
||||
static constexpr auto NumRows = _NumRows;
|
||||
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
|
||||
|
||||
template<typename M1, typename M2, typename M3>
|
||||
static void __attribute__ ((noinline))
|
||||
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||
|
||||
// AVX2 has 16 registers
|
||||
// should be compiled as registers (total: regA * regB)
|
||||
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
|
||||
// iterate over dot-product terms
|
||||
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
||||
// Perform the DOT product
|
||||
for (int bi = 0;
|
||||
bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
|
||||
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
|
||||
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
|
||||
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
|
||||
CReg[ai][bi] += aa * bb;
|
||||
}
|
||||
}
|
||||
}
|
||||
// total regA * regB + regB registers
|
||||
|
||||
// Accumulate the results into C.
|
||||
for (int ai = 0; ai < _NumRows; ai++) {
|
||||
for (int bi = 0; bi < _NumColumnVectors; bi++) {
|
||||
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<
|
||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||
typename BlockWiseConfig
|
||||
@ -107,6 +26,7 @@ namespace detail {
|
||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||
};
|
||||
|
||||
/*
|
||||
template<typename M1, typename M2, typename M3>
|
||||
static void multiply(M1 &C, const M2 &A, const M3 &B) {
|
||||
assert(C.size1() == A.size1() && C.size2() == B.size2());
|
||||
@ -142,6 +62,7 @@ namespace detail {
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
|
||||
class iterate_columns {
|
||||
@ -250,100 +171,4 @@ namespace detail {
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
struct __m128_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m128;
|
||||
static constexpr auto LoadVector = _mm_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm_set1_ps;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
struct __m128d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m128d;
|
||||
static constexpr auto LoadVector = _mm_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm_set1_pd;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
struct __m256_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m256;
|
||||
static constexpr auto LoadVector = _mm256_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm256_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm256_set1_ps;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
struct __m256d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m256d;
|
||||
static constexpr auto LoadVector = _mm256_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm256_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm256_set1_pd;
|
||||
static constexpr unsigned Registers = 16;
|
||||
};
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
struct __m512_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m512;
|
||||
static constexpr auto LoadVector = _mm512_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm512_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm512_set1_ps;
|
||||
static constexpr unsigned Registers = 32;
|
||||
};
|
||||
|
||||
struct __m512d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m512d;
|
||||
static constexpr auto LoadVector = _mm512_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm512_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm512_set1_pd;
|
||||
static constexpr unsigned Registers = 32;
|
||||
};
|
||||
#endif
|
||||
|
||||
enum AvxVersion {
|
||||
SSE,
|
||||
AVX2,
|
||||
#ifdef WITH_AVX512
|
||||
AVX512
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename FloatType, AvxVersion avxVersion>
|
||||
struct block_wise;
|
||||
|
||||
template<> struct block_wise<float, SSE> : public detail::block_wise_base<__m128_block_wise_config> {};
|
||||
template<> struct block_wise<double, SSE> : public detail::block_wise_base<__m128d_block_wise_config> {};
|
||||
|
||||
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<__m256_block_wise_config> {};
|
||||
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<__m256d_block_wise_config> {};
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<__m512_block_wise_config> {};
|
||||
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<__m512d_block_wise_config> {};
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) block_wise_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
block_wise<T, SSE>::multiply(C, A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
block_wise<T, AVX2>::multiply(C, A, B);
|
||||
}
|
||||
|
||||
#ifdef WITH_AVX512
|
||||
template<typename T>
|
||||
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
block_wise<T, AVX512>::multiply(C, A, B);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //SMID_MATRIX_REGISTERBLOCKING_H
|
||||
#endif // SMID_MATRIX_BLOCKWISE_IMPL_H
|
51
src/register_blocking/detail/ExtendedBlockWiseConfig.h
Normal file
51
src/register_blocking/detail/ExtendedBlockWiseConfig.h
Normal file
@ -0,0 +1,51 @@
|
||||
//
|
||||
// Created by oke on 04.07.20.
|
||||
//
|
||||
|
||||
#ifndef SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H
|
||||
#define SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H
|
||||
|
||||
template<typename BlockWiseConfig>
|
||||
struct ExtendedBlockWiseConfig {
|
||||
|
||||
typedef typename BlockWiseConfig::FloatType FloatType;
|
||||
using VectorType = typename BlockWiseConfig::VectorType;
|
||||
|
||||
static constexpr auto Registers = BlockWiseConfig::Registers;
|
||||
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
|
||||
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
|
||||
static constexpr auto XOR = BlockWiseConfig::XOR;
|
||||
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
|
||||
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
|
||||
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) {
|
||||
StoreVector(memory, LoadVector(memory) + vector);
|
||||
};
|
||||
};
|
||||
|
||||
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
|
||||
// C = floor ((16 - R) / R)
|
||||
|
||||
// maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16
|
||||
// C = floor ((16 - R) / R)
|
||||
|
||||
unsigned constexpr extra_registers = 2;
|
||||
|
||||
|
||||
constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {
|
||||
if (R == 0) return 0;
|
||||
auto result = (unsigned) ((double) (Registers - extra_registers - R) / (double) R);
|
||||
if (extra_registers + R + R * (result + 1) <= Registers)
|
||||
return result + 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
constexpr unsigned GetInitialRows(unsigned Registers) {
|
||||
for(unsigned R = Registers; R > 0; --R) {
|
||||
if(extra_registers + R + R * R <= Registers) {
|
||||
return R;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif //SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H
|
155
src/register_blocking/detail/RegisterBlocking.h
Normal file
155
src/register_blocking/detail/RegisterBlocking.h
Normal file
@ -0,0 +1,155 @@
|
||||
//
|
||||
// Created by oke on 01.07.20.
|
||||
//
|
||||
|
||||
#ifndef SMID_MATRIX_REGISTERBLOCKING_H
|
||||
#define SMID_MATRIX_REGISTERBLOCKING_H
|
||||
|
||||
#include "../../Matrix.h"
|
||||
#include "ExtendedBlockWiseConfig.h"
|
||||
|
||||
namespace detail {
|
||||
|
||||
template<
|
||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||
typename BitWiseConfig,
|
||||
unsigned _NumRows, unsigned _NumColumnVectors
|
||||
>
|
||||
struct RegisterBlocking {
|
||||
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||
|
||||
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||
};
|
||||
|
||||
static constexpr auto NumRows = _NumRows;
|
||||
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
|
||||
|
||||
template<typename M1, typename M2, typename M3>
|
||||
static void __attribute__ ((noinline))
|
||||
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||
|
||||
// AVX2 has 16 registers
|
||||
// should be compiled as registers (total: regA * regB)
|
||||
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
|
||||
// iterate over dot-product terms
|
||||
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
||||
// Perform the DOT product
|
||||
for (int bi = 0;
|
||||
bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
|
||||
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
|
||||
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
|
||||
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
|
||||
CReg[ai][bi] += aa * bb;
|
||||
}
|
||||
}
|
||||
}
|
||||
// total regA * regB + regB registers
|
||||
|
||||
// Accumulate the results into C.
|
||||
for (int ai = 0; ai < _NumRows; ai++) {
|
||||
for (int bi = 0; bi < _NumColumnVectors; bi++) {
|
||||
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
template<typename BitWiseConfig>
|
||||
struct RegisterBlocking<BitWiseConfig, 3, 4> {
|
||||
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||
|
||||
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||
};
|
||||
|
||||
static constexpr auto NumRows = 3;
|
||||
static constexpr auto NumColumns = 4 * bwc::VectorWidth;
|
||||
|
||||
template<typename M1, typename M2, typename M3>
|
||||
static void __attribute__ ((noinline))
|
||||
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||
|
||||
// AVX2 has 16 registers
|
||||
// should be compiled as registers (total: regA * regB)
|
||||
typename bwc::VectorType r1c1;
|
||||
typename bwc::VectorType r1c2;
|
||||
typename bwc::VectorType r1c3;
|
||||
typename bwc::VectorType r1c4;
|
||||
typename bwc::VectorType r2c1;
|
||||
typename bwc::VectorType r2c2;
|
||||
typename bwc::VectorType r2c3;
|
||||
typename bwc::VectorType r2c4;
|
||||
typename bwc::VectorType r3c1;
|
||||
typename bwc::VectorType r3c2;
|
||||
typename bwc::VectorType r3c3;
|
||||
typename bwc::VectorType r3c4;
|
||||
typename bwc::VectorType r1;
|
||||
typename bwc::VectorType r2;
|
||||
typename bwc::VectorType r3;
|
||||
typename bwc::VectorType c;
|
||||
|
||||
r1c1 = bwc::XOR(r1c1, r1c1);
|
||||
r1c2 = bwc::XOR(r1c2, r1c2);
|
||||
r1c3 = bwc::XOR(r1c3, r1c3);
|
||||
r1c4 = bwc::XOR(r1c4, r1c4);
|
||||
r2c1 = bwc::XOR(r2c1, r2c1);
|
||||
r2c2 = bwc::XOR(r2c2, r2c2);
|
||||
r2c3 = bwc::XOR(r2c3, r2c3);
|
||||
r2c4 = bwc::XOR(r2c4, r2c4);
|
||||
r3c1 = bwc::XOR(r3c1, r3c1);
|
||||
r3c2 = bwc::XOR(r3c2, r3c2);
|
||||
r3c3 = bwc::XOR(r3c3, r3c3);
|
||||
r3c4 = bwc::XOR(r3c4, r3c4);
|
||||
|
||||
// iterate over dot-product terms
|
||||
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
||||
// Perform the DOT product
|
||||
r1 = bwc::BroadcastToVector(A(aRowOffset + 0, p));
|
||||
r2 = bwc::BroadcastToVector(A(aRowOffset + 1, p));
|
||||
r3 = bwc::BroadcastToVector(A(aRowOffset + 2, p));
|
||||
|
||||
c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth));
|
||||
r1c1 += r1 * c;
|
||||
r2c1 += r2 * c;
|
||||
r3c1 += r3 * c;
|
||||
|
||||
c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth));
|
||||
r1c2 += r1 * c;
|
||||
r2c2 += r2 * c;
|
||||
r3c2 += r3 * c;
|
||||
|
||||
c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth));
|
||||
r1c3 += r1 * c;
|
||||
r2c3 += r2 * c;
|
||||
r3c3 += r3 * c;
|
||||
|
||||
c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth));
|
||||
r1c4 += r1 * c;
|
||||
r2c4 += r2 * c;
|
||||
r3c4 += r3 * c;
|
||||
}
|
||||
|
||||
// total regA * regB + regB registers
|
||||
|
||||
// Accumulate the results into C.
|
||||
AddAndStore(&C(aRowOffset + 0, bColOffset + 0 * bwc::VectorWidth), r1c1);
|
||||
AddAndStore(&C(aRowOffset + 1, bColOffset + 0 * bwc::VectorWidth), r2c1);
|
||||
AddAndStore(&C(aRowOffset + 2, bColOffset + 0 * bwc::VectorWidth), r3c1);
|
||||
AddAndStore(&C(aRowOffset + 0, bColOffset + 1 * bwc::VectorWidth), r1c2);
|
||||
AddAndStore(&C(aRowOffset + 1, bColOffset + 1 * bwc::VectorWidth), r2c2);
|
||||
AddAndStore(&C(aRowOffset + 2, bColOffset + 1 * bwc::VectorWidth), r3c2);
|
||||
AddAndStore(&C(aRowOffset + 0, bColOffset + 2 * bwc::VectorWidth), r1c3);
|
||||
AddAndStore(&C(aRowOffset + 1, bColOffset + 2 * bwc::VectorWidth), r2c3);
|
||||
AddAndStore(&C(aRowOffset + 2, bColOffset + 2 * bwc::VectorWidth), r3c3);
|
||||
AddAndStore(&C(aRowOffset + 0, bColOffset + 3 * bwc::VectorWidth), r1c4);
|
||||
AddAndStore(&C(aRowOffset + 1, bColOffset + 3 * bwc::VectorWidth), r2c4);
|
||||
AddAndStore(&C(aRowOffset + 2, bColOffset + 3 * bwc::VectorWidth), r3c4);
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
#endif //SMID_MATRIX_REGISTERBLOCKING_H
|
80
src/register_blocking/detail/RegisterBlockingManual.h
Normal file
80
src/register_blocking/detail/RegisterBlockingManual.h
Normal file
@ -0,0 +1,80 @@
|
||||
//
|
||||
// Created by oke on 01.07.20.
|
||||
//
|
||||
|
||||
#ifndef SMID_MATRIX_REGISTERBLOCKINGMANUAL_H
|
||||
#define SMID_MATRIX_REGISTERBLOCKINGMANUAL_H
|
||||
|
||||
#include "../../Matrix.h"
|
||||
#include "ExtendedBlockWiseConfig.h"
|
||||
|
||||
template<typename BitWiseConfig, size_t Rows, size_t ColumnVectors>
|
||||
struct a {
|
||||
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||
|
||||
|
||||
template<typename M1, typename M2, typename M3>
|
||||
void do_stuff(int p, int bi, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||
|
||||
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
|
||||
|
||||
for (int ai = 0; ai < Rows; ai++) { // row index in A (handling regsA rows)
|
||||
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
|
||||
CReg[ai][bi] += aa * bb;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
namespace detail {
|
||||
|
||||
template<
|
||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||
typename BitWiseConfig,
|
||||
unsigned _NumRows, unsigned _NumColumnVectors
|
||||
>
|
||||
struct RegisterBlockingManual {
|
||||
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
|
||||
|
||||
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
|
||||
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
|
||||
};
|
||||
|
||||
static constexpr auto NumRows = _NumRows;
|
||||
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
|
||||
|
||||
template<typename M1, typename M2, typename M3>
|
||||
static void __attribute__ ((noinline))
|
||||
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
|
||||
|
||||
// AVX2 has 16 registers
|
||||
// should be compiled as registers (total: regA * regB)
|
||||
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
|
||||
// iterate over dot-product terms
|
||||
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
||||
// Perform the DOT product
|
||||
for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
|
||||
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
|
||||
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
|
||||
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
|
||||
CReg[ai][bi] += aa * bb;
|
||||
}
|
||||
}
|
||||
}
|
||||
// total regA * regB + regB registers
|
||||
|
||||
// Accumulate the results into C.
|
||||
for (int ai = 0; ai < _NumRows; ai++) {
|
||||
for (int bi = 0; bi < _NumColumnVectors; bi++) {
|
||||
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif //SMID_MATRIX_REGISTERBLOCKINGMANUAL_H
|
Loading…
x
Reference in New Issue
Block a user