Reorganize register blocking

This commit is contained in:
har0ke 2020-07-05 02:28:29 +02:00
parent b697b44fb5
commit 8634a597ba
13 changed files with 1688 additions and 205 deletions

View File

@ -20,7 +20,8 @@ else()
set(CMAKE_CXX_COMPILER "g++")
endif()
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
if(OPTIMIZE_FOR_NATIVE)
CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)

51
plot.py
View File

@ -1,6 +1,57 @@
import matplotlib.pyplot as plt
import numpy as np
import os
import json
from pprint import pprint
folder = os.path.join(os.path.dirname(__file__), "data")
files = list(reversed(sorted([os.path.join(folder, fn) for fn in os.listdir(folder) if fn.endswith(".json")])))
each_size = {
}
functions = set()
for fn in files:
with open(fn) as f:
data =json.load(f)
if data["total_size"] not in each_size:
each_size[data["total_size"]] = {v: k for k, v in zip(data["means"], data["functions"])}
functions = functions.union(set(data["functions"]))
pprint(each_size)
pprint(functions)
for function in functions:
#if function.startswith("divide") and "naive" in function and not (function.endswith("r2") or function.endswith("r3")):
# continue
#if not function.startswith("divide_and_conquer_naive"):
# continue
if function.startswith("block_wise"):
continue
x = []
y = []
for size in sorted(each_size.keys()):
values = each_size[size]
if function in values:
x.append(pow(size, 1.))
y.append(float(values[function]) / 1000.)
print(x, y)
plt.plot(x, y, label=function)
from matplotlib import rc
plt.legend()
plt.xscale("log")
plt.grid()
plt.xlim((1e10, 5.3e11))
plt.ylim((0, 160))
plt.savefig("test.pdf")
plt.xlabel("n * p * m")
plt.ylabel("seconds")
plt.show()
exit(0)
a = [
[1e6, '0.000', '0.000', '0.000', '0.300', '0.200', '0.250', '0.000', '0.000'],

10
scripts/dumpall.py Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/python
import os
import sys
import subprocess
for e in range(8, 13):
for p in [1, 5]:
number = str(float(str(p) + "e" + str(e)))
args = [os.path.join(os.path.dirname(__file__), "test.py"), number] + sys.argv[1:]
print(" ".join(args))
subprocess.check_call(args)

View File

@ -21,10 +21,11 @@ def check_call_quiet(*args, **kwargs):
exit(0)
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, args):
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, avx512, release, args):
flags = [
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
"-DCMAKE_BUILD_TYPE=RelWithDebInfo",
"-DCMAKE_BUILD_TYPE=" + ("Release" if release else "RelWithDebInfo"),
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON",
"-DWITH_AVX512=" + ("ON" if avx512 else "OFF"),
@ -42,8 +43,8 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
output = subprocess.check_output([os.path.join(build_path, target)] + args)
return output
very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
very_slow_functions = ["naive_reordered"]
slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r1"]
normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
fast_functions = ["divide_and_conquer_block_avx2", "blas"]
avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
@ -65,6 +66,7 @@ if __name__ == '__main__':
parser.add_argument("--double", action="store_true")
parser.add_argument("--gcc", action="store_true")
parser.add_argument("--function", type=str, nargs="*")
parser.add_argument("--release", action="store_true")
options = parser.parse_args()
@ -84,14 +86,14 @@ if __name__ == '__main__':
if options.function:
functions = options.function
# functions = ["divide_and_conquer_naive_r1", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r5"]
extra_args = []
if options.validate:
extra_args.append("--validate")
if options.double:
extra_args.append("--double")
total = options.total_size
total = int(options.total_size)
matrix_combinations = []
i = pow(total, 1. / 3.)
for _ in range(20):
@ -99,19 +101,19 @@ if __name__ == '__main__':
b = int(round(max(0.01 * i + 1, numpy.random.normal(i, i / 2))))
c = int(round(total / a / b))
matrix_combinations.append([str(a), str(b), str(c)])
times = [[] for f in functions]
output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
clang = not options.gcc
for sizes in matrix_combinations:
args = list(sizes)
compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args)
compile_and_run("..", "builds", "generate_random", True, True, options.avx512, options.release, args)
folder = "x".join(sizes)
for fidx, function in enumerate(functions):
arguments = [folder, "--algorithm", function]
if with_double:
arguments.append("--double")
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args)
output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, options.release, arguments + extra_args)
ms = output.decode()[output.decode().find("multiply:") + 10:]
if "ms\n" in ms:
ms = float(ms.split("ms\n")[0])
@ -127,6 +129,7 @@ if __name__ == '__main__':
"times": times,
"sizes": matrix_combinations,
"functions": functions,
"means": ["%.3f" % numpy.mean(ts) for ts in times]
"means": ["%.3f" % numpy.mean(ts) for ts in times],
"total_size": total
}, f)
os.rename(output_file + ".cache", output_file)

View File

@ -4,6 +4,8 @@
#ifndef SMID_MATRIX_DEVIDEANDCONQUER_H
#define SMID_MATRIX_DEVIDEANDCONQUER_H
#include "register_blocking/BlockWise.h"
#include "Boost.h"
#include "Naive.h"
@ -107,14 +109,14 @@ struct multiplier_naive_functor {
};
template<typename FloatType, AvxVersion version>
template<typename FloatType, AvxVersion version, unsigned RowMultiplier = 5, unsigned ColumnMultiplier = 1, unsigned PMultiplier = 20>
struct multiplier_block_wise : block_wise<FloatType, version> {
typedef block_wise<FloatType, version> Base;
static constexpr unsigned SplitMultipleM = 5 * Base::NumRows;
static constexpr unsigned SplitMultipleP = 20;
static constexpr unsigned SplitMultipleN = Base::NumColumns;
static constexpr unsigned SplitMultipleM = RowMultiplier * Base::NumRows;
static constexpr unsigned SplitMultipleP = PMultiplier;
static constexpr unsigned SplitMultipleN = ColumnMultiplier * Base::NumColumns;
static SplitAction action(unsigned m, unsigned p, unsigned n) {
size_t m_p = m / SplitMultipleM;
@ -143,27 +145,27 @@ struct multiplier_block_wise : block_wise<FloatType, version> {
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_naive_r1(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_naive_functor<5, 5>>(C, A, B);
_divide_and_conquer<multiplier_naive_functor<40, 40>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_naive_r2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
_divide_and_conquer<multiplier_naive_functor<45, 45>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_naive_r3(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_naive_functor<200, 200>>(C, A, B);
_divide_and_conquer<multiplier_naive_functor<35, 35>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_naive_r4(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_naive_functor<500, 500>>(C, A, B);
_divide_and_conquer<multiplier_naive_functor<30, 30>>(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
_divide_and_conquer<multiplier_naive_functor<50, 50>>(C, A, B);
}
template<typename T>
@ -174,10 +176,9 @@ void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix<T> &C, const
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
_divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
_divide_and_conquer<multiplier_block_wise<T, AVX2, 10, 1, 75>>(C, A, B);
}
#ifdef WITH_AVX512
template<typename T>
void __attribute__ ((noinline)) divide_and_conquer_block_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {

View File

@ -3,7 +3,7 @@
#include <iostream>
#include <cassert>
#include <boost/program_options.hpp>
#include "register_blocking/RegisterBlocking.h"
#include "register_blocking/BlockWise.h"
#include "Boost.h"
#include "Naive.h"
#include "DevideAndConquer.h"

View File

@ -0,0 +1,111 @@
//
// Created by oke on 04.07.20.
//
#ifndef SMID_MATRIX_BLOCKWISE_H
#define SMID_MATRIX_BLOCKWISE_H
#include "detail/BlockWiseImpl.h"
#include <immintrin.h>
struct __m128_block_wise_config {
using FloatType = float;
using VectorType = __m128;
static constexpr auto LoadVector = _mm_loadu_ps;
static constexpr auto StoreVector = _mm_storeu_ps;
static constexpr auto BroadcastToVector = _mm_set1_ps;
static constexpr auto XOR = _mm_xor_ps;
static constexpr unsigned Registers = 16;
};
struct __m128d_block_wise_config {
using FloatType = double;
using VectorType = __m128d;
static constexpr auto LoadVector = _mm_loadu_pd;
static constexpr auto StoreVector = _mm_storeu_pd;
static constexpr auto BroadcastToVector = _mm_set1_pd;
static constexpr auto XOR = _mm_xor_pd;
static constexpr unsigned Registers = 16;
};
struct __m256_block_wise_config {
using FloatType = float;
using VectorType = __m256;
static constexpr auto LoadVector = _mm256_loadu_ps;
static constexpr auto StoreVector = _mm256_storeu_ps;
static constexpr auto BroadcastToVector = _mm256_set1_ps;
static constexpr auto XOR = _mm256_xor_ps;
static constexpr unsigned Registers = 16;
};
struct __m256d_block_wise_config {
using FloatType = double;
using VectorType = __m256d;
static constexpr auto LoadVector = _mm256_loadu_pd;
static constexpr auto StoreVector = _mm256_storeu_pd;
static constexpr auto BroadcastToVector = _mm256_set1_pd;
static constexpr auto XOR = _mm256_xor_pd;
static constexpr unsigned Registers = 16;
};
#ifdef WITH_AVX512
struct __m512_block_wise_config {
using FloatType = float;
using VectorType = __m512;
static constexpr auto LoadVector = _mm512_loadu_ps;
static constexpr auto StoreVector = _mm512_storeu_ps;
static constexpr auto BroadcastToVector = _mm512_set1_ps;
static constexpr unsigned Registers = 32;
};
struct __m512d_block_wise_config {
using FloatType = double;
using VectorType = __m512d;
static constexpr auto LoadVector = _mm512_loadu_pd;
static constexpr auto StoreVector = _mm512_storeu_pd;
static constexpr auto BroadcastToVector = _mm512_set1_pd;
static constexpr unsigned Registers = 32;
};
#endif
enum AvxVersion {
SSE,
AVX2,
#ifdef WITH_AVX512
AVX512
#endif
};
template<typename FloatType, AvxVersion avxVersion>
struct block_wise;
template<> struct block_wise<float, SSE> : public detail::block_wise_base<__m128_block_wise_config> {};
template<> struct block_wise<double, SSE> : public detail::block_wise_base<__m128d_block_wise_config> {};
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<__m256_block_wise_config> {};
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<__m256d_block_wise_config> {};
#ifdef WITH_AVX512
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<__m512_block_wise_config> {};
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<__m512d_block_wise_config> {};
#endif
template<typename T>
void __attribute__ ((noinline)) block_wise_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
block_wise<T, SSE>::multiply2(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
block_wise<T, AVX2>::multiply2(C, A, B);
}
#ifdef WITH_AVX512
template<typename T>
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
block_wise<T, AVX512>::multiply2(C, A, B);
}
#endif
#endif //SMID_MATRIX_BLOCKWISE_H

View File

@ -1,95 +1,14 @@
//
// Created by oke on 01.07.20.
// Created by oke on 04.07.20.
//
#ifndef SMID_MATRIX_REGISTERBLOCKING_H
#define SMID_MATRIX_REGISTERBLOCKING_H
#ifndef SMID_MATRIX_BLOCKWISE_IMPL_H
#define SMID_MATRIX_BLOCKWISE_IMPL_H
#include "../Matrix.h"
#include <immintrin.h>
#include "RegisterBlocking.h"
namespace detail {
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
// C = floor ((16 - R) / R)
constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {
if (R == 0) return 0;
return (unsigned) ((double) (Registers - R) / (double) R);
}
constexpr unsigned GetInitialRows(unsigned Registers) {
for(unsigned R = Registers; R > 0; --R) {
if(R + R * (R + 1) < Registers) {
return R;
}
}
return 0;
}
template<typename BlockWiseConfig>
struct ExtendedBlockWiseConfig {
typedef typename BlockWiseConfig::FloatType FloatType;
using VectorType = typename BlockWiseConfig::VectorType;
static constexpr auto Registers = BlockWiseConfig::Registers;
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory,
LoadVector(memory) +
vector);
};
};
template<
// template parameter as struct: otherwise some warning about losing alignment information warning
typename BitWiseConfig,
unsigned _NumRows, unsigned _NumColumnVectors
>
struct RegisterBlocking {
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
};
static constexpr auto NumRows = _NumRows;
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
template<typename M1, typename M2, typename M3>
static void __attribute__ ((noinline))
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
// AVX2 has 16 registers
// should be compiled as registers (total: regA * regB)
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
// iterate over dot-product terms
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
// Perform the DOT product
for (int bi = 0;
bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
CReg[ai][bi] += aa * bb;
}
}
}
// total regA * regB + regB registers
// Accumulate the results into C.
for (int ai = 0; ai < _NumRows; ai++) {
for (int bi = 0; bi < _NumColumnVectors; bi++) {
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
}
}
}
};
template<
// template parameter as struct: otherwise some warning about losing alignment information warning
typename BlockWiseConfig
@ -107,6 +26,7 @@ namespace detail {
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
};
/*
template<typename M1, typename M2, typename M3>
static void multiply(M1 &C, const M2 &A, const M3 &B) {
assert(C.size1() == A.size1() && C.size2() == B.size2());
@ -142,6 +62,7 @@ namespace detail {
}
}
}
*/
template<unsigned CurrentNumRows, unsigned CurrentNumColumnVectors>
class iterate_columns {
@ -250,100 +171,4 @@ namespace detail {
};
}
struct __m128_block_wise_config {
using FloatType = float;
using VectorType = __m128;
static constexpr auto LoadVector = _mm_loadu_ps;
static constexpr auto StoreVector = _mm_storeu_ps;
static constexpr auto BroadcastToVector = _mm_set1_ps;
static constexpr unsigned Registers = 16;
};
struct __m128d_block_wise_config {
using FloatType = double;
using VectorType = __m128d;
static constexpr auto LoadVector = _mm_loadu_pd;
static constexpr auto StoreVector = _mm_storeu_pd;
static constexpr auto BroadcastToVector = _mm_set1_pd;
static constexpr unsigned Registers = 16;
};
struct __m256_block_wise_config {
using FloatType = float;
using VectorType = __m256;
static constexpr auto LoadVector = _mm256_loadu_ps;
static constexpr auto StoreVector = _mm256_storeu_ps;
static constexpr auto BroadcastToVector = _mm256_set1_ps;
static constexpr unsigned Registers = 16;
};
struct __m256d_block_wise_config {
using FloatType = double;
using VectorType = __m256d;
static constexpr auto LoadVector = _mm256_loadu_pd;
static constexpr auto StoreVector = _mm256_storeu_pd;
static constexpr auto BroadcastToVector = _mm256_set1_pd;
static constexpr unsigned Registers = 16;
};
#ifdef WITH_AVX512
struct __m512_block_wise_config {
using FloatType = float;
using VectorType = __m512;
static constexpr auto LoadVector = _mm512_loadu_ps;
static constexpr auto StoreVector = _mm512_storeu_ps;
static constexpr auto BroadcastToVector = _mm512_set1_ps;
static constexpr unsigned Registers = 32;
};
struct __m512d_block_wise_config {
using FloatType = double;
using VectorType = __m512d;
static constexpr auto LoadVector = _mm512_loadu_pd;
static constexpr auto StoreVector = _mm512_storeu_pd;
static constexpr auto BroadcastToVector = _mm512_set1_pd;
static constexpr unsigned Registers = 32;
};
#endif
enum AvxVersion {
SSE,
AVX2,
#ifdef WITH_AVX512
AVX512
#endif
};
template<typename FloatType, AvxVersion avxVersion>
struct block_wise;
template<> struct block_wise<float, SSE> : public detail::block_wise_base<__m128_block_wise_config> {};
template<> struct block_wise<double, SSE> : public detail::block_wise_base<__m128d_block_wise_config> {};
template<> struct block_wise<float, AVX2> : public detail::block_wise_base<__m256_block_wise_config> {};
template<> struct block_wise<double, AVX2> : public detail::block_wise_base<__m256d_block_wise_config> {};
#ifdef WITH_AVX512
template<> struct block_wise<float, AVX512> : public detail::block_wise_base<__m512_block_wise_config> {};
template<> struct block_wise<double, AVX512> : public detail::block_wise_base<__m512d_block_wise_config> {};
#endif
template<typename T>
void __attribute__ ((noinline)) block_wise_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
block_wise<T, SSE>::multiply(C, A, B);
}
template<typename T>
void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
block_wise<T, AVX2>::multiply(C, A, B);
}
#ifdef WITH_AVX512
template<typename T>
void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
block_wise<T, AVX512>::multiply(C, A, B);
}
#endif
#endif //SMID_MATRIX_REGISTERBLOCKING_H
#endif // SMID_MATRIX_BLOCKWISE_IMPL_H

View File

@ -0,0 +1,51 @@
//
// Created by oke on 04.07.20.
//
#ifndef SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H
#define SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H
template<typename BlockWiseConfig>
struct ExtendedBlockWiseConfig {
typedef typename BlockWiseConfig::FloatType FloatType;
using VectorType = typename BlockWiseConfig::VectorType;
static constexpr auto Registers = BlockWiseConfig::Registers;
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
static constexpr auto XOR = BlockWiseConfig::XOR;
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) {
StoreVector(memory, LoadVector(memory) + vector);
};
};
// maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
// C = floor ((16 - R) / R)
// maximize = (R * C) / (R + C) for 1 + R + R * C < 16 => any fixed r -> largest C with < 16
// C = floor ((16 - R) / R)
unsigned constexpr extra_registers = 2;
constexpr unsigned GetInitialColumnVectors(unsigned R, unsigned Registers) {
if (R == 0) return 0;
auto result = (unsigned) ((double) (Registers - extra_registers - R) / (double) R);
if (extra_registers + R + R * (result + 1) <= Registers)
return result + 1;
return result;
}
constexpr unsigned GetInitialRows(unsigned Registers) {
for(unsigned R = Registers; R > 0; --R) {
if(extra_registers + R + R * R <= Registers) {
return R;
}
}
return 0;
}
#endif //SMID_MATRIX_EXTENDEDBLOCKWISECONFIG_H

View File

@ -0,0 +1,155 @@
//
// Created by oke on 01.07.20.
//
#ifndef SMID_MATRIX_REGISTERBLOCKING_H
#define SMID_MATRIX_REGISTERBLOCKING_H
#include "../../Matrix.h"
#include "ExtendedBlockWiseConfig.h"
namespace detail {
template<
// template parameter as struct: otherwise some warning about losing alignment information warning
typename BitWiseConfig,
unsigned _NumRows, unsigned _NumColumnVectors
>
struct RegisterBlocking {
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
};
static constexpr auto NumRows = _NumRows;
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
template<typename M1, typename M2, typename M3>
static void __attribute__ ((noinline))
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
// AVX2 has 16 registers
// should be compiled as registers (total: regA * regB)
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
// iterate over dot-product terms
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
// Perform the DOT product
for (int bi = 0;
bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
CReg[ai][bi] += aa * bb;
}
}
}
// total regA * regB + regB registers
// Accumulate the results into C.
for (int ai = 0; ai < _NumRows; ai++) {
for (int bi = 0; bi < _NumColumnVectors; bi++) {
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
}
}
}
};
/*
template<typename BitWiseConfig>
struct RegisterBlocking<BitWiseConfig, 3, 4> {
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
};
static constexpr auto NumRows = 3;
static constexpr auto NumColumns = 4 * bwc::VectorWidth;
template<typename M1, typename M2, typename M3>
static void __attribute__ ((noinline))
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
// AVX2 has 16 registers
// should be compiled as registers (total: regA * regB)
typename bwc::VectorType r1c1;
typename bwc::VectorType r1c2;
typename bwc::VectorType r1c3;
typename bwc::VectorType r1c4;
typename bwc::VectorType r2c1;
typename bwc::VectorType r2c2;
typename bwc::VectorType r2c3;
typename bwc::VectorType r2c4;
typename bwc::VectorType r3c1;
typename bwc::VectorType r3c2;
typename bwc::VectorType r3c3;
typename bwc::VectorType r3c4;
typename bwc::VectorType r1;
typename bwc::VectorType r2;
typename bwc::VectorType r3;
typename bwc::VectorType c;
r1c1 = bwc::XOR(r1c1, r1c1);
r1c2 = bwc::XOR(r1c2, r1c2);
r1c3 = bwc::XOR(r1c3, r1c3);
r1c4 = bwc::XOR(r1c4, r1c4);
r2c1 = bwc::XOR(r2c1, r2c1);
r2c2 = bwc::XOR(r2c2, r2c2);
r2c3 = bwc::XOR(r2c3, r2c3);
r2c4 = bwc::XOR(r2c4, r2c4);
r3c1 = bwc::XOR(r3c1, r3c1);
r3c2 = bwc::XOR(r3c2, r3c2);
r3c3 = bwc::XOR(r3c3, r3c3);
r3c4 = bwc::XOR(r3c4, r3c4);
// iterate over dot-product terms
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
// Perform the DOT product
r1 = bwc::BroadcastToVector(A(aRowOffset + 0, p));
r2 = bwc::BroadcastToVector(A(aRowOffset + 1, p));
r3 = bwc::BroadcastToVector(A(aRowOffset + 2, p));
c = bwc::LoadVector(&B(p, bColOffset + 0 * bwc::VectorWidth));
r1c1 += r1 * c;
r2c1 += r2 * c;
r3c1 += r3 * c;
c = bwc::LoadVector(&B(p, bColOffset + 1 * bwc::VectorWidth));
r1c2 += r1 * c;
r2c2 += r2 * c;
r3c2 += r3 * c;
c = bwc::LoadVector(&B(p, bColOffset + 2 * bwc::VectorWidth));
r1c3 += r1 * c;
r2c3 += r2 * c;
r3c3 += r3 * c;
c = bwc::LoadVector(&B(p, bColOffset + 3 * bwc::VectorWidth));
r1c4 += r1 * c;
r2c4 += r2 * c;
r3c4 += r3 * c;
}
// total regA * regB + regB registers
// Accumulate the results into C.
AddAndStore(&C(aRowOffset + 0, bColOffset + 0 * bwc::VectorWidth), r1c1);
AddAndStore(&C(aRowOffset + 1, bColOffset + 0 * bwc::VectorWidth), r2c1);
AddAndStore(&C(aRowOffset + 2, bColOffset + 0 * bwc::VectorWidth), r3c1);
AddAndStore(&C(aRowOffset + 0, bColOffset + 1 * bwc::VectorWidth), r1c2);
AddAndStore(&C(aRowOffset + 1, bColOffset + 1 * bwc::VectorWidth), r2c2);
AddAndStore(&C(aRowOffset + 2, bColOffset + 1 * bwc::VectorWidth), r3c2);
AddAndStore(&C(aRowOffset + 0, bColOffset + 2 * bwc::VectorWidth), r1c3);
AddAndStore(&C(aRowOffset + 1, bColOffset + 2 * bwc::VectorWidth), r2c3);
AddAndStore(&C(aRowOffset + 2, bColOffset + 2 * bwc::VectorWidth), r3c3);
AddAndStore(&C(aRowOffset + 0, bColOffset + 3 * bwc::VectorWidth), r1c4);
AddAndStore(&C(aRowOffset + 1, bColOffset + 3 * bwc::VectorWidth), r2c4);
AddAndStore(&C(aRowOffset + 2, bColOffset + 3 * bwc::VectorWidth), r3c4);
}
};
*/
}
#endif //SMID_MATRIX_REGISTERBLOCKING_H

View File

@ -0,0 +1,80 @@
//
// Created by oke on 01.07.20.
//
#ifndef SMID_MATRIX_REGISTERBLOCKINGMANUAL_H
#define SMID_MATRIX_REGISTERBLOCKINGMANUAL_H
#include "../../Matrix.h"
#include "ExtendedBlockWiseConfig.h"
template<typename BitWiseConfig, size_t Rows, size_t ColumnVectors>
struct a {
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
template<typename M1, typename M2, typename M3>
void do_stuff(int p, int bi, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
for (int ai = 0; ai < Rows; ai++) { // row index in A (handling regsA rows)
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
CReg[ai][bi] += aa * bb;
}
}
};
namespace detail {
template<
// template parameter as struct: otherwise some warning about losing alignment information warning
typename BitWiseConfig,
unsigned _NumRows, unsigned _NumColumnVectors
>
struct RegisterBlockingManual {
typedef ExtendedBlockWiseConfig<BitWiseConfig> bwc;
static constexpr auto AddAndStore = [](typename bwc::FloatType *memory, typename bwc::VectorType vector) {
bwc::StoreVector(memory, bwc::LoadVector(memory) + vector);
};
static constexpr auto NumRows = _NumRows;
static constexpr auto NumColumns = _NumColumnVectors * bwc::VectorWidth;
template<typename M1, typename M2, typename M3>
static void __attribute__ ((noinline))
handle_block(int k, M1 &C, const M2 &A, int aRowOffset, const M3 &B, int bColOffset) {
// AVX2 has 16 registers
// should be compiled as registers (total: regA * regB)
typename bwc::VectorType CReg[_NumRows][_NumColumnVectors] = {{0.0}};
// iterate over dot-product terms
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
// Perform the DOT product
for (int bi = 0; bi < _NumColumnVectors; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
typename bwc::VectorType bb = bwc::LoadVector(&B(p, bColOffset + bi * bwc::VectorWidth));
for (int ai = 0; ai < _NumRows; ai++) { // row index in A (handling regsA rows)
typename bwc::VectorType aa = bwc::BroadcastToVector(A(aRowOffset + ai, p));
CReg[ai][bi] += aa * bb;
}
}
}
// total regA * regB + regB registers
// Accumulate the results into C.
for (int ai = 0; ai < _NumRows; ai++) {
for (int bi = 0; bi < _NumColumnVectors; bi++) {
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * bwc::VectorWidth), CReg[ai][bi]);
}
}
}
};
}
#endif //SMID_MATRIX_REGISTERBLOCKINGMANUAL_H

BIN
test.pdf Normal file

Binary file not shown.

1195
test.svg Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 38 KiB