Initial Commit
This commit is contained in:
commit
a9a3fbbd11
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
.idea/*
|
||||
data/*
|
||||
*.s
|
||||
cmake-build-*
|
45
CMakeLists.txt
Normal file
45
CMakeLists.txt
Normal file
@ -0,0 +1,45 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
project(smid_matrix CXX)
|
||||
|
||||
include(CheckCXXCompilerFlag)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON)
|
||||
option(USE_CLANG "Build with clang instead of gcc" OFF)
|
||||
set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run")
|
||||
|
||||
add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}")
|
||||
|
||||
if(USE_CLANG)
|
||||
set(CMAKE_CXX_COMPILER "clang++")
|
||||
else()
|
||||
set(CMAKE_CXX_COMPILER "g++")
|
||||
endif()
|
||||
|
||||
if(OPTIMIZE_FOR_NATIVE)
|
||||
CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
if(COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
|
||||
else()
|
||||
error(FATAL_ERROR "-march=native not supported")
|
||||
endif()
|
||||
else()
|
||||
CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
if(COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
|
||||
else()
|
||||
error(FATAL_ERROR "-march=native not supported")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(Boost REQUIRED COMPONENTS filesystem program_options)
|
||||
|
||||
include_directories("${Boost_INCLUDE_DIRS}")
|
||||
|
||||
add_library(simple_matrix src/Matrix.cpp)
|
||||
target_link_libraries(simple_matrix ${Boost_LIBRARIES})
|
||||
|
||||
add_executable(simd_multiply src/main.cpp)
|
||||
target_link_libraries(simd_multiply simple_matrix)
|
||||
|
||||
add_executable(generate_random src/generate_random.cpp)
|
||||
target_link_libraries(generate_random simple_matrix)
|
1
dump.sh
Normal file
1
dump.sh
Normal file
@ -0,0 +1 @@
|
||||
g++ -g -c -march=native -O2 main.cpp; objdump -d -M intel -S main.o > main.s
|
4
ressrouces.txt
Normal file
4
ressrouces.txt
Normal file
@ -0,0 +1,4 @@
|
||||
https://gist.github.com/nadavrot/5b35d44e8ba3dd718e595e40184d03f0
|
||||
https://www.cs.cornell.edu/~bindel/class/cs5220-s10/slides/lec03.pdf
|
||||
http://www.netlib.org/utk/papers/autoblock/node2.html
|
||||
https://software.intel.com/content/www/us/en/develop/articles/performance-of-classic-matrix-multiplication-algorithm-on-intel-xeon-phi-processor-system.html
|
52
scripts/test.py
Executable file
52
scripts/test.py
Executable file
@ -0,0 +1,52 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
|
||||
def check_call_quiet(*args, **kwargs):
|
||||
p = subprocess.Popen(*args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)
|
||||
output, err = p.communicate()
|
||||
rc = p.returncode
|
||||
if rc != 0:
|
||||
print(output)
|
||||
print(err)
|
||||
exit(0)
|
||||
|
||||
|
||||
def compile_and_run(source_path, build_path_prefix, target, native, use_clang, args):
|
||||
flags = [
|
||||
"-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"),
|
||||
"-DCMAKE_BUILD_TYPE=Release",
|
||||
"-DUSE_CLANG=" + ("ON" if use_clang else "OFF"),
|
||||
"-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON"
|
||||
]
|
||||
build_path = os.path.join(build_path_prefix, " ".join(flags))
|
||||
os.makedirs(build_path, exist_ok=True)
|
||||
check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags)
|
||||
check_call_quiet(["make", target], cwd=build_path)
|
||||
if False:
|
||||
subprocess.check_call(["perf", "stat", "-B", "-e", "cache-references,cache-misses,cycles,instructions,branches,faults,migrations"]
|
||||
+ [os.path.join(build_path, target)] + args)
|
||||
else:
|
||||
print(" ".join([os.path.join(build_path, target)] + args))
|
||||
subprocess.check_call([os.path.join(build_path, target)] + args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), ".."))
|
||||
os.makedirs("data", exist_ok=True)
|
||||
os.chdir("data")
|
||||
with_double = False
|
||||
extra_args = ["--validate"]
|
||||
for clang in [True]:
|
||||
for sizes in [["2048", "2048", "999"]]: # [["873", "1456", "999"]]:
|
||||
args = list(sizes)
|
||||
if with_double:
|
||||
args.append("--double")
|
||||
compile_and_run("..", "builds", "generate_random", True, True, args)
|
||||
folder = "x".join(sizes)
|
||||
for function in ["block_wise_256_f", "boost_axpy_mul"]: # ["naive_reordered", "block_wise_256_f", "boost_blocked_mul_64", "boost_blocked_mul_256", "boost_mul", "boost_axpy_mul"]:
|
||||
arguments = [folder, "--algorithm", function]
|
||||
if with_double:
|
||||
arguments.append("--double")
|
||||
compile_and_run("..", "builds", "simd_multiply", True, clang, arguments + extra_args)
|
91
src/Matrix.cpp
Normal file
91
src/Matrix.cpp
Normal file
@ -0,0 +1,91 @@
|
||||
//
|
||||
// Created by oke on 05.06.20.
|
||||
//
|
||||
#include "Matrix.h"
|
||||
#include <fstream>
|
||||
|
||||
template<typename T>
|
||||
struct value_prefix;
|
||||
|
||||
template<>
|
||||
struct value_prefix<double> {
|
||||
static constexpr auto prefix = "double";
|
||||
};
|
||||
|
||||
template<>
|
||||
struct value_prefix<float> {
|
||||
static constexpr auto prefix = "float";
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
void matrix_io<T>::save(const Matrix<T> &matrix, const boost::filesystem::path &folder) {
|
||||
auto fn = folder / (value_prefix<T>::prefix + std::to_string(matrix.size1()) + "x" + std::to_string(matrix.size2()));
|
||||
std::ofstream f(fn.native());
|
||||
f.write((char * )&matrix(0, 0), sizeof(T) * matrix.size2() * matrix.size1());
|
||||
f.close();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> matrix_io<T>::load(const boost::filesystem::path & fn) {
|
||||
if(!boost::filesystem::is_regular_file(fn)) {
|
||||
std::cerr << fn << std::endl;
|
||||
throw std::runtime_error("Is not a file");
|
||||
}
|
||||
|
||||
std::string fileName = fn.filename().native();
|
||||
|
||||
size_t startN = 0;
|
||||
while((fileName[startN] < 0 || fileName[startN] > '9') && startN < fn.size()) ++startN;
|
||||
if(startN == fileName.size()) throw;
|
||||
|
||||
auto prefix = fileName.substr(0, startN);
|
||||
if(prefix != value_prefix<T>::prefix) throw;
|
||||
|
||||
size_t endN = startN;
|
||||
while((fileName[endN] != 'x') && endN < fileName.size()) ++endN;
|
||||
if(endN == fileName.size()) throw;
|
||||
|
||||
size_t m = std::stoul(fileName.substr(startN, endN - startN));
|
||||
size_t n = std::stoul(fileName.substr(endN + 1));
|
||||
|
||||
Matrix<T> matrix(m, n);
|
||||
std::ifstream f(fn.native());
|
||||
f.read((char * )&matrix(0, 0), sizeof(T) * m * n);
|
||||
f.close();
|
||||
return matrix;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void matrix_io<T>::loadAB(Matrix<T> &A, Matrix<T> &B, boost::filesystem::path folder) {
|
||||
if(!boost::filesystem::is_directory(folder))
|
||||
throw std::runtime_error("Is not a directory");
|
||||
|
||||
std::string folderName = folder.filename().string();
|
||||
auto last_i = 0;
|
||||
size_t i;
|
||||
size_t sizes_i = 0;
|
||||
size_t sizes[3];
|
||||
while((i = folderName.find("x", last_i)) != std::string::npos) {
|
||||
if(sizes_i >= 2) throw;
|
||||
sizes[sizes_i] = std::stoul(folderName.substr(last_i, i - last_i));
|
||||
last_i = i + 1;
|
||||
++sizes_i;
|
||||
}
|
||||
if(sizes_i != 2) throw;
|
||||
sizes[sizes_i] = std::stoul(folderName.substr(last_i));
|
||||
A = matrix_io<T>::load(folder / (value_prefix<T>::prefix + std::to_string(sizes[0]) + "x" + std::to_string(sizes[1])));
|
||||
B = matrix_io<T>::load(folder / (value_prefix<T>::prefix + std::to_string(sizes[1]) + "x" + std::to_string(sizes[2])));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
boost::filesystem::path matrix_io<T>::saveAB(const Matrix<T> &A, const Matrix<T> &B, boost::filesystem::path base_folder) {
|
||||
auto folder = base_folder / (std::to_string(A.size1()) + "x" + std::to_string(A.size2()) + "x" + std::to_string(B.size2()));
|
||||
boost::filesystem::create_directories(folder);
|
||||
matrix_io::save(A, folder);
|
||||
matrix_io::save(B, folder);
|
||||
return folder;
|
||||
}
|
||||
|
||||
|
||||
template class matrix_io<float>;
|
||||
template class matrix_io<double>;
|
34
src/Matrix.h
Normal file
34
src/Matrix.h
Normal file
@ -0,0 +1,34 @@
|
||||
//
|
||||
// Created by oke on 05.06.20.
|
||||
//
|
||||
|
||||
#ifndef SMID_MATRIX_Matrix_H
|
||||
#define SMID_MATRIX_Matrix_H
|
||||
|
||||
#include <iostream>
|
||||
#include <boost/numeric/ublas/matrix.hpp>
|
||||
#include <boost/numeric/ublas/storage.hpp>
|
||||
#include <boost/align/aligned_allocator.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
template<typename T, size_t Alignment=64> // byte (512 bit)
|
||||
using AlignedBoostMatrix = boost::numeric::ublas::matrix<T, boost::numeric::ublas::row_major,
|
||||
boost::numeric::ublas::unbounded_array<T, boost::alignment::aligned_allocator<T, Alignment>>>;
|
||||
|
||||
template<typename T, size_t Alignment=512 / sizeof(T)>
|
||||
using Matrix = AlignedBoostMatrix<T, Alignment>;
|
||||
|
||||
template<typename T>
|
||||
class matrix_io {
|
||||
public:
|
||||
matrix_io() = delete;
|
||||
static void save(const Matrix<T> &matrix, const boost::filesystem::path &folder);
|
||||
static Matrix<T> load(const boost::filesystem::path & fn);
|
||||
static void loadAB(Matrix<T> &A, Matrix<T> &B, boost::filesystem::path folder);
|
||||
static boost::filesystem::path saveAB(const Matrix<T> &A, const Matrix<T> &B, boost::filesystem::path base_folder);
|
||||
};
|
||||
|
||||
extern template class matrix_io<float>;
|
||||
extern template class matrix_io<double>;
|
||||
|
||||
#endif //SMID_MATRIX_Matrix_H
|
57
src/generate_random.cpp
Normal file
57
src/generate_random.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
//
|
||||
// Created by oke on 05.06.20.
|
||||
//
|
||||
|
||||
#include "Matrix.h"
|
||||
#include <iostream>
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
template<typename FloatType>
|
||||
Matrix<FloatType> random_matrix(size_t m, size_t n) {
|
||||
Matrix<FloatType> matrix(m, n);
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
matrix(i, j) = ((FloatType) std::rand()) / RAND_MAX;
|
||||
}
|
||||
}
|
||||
return matrix;
|
||||
}
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
template<typename T>
|
||||
void generate(size_t m, size_t k, size_t n) {
|
||||
auto A = random_matrix<T>(m, k);
|
||||
auto B = random_matrix<T>(k, n);
|
||||
|
||||
Matrix<T> A_;
|
||||
Matrix<T> B_;
|
||||
|
||||
matrix_io<T>::saveAB(A, B, ".");
|
||||
}
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
po::options_description desc("Generate matrices");
|
||||
desc.add_options()
|
||||
("m", po::value<size_t>(), "Size n")
|
||||
("k", po::value<size_t>(), "Size k")
|
||||
("n", po::value<size_t>(), "Size m")
|
||||
("double", "use_double instead of float");
|
||||
po::positional_options_description p;
|
||||
p.add("m", 1);
|
||||
p.add("k", 1);
|
||||
p.add("n", 1);
|
||||
|
||||
po::variables_map vm;
|
||||
po::store(po::command_line_parser(argc, argv).
|
||||
options(desc).positional(p).run(), vm);
|
||||
po::notify(vm);
|
||||
|
||||
if(vm.count("double"))
|
||||
generate<double>(vm["m"].as<size_t>(), vm["k"].as<size_t>(), vm["n"].as<size_t>());
|
||||
else
|
||||
generate<float>(vm["m"].as<size_t>(), vm["k"].as<size_t>(), vm["n"].as<size_t>());
|
||||
return 0;
|
||||
}
|
266
src/main.cpp
Normal file
266
src/main.cpp
Normal file
@ -0,0 +1,266 @@
|
||||
#include "Matrix.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <boost/numeric/ublas/operation_blocked.hpp>
|
||||
#include <boost/numeric/ublas/operation.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
#include <immintrin.h>
|
||||
|
||||
using namespace std::chrono;
|
||||
namespace po = boost::program_options;
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) naive(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
Matrix<T> C(A.size1(), B.size2());
|
||||
assert(A.size2() == B.size1());
|
||||
|
||||
for (int i = 0; i < A.size1(); i++) {
|
||||
for (int j = 0; j < B.size2(); j++) {
|
||||
for (int p = 0; p < B.size1(); p++) {
|
||||
C(i, j) += A(i, p) * B(p, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) naive_reordered(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
Matrix<T> C(A.size1(), B.size2());
|
||||
assert(A.size2() == B.size1());
|
||||
for (int i = 0; i < A.size1(); i++) {
|
||||
for (int p = 0; p < B.size1(); p++) {
|
||||
for (int j = 0; j < B.size2(); j++) {
|
||||
C(i, j) += A(i, p) * B(p, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
return C;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) register_blocking(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
Matrix<T> C(A.size1(), B.size2());
|
||||
assert(A.size2() == B.size1());
|
||||
|
||||
for (int i = 0; i < A.size1(); i++) {
|
||||
for (int p = 0; p < B.size1(); p++) {
|
||||
for (int j = 0; j < B.size2(); j++) {
|
||||
C(i, j) += A(i, p) * B(p, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
using BinaryMatrixOp = Matrix<T> (*)(const Matrix<T> &A, const Matrix<T> &B);
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> run_function(BinaryMatrixOp<T> f, const Matrix<T> &A, const Matrix<T> &B) {
|
||||
auto a = steady_clock::now();
|
||||
auto C = f(A, B);
|
||||
auto b = steady_clock::now();
|
||||
std::cout << "multiply: " << std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count() << "ms" << std::endl;
|
||||
return C;
|
||||
}
|
||||
|
||||
struct __m256_block_wise_config {
|
||||
using FloatType = float;
|
||||
using VectorType = __m256;
|
||||
static constexpr auto LoadVector = _mm256_loadu_ps;
|
||||
static constexpr auto StoreVector = _mm256_storeu_ps;
|
||||
static constexpr auto BroadcastToVector = _mm256_broadcast_ss;
|
||||
};
|
||||
|
||||
struct __m256d_block_wise_config {
|
||||
using FloatType = double;
|
||||
using VectorType = __m256d;
|
||||
static constexpr auto LoadVector = _mm256_loadu_pd;
|
||||
static constexpr auto StoreVector = _mm256_storeu_pd;
|
||||
static constexpr auto BroadcastToVector = _mm256_broadcast_sd;
|
||||
};
|
||||
|
||||
template<
|
||||
// template parameter as struct: otherwise some warning about losing alignment information warning
|
||||
typename BlockWiseConfig = __m256_block_wise_config,
|
||||
unsigned numRows = 3, unsigned numColumns_width = 4
|
||||
>
|
||||
struct block_wise {
|
||||
using FloatType = typename BlockWiseConfig::FloatType;
|
||||
using VectorType = typename BlockWiseConfig::VectorType;
|
||||
static constexpr auto LoadVector = BlockWiseConfig::LoadVector;
|
||||
static constexpr auto StoreVector = BlockWiseConfig::StoreVector;
|
||||
static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector;
|
||||
static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType);
|
||||
static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); };
|
||||
|
||||
Matrix<FloatType> operator()(const Matrix<FloatType> &A, const Matrix<FloatType> &B) {
|
||||
Matrix<FloatType> C(A.size1(), B.size2());
|
||||
int m = A.size1();
|
||||
int k = A.size2();
|
||||
int n = B.size2();
|
||||
int m_i = 0;
|
||||
for(; m_i + numRows <= m; m_i += numRows) { // row
|
||||
int n_i = 0;
|
||||
for(; n_i + numColumns_width * VectorWidth <= n; n_i += numColumns_width * VectorWidth) {
|
||||
handle_block(k, C, A, m_i, B, n_i);
|
||||
}
|
||||
// calculate remaining columns (like naive_reordered)
|
||||
if(n_i < n) {
|
||||
for (auto m_i_o = 0; m_i_o < numRows; ++m_i_o) {
|
||||
for (int p = 0; p < k; ++p) {
|
||||
for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) {
|
||||
C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// calculate remaining rows (like naive_reordered)
|
||||
for (; m_i < m; ++m_i) {
|
||||
for (int p = 0; p < k; p++) {
|
||||
for (int n_i = 0; n_i < n; ++n_i) {
|
||||
C(m_i, n_i) += A(m_i, p) * B(p, n_i);
|
||||
}
|
||||
}
|
||||
}
|
||||
return C;
|
||||
|
||||
}
|
||||
void __attribute__ ((noinline)) handle_block(int k, Matrix<FloatType> &C, const Matrix<FloatType> &A, int aRowOffset, const Matrix<FloatType> &B, int bColOffset) {
|
||||
|
||||
// AVX2 has 16 registers
|
||||
// should be compiled as registers (total: regA * regB)
|
||||
VectorType CReg[numRows][numColumns_width] = {{0.0}};
|
||||
// iterate over dot-product terms
|
||||
for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns)
|
||||
// Perform the DOT product
|
||||
for (int bi = 0; bi < numColumns_width; bi++) { // column index in B (handling regsB * 'VectorWidth' columns)
|
||||
VectorType bb = LoadVector(&B(p, bColOffset + bi * VectorWidth));
|
||||
for (int ai = 0; ai < numRows; ai++) { // row index in A (handling regsA rows)
|
||||
VectorType aa = BroadcastToVector(&A(aRowOffset + ai, p));
|
||||
CReg[ai][bi] += aa * bb;
|
||||
}
|
||||
}
|
||||
}
|
||||
// total regA * regB + regB registers
|
||||
|
||||
// Accumulate the results into C.
|
||||
for (int ai = 0; ai < numRows; ai++) {
|
||||
for (int bi = 0; bi < numColumns_width; bi++) {
|
||||
AddAndStore(&C(aRowOffset + ai, bColOffset + bi * VectorWidth), CReg[ai][bi]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<typename FloatType>
|
||||
struct block_wise_256;
|
||||
|
||||
template<>
|
||||
struct block_wise_256<float> :
|
||||
public block_wise<__m256_block_wise_config> {};
|
||||
template<>
|
||||
struct block_wise_256<double> :
|
||||
public block_wise<__m256d_block_wise_config> {};
|
||||
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) block_wise_256_f(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
return block_wise_256<T>()(A, B);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) boost_axpy_mul(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
Matrix<T> C(A.size1(), B.size2());
|
||||
boost::numeric::ublas::axpy_prod(A, B, C);
|
||||
return C;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) boost_blocked_mul_64(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
return boost::numeric::ublas::block_prod<Matrix<T>, 64>(A, B);
|
||||
}
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) boost_blocked_mul_256(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
return boost::numeric::ublas::block_prod<Matrix<T>, 256>(A, B);
|
||||
}
|
||||
template<typename T>
|
||||
Matrix<T> __attribute__ ((noinline)) boost_mul(const Matrix<T> &A, const Matrix<T> &B) {
|
||||
return boost::numeric::ublas::prod(A, B);
|
||||
}
|
||||
|
||||
#define TEST_IF(test_function_name, function, A, B) {\
|
||||
if(test_function_name == #function) {\
|
||||
C = run_function(function, A, B);\
|
||||
use_result += C(0, 0);\
|
||||
}\
|
||||
}
|
||||
|
||||
template<typename FloatType>
|
||||
int main_work(const std::string &test_function_name, const std::string &input_folder, bool validate) {
|
||||
std::cout << "Running function '" << test_function_name << "'" << std::endl;
|
||||
std::srand(0);
|
||||
Matrix<FloatType> A;
|
||||
Matrix<FloatType> B;
|
||||
Matrix<FloatType> C;
|
||||
auto a = steady_clock::now();
|
||||
matrix_io<FloatType>::loadAB(A, B, input_folder);
|
||||
auto b = steady_clock::now();
|
||||
std::cout << "loading from file: " << std::chrono::duration_cast<std::chrono::milliseconds>(b - a).count() << "ms" << std::endl;
|
||||
|
||||
// use the result to prevent compiler to optimize...
|
||||
double use_result = 0;
|
||||
TEST_IF(test_function_name, naive, A, B)
|
||||
TEST_IF(test_function_name, naive_reordered, A, B)
|
||||
TEST_IF(test_function_name, block_wise_256_f, A, B)
|
||||
TEST_IF(test_function_name, boost_axpy_mul, A, B)
|
||||
TEST_IF(test_function_name, boost_blocked_mul_64, A, B)
|
||||
TEST_IF(test_function_name, boost_blocked_mul_256, A, B)
|
||||
TEST_IF(test_function_name, boost_mul, A, B)
|
||||
|
||||
if(validate) {
|
||||
std::cout << "Validating matrix" << std::endl;
|
||||
auto C2 = boost_axpy_mul(A, B);
|
||||
if(C.size1() != C2.size1() || C.size2() != C2.size2())
|
||||
throw std::runtime_error("Result matrix has invalid size.");
|
||||
for(auto i = 0; i < C2.size1(); ++i) {
|
||||
for(auto j = 0; j < C2.size2(); ++j) {
|
||||
if(abs(C(i, j) - C2(i, j)) > 1e-3)
|
||||
throw std::runtime_error("Result matrix is invalid.");
|
||||
}
|
||||
}
|
||||
std::cout << "Matrix seems fine" << std::endl;
|
||||
}
|
||||
std::cout << use_result << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b" << std::endl;
|
||||
return use_result == 0 ? -1 : 0;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
po::options_description desc("Multiply two matrices");
|
||||
desc.add_options()
|
||||
("input-folder", "folder containing matrices, following naming conventions; folder name:<n>x<k>x<m>; file name: <float|double><m>x<n>")
|
||||
("validate", "validate matrix with boost")
|
||||
("algorithm", po::value<std::string>(), "algorithm to execute")
|
||||
("double", "use_double instead of float");
|
||||
po::positional_options_description p;
|
||||
p.add("input-folder", -1);
|
||||
|
||||
po::variables_map vm;
|
||||
po::store(po::command_line_parser(argc, argv).
|
||||
options(desc).positional(p).run(), vm);
|
||||
po::notify(vm);
|
||||
|
||||
std::string test_function_name = vm.count("algorithm") ? vm["algorithm"].as<std::string>() : DEFAULT_TEST_FUNCTION_NAME;
|
||||
|
||||
if(vm.count("double")) {
|
||||
return main_work<double>(test_function_name, vm["input-folder"].as<std::string>(), vm.count("validate"));
|
||||
} else {
|
||||
return main_work<float>(test_function_name, vm["input-folder"].as<std::string>(), vm.count("validate"));
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user