commit a9a3fbbd1123fdfb4bc783c43771bb10018cf7ea Author: har0ke Date: Thu Jun 11 11:30:38 2020 +0200 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3842203 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea/* +data/* +*.s +cmake-build-* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..5f4fa8d --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.16) +project(smid_matrix CXX) + +include(CheckCXXCompilerFlag) +set(CMAKE_CXX_STANDARD 17) +option(OPTIMIZE_FOR_NATIVE "Build with -march=native" ON) +option(USE_CLANG "Build with clang instead of gcc" OFF) +set(DEFAULT_TEST_FUNCTION_NAME "native_reordered" CACHE STRING "default function to run") + +add_compile_definitions(DEFAULT_TEST_FUNCTION_NAME="${DEFAULT_TEST_FUNCTION_NAME}") + +if(USE_CLANG) + set(CMAKE_CXX_COMPILER "clang++") +else() + set(CMAKE_CXX_COMPILER "g++") +endif() + +if(OPTIMIZE_FOR_NATIVE) + CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + else() + error(FATAL_ERROR "-march=native not supported") + endif() +else() + CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) + if(COMPILER_SUPPORTS_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + else() + error(FATAL_ERROR "-march=native not supported") + endif() +endif() + +find_package(Boost REQUIRED COMPONENTS filesystem program_options) + +include_directories("${Boost_INCLUDE_DIRS}") + +add_library(simple_matrix src/Matrix.cpp) +target_link_libraries(simple_matrix ${Boost_LIBRARIES}) + +add_executable(simd_multiply src/main.cpp) +target_link_libraries(simd_multiply simple_matrix) + +add_executable(generate_random src/generate_random.cpp) +target_link_libraries(generate_random simple_matrix) \ No newline at end of file diff --git a/dump.sh b/dump.sh new file mode 100644 index 0000000..653219b --- /dev/null +++ b/dump.sh @@ -0,0 +1 @@ +g++ -g -c -march=native -O2 main.cpp; objdump -d -M intel -S main.o > main.s diff --git a/ressrouces.txt b/ressrouces.txt new file mode 100644 index 0000000..8b8e177 --- /dev/null +++ b/ressrouces.txt @@ -0,0 +1,4 @@ +https://gist.github.com/nadavrot/5b35d44e8ba3dd718e595e40184d03f0 +https://www.cs.cornell.edu/~bindel/class/cs5220-s10/slides/lec03.pdf +http://www.netlib.org/utk/papers/autoblock/node2.html +https://software.intel.com/content/www/us/en/develop/articles/performance-of-classic-matrix-multiplication-algorithm-on-intel-xeon-phi-processor-system.html \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py new file mode 100755 index 0000000..3a4a6ca --- /dev/null +++ b/scripts/test.py @@ -0,0 +1,52 @@ +#!/usr/bin/python +import os +import subprocess + + +def check_call_quiet(*args, **kwargs): + p = subprocess.Popen(*args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) + output, err = p.communicate() + rc = p.returncode + if rc != 0: + print(output) + print(err) + exit(0) + + +def compile_and_run(source_path, build_path_prefix, target, native, use_clang, args): + flags = [ + "-DOPTIMIZE_FOR_NATIVE=" + ("ON" if native else "OFF"), + "-DCMAKE_BUILD_TYPE=Release", + "-DUSE_CLANG=" + ("ON" if use_clang else "OFF"), + "-DNDEBUG=ON", "-DBOOST_UBLAS_NDEBUG=ON" + ] + build_path = os.path.join(build_path_prefix, " ".join(flags)) + os.makedirs(build_path, exist_ok=True) + check_call_quiet(["cmake", "-B", build_path, "-S", source_path] + flags) + check_call_quiet(["make", target], cwd=build_path) + if False: + subprocess.check_call(["perf", "stat", "-B", "-e", "cache-references,cache-misses,cycles,instructions,branches,faults,migrations"] + + [os.path.join(build_path, target)] + args) + else: + print(" ".join([os.path.join(build_path, target)] + args)) + subprocess.check_call([os.path.join(build_path, target)] + args) + + +if __name__ == '__main__': + os.chdir(os.path.join(os.path.dirname(__file__), "..")) + os.makedirs("data", exist_ok=True) + os.chdir("data") + with_double = False + extra_args = ["--validate"] + for clang in [True]: + for sizes in [["2048", "2048", "999"]]: # [["873", "1456", "999"]]: + args = list(sizes) + if with_double: + args.append("--double") + compile_and_run("..", "builds", "generate_random", True, True, args) + folder = "x".join(sizes) + for function in ["block_wise_256_f", "boost_axpy_mul"]: # ["naive_reordered", "block_wise_256_f", "boost_blocked_mul_64", "boost_blocked_mul_256", "boost_mul", "boost_axpy_mul"]: + arguments = [folder, "--algorithm", function] + if with_double: + arguments.append("--double") + compile_and_run("..", "builds", "simd_multiply", True, clang, arguments + extra_args) diff --git a/src/Matrix.cpp b/src/Matrix.cpp new file mode 100644 index 0000000..97b5ea0 --- /dev/null +++ b/src/Matrix.cpp @@ -0,0 +1,91 @@ +// +// Created by oke on 05.06.20. +// +#include "Matrix.h" +#include + +template +struct value_prefix; + +template<> +struct value_prefix { + static constexpr auto prefix = "double"; +}; + +template<> +struct value_prefix { + static constexpr auto prefix = "float"; +}; + +template +void matrix_io::save(const Matrix &matrix, const boost::filesystem::path &folder) { + auto fn = folder / (value_prefix::prefix + std::to_string(matrix.size1()) + "x" + std::to_string(matrix.size2())); + std::ofstream f(fn.native()); + f.write((char * )&matrix(0, 0), sizeof(T) * matrix.size2() * matrix.size1()); + f.close(); +} + +template +Matrix matrix_io::load(const boost::filesystem::path & fn) { + if(!boost::filesystem::is_regular_file(fn)) { + std::cerr << fn << std::endl; + throw std::runtime_error("Is not a file"); + } + + std::string fileName = fn.filename().native(); + + size_t startN = 0; + while((fileName[startN] < 0 || fileName[startN] > '9') && startN < fn.size()) ++startN; + if(startN == fileName.size()) throw; + + auto prefix = fileName.substr(0, startN); + if(prefix != value_prefix::prefix) throw; + + size_t endN = startN; + while((fileName[endN] != 'x') && endN < fileName.size()) ++endN; + if(endN == fileName.size()) throw; + + size_t m = std::stoul(fileName.substr(startN, endN - startN)); + size_t n = std::stoul(fileName.substr(endN + 1)); + + Matrix matrix(m, n); + std::ifstream f(fn.native()); + f.read((char * )&matrix(0, 0), sizeof(T) * m * n); + f.close(); + return matrix; +} + +template +void matrix_io::loadAB(Matrix &A, Matrix &B, boost::filesystem::path folder) { + if(!boost::filesystem::is_directory(folder)) + throw std::runtime_error("Is not a directory"); + + std::string folderName = folder.filename().string(); + auto last_i = 0; + size_t i; + size_t sizes_i = 0; + size_t sizes[3]; + while((i = folderName.find("x", last_i)) != std::string::npos) { + if(sizes_i >= 2) throw; + sizes[sizes_i] = std::stoul(folderName.substr(last_i, i - last_i)); + last_i = i + 1; + ++sizes_i; + } + if(sizes_i != 2) throw; + sizes[sizes_i] = std::stoul(folderName.substr(last_i)); + A = matrix_io::load(folder / (value_prefix::prefix + std::to_string(sizes[0]) + "x" + std::to_string(sizes[1]))); + B = matrix_io::load(folder / (value_prefix::prefix + std::to_string(sizes[1]) + "x" + std::to_string(sizes[2]))); +} + +template +boost::filesystem::path matrix_io::saveAB(const Matrix &A, const Matrix &B, boost::filesystem::path base_folder) { + auto folder = base_folder / (std::to_string(A.size1()) + "x" + std::to_string(A.size2()) + "x" + std::to_string(B.size2())); + boost::filesystem::create_directories(folder); + matrix_io::save(A, folder); + matrix_io::save(B, folder); + return folder; +} + + +template class matrix_io; +template class matrix_io; \ No newline at end of file diff --git a/src/Matrix.h b/src/Matrix.h new file mode 100644 index 0000000..ddb8c82 --- /dev/null +++ b/src/Matrix.h @@ -0,0 +1,34 @@ +// +// Created by oke on 05.06.20. +// + +#ifndef SMID_MATRIX_Matrix_H +#define SMID_MATRIX_Matrix_H + +#include +#include +#include +#include +#include + +template // byte (512 bit) +using AlignedBoostMatrix = boost::numeric::ublas::matrix>>; + +template +using Matrix = AlignedBoostMatrix; + +template +class matrix_io { +public: + matrix_io() = delete; + static void save(const Matrix &matrix, const boost::filesystem::path &folder); + static Matrix load(const boost::filesystem::path & fn); + static void loadAB(Matrix &A, Matrix &B, boost::filesystem::path folder); + static boost::filesystem::path saveAB(const Matrix &A, const Matrix &B, boost::filesystem::path base_folder); +}; + +extern template class matrix_io; +extern template class matrix_io; + +#endif //SMID_MATRIX_Matrix_H diff --git a/src/generate_random.cpp b/src/generate_random.cpp new file mode 100644 index 0000000..9cc6d3a --- /dev/null +++ b/src/generate_random.cpp @@ -0,0 +1,57 @@ +// +// Created by oke on 05.06.20. +// + +#include "Matrix.h" +#include +#include + +template +Matrix random_matrix(size_t m, size_t n) { + Matrix matrix(m, n); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + matrix(i, j) = ((FloatType) std::rand()) / RAND_MAX; + } + } + return matrix; +} + +#include + +namespace po = boost::program_options; + +template +void generate(size_t m, size_t k, size_t n) { + auto A = random_matrix(m, k); + auto B = random_matrix(k, n); + + Matrix A_; + Matrix B_; + + matrix_io::saveAB(A, B, "."); +} +int main(int argc, char *argv[]) { + + po::options_description desc("Generate matrices"); + desc.add_options() + ("m", po::value(), "Size n") + ("k", po::value(), "Size k") + ("n", po::value(), "Size m") + ("double", "use_double instead of float"); + po::positional_options_description p; + p.add("m", 1); + p.add("k", 1); + p.add("n", 1); + + po::variables_map vm; + po::store(po::command_line_parser(argc, argv). + options(desc).positional(p).run(), vm); + po::notify(vm); + + if(vm.count("double")) + generate(vm["m"].as(), vm["k"].as(), vm["n"].as()); + else + generate(vm["m"].as(), vm["k"].as(), vm["n"].as()); + return 0; +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..04a2af7 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,266 @@ +#include "Matrix.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace std::chrono; +namespace po = boost::program_options; + +template +Matrix __attribute__ ((noinline)) naive(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2()); + assert(A.size2() == B.size1()); + + for (int i = 0; i < A.size1(); i++) { + for (int j = 0; j < B.size2(); j++) { + for (int p = 0; p < B.size1(); p++) { + C(i, j) += A(i, p) * B(p, j); + } + } + } + + return C; +} + +template +Matrix __attribute__ ((noinline)) naive_reordered(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2()); + assert(A.size2() == B.size1()); + for (int i = 0; i < A.size1(); i++) { + for (int p = 0; p < B.size1(); p++) { + for (int j = 0; j < B.size2(); j++) { + C(i, j) += A(i, p) * B(p, j); + } + } + } + return C; +} + +template +Matrix __attribute__ ((noinline)) register_blocking(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2()); + assert(A.size2() == B.size1()); + + for (int i = 0; i < A.size1(); i++) { + for (int p = 0; p < B.size1(); p++) { + for (int j = 0; j < B.size2(); j++) { + C(i, j) += A(i, p) * B(p, j); + } + } + } + + return C; +} + +template +using BinaryMatrixOp = Matrix (*)(const Matrix &A, const Matrix &B); + +template +Matrix run_function(BinaryMatrixOp f, const Matrix &A, const Matrix &B) { + auto a = steady_clock::now(); + auto C = f(A, B); + auto b = steady_clock::now(); + std::cout << "multiply: " << std::chrono::duration_cast(b - a).count() << "ms" << std::endl; + return C; +} + +struct __m256_block_wise_config { + using FloatType = float; + using VectorType = __m256; + static constexpr auto LoadVector = _mm256_loadu_ps; + static constexpr auto StoreVector = _mm256_storeu_ps; + static constexpr auto BroadcastToVector = _mm256_broadcast_ss; +}; + +struct __m256d_block_wise_config { + using FloatType = double; + using VectorType = __m256d; + static constexpr auto LoadVector = _mm256_loadu_pd; + static constexpr auto StoreVector = _mm256_storeu_pd; + static constexpr auto BroadcastToVector = _mm256_broadcast_sd; +}; + +template< + // template parameter as struct: otherwise some warning about losing alignment information warning + typename BlockWiseConfig = __m256_block_wise_config, + unsigned numRows = 3, unsigned numColumns_width = 4 +> +struct block_wise { + using FloatType = typename BlockWiseConfig::FloatType; + using VectorType = typename BlockWiseConfig::VectorType; + static constexpr auto LoadVector = BlockWiseConfig::LoadVector; + static constexpr auto StoreVector = BlockWiseConfig::StoreVector; + static constexpr auto BroadcastToVector = BlockWiseConfig::BroadcastToVector; + static constexpr size_t VectorWidth = sizeof(VectorType) / sizeof(FloatType); + static constexpr auto AddAndStore = [](FloatType *memory, VectorType vector) { StoreVector(memory, LoadVector(memory) + vector); }; + + Matrix operator()(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2()); + int m = A.size1(); + int k = A.size2(); + int n = B.size2(); + int m_i = 0; + for(; m_i + numRows <= m; m_i += numRows) { // row + int n_i = 0; + for(; n_i + numColumns_width * VectorWidth <= n; n_i += numColumns_width * VectorWidth) { + handle_block(k, C, A, m_i, B, n_i); + } + // calculate remaining columns (like naive_reordered) + if(n_i < n) { + for (auto m_i_o = 0; m_i_o < numRows; ++m_i_o) { + for (int p = 0; p < k; ++p) { + for (auto n_i_rest = n_i; n_i_rest < n; ++n_i_rest) { + C(m_i + m_i_o, n_i_rest) += A(m_i + m_i_o, p) * B(p, n_i_rest); + } + } + } + } + } + // calculate remaining rows (like naive_reordered) + for (; m_i < m; ++m_i) { + for (int p = 0; p < k; p++) { + for (int n_i = 0; n_i < n; ++n_i) { + C(m_i, n_i) += A(m_i, p) * B(p, n_i); + } + } + } + return C; + + } + void __attribute__ ((noinline)) handle_block(int k, Matrix &C, const Matrix &A, int aRowOffset, const Matrix &B, int bColOffset) { + + // AVX2 has 16 registers + // should be compiled as registers (total: regA * regB) + VectorType CReg[numRows][numColumns_width] = {{0.0}}; + // iterate over dot-product terms + for (int p = 0; p < k; p++) { // row index in B and column index in A (handling all rows/columns) + // Perform the DOT product + for (int bi = 0; bi < numColumns_width; bi++) { // column index in B (handling regsB * 'VectorWidth' columns) + VectorType bb = LoadVector(&B(p, bColOffset + bi * VectorWidth)); + for (int ai = 0; ai < numRows; ai++) { // row index in A (handling regsA rows) + VectorType aa = BroadcastToVector(&A(aRowOffset + ai, p)); + CReg[ai][bi] += aa * bb; + } + } + } + // total regA * regB + regB registers + + // Accumulate the results into C. + for (int ai = 0; ai < numRows; ai++) { + for (int bi = 0; bi < numColumns_width; bi++) { + AddAndStore(&C(aRowOffset + ai, bColOffset + bi * VectorWidth), CReg[ai][bi]); + } + } + } + +}; + +template +struct block_wise_256; + +template<> +struct block_wise_256 : + public block_wise<__m256_block_wise_config> {}; +template<> +struct block_wise_256 : + public block_wise<__m256d_block_wise_config> {}; + + +template +Matrix __attribute__ ((noinline)) block_wise_256_f(const Matrix &A, const Matrix &B) { + return block_wise_256()(A, B); +} + +template +Matrix __attribute__ ((noinline)) boost_axpy_mul(const Matrix &A, const Matrix &B) { + Matrix C(A.size1(), B.size2()); + boost::numeric::ublas::axpy_prod(A, B, C); + return C; +} + +template +Matrix __attribute__ ((noinline)) boost_blocked_mul_64(const Matrix &A, const Matrix &B) { + return boost::numeric::ublas::block_prod, 64>(A, B); +} +template +Matrix __attribute__ ((noinline)) boost_blocked_mul_256(const Matrix &A, const Matrix &B) { + return boost::numeric::ublas::block_prod, 256>(A, B); +} +template +Matrix __attribute__ ((noinline)) boost_mul(const Matrix &A, const Matrix &B) { + return boost::numeric::ublas::prod(A, B); +} + +#define TEST_IF(test_function_name, function, A, B) {\ + if(test_function_name == #function) {\ + C = run_function(function, A, B);\ + use_result += C(0, 0);\ + }\ +} + +template +int main_work(const std::string &test_function_name, const std::string &input_folder, bool validate) { + std::cout << "Running function '" << test_function_name << "'" << std::endl; + std::srand(0); + Matrix A; + Matrix B; + Matrix C; + auto a = steady_clock::now(); + matrix_io::loadAB(A, B, input_folder); + auto b = steady_clock::now(); + std::cout << "loading from file: " << std::chrono::duration_cast(b - a).count() << "ms" << std::endl; + + // use the result to prevent compiler to optimize... + double use_result = 0; + TEST_IF(test_function_name, naive, A, B) + TEST_IF(test_function_name, naive_reordered, A, B) + TEST_IF(test_function_name, block_wise_256_f, A, B) + TEST_IF(test_function_name, boost_axpy_mul, A, B) + TEST_IF(test_function_name, boost_blocked_mul_64, A, B) + TEST_IF(test_function_name, boost_blocked_mul_256, A, B) + TEST_IF(test_function_name, boost_mul, A, B) + + if(validate) { + std::cout << "Validating matrix" << std::endl; + auto C2 = boost_axpy_mul(A, B); + if(C.size1() != C2.size1() || C.size2() != C2.size2()) + throw std::runtime_error("Result matrix has invalid size."); + for(auto i = 0; i < C2.size1(); ++i) { + for(auto j = 0; j < C2.size2(); ++j) { + if(abs(C(i, j) - C2(i, j)) > 1e-3) + throw std::runtime_error("Result matrix is invalid."); + } + } + std::cout << "Matrix seems fine" << std::endl; + } + std::cout << use_result << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b" << std::endl; + return use_result == 0 ? -1 : 0; +} + +int main(int argc, char* argv[]) { + po::options_description desc("Multiply two matrices"); + desc.add_options() + ("input-folder", "folder containing matrices, following naming conventions; folder name:xx; file name: x") + ("validate", "validate matrix with boost") + ("algorithm", po::value(), "algorithm to execute") + ("double", "use_double instead of float"); + po::positional_options_description p; + p.add("input-folder", -1); + + po::variables_map vm; + po::store(po::command_line_parser(argc, argv). + options(desc).positional(p).run(), vm); + po::notify(vm); + + std::string test_function_name = vm.count("algorithm") ? vm["algorithm"].as() : DEFAULT_TEST_FUNCTION_NAME; + + if(vm.count("double")) { + return main_work(test_function_name, vm["input-folder"].as(), vm.count("validate")); + } else { + return main_work(test_function_name, vm["input-folder"].as(), vm.count("validate")); + } +}