diff --git a/scripts/test.py b/scripts/test.py
index 40b59f0..ef480f2 100755
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -17,7 +17,7 @@ def check_call_quiet(*args, **kwargs):
     rc = p.returncode
     if rc != 0:
         print(output.decode())
-        print(err)
+        print(err.decode())
         exit(0)
 
 
@@ -44,7 +44,7 @@ def compile_and_run(source_path, build_path_prefix, target, native, use_clang, a
 
 very_slow_functions = ["naive_reordered", "divide_and_conquer_naive_r4", "divide_and_conquer_naive_r2", "divide_and_conquer_naive_r1"]
 slow_functions = ["boost_axpy_mul", "divide_and_conquer_naive_r3", "divide_and_conquer_naive_r5"]
-normal_functions = ["block_wise_avx2"]
+normal_functions = ["block_wise_sse", "block_wise_avx2", "divide_and_conquer_block_sse"]
 fast_functions = ["divide_and_conquer_block_avx2", "blas"]
 avx512_functions = ["block_wise_avx512", "divide_and_conquer_block_avx512"]
 
@@ -63,6 +63,7 @@ if __name__ == '__main__':
     parser.add_argument("--avx512", action="store_true")
     parser.add_argument("--validate", action="store_true")
     parser.add_argument("--double", action="store_true")
+    parser.add_argument("--gcc", action="store_true")
     parser.add_argument("--function", type=str, nargs="*")
 
     options = parser.parse_args()
@@ -101,31 +102,31 @@ if __name__ == '__main__':
 
     times = [[] for f in functions]
     output_file = datetime.now().strftime("%Y.%m.%d_%H-%M-%S.json")
-    for clang in [True]:
-        for sizes in matrix_combinations:
-            args = list(sizes)
-            compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args)
-            folder = "x".join(sizes)
-            for fidx, function in enumerate(functions):
-                arguments = [folder, "--algorithm", function]
-                if with_double:
-                    arguments.append("--double")
-                output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args)
-                ms = output.decode()[output.decode().find("multiply:") + 10:]
-                if "ms\n" in ms:
-                    ms = float(ms.split("ms\n")[0])
-                else:
-                    ms = float(ms.split("s\n")[0]) * 1000
-                times[fidx].append(ms)
+    clang = not options.gcc
+    for sizes in matrix_combinations:
+        args = list(sizes)
+        compile_and_run("..", "builds", "generate_random", True, True, options.avx512, args)
+        folder = "x".join(sizes)
+        for fidx, function in enumerate(functions):
+            arguments = [folder, "--algorithm", function]
+            if with_double:
+                arguments.append("--double")
+            output = compile_and_run("..", "builds", "simd_multiply", True, clang, options.avx512, arguments + extra_args)
+            ms = output.decode()[output.decode().find("multiply:") + 10:]
+            if "ms\n" in ms:
+                ms = float(ms.split("ms\n")[0])
+            else:
+                ms = float(ms.split("s\n")[0]) * 1000
+            times[fidx].append(ms)
 
-            shutil.rmtree(folder)
-            print(["%.3f" % numpy.mean(ts) for ts in times])
-            with open(output_file + ".cache", "w") as f:
-                json.dump({
-                    "extr_args": extra_args,
-                    "times": times,
-                    "sizes": matrix_combinations,
-                    "functions": functions,
-                    "means": ["%.3f" % numpy.mean(ts) for ts in times]
-                }, f)
-            os.rename(output_file + ".cache", output_file)
\ No newline at end of file
+        shutil.rmtree(folder)
+        print(["%.3f" % numpy.mean(ts) for ts in times])
+        with open(output_file + ".cache", "w") as f:
+            json.dump({
+                "extr_args": extra_args,
+                "times": times,
+                "sizes": matrix_combinations,
+                "functions": functions,
+                "means": ["%.3f" % numpy.mean(ts) for ts in times]
+            }, f)
+        os.rename(output_file + ".cache", output_file)
\ No newline at end of file
diff --git a/src/DevideAndConquer.h b/src/DevideAndConquer.h
index 7fc7a25..425db9d 100644
--- a/src/DevideAndConquer.h
+++ b/src/DevideAndConquer.h
@@ -166,6 +166,12 @@ void __attribute__ ((noinline)) divide_and_conquer_naive_r5(Matrix<T> &C, const
     _divide_and_conquer<multiplier_naive_functor<1000, 1000>>(C, A, B);
 }
 
+template<typename T>
+void __attribute__ ((noinline)) divide_and_conquer_block_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
+    _divide_and_conquer<multiplier_block_wise<T, SSE>>(C, A, B);
+}
+
+
 template<typename T>
 void __attribute__ ((noinline)) divide_and_conquer_block_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
     _divide_and_conquer<multiplier_block_wise<T, AVX2>>(C, A, B);
diff --git a/src/main.cpp b/src/main.cpp
index 037fb07..acec4cc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
 #include <cassert>
 #include <boost/program_options.hpp>
-#include "RegisterBlocking.h"
+#include "register_blocking/RegisterBlocking.h"
 #include "Boost.h"
 #include "Naive.h"
 #include "DevideAndConquer.h"
@@ -52,6 +52,7 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
     double use_result = 0;
 
     TEST_IF(test_function_name, naive_reordered, A, B)
+    TEST_IF(test_function_name, block_wise_sse, A, B)
     TEST_IF(test_function_name, block_wise_avx2, A, B)
 
 #ifdef WITH_AVX512
@@ -59,6 +60,7 @@ int main_work(const std::string &test_function_name, const std::string &input_fo
 #endif
 
     TEST_IF(test_function_name, boost_axpy_mul, A, B)
+    TEST_IF(test_function_name, divide_and_conquer_block_sse, A, B)
     TEST_IF(test_function_name, divide_and_conquer_block_avx2, A, B)
 
 #ifdef WITH_AVX512
diff --git a/src/RegisterBlocking.h b/src/register_blocking/RegisterBlocking.h
similarity index 82%
rename from src/RegisterBlocking.h
rename to src/register_blocking/RegisterBlocking.h
index 2dbfa6b..4d50148 100644
--- a/src/RegisterBlocking.h
+++ b/src/register_blocking/RegisterBlocking.h
@@ -5,58 +5,11 @@
 #ifndef SMID_MATRIX_REGISTERBLOCKING_H
 #define SMID_MATRIX_REGISTERBLOCKING_H
 
-#include "Matrix.h"
+#include "../Matrix.h"
 #include <immintrin.h>
 
-enum AvxVersion {
-    AVX2,
-
-#ifdef WITH_AVX512
-    AVX512
-#endif
-
-};
-
 namespace detail {
 
-    struct __m256_block_wise_config {
-        using FloatType = float;
-        using VectorType = __m256;
-        static constexpr auto LoadVector = _mm256_loadu_ps;
-        static constexpr auto StoreVector = _mm256_storeu_ps;
-        static constexpr auto BroadcastToVector = _mm256_set1_ps;
-        static constexpr unsigned Registers = 16;
-    };
-
-    struct __m256d_block_wise_config {
-        using FloatType = double;
-        using VectorType = __m256d;
-        static constexpr auto LoadVector = _mm256_loadu_pd;
-        static constexpr auto StoreVector = _mm256_storeu_pd;
-        static constexpr auto BroadcastToVector = _mm256_set1_pd;
-        static constexpr unsigned Registers = 16;
-    };
-
-#ifdef WITH_AVX512
-    struct __m512_block_wise_config {
-        using FloatType = float;
-        using VectorType = __m512;
-        static constexpr auto LoadVector = _mm512_loadu_ps;
-        static constexpr auto StoreVector = _mm512_storeu_ps;
-        static constexpr auto BroadcastToVector = _mm512_set1_ps;
-        static constexpr unsigned Registers = 32;
-    };
-
-    struct __m512d_block_wise_config {
-        using FloatType = double;
-        using VectorType = __m512d;
-        static constexpr auto LoadVector = _mm512_loadu_pd;
-        static constexpr auto StoreVector = _mm512_storeu_pd;
-        static constexpr auto BroadcastToVector = _mm512_set1_pd;
-        static constexpr unsigned Registers = 32;
-    };
-#endif
-
     // maximize = (R * C) / (R + C) for R + R * C < 16 => any fixed r -> largest C with < 16
     // C = floor ((16 - R) / R)
 
@@ -139,7 +92,7 @@ namespace detail {
 
     template<
             // template parameter as struct: otherwise some warning about losing alignment information warning
-            typename BlockWiseConfig = __m256_block_wise_config
+            typename BlockWiseConfig
     >
     struct block_wise {
 
@@ -298,23 +251,94 @@ namespace detail {
     };
 }
 
+struct __m128_block_wise_config {
+    using FloatType = float;
+    using VectorType = __m128;
+    static constexpr auto LoadVector = _mm_loadu_ps;
+    static constexpr auto StoreVector = _mm_storeu_ps;
+    static constexpr auto BroadcastToVector = _mm_set1_ps;
+    static constexpr unsigned Registers = 16;
+};
+
+struct __m128d_block_wise_config {
+    using FloatType = double;
+    using VectorType = __m128d;
+    static constexpr auto LoadVector = _mm_loadu_pd;
+    static constexpr auto StoreVector = _mm_storeu_pd;
+    static constexpr auto BroadcastToVector = _mm_set1_pd;
+    static constexpr unsigned Registers = 16;
+};
+
+struct __m256_block_wise_config {
+    using FloatType = float;
+    using VectorType = __m256;
+    static constexpr auto LoadVector = _mm256_loadu_ps;
+    static constexpr auto StoreVector = _mm256_storeu_ps;
+    static constexpr auto BroadcastToVector = _mm256_set1_ps;
+    static constexpr unsigned Registers = 16;
+};
+
+struct __m256d_block_wise_config {
+    using FloatType = double;
+    using VectorType = __m256d;
+    static constexpr auto LoadVector = _mm256_loadu_pd;
+    static constexpr auto StoreVector = _mm256_storeu_pd;
+    static constexpr auto BroadcastToVector = _mm256_set1_pd;
+    static constexpr unsigned Registers = 16;
+};
+
+#ifdef WITH_AVX512
+struct __m512_block_wise_config {
+    using FloatType = float;
+    using VectorType = __m512;
+    static constexpr auto LoadVector = _mm512_loadu_ps;
+    static constexpr auto StoreVector = _mm512_storeu_ps;
+    static constexpr auto BroadcastToVector = _mm512_set1_ps;
+    static constexpr unsigned Registers = 32;
+};
+
+struct __m512d_block_wise_config {
+    using FloatType = double;
+    using VectorType = __m512d;
+    static constexpr auto LoadVector = _mm512_loadu_pd;
+    static constexpr auto StoreVector = _mm512_storeu_pd;
+    static constexpr auto BroadcastToVector = _mm512_set1_pd;
+    static constexpr unsigned Registers = 32;
+};
+#endif
+
+enum AvxVersion {
+    SSE,
+    AVX2,
+#ifdef WITH_AVX512
+    AVX512
+#endif
+};
+
 template<typename FloatType, AvxVersion avxVersion>
 struct block_wise;
 
-template<> struct block_wise<float, AVX2> : public detail::block_wise_base<detail::__m256_block_wise_config> {};
-template<> struct block_wise<double, AVX2> : public detail::block_wise_base<detail::__m256d_block_wise_config> {};
+template<> struct block_wise<float, SSE> : public detail::block_wise_base<__m128_block_wise_config> {};
+template<> struct block_wise<double, SSE> : public detail::block_wise_base<__m128d_block_wise_config> {};
+
+template<> struct block_wise<float, AVX2> : public detail::block_wise_base<__m256_block_wise_config> {};
+template<> struct block_wise<double, AVX2> : public detail::block_wise_base<__m256d_block_wise_config> {};
 
 #ifdef WITH_AVX512
-    template<> struct block_wise<float, AVX512> : public detail::block_wise_base<detail::__m512_block_wise_config> {};
-    template<> struct block_wise<double, AVX512> : public detail::block_wise_base<detail::__m512d_block_wise_config> {};
+    template<> struct block_wise<float, AVX512> : public detail::block_wise_base<__m512_block_wise_config> {};
+    template<> struct block_wise<double, AVX512> : public detail::block_wise_base<__m512d_block_wise_config> {};
 #endif
 
+template<typename T>
+void __attribute__ ((noinline)) block_wise_sse(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
+    block_wise<T, SSE>::multiply(C, A, B);
+}
+
 template<typename T>
 void __attribute__ ((noinline)) block_wise_avx2(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {
     block_wise<T, AVX2>::multiply(C, A, B);
 }
 
-
 #ifdef WITH_AVX512
 template<typename T>
 void __attribute__ ((noinline)) block_wise_avx512(Matrix<T> &C, const Matrix<T> &A, const Matrix<T> &B) {