Remove SSE-only code and convolve5x5 (#12109)

Summary: Performance oriented code will use AVX/AVX2, so we don't need SSE specific code anymore. This will also reduce the probability of running into an error on legacy CPUs. On top of this convolve is covered by modern libraries such as MKLDNN, which are much more performant and which we now build against by default (even for builds from source). Pull Request resolved: https://github.com/pytorch/pytorch/pull/12109 Differential Revision: D10055134 Pulled By: colesbury fbshipit-source-id: 789b8a34d5936d9c144bcde410c30f7eb1c776fa
2025-12-06 12:20:52 +01:00 · 2018-10-09 10:46:50 -07:00 · 2018-10-09 10:46:50 -07:00 · f564163951
commit f564163951
parent 11c31aef04
14 changed files with 63 additions and 1552 deletions
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@ -1,19 +1,8 @@
 set(extra_src)

-# IF ANY SIMD FOUND
-IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
-  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve.cpp)
-ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
-
-# IF SSE4 FOUND
-IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
-  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_sse.cpp)
-ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
-
 # IF AVX FOUND
 IF(C_AVX_FOUND)
  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX.cpp)
-  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_avx.cpp)
 ENDIF(C_AVX_FOUND)

 IF(C_AVX2_FOUND)
--- a/aten/src/TH/THVector.cpp
+++ b/aten/src/TH/THVector.cpp
@ -1,6 +1,6 @@
 #include "THVector.h"

-#include "generic/simd/simd.h"
+#include "vector/simd.h"

 #ifdef __NEON__
 #include "vector/NEON.cpp"
@ -10,11 +10,6 @@
 #include "vector/VSX.cpp"
 #endif

-#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-        || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-#include "vector/SSE.cpp"
-#endif
-
 #if defined(USE_AVX)
 #include "vector/AVX.h"
 #endif
--- a/aten/src/TH/generic/THVectorDispatch.cpp
+++ b/aten/src/TH/generic/THVectorDispatch.cpp
@ -32,12 +32,6 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(fill_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
  FUNCTION_IMPL(THVector_(fill_DEFAULT), SIMDExtension_DEFAULT)
 };
 void THVector_(fill)(scalar_t *x, const scalar_t c, const ptrdiff_t n) {
@ -64,13 +58,6 @@ static FunctionDescription THVector_(cadd_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cadd_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
  FUNCTION_IMPL(THVector_(cadd_DEFAULT), SIMDExtension_DEFAULT)
 };
 void THVector_(cadd)(scalar_t *z, const scalar_t *x, const scalar_t *y, const scalar_t c, const ptrdiff_t n) {
@ -97,13 +84,6 @@ static FunctionDescription THVector_(adds_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(adds_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
  FUNCTION_IMPL(THVector_(adds_DEFAULT), SIMDExtension_DEFAULT)
 };
 // Dispatch stubs that just call the pointers
@ -125,13 +105,6 @@ static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cmul_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
  FUNCTION_IMPL(THVector_(cmul_DEFAULT), SIMDExtension_DEFAULT)
 };
 void THVector_(cmul)(scalar_t *z, const scalar_t *x, const scalar_t *y, const ptrdiff_t n) {
@ -158,13 +131,6 @@ static FunctionDescription THVector_(muls_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(muls_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
  FUNCTION_IMPL(THVector_(muls_DEFAULT), SIMDExtension_DEFAULT)
 };
 void THVector_(muls)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n) {
@ -185,13 +151,6 @@ static FunctionDescription THVector_(cdiv_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cdiv_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
  FUNCTION_IMPL(THVector_(cdiv_DEFAULT), SIMDExtension_DEFAULT)
 };
 void THVector_(cdiv)(scalar_t *z, const scalar_t *x, const scalar_t *y, const ptrdiff_t n) {
@ -212,13 +171,6 @@ static FunctionDescription THVector_(divs_DISPATCHTABLE)[] = {
    #endif
  #endif

-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(divs_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
  FUNCTION_IMPL(THVector_(divs_DEFAULT), SIMDExtension_DEFAULT)
 };
 void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n) {
--- a/aten/src/TH/generic/simd/common_simd.h
+++ b/aten/src/TH/generic/simd/common_simd.h
@ -1,395 +0,0 @@
-#ifndef COMMON_SIMD_H
-#define COMMON_SIMD_H
-
-/* Weights */
-#define LOAD_WEIGHT(q, simd_type, inst_var) _m ## simd_type ## inst_var(*(q))
-
-#define DECLARE_WEIGHTS(simd_type) \
-__ ## simd_type weight0; \
-__ ## simd_type weight1; \
-__ ## simd_type weight2; \
-__ ## simd_type weight3; \
-__ ## simd_type weight4;
-
-#define LOAD_WEIGHTS(k, simd_type, inst_var) \
-weight0 = LOAD_WEIGHT(weight + 5 * 0 + k, simd_type, inst_var); \
-weight1 = LOAD_WEIGHT(weight + 5 * 1 + k, simd_type, inst_var); \
-weight2 = LOAD_WEIGHT(weight + 5 * 2 + k, simd_type, inst_var); \
-weight3 = LOAD_WEIGHT(weight + 5 * 3 + k, simd_type, inst_var); \
-weight4 = LOAD_WEIGHT(weight + 5 * 4 + k, simd_type, inst_var);
-
-/* Inputs declare */
-#define DECLARE_INPUT_0(i) \
-float* input0 = image + i; \
-
-#define DECLARE_INPUT_1() \
-float* input1 = input0 + inputStride; \
-float* input2 = input1 + inputStride; \
-float* input3 = input2 + inputStride; \
-float* input4 = input3 + inputStride;
-
-#define DECLARE_INPUT_2() \
-DECLARE_INPUT_1() \
-float* input5 = input4 + inputStride;
-
-#define DECLARE_INPUT_4() \
-DECLARE_INPUT_2() \
-float* input6 = input5 + inputStride; \
-float* input7 = input6 + inputStride;
-
-#define DECLARE_INPUT_5() \
-DECLARE_INPUT_4() \
-float* input8 = input7 + inputStride;
-
-#define DECLARE_INPUT_6() \
-DECLARE_INPUT_5() \
-float* input9 = input8 + inputStride;
-
-#define DECLARE_INPUT_7() \
-DECLARE_INPUT_6() \
-float* inputA = input9 + inputStride;
-
-#define DECLARE_INPUT_8() \
-DECLARE_INPUT_7() \
-float* inputB = inputA + inputStride;
-
-
-/* Inputs increment */
-#define INC_INPUT_1()\
-input0++; \
-input1++; \
-input2++; \
-input3++; \
-input4++; \
-
-#define INC_INPUT_2()\
-INC_INPUT_1() \
-input5++;
-
-#define INC_INPUT_4()\
-INC_INPUT_2() \
-input6++; \
-input7++;
-
-#define INC_INPUT_5()\
-INC_INPUT_4() \
-input8++;
-
-#define INC_INPUT_6()\
-INC_INPUT_5() \
-input9++;
-
-#define INC_INPUT_7()\
-INC_INPUT_6() \
-inputA++;
-
-#define INC_INPUT_8()\
-INC_INPUT_7() \
-inputB++;
-
-/* Outputs declare */
-#define DECLARE_OUTPUT_1() \
-float* output0 = output;
-
-#define DECLARE_OUTPUT_2() \
-DECLARE_OUTPUT_1() \
-float* output1 = output0 + outputStride;
-
-#define DECLARE_OUTPUT_4() \
-DECLARE_OUTPUT_2() \
-float* output2 = output1 + outputStride; \
-float* output3 = output2 + outputStride;
-
-#define DECLARE_OUTPUT_5() \
-DECLARE_OUTPUT_4() \
-float* output4 = output3 + outputStride;
-
-#define DECLARE_OUTPUT_6() \
-DECLARE_OUTPUT_5() \
-float* output5 = output4 + outputStride;
-
-#define DECLARE_OUTPUT_7() \
-DECLARE_OUTPUT_6() \
-float* output6 = output5 + outputStride;
-
-#define DECLARE_OUTPUT_8() \
-DECLARE_OUTPUT_7() \
-float* output7 = output6 + outputStride;
-
-/* Outputs increment */
-#define INC_OUTPUT_1(x) \
-output0 += x;
-
-#define INC_OUTPUT_2(x) \
-INC_OUTPUT_1(x) \
-output1 += x;
-
-#define INC_OUTPUT_4(x) \
-INC_OUTPUT_2(x) \
-output2 += x; \
-output3 += x;
-
-#define INC_OUTPUT_5(x) \
-INC_OUTPUT_4(x) \
-output4 += x;
-
-#define INC_OUTPUT_6(x) \
-INC_OUTPUT_5(x) \
-output5 += x;
-
-#define INC_OUTPUT_7(x) \
-INC_OUTPUT_6(x) \
-output6 += x;
-
-#define INC_OUTPUT_8(x) \
-INC_OUTPUT_7(x) \
-output7 += x;
-
-/* Image declare */
-#define DECLARE_IMAGE_1(simd_type) \
-__ ## simd_type image0; \
-__ ## simd_type image1; \
-__ ## simd_type image2; \
-__ ## simd_type image3; \
-__ ## simd_type image4;
-
-#define DECLARE_IMAGE_2(simd_type) \
-DECLARE_IMAGE_1(simd_type) \
-__ ## simd_type image5;
-
-#define DECLARE_IMAGE_4(simd_type) \
-DECLARE_IMAGE_2(simd_type) \
-__ ## simd_type image6; \
-__ ## simd_type image7;
-
-#define DECLARE_IMAGE_5(simd_type) \
-DECLARE_IMAGE_4(simd_type) \
-__ ## simd_type image8;
-
-#define DECLARE_IMAGE_6(simd_type) \
-DECLARE_IMAGE_5(simd_type) \
-__ ## simd_type image9;
-
-#define DECLARE_IMAGE_7(simd_type) \
-DECLARE_IMAGE_6(simd_type) \
-__ ## simd_type imageA;
-
-#define DECLARE_IMAGE_8(simd_type) \
-DECLARE_IMAGE_7(simd_type) \
-__ ## simd_type imageB;
-
-/* Sums declare */
-#define DECLARE_SUM_1(simd_type) \
-__ ## simd_type sum0;
-
-#define DECLARE_SUM_2(simd_type) \
-DECLARE_SUM_1(simd_type) \
-__ ## simd_type sum1;
-
-#define DECLARE_SUM_4(simd_type) \
-DECLARE_SUM_2(simd_type) \
-__ ## simd_type sum2; \
-__ ## simd_type sum3;
-
-#define DECLARE_SUM_5(simd_type) \
-DECLARE_SUM_4(simd_type) \
-__ ## simd_type sum4;
-
-#define DECLARE_SUM_6(simd_type) \
-DECLARE_SUM_5(simd_type) \
-__ ## simd_type sum5;
-
-#define DECLARE_SUM_7(simd_type) \
-DECLARE_SUM_6(simd_type) \
-__ ## simd_type sum6;
-
-#define DECLARE_SUM_8(simd_type) \
-DECLARE_SUM_7(simd_type) \
-__ ## simd_type sum7;
-
-/* Sums load */
-#define LOAD_SUM_1(simd_type) \
-sum0 = _m ## simd_type ## _loadu_ps(output0);
-
-#define LOAD_SUM_2(simd_type) \
-LOAD_SUM_1(simd_type) \
-sum1 = _m ## simd_type ## _loadu_ps(output1);
-
-#define LOAD_SUM_4(simd_type) \
-LOAD_SUM_2(simd_type) \
-sum2 = _m ## simd_type ## _loadu_ps(output2); \
-sum3 = _m ## simd_type ## _loadu_ps(output3);
-
-#define LOAD_SUM_5(simd_type) \
-LOAD_SUM_4(simd_type) \
-sum4 = _m ## simd_type ## _loadu_ps(output4);
-
-#define LOAD_SUM_6(simd_type) \
-LOAD_SUM_5(simd_type) \
-sum5 = _m ## simd_type ## _loadu_ps(output5);
-
-#define LOAD_SUM_7(simd_type) \
-LOAD_SUM_6(simd_type) \
-sum6 = _m ## simd_type ## _loadu_ps(output6);
-
-#define LOAD_SUM_8(simd_type) \
-LOAD_SUM_7(simd_type) \
-sum7 = _m ## simd_type ## _loadu_ps(output7);
-
-/* Sums store */
-#define STORE_SUM_1(simd_type) \
-_m ## simd_type ## _storeu_ps(output0, sum0);
-
-#define STORE_SUM_2(simd_type) \
-STORE_SUM_1(simd_type) \
-_m ## simd_type ## _storeu_ps(output1, sum1);
-
-#define STORE_SUM_4(simd_type) \
-STORE_SUM_2(simd_type) \
-_m ## simd_type ## _storeu_ps(output2, sum2); \
-_m ## simd_type ## _storeu_ps(output3, sum3);
-
-#define STORE_SUM_5(simd_type) \
-STORE_SUM_4(simd_type) \
-_m ## simd_type ## _storeu_ps(output4, sum4);
-
-#define STORE_SUM_6(simd_type) \
-STORE_SUM_5(simd_type) \
-_m ## simd_type ## _storeu_ps(output5, sum5);
-
-#define STORE_SUM_7(simd_type) \
-STORE_SUM_6(simd_type) \
-_m ## simd_type ## _storeu_ps(output6, sum6);
-
-#define STORE_SUM_8(simd_type) \
-STORE_SUM_7(simd_type) \
-_m ## simd_type ## _storeu_ps(output7, sum7);
-
-/* Convolution */
-#define CONVOLVE_1ROWS(simd_type) \
-image0 = _m ## simd_type ## _loadu_ps(input0); \
-image1 = _m ## simd_type ## _loadu_ps(input1); \
-image2 = _m ## simd_type ## _loadu_ps(input2); \
-image3 = _m ## simd_type ## _loadu_ps(input3); \
-image4 = _m ## simd_type ## _loadu_ps(input4); \
-\
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight0, image0)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight1, image1)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight2, image2)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight3, image3)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight4, image4));
-
-#define CONVOLVE_2ROWS(simd_type) \
-CONVOLVE_1ROWS(simd_type) \
-image5 = _m ## simd_type ## _loadu_ps(input5); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight0, image1)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight1, image2)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight2, image3)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight3, image4)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight4, image5));
-
-#define CONVOLVE_4ROWS(simd_type) \
-CONVOLVE_2ROWS(simd_type) \
-image6 = _m ## simd_type ## _loadu_ps(input6); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight0, image2)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight1, image3)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight2, image4)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight3, image5)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight4, image6)); \
-\
-image7 = _m ## simd_type ## _loadu_ps(input7); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight0, image3)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight1, image4)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight2, image5)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight3, image6)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight4, image7));
-
-#define CONVOLVE_5ROWS(simd_type) \
-CONVOLVE_4ROWS(simd_type) \
-image8 = _m ## simd_type ## _loadu_ps(input8); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight0, image4)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight1, image5)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight2, image6)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight3, image7)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight4, image8));
-
-#define CONVOLVE_6ROWS(simd_type) \
-CONVOLVE_5ROWS(simd_type) \
-image9 = _m ## simd_type ## _loadu_ps(input9); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight0, image5)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight1, image6)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight2, image7)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight3, image8)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight4, image9));
-
-#define CONVOLVE_7ROWS(simd_type) \
-CONVOLVE_6ROWS(simd_type) \
-imageA = _m ## simd_type ## _loadu_ps(inputA); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight0, image6)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight1, image7)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight2, image8)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight3, image9)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight4, imageA));
-
-#define CONVOLVE_8ROWS(simd_type) \
-CONVOLVE_7ROWS(simd_type) \
-imageB = _m ## simd_type ## _loadu_ps(inputB); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight0, image7)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight1, image8)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight2, image9)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight3, imageA)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight4, imageB));
-
-/* Convolution MEGA macro */
-#define DECLARE_SUMX(rows) DECLARE_SUM_ ## rows
-#define LOAD_SUMX(rows) LOAD_SUM_ ## rows
-#define DECLARE_INPUTX(rows) DECLARE_INPUT_ ## rows
-#define DECLARE_IMAGEX(rows) DECLARE_IMAGE_ ## rows
-#define CONVOLVEX(rows) CONVOLVE_ ## rows ## ROWS
-#define INC_INPUTX(rows) INC_INPUT_ ## rows
-#define STORE_SUMX(rows) STORE_SUM_ ## rows
-#define INC_OUTPUTX(rows) INC_OUTPUT_ ## rows
-
-#define CONVOLUTION_LOOP(rows, simd_type, simd_inst_prefex, simd_set, i) \
-DECLARE_SUMX(rows)(simd_type) \
-LOAD_SUMX(rows)(simd_inst_prefex) \
-DECLARE_WEIGHTS(simd_type) \
-DECLARE_INPUT_0(i) \
-DECLARE_INPUTX(rows)() \
-DECLARE_IMAGEX(rows)(simd_type) \
-\
-LOAD_WEIGHTS(0, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(1, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(2, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(3, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(4, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-\
-STORE_SUMX(rows)(simd_inst_prefex) \
-\
-INC_OUTPUTX(rows)(sizeof(__ ## simd_type) / sizeof(float))
-
-
-#define CONVOLVE_8COLS_XROWS(rows, i) \
-{ \
-CONVOLUTION_LOOP(rows, m256, m256, _set1_ps, i) \
-}
-
-#define CONVOLVE_4COLS_XROWS(rows, i) \
-{ \
-CONVOLUTION_LOOP(rows, m128, m, _set_ps1, i) \
-}
-
-#endif
--- a/aten/src/TH/generic/simd/convolve.cpp
+++ b/aten/src/TH/generic/simd/convolve.cpp
@ -1,129 +0,0 @@
-#if defined(__AVX__)
-
-#ifdef _MSC_VER
-#include <intrin.h>
-
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-  unsigned int cpui[4];
-  __cpuid(cpui, __level);
-  *__eax = cpui[0]; *__ebx = cpui[1]; *__ecx = cpui[2]; *__edx = cpui[3];
-  return 1;
-}
-
-static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
-  *eax = 0; *edx = 0;
-  if (op == 0)
-      *eax = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-}
-
-#else
-
-#if __i386__
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
-__asm("  pushl  %%ebx\n" \
-"  cpuid\n" \
-"  mov    %%ebx,%1\n" \
-"  popl   %%ebx" \
-: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-: "0"(__level))
-#else
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
-__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-: "0"(__level))
-#endif
-
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-  __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
-  return 1;
-}
-
-static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
-  __asm__ __volatile__
-  (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
-}
-
-#endif
-
-enum ECPUFeature
-{
-  kCPUFeature_SSE = 0x01,
-  kCPUFeature_SSE2 = 0x02,
-  kCPUFeature_SSE3 = 0x04,
-  kCPUFeature_SSE3_S = 0x08,
-  kCPUFeature_SSE4_1 = 0x10,
-  kCPUFeature_SSE4_2 = 0x20,
-  kCPUFeature_AVX = 0x40
-};
-
-static unsigned int checkCPUFeatures() {
-  unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
-  unsigned int features = 0;
-  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
-  if( (edx & (1 << 25)) != 0 ) {
-    features |= kCPUFeature_SSE;
-  }
-  if( (edx & (1 << 26)) != 0 ) {
-    features |= kCPUFeature_SSE2;
-  }
-  if( (ecx & (1 << 0)) != 0 ) {
-    features |= kCPUFeature_SSE3;
-  }
-  if( (ecx & (1 << 9)) != 0 ) {
-    features |= kCPUFeature_SSE3_S;
-  }
-  if( (ecx & (1 << 19)) != 0 ) {
-    features |= kCPUFeature_SSE4_1;
-  }
-  if( (ecx & (1 << 20)) != 0 ) {
-    features |= kCPUFeature_SSE4_2;
-  }
-  if( (ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0 ) {
-    xgetbv(0, &eax, &edx);
-    if( (eax & 6) == 6 ) {
-      features |= kCPUFeature_AVX;
-    }
-  }
-  return features;
-}
-
-#include <stdio.h>
-
-static int haveCPUFeature(unsigned int feature) {
-  static unsigned int sCPUFeatures = 0;
-  static int sDetectedCPUFeatures = 0;
-  if (!sDetectedCPUFeatures) {
-    sDetectedCPUFeatures = 1;
-    sCPUFeatures = checkCPUFeatures();
-    if ((sCPUFeatures & kCPUFeature_AVX) != 0) {
-      printf("torch running avx\n");
-    } else {
-      printf("torch running sse \n");
-    }
-  }
-  return (sCPUFeatures & feature) != 0;
-}
-
-#endif
-
-#include <stdint.h>
-
-void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
-void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
-
-void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols) {
-#if defined(__AVX__)
-  int avx = haveCPUFeature(kCPUFeature_AVX);
-  if (avx)
-  {
-    convolve_5x5_avx(output, input, kernel, outRows, outCols, outCols, inCols);
-  }
-  else
-#endif
-  {
-    convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
-  }
-}
--- a/aten/src/TH/generic/simd/convolve.h
+++ b/aten/src/TH/generic/simd/convolve.h
@ -1 +0,0 @@
-void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols);
--- a/aten/src/TH/generic/simd/convolve5x5_avx.cpp
+++ b/aten/src/TH/generic/simd/convolve5x5_avx.cpp
@ -1,214 +0,0 @@
-#include <immintrin.h>
-#include "common_simd.h"
-#include <stdint.h>
-
-
-#define CLEAR_AVX() _mm256_zeroupper()
-
-void convolve_5x5_1_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_1()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(1, i)
-  }
-}
-
-void convolve_5x5_2_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_2()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(2, i)
-  }
-}
-
-void convolve_5x5_4_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_4()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(4, i)
-  }
-}
-
-void convolve_5x5_5_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_5()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(5, i)
-  }
-}
-
-void convolve_5x5_6_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_6()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(6, i)
-  }
-}
-
-void convolve_5x5_7_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_7()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(7, i)
-  }
-}
-
-void convolve_5x5_8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_8()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(8, i)
-  }
-}
-
-void convolve_5x5_64x64_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  for(int i = 0; i < 60; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_8COLS_XROWS(6, 0)
-    CONVOLVE_8COLS_XROWS(6, 8)
-    CONVOLVE_8COLS_XROWS(6, 16)
-    CONVOLVE_8COLS_XROWS(6, 24)
-    CONVOLVE_8COLS_XROWS(6, 32)
-    CONVOLVE_8COLS_XROWS(6, 40)
-    CONVOLVE_8COLS_XROWS(6, 48)
-    CONVOLVE_8COLS_XROWS(6, 56)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_8COLS_XROWS(4, 0)
-  CONVOLVE_8COLS_XROWS(4, 8)
-  CONVOLVE_8COLS_XROWS(4, 16)
-  CONVOLVE_8COLS_XROWS(4, 24)
-  CONVOLVE_8COLS_XROWS(4, 32)
-  CONVOLVE_8COLS_XROWS(4, 40)
-  CONVOLVE_8COLS_XROWS(4, 48)
-  CONVOLVE_8COLS_XROWS(4, 56)
-}
-
-void convolve_5x5_32x32_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  for(int i = 0; i < 30; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_8COLS_XROWS(6, 0)
-    CONVOLVE_8COLS_XROWS(6, 8)
-    CONVOLVE_8COLS_XROWS(6, 16)
-    CONVOLVE_8COLS_XROWS(6, 24)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_2()
-  CONVOLVE_8COLS_XROWS(2, 0)
-  CONVOLVE_8COLS_XROWS(2, 8)
-  CONVOLVE_8COLS_XROWS(2, 16)
-  CONVOLVE_8COLS_XROWS(2, 24)
-}
-
-void convolve_5x5_16x16_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  for(int i = 0; i < 12; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_8COLS_XROWS(6, 0)
-    CONVOLVE_8COLS_XROWS(6, 8)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_8COLS_XROWS(4, 0)
-  CONVOLVE_8COLS_XROWS(4, 8)
-}
-
-void convolve_5x5_8x8_avx(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  DECLARE_OUTPUT_8()
-  CONVOLVE_8COLS_XROWS(8, 0)
-}
-
-void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols);
-
-void convolve_5x5_avx(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) {
-  int64_t ic = inCols;
-  int64_t yy = 0;
-  float* t_ = input;
-  float* r_ = output;
-  float* k_ = kernel;
-
-  if((outRows == 64) && (outCols == 64)) {
-    convolve_5x5_64x64_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 32) && (outCols == 32)) {
-    convolve_5x5_32x32_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 16) && (outCols == 16)) {
-    convolve_5x5_16x16_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 8) && (outCols == 8)) {
-    convolve_5x5_8x8_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  for(; yy < (outRows / 6 ) * 6; yy += 6) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_6_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 6);
-  }
-
-  // more than 2 rows left to process and we ended up on a non-multiple of 4
-  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
-    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 2);
-    yy += 2;
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_4_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 4);
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 2);
-  }
-
-  for(; yy < outRows; yy += 1) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_1_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 1);
-  }
-
-  int64_t procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
-  int64_t remCols = outCols - procCols;
-
-  //process the rest using sse
-  if( remCols > 0) {
-    CLEAR_AVX();
-    convolve_5x5_sse(&output[procCols], &input[procCols], kernel, outRows, remCols, outStride, inCols);
-  }
-}
--- a/aten/src/TH/generic/simd/convolve5x5_sse.cpp
+++ b/aten/src/TH/generic/simd/convolve5x5_sse.cpp
@ -1,321 +0,0 @@
-#include <smmintrin.h>
-#include "common_simd.h"
-#include <stdint.h>
-
-
-/* SSE variants */
-void convolve_5x5_1_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_1()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(1, i)
-  }
-  for (; i < (count); i++) {
-    float output0 = output[i + outputStride * 0];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-  }
-}
-
-void convolve_5x5_2_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_2()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(2, i)
-  }
-  for (; i < (count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-  }
-}
-
-void convolve_5x5_4_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_4()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(4, i)
-  }
-  for (; i < (count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    float output2 = output[i + outputStride * 2];
-    float output3 = output[i + outputStride * 3];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
-        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-    output[i + outputStride * 2] = output2;
-    output[i + outputStride * 3] = output3;
-  }
-}
-
-void convolve_5x5_6_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_6()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(6, i)
-  }
-  for (; i<(count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    float output2 = output[i + outputStride * 2];
-    float output3 = output[i + outputStride * 3];
-    float output4 = output[i + outputStride * 4];
-    float output5 = output[i + outputStride * 5];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
-        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
-        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
-        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-    output[i + outputStride * 2] = output2;
-    output[i + outputStride * 3] = output3;
-    output[i + outputStride * 4] = output4;
-    output[i + outputStride * 5] = output5;
-  }
-}
-
-void convolve_5x5_8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  int64_t i = 0;
-  int64_t alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_8()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(8, i)
-  }
-  for (; i<(count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    float output2 = output[i + outputStride * 2];
-    float output3 = output[i + outputStride * 3];
-    float output4 = output[i + outputStride * 4];
-    float output5 = output[i + outputStride * 5];
-    float output6 = output[i + outputStride * 6];
-    float output7 = output[i + outputStride * 7];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
-        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
-        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
-        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
-        output6 += weight[5 * row + col] * image[i + (row + 6) * inputStride + col];
-        output7 += weight[5 * row + col] * image[i + (row + 7) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-    output[i + outputStride * 2] = output2;
-    output[i + outputStride * 3] = output3;
-    output[i + outputStride * 4] = output4;
-    output[i + outputStride * 5] = output5;
-    output[i + outputStride * 6] = output6;
-    output[i + outputStride * 7] = output7;
-  }
-}
-
-#define UNROLL_SSE_CONVOLUTION 0
-#if (UNROLL_SSE_CONVOLUTION)
-
-void convolve_5x5_64x64_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  for(int i = 0; i < 60; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_4COLS_XROWS(6, 0)
-    CONVOLVE_4COLS_XROWS(6, 4)
-    CONVOLVE_4COLS_XROWS(6, 8)
-    CONVOLVE_4COLS_XROWS(6, 12)
-    CONVOLVE_4COLS_XROWS(6, 16)
-    CONVOLVE_4COLS_XROWS(6, 20)
-    CONVOLVE_4COLS_XROWS(6, 24)
-    CONVOLVE_4COLS_XROWS(6, 28)
-    CONVOLVE_4COLS_XROWS(6, 32)
-    CONVOLVE_4COLS_XROWS(6, 36)
-    CONVOLVE_4COLS_XROWS(6, 40)
-    CONVOLVE_4COLS_XROWS(6, 44)
-    CONVOLVE_4COLS_XROWS(6, 48)
-    CONVOLVE_4COLS_XROWS(6, 52)
-    CONVOLVE_4COLS_XROWS(6, 56)
-    CONVOLVE_4COLS_XROWS(6, 60)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_4COLS_XROWS(4, 0)
-  CONVOLVE_4COLS_XROWS(4, 4)
-  CONVOLVE_4COLS_XROWS(4, 8)
-  CONVOLVE_4COLS_XROWS(4, 12)
-  CONVOLVE_4COLS_XROWS(4, 16)
-  CONVOLVE_4COLS_XROWS(4, 20)
-  CONVOLVE_4COLS_XROWS(4, 24)
-  CONVOLVE_4COLS_XROWS(4, 28)
-  CONVOLVE_4COLS_XROWS(4, 32)
-  CONVOLVE_4COLS_XROWS(4, 36)
-  CONVOLVE_4COLS_XROWS(4, 40)
-  CONVOLVE_4COLS_XROWS(4, 44)
-  CONVOLVE_4COLS_XROWS(4, 48)
-  CONVOLVE_4COLS_XROWS(4, 52)
-  CONVOLVE_4COLS_XROWS(4, 56)
-  CONVOLVE_4COLS_XROWS(4, 60)
-}
-
-void convolve_5x5_32x32_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  for(int i = 0; i < 30; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-
-      CONVOLVE_4COLS_XROWS(6, 0)
-      CONVOLVE_4COLS_XROWS(6, 4)
-      CONVOLVE_4COLS_XROWS(6, 8)
-      CONVOLVE_4COLS_XROWS(6, 12)
-      CONVOLVE_4COLS_XROWS(6, 16)
-      CONVOLVE_4COLS_XROWS(6, 20)
-      CONVOLVE_4COLS_XROWS(6, 24)
-      CONVOLVE_4COLS_XROWS(6, 28)
-
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_2()
-  CONVOLVE_4COLS_XROWS(2, 0)
-  CONVOLVE_4COLS_XROWS(2, 4)
-  CONVOLVE_4COLS_XROWS(2, 8)
-  CONVOLVE_4COLS_XROWS(2, 12)
-  CONVOLVE_4COLS_XROWS(2, 16)
-  CONVOLVE_4COLS_XROWS(2, 20)
-  CONVOLVE_4COLS_XROWS(2, 24)
-  CONVOLVE_4COLS_XROWS(2, 28)
-}
-
-void convolve_5x5_16x16_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  for(int i = 0; i < 12; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_4COLS_XROWS(6, 0)
-    CONVOLVE_4COLS_XROWS(6, 4)
-    CONVOLVE_4COLS_XROWS(6, 8)
-    CONVOLVE_4COLS_XROWS(6, 12)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_4COLS_XROWS(4, 0)
-  CONVOLVE_4COLS_XROWS(4, 4)
-  CONVOLVE_4COLS_XROWS(4, 8)
-  CONVOLVE_4COLS_XROWS(4, 12)
-}
-
-void convolve_5x5_8x8_sse(float* output, float* image, float* weight, int64_t count, int64_t outputStride, int64_t inputStride) {
-  DECLARE_OUTPUT_8()
-  CONVOLVE_4COLS_XROWS(8, 0)
-  CONVOLVE_4COLS_XROWS(8, 4)
-}
-
-#endif
-
-void convolve_5x5_sse(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t outStride, int64_t inCols) {
-  int64_t yy = 0;
-  float* t_ = input;
-  float* r_ = output;
-  float* k_ = kernel;
-#if (UNROLL_SSE_CONVOLUTION)
-  if((outRows == 64) && (outCols == 64)) {
-    convolve_5x5_64x64_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 32) && (outCols == 32)) {
-    convolve_5x5_32x32_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 16) && (outCols == 16)) {
-    convolve_5x5_16x16_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 8) && (outCols == 8)) {
-    convolve_5x5_8x8_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-#endif
-  for(; yy < (outRows / 6 ) * 6; yy += 6) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_6_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 6);
-  }
-  // more than 2 rows left to process and we ended up on a non-multiple of 4
-  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
-    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 2);
-    yy += 2;
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_4_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 4);
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 2);
-  }
-
-  for(; yy < outRows; yy += 1) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_1_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 1);
-  }
-}
--- a/aten/src/TH/vector/SSE.cpp
+++ b/aten/src/TH/vector/SSE.cpp
@ -1,269 +0,0 @@
-#ifndef _MSC_VER
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  ptrdiff_t off;
-  __m128d XMM0 = _mm_set1_pd(c);
-  for (i=0; i<=((n)-8); i+=8) {
-    _mm_storeu_pd((x)+i  , XMM0);
-    _mm_storeu_pd((x)+i+2, XMM0);
-    _mm_storeu_pd((x)+i+4, XMM0);
-    _mm_storeu_pd((x)+i+6, XMM0);
-  }
-  off = (n) - ((n)%8);
-  for (i=0; i<((n)%8); i++) {
-    x[off+i] = c;
-  }
-}
-
-static void THDoubleVector_cadd_SSE(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0, XMM2;
-  for (i=0; i<=((n)-2); i+=2) {
-    XMM0 = _mm_loadu_pd((x)+i);
-    XMM2 = _mm_loadu_pd((y)+i);
-    XMM2 = _mm_mul_pd(XMM2, XMM7);
-    XMM2 = _mm_add_pd(XMM0, XMM2);
-    _mm_storeu_pd((z)+i, XMM2);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + c * y[i];
-  }
-}
-
-static void THDoubleVector_adds_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0, XMM2;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_pd((x)+i);
-    XMM2 = _mm_loadu_pd((x)+i+2);
-    XMM0 = _mm_add_pd(XMM0, XMM7);
-    XMM2 = _mm_add_pd(XMM2, XMM7);
-    _mm_storeu_pd((y)+i, XMM0);
-    _mm_storeu_pd((y)+i+2, XMM2);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] + c;
-  }
-}
-
-static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  for (i=0; i<=((n)-8); i+=8) {
-    __m128d XMM0 = _mm_loadu_pd((x)+i  );
-    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
-    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
-    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
-    __m128d XMM4 = _mm_loadu_pd((y)+i  );
-    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
-    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
-    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
-    XMM4 = _mm_mul_pd(XMM4, XMM0);
-    XMM5 = _mm_mul_pd(XMM5, XMM1);
-    XMM6 = _mm_mul_pd(XMM6, XMM2);
-    XMM7 = _mm_mul_pd(XMM7, XMM3);
-    _mm_storeu_pd((z)+i  , XMM4);
-    _mm_storeu_pd((z)+i+2, XMM5);
-    _mm_storeu_pd((z)+i+4, XMM6);
-    _mm_storeu_pd((z)+i+6, XMM7);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-static void THDoubleVector_muls_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM15 = _mm_set1_pd(c);
-  for (i=0; i<=((n)-8); i+=8) {
-    __m128d XMM0 = _mm_loadu_pd((x)+i  );
-    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
-    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
-    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
-    __m128d XMM4 = _mm_mul_pd(XMM15, XMM0);
-    __m128d XMM5 = _mm_mul_pd(XMM15, XMM1);
-    __m128d XMM6 = _mm_mul_pd(XMM15, XMM2);
-    __m128d XMM7 = _mm_mul_pd(XMM15, XMM3);
-    _mm_storeu_pd((y)+i  , XMM4);
-    _mm_storeu_pd((y)+i+2, XMM5);
-    _mm_storeu_pd((y)+i+4, XMM6);
-    _mm_storeu_pd((y)+i+6, XMM7);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] * c;
-  }
-}
-
-static void THDoubleVector_cdiv_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM0, XMM1, XMM2, XMM3;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_pd(x+i);
-    XMM1 = _mm_loadu_pd(x+i+2);
-    XMM2 = _mm_loadu_pd(y+i);
-    XMM3 = _mm_loadu_pd(y+i+2);
-    XMM2 = _mm_div_pd(XMM0, XMM2);
-    XMM3 = _mm_div_pd(XMM1, XMM3);
-    _mm_storeu_pd(z+i, XMM2);
-    _mm_storeu_pd(z+i+2, XMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] / y[i];
-  }
-}
-
-static void THDoubleVector_divs_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0, XMM1;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_pd(x+i);
-    XMM1 = _mm_loadu_pd(x+i+2);
-    XMM0 = _mm_div_pd(XMM0, XMM7);
-    XMM1 = _mm_div_pd(XMM1, XMM7);
-    _mm_storeu_pd(y+i, XMM0);
-    _mm_storeu_pd(y+i+2, XMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] / c;
-  }
-}
-
-static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM0 = _mm_set_ps1(c);
-  ptrdiff_t off;
-  for (i=0; i<=((n)-16); i+=16) {
-    _mm_storeu_ps((x)+i  ,  XMM0);
-    _mm_storeu_ps((x)+i+4,  XMM0);
-    _mm_storeu_ps((x)+i+8,  XMM0);
-    _mm_storeu_ps((x)+i+12, XMM0);
-  }
-  off = (n) - ((n)%16);
-  for (i=0; i<((n)%16); i++) {
-    x[off+i] = c;
-  }
-}
-
-
-static void THFloatVector_cadd_SSE(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set_ps1(c);
-  __m128 XMM0, XMM2;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_ps((x)+i);
-    XMM2 = _mm_loadu_ps((y)+i);
-    XMM2 = _mm_mul_ps(XMM2, XMM7);
-    XMM2 = _mm_add_ps(XMM0, XMM2);
-    _mm_storeu_ps((z)+i, XMM2);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + c * y[i];
-  }
-}
-
-static void THFloatVector_adds_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set1_ps(c);
-  __m128 XMM0, XMM2;
-  for (i=0; i<=((n)-8); i+=8) {
-    XMM0 = _mm_loadu_ps((x)+i);
-    XMM2 = _mm_loadu_ps((x)+i+4);
-    XMM0 = _mm_add_ps(XMM0, XMM7);
-    XMM2 = _mm_add_ps(XMM2, XMM7);
-    _mm_storeu_ps((y)+i, XMM0);
-    _mm_storeu_ps((y)+i+4, XMM2);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] + c;
-  }
-}
-
-static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  for (i=0; i<=((n)-16); i+=16) {
-    __m128 XMM0 = _mm_loadu_ps((x)+i   );
-    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
-    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
-    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
-    __m128 XMM4 = _mm_loadu_ps((y)+i   );
-    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
-    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
-    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
-    XMM4 = _mm_mul_ps(XMM4, XMM0);
-    XMM5 = _mm_mul_ps(XMM5, XMM1);
-    XMM6 = _mm_mul_ps(XMM6, XMM2);
-    XMM7 = _mm_mul_ps(XMM7, XMM3);
-    _mm_storeu_ps((z)+i   , XMM4);
-    _mm_storeu_ps((z)+i+ 4, XMM5);
-    _mm_storeu_ps((z)+i+ 8, XMM6);
-    _mm_storeu_ps((z)+i+12, XMM7);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-static void THFloatVector_muls_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM15 = _mm_set_ps1(c);
-  for (i=0; i<=((n)-16); i+=16) {
-    __m128 XMM0 = _mm_loadu_ps((x)+i   );
-    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
-    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
-    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
-    __m128 XMM4 = _mm_mul_ps(XMM15, XMM0);
-    __m128 XMM5 = _mm_mul_ps(XMM15, XMM1);
-    __m128 XMM6 = _mm_mul_ps(XMM15, XMM2);
-    __m128 XMM7 = _mm_mul_ps(XMM15, XMM3);
-    _mm_storeu_ps((y)+i   , XMM4);
-    _mm_storeu_ps((y)+i+ 4, XMM5);
-    _mm_storeu_ps((y)+i+ 8, XMM6);
-    _mm_storeu_ps((y)+i+12, XMM7);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] * c;
-  }
-}
-
-static void THFloatVector_cdiv_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM0, XMM1, XMM2, XMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    XMM0 = _mm_loadu_ps(x+i);
-    XMM1 = _mm_loadu_ps(x+i+4);
-    XMM2 = _mm_loadu_ps(y+i);
-    XMM3 = _mm_loadu_ps(y+i+4);
-    XMM2 = _mm_div_ps(XMM0, XMM2);
-    XMM3 = _mm_div_ps(XMM1, XMM3);
-    _mm_storeu_ps(z+i, XMM2);
-    _mm_storeu_ps(z+i+4, XMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] / y[i];
-  }
-}
-
-static void THFloatVector_divs_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set1_ps(c);
-  __m128 XMM0, XMM1;
-  for (i=0; i<=((n)-8); i+=8) {
-    XMM0 = _mm_loadu_ps(x+i);
-    XMM1 = _mm_loadu_ps(x+i+4);
-    XMM0 = _mm_div_ps(XMM0, XMM7);
-    XMM1 = _mm_div_ps(XMM1, XMM7);
-    _mm_storeu_ps(y+i, XMM0);
-    _mm_storeu_ps(y+i+4, XMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] / c;
-  }
-}
-
--- a/aten/src/TH/generic/simd/simd.h
+++ b/aten/src/TH/generic/simd/simd.h
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@ -59,21 +59,10 @@ if (NOT BUILD_ATEN_MOBILE)
    SET(VCOMP_LIB "vcompd")
  ENDIF()

-  # SET_SOURCE_FILES_PROPERTIES must be in the same CMakeLists.txt file as the target that includes the file
-  # so we need to set these commands here rather than in src/TH
-  IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
-    IF(MSVC)
-      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_sse.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/fp:fast")
-    ELSE(MSVC)
-      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_sse.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} -ffast-math")
-    ENDIF(MSVC)
-  ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
  IF(C_AVX_FOUND)
    IF(MSVC)
-      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_avx.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/fp:fast ${CXX_AVX_FLAGS}")
      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/arch:AVX ${CXX_AVX_FLAGS}")
    ELSE(MSVC)
-      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/generic/simd/convolve5x5_avx.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} -ffast-math ${CXX_AVX_FLAGS}")
      SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} ${CXX_AVX_FLAGS}")
    ENDIF(MSVC)
  ENDIF(C_AVX_FOUND)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1085,36 +1085,17 @@ if (NOT BUILD_ATEN_MOBILE)
    add_compile_options(-DUSE_GCC_GET_CPUID)
  ENDIF()

-  FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
-  IF (C_SSE2_FOUND)
-    MESSAGE(STATUS "SSE2 Found")
-    # TODO: Work out correct way to do this.  Note that C_SSE2_FLAGS is often
-    # empty, in which case it expands to " " flag which is bad
-    SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} ${CMAKE_C_FLAGS}")
-    SET(CMAKE_CXX_FLAGS "${C_SSE2_FLAGS} ${CMAKE_CXX_FLAGS}")
-    add_compile_options(-DUSE_SSE2)
-  ENDIF()
-  IF (C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
-    SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} ${C_SSE4_2_FLAGS} ${CMAKE_C_FLAGS}")
-    SET(CMAKE_CXX_FLAGS "${C_SSE4_1_FLAGS} ${C_SSE4_2_FLAGS} ${CMAKE_CXX_FLAGS}")
-    add_compile_options(-DUSE_SSE4_1 -DUSE_SSE4_2)
-  ENDIF()
-  IF (C_SSE3_FOUND)
-    MESSAGE(STATUS "SSE3 Found")
-    SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} ${CMAKE_C_FLAGS}")
-    SET(CMAKE_CXX_FLAGS "${C_SSE3_FLAGS} ${CMAKE_CXX_FLAGS}")
-    add_compile_options(-DUSE_SSE3)
-  ENDIF()
+  FIND_PACKAGE(AVX) # checks AVX and AVX2

  # we don't set -mavx and -mavx2 flags globally, but only for specific files
  # however, we want to enable the AVX codepaths, so we still need to
  # add USE_AVX and USE_AVX2 macro defines
  IF (C_AVX_FOUND)
-    MESSAGE(STATUS "AVX Found")
+    MESSAGE(STATUS "AVX compiler support found")
    add_compile_options(-DUSE_AVX)
  ENDIF()
  IF (C_AVX2_FOUND)
-    MESSAGE(STATUS "AVX2 Found")
+    MESSAGE(STATUS "AVX2 compiler support found")
    add_compile_options(-DUSE_AVX2)
  ENDIF()

--- a/cmake/Modules/FindAVX.cmake
+++ b/cmake/Modules/FindAVX.cmake
@ -0,0 +1,59 @@
+INCLUDE(CheckCSourceCompiles)
+INCLUDE(CheckCXXSourceCompiles)
+
+SET(AVX_CODE "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+")
+
+SET(AVX2_CODE "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256i a = {0};
+    a = _mm256_abs_epi16(a);
+    return 0;
+  }
+")
+
+MACRO(CHECK_SSE lang type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${lang}_${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      IF(lang STREQUAL "CXX")
+        CHECK_CXX_SOURCE_COMPILES("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      ELSE()
+        CHECK_C_SOURCE_COMPILES("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      ENDIF()
+      IF(${lang}_HAS_${type}_${__FLAG_I})
+        SET(${lang}_${type}_FOUND TRUE CACHE BOOL "${lang} ${type} support")
+        SET(${lang}_${type}_FLAGS "${__FLAG}" CACHE STRING "${lang} ${type} flags")
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${lang}_${type}_FOUND)
+    SET(${lang}_${type}_FOUND FALSE CACHE BOOL "${lang} ${type} support")
+    SET(${lang}_${type}_FLAGS "" CACHE STRING "${lang} ${type} flags")
+  ENDIF()
+
+  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
+
+ENDMACRO()
+
+CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
--- a/cmake/Modules/FindSSE.cmake
+++ b/cmake/Modules/FindSSE.cmake
@ -1,125 +0,0 @@
-INCLUDE(CheckCSourceRuns)
-INCLUDE(CheckCXXSourceRuns)
-
-SET(SSE1_CODE "
-  #include <xmmintrin.h>
-
-  int main()
-  {
-    __m128 a;
-    float vals[4] = {0,0,0,0};
-    a = _mm_loadu_ps(vals);
-    return 0;
-  }")
-
-SET(SSE2_CODE "
-  #include <emmintrin.h>
-
-  int main()
-  {
-    __m128d a;
-    double vals[2] = {0,0};
-    a = _mm_loadu_pd(vals);
-    return 0;
-  }")
-
-SET(SSE3_CODE "
-  #include <pmmintrin.h>
-
-  int main( )
-  {
-    const int vals[4] = {0,0,0,0};
-    __m128i a;
-    a = _mm_lddqu_si128( (const __m128i*)vals );
-    return 0;
-  }")
-
-SET(SSE4_1_CODE "
-  #include <smmintrin.h>
-
-  int main ()
-  {
-    __m128i a = {0,0,0,0}, b = {0,0,0,0};
-    __m128i res = _mm_max_epi8(a, b);
-
-    return 0;
-  }
-")
-
-SET(SSE4_2_CODE "
-  #include <nmmintrin.h>
-
-  int main()
-  {
-    __m128i a = {0,0,0,0}, b = {0,0,0,0}, c = {0,0,0,0};
-    c = _mm_cmpgt_epi64(a, b);
-    return 0;
-  }
-")
-
-SET(AVX_CODE "
-  #include <immintrin.h>
-
-  int main()
-  {
-    __m256 a;
-    a = _mm256_set1_ps(0);
-    return 0;
-  }
-")
-
-SET(AVX2_CODE "
-  #include <immintrin.h>
-
-  int main()
-  {
-    __m256i a = {0};
-    a = _mm256_abs_epi16(a);
-    return 0;
-  }
-")
-
-MACRO(CHECK_SSE lang type flags)
-  SET(__FLAG_I 1)
-  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-  FOREACH(__FLAG ${flags})
-    IF(NOT ${lang}_${type}_FOUND)
-      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
-      IF(lang STREQUAL "CXX")
-        CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
-      ELSE()
-        CHECK_C_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
-      ENDIF()
-      IF(${lang}_HAS_${type}_${__FLAG_I})
-        SET(${lang}_${type}_FOUND TRUE CACHE BOOL "${lang} ${type} support")
-        SET(${lang}_${type}_FLAGS "${__FLAG}" CACHE STRING "${lang} ${type} flags")
-      ENDIF()
-      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
-    ENDIF()
-  ENDFOREACH()
-  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-  IF(NOT ${lang}_${type}_FOUND)
-    SET(${lang}_${type}_FOUND FALSE CACHE BOOL "${lang} ${type} support")
-    SET(${lang}_${type}_FLAGS "" CACHE STRING "${lang} ${type} flags")
-  ENDIF()
-
-  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
-
-ENDMACRO()
-
-CHECK_SSE(C "SSE1" " ;-msse;/arch:SSE")
-CHECK_SSE(C "SSE2" " ;-msse2;/arch:SSE2")
-CHECK_SSE(C "SSE3" " ;-msse3;/arch:SSE3")
-CHECK_SSE(C "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
-CHECK_SSE(C "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
-CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
-CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
-
-CHECK_SSE(CXX "SSE1" " ;-msse;/arch:SSE")
-CHECK_SSE(CXX "SSE2" " ;-msse2;/arch:SSE2")
-CHECK_SSE(CXX "SSE3" " ;-msse3;/arch:SSE3")
-CHECK_SSE(CXX "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
-CHECK_SSE(CXX "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
-CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
-CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
				`@ -1 +0,0 @@`
				`void convolve_5x5(float* output, float* input, float* kernel, int64_t outRows, int64_t outCols, int64_t inCols);`