From 1423a171088c09006e5886ca859f2c7a0ac39666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Fri, 11 Dec 2015 23:47:10 +0100
Subject: [PATCH 001/101] Add THNN/ffi conversion of Abs

---
 CMakeLists.txt | 10 +++++++
 README.md      | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++
 THNN.h         | 14 ++++++++++
 generic/Abs.c  | 20 ++++++++++++++
 generic/THNN.h | 19 +++++++++++++
 init.c         |  8 ++++++
 6 files changed, 143 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 README.md
 create mode 100644 THNN.h
 create mode 100644 generic/Abs.c
 create mode 100644 generic/THNN.h
 create mode 100644 init.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000000..e94fca0c5fd
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,10 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+
+SET(src init.c)
+ADD_LIBRARY(THNN SHARED init.c)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+TARGET_LINK_LIBRARIES(THNN TH)
+
+INSTALL(TARGETS THNN
+  RUNTIME DESTINATION ${Torch_INSTALL_LIB_SUBDIR}
+  LIBRARY DESTINATION ${Torch_INSTALL_LIB_SUBDIR})
diff --git a/README.md b/README.md
new file mode 100644
index 00000000000..dec5ffc64b4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,72 @@
+## API design guidelines
+
+All functions should accept arguments in the following order. Dots represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. They should follow the order
+```
+[weight], [bias], [any buffers], [additional arguments], [optional arugments]
+```
+
+### Modules
+```
+updateOutput: state, input, output, ...
+updateGradInput: state, input, gradOutput, gradInput, ...
+accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
+```
+
+e.g.
+```C
+void THNN_(HardShrink_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+```
+
+### Criterions
+```
+updateOutput: state, input, target, output, ...
+updateGradInput: state, input, target, gradInput, ...
+```
+
+e.g.
+
+```C
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState* state,
+          THTensor *input,
+          THLongTensor *target,
+          THTensor *output,
+          THTensor *weights,
+          THTensor *total_weight,
+          bool sizeAverage)
+```
+
+## Code style guide
+
+```C
+void THNN_Linear_updateOutput(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+//<- 10 ->
+```
+
+All arguments should start on a new line after function name, and they should be indented using 10 spaces.
+
+Use 2 spaces for block indentation.
+
+
+### Conversion Steps
+
+1. copy old .c file to lib/THNN/generic 
+  - replace static int nn_ -> void THNN_
+  - replace lua_State \*L with 'actual' parameters (+ add THNNState\* state)
+  - remove any numeric values from return statements, remove the return at the end of the function body
+  - remove old luaL_Reg & _init function
+2. add forward declarations to generic/THNN.h
+3. include the generic/xyz.c file in init.c
+4. add functions to ffi.lua
+5. copy & adapt lua file: specify module THNN for torch.class(), use THNN.errcheck
+6. include module lua file in init.lua
+7. add & run unit test to lua/tests/test.lua
diff --git a/THNN.h b/THNN.h
new file mode 100644
index 00000000000..3968d2b3250
--- /dev/null
+++ b/THNN.h
@@ -0,0 +1,14 @@
+#ifndef THNN_H
+#define THNN_H
+
+#include <stdbool.h>
+#include <TH.h>
+
+#define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
+
+typedef void THNNState;
+
+#include "generic/THNN.h"
+#include <THGenerateFloatTypes.h>
+
+#endif
\ No newline at end of file
diff --git a/generic/Abs.c b/generic/Abs.c
new file mode 100644
index 00000000000..cc96d5d4409
--- /dev/null
+++ b/generic/Abs.c
@@ -0,0 +1,20 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    real z = *input_data;
+    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+  );
+}
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
new file mode 100644
index 00000000000..8d74ae14bbd
--- /dev/null
+++ b/generic/THNN.h
@@ -0,0 +1,19 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THNN.h"
+#else
+
+#ifndef THIndexTensor
+#define THIndexTensor THLongTensor
+#endif
+
+TH_API void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+#endif
diff --git a/init.c b/init.c
new file mode 100644
index 00000000000..4488afcc489
--- /dev/null
+++ b/init.c
@@ -0,0 +1,8 @@
+#include "TH.h"
+#include "THNN.h"
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#include "generic/Abs.c"
+#include "THGenerateFloatTypes.h"

From af34140313b5a8aba7486eff29df65a3dadedbd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 12 Dec 2015 22:51:26 +0100
Subject: [PATCH 002/101] moved AbsCriterion.c

---
 generic/AbsCriterion.c | 54 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 generic/AbsCriterion.c

diff --git a/generic/AbsCriterion.c b/generic/AbsCriterion.c
new file mode 100644
index 00000000000..397e9ddd421
--- /dev/null
+++ b/generic/AbsCriterion.c
@@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+static int nn_(AbsCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   sum += fabs(*input_data - *target_data);)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(AbsCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   *gradInput_data = ( (*input_data - *target_data) >= 0 ? norm : -norm);)
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(AbsCriterion__) [] = {
+  {"AbsCriterion_updateOutput", nn_(AbsCriterion_updateOutput)},
+  {"AbsCriterion_updateGradInput", nn_(AbsCriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(AbsCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(AbsCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From 97c5021b683765b1bf63954c8198a03098e18179 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 12 Dec 2015 22:52:35 +0100
Subject: [PATCH 003/101] Add functional version of AbsCriterion using
 metatable call

THNN state is now passed implicitely.
---
 generic/AbsCriterion.c | 43 +++++++++---------------------------------
 generic/THNN.h         | 13 +++++++++++++
 init.c                 |  3 +++
 3 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/generic/AbsCriterion.c b/generic/AbsCriterion.c
index 397e9ddd421..f14181cca40 100644
--- a/generic/AbsCriterion.c
+++ b/generic/AbsCriterion.c
@@ -2,53 +2,28 @@
 #define TH_GENERIC_FILE "generic/AbsCriterion.c"
 #else
 
-static int nn_(AbsCriterion_updateOutput)(lua_State *L)
+void THNN_(AbsCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, real *output, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  real sum;
+  real sum = 0;
 
-  sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
-                   sum += fabs(*input_data - *target_data);)
+    sum += fabs(*input_data - *target_data);
+  );
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
-  lua_pushnumber(L, sum);
-  return 1;
+  *output = sum;
 }
 
-static int nn_(AbsCriterion_updateGradInput)(lua_State *L)
+void THNN_(AbsCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
-                   *gradInput_data = ( (*input_data - *target_data) >= 0 ? norm : -norm);)
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(AbsCriterion__) [] = {
-  {"AbsCriterion_updateOutput", nn_(AbsCriterion_updateOutput)},
-  {"AbsCriterion_updateGradInput", nn_(AbsCriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(AbsCriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(AbsCriterion__), "nn");
-  lua_pop(L,1);
+    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+  );
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 8d74ae14bbd..00f5fed3020 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -16,4 +16,17 @@ TH_API void THNN_(Abs_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput);
 
+TH_API void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          real *output,
+          bool sizeAverage);
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
 #endif
diff --git a/init.c b/init.c
index 4488afcc489..fbd9c5053f0 100644
--- a/init.c
+++ b/init.c
@@ -6,3 +6,6 @@
 
 #include "generic/Abs.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/AbsCriterion.c"
+#include "THGenerateFloatTypes.h"

From 81f0fb213cc10199ac8f91c3ae15ac847221521a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Thu, 17 Dec 2015 00:31:50 +0100
Subject: [PATCH 004/101] Install THNN into ${Torch_INSTALL_LUA_CPATH_SUBDIR}

---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e94fca0c5fd..339b2f48cec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,15 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+FIND_PACKAGE(Torch REQUIRED)
+
+IF(NOT THNN_INSTALL_LIB_SUBDIR)
+  SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
+ENDIF()
 
 SET(src init.c)
 ADD_LIBRARY(THNN SHARED init.c)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 TARGET_LINK_LIBRARIES(THNN TH)
 
-INSTALL(TARGETS THNN
-  RUNTIME DESTINATION ${Torch_INSTALL_LIB_SUBDIR}
-  LIBRARY DESTINATION ${Torch_INSTALL_LIB_SUBDIR})
+INSTALL(TARGETS THNN LIBRARY DESTINATION ${THNN_INSTALL_LIB_SUBDIR})

From 6288c5498ebdebfa3d36632a21ff114c89b80874 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Thu, 17 Dec 2015 10:19:51 +0100
Subject: [PATCH 005/101] Change THNN library type to MODULE (to create
 libTHNN.so on OSX)

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 339b2f48cec..b3bf40595ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,14 +1,16 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
 CMAKE_POLICY(VERSION 2.6)
 
-FIND_PACKAGE(Torch REQUIRED)
+IF(NOT Torch_FOUND)
+  FIND_PACKAGE(Torch REQUIRED)
+ENDIF()
 
 IF(NOT THNN_INSTALL_LIB_SUBDIR)
   SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
 ENDIF()
 
 SET(src init.c)
-ADD_LIBRARY(THNN SHARED init.c)
+ADD_LIBRARY(THNN MODULE init.c)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 TARGET_LINK_LIBRARIES(THNN TH)
 

From 3b8dea4c7f69cd52b9417153424d411f03bc5375 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Wed, 30 Dec 2015 23:29:27 +0100
Subject: [PATCH 006/101] Move ClassNLLCriterion.c to lib/THNN/generic

---
 generic/ClassNLLCriterion.c | 163 ++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 generic/ClassNLLCriterion.c

diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
new file mode 100644
index 00000000000..d8efef76f12
--- /dev/null
+++ b/generic/ClassNLLCriterion.c
@@ -0,0 +1,163 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+
+static int nn_(ClassNLLCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THLongTensor *target = luaT_checkudata(L, 2, "torch.LongTensor");
+  THTensor *weights = NULL;
+  if (!lua_isnil(L, 3)) {
+    weights = luaT_checkudata(L, 3, torch_Tensor);
+  }
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+
+  int sizeAverage = lua_toboolean(L, 4);
+  THTensor *output = luaT_checkudata(L, 5, torch_Tensor);
+  THTensor *total_weight = luaT_checkudata(L, 6, torch_Tensor);
+
+  if (THLongTensor_nDimension(target) > 1) {
+    THError("multi-target not supported");
+  }
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THLongTensor_newContiguous(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  long *target_data = THLongTensor_data(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  output_data[0] = total_weight_data[0] = 0.0;
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - 1;
+    THAssert(cur_target >= 0 && cur_target < n_classes);
+    total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+    output_data[0] = -input_data[cur_target] * total_weight_data[0];
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = target_data[i] - 1;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_data[0] += cur_weight;
+      output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+    }
+  }
+
+  if (sizeAverage && total_weight_data[0]) {
+    output_data[0] /= total_weight_data[0];
+  }
+
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+  THTensor_(free)(input);
+  THLongTensor_free(target);
+
+  return 0;
+}
+
+static int nn_(ClassNLLCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THLongTensor *target = luaT_checkudata(L, 2, "torch.LongTensor");
+  THTensor *weights = NULL;
+  if (!lua_isnil(L, 3)) {
+    weights = luaT_checkudata(L, 3, torch_Tensor);
+  }
+
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+
+  int sizeAverage = lua_toboolean(L, 4);
+  THTensor *total_weight = luaT_checkudata(L, 5, torch_Tensor);
+  THTensor *gradInput = luaT_checkudata(L, 6, torch_Tensor);
+  luaL_argcheck(
+    L,
+    THTensor_(isContiguous)(gradInput),
+    6,
+    "gradInput must be contiguous"
+  );
+
+  real* total_weight_data = THTensor_(data)(total_weight);
+
+  if (!(*total_weight_data > 0)) {
+    return 0;
+  }
+
+  if (THLongTensor_nDimension(target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  target = THLongTensor_newContiguous(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  long *target_data = THLongTensor_data(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - 1;
+    THAssert(cur_target >= 0 && cur_target < n_classes);
+
+    gradInput_data[cur_target] =
+      (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for(i = 0; i < batch_size; i++){
+      int cur_target = target_data[i] - 1;
+
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[i * n_target + cur_target] =
+        -(weights ? weights_data[cur_target] : 1.0f);
+
+      if (sizeAverage && *total_weight_data) {
+        gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+      }
+    }
+  }
+
+  THLongTensor_free(target);
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(ClassNLLCriterion__) [] = {
+  {"ClassNLLCriterion_updateOutput", nn_(ClassNLLCriterion_updateOutput)},
+  {"ClassNLLCriterion_updateGradInput", nn_(ClassNLLCriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(ClassNLLCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(ClassNLLCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From 8c0041bde287d2b6597dca28ca01677e70f5aa07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Wed, 30 Dec 2015 23:43:06 +0100
Subject: [PATCH 007/101] Add functional conversion of ClassNLLCriterion

Fixed indentation in THNN.lua to 3-spaces.
---
 THNN.h                      |  4 ++
 generic/ClassNLLCriterion.c | 79 +++++++++----------------------------
 generic/THNN.h              | 21 ++++++++--
 init.c                      |  3 ++
 4 files changed, 43 insertions(+), 64 deletions(-)

diff --git a/THNN.h b/THNN.h
index 3968d2b3250..f731e14cd82 100644
--- a/THNN.h
+++ b/THNN.h
@@ -6,6 +6,10 @@
 
 #define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
 
+#define THIndexTensor THLongTensor
+#define THIndexTensor_(NAME) THLongTensor_ ## NAME
+
+typedef long TH_index_t;
 typedef void THNNState;
 
 #include "generic/THNN.h"
diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index d8efef76f12..d8270dcf645 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -2,23 +2,12 @@
 #define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
 #else
 
-
-static int nn_(ClassNLLCriterion_updateOutput)(lua_State *L)
+void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input, THIndexTensor *target, THTensor *output, bool sizeAverage, THTensor *weights, THTensor *total_weight)
 {
-  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
-  THLongTensor *target = luaT_checkudata(L, 2, "torch.LongTensor");
-  THTensor *weights = NULL;
-  if (!lua_isnil(L, 3)) {
-    weights = luaT_checkudata(L, 3, torch_Tensor);
-  }
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
 
-  int sizeAverage = lua_toboolean(L, 4);
-  THTensor *output = luaT_checkudata(L, 5, torch_Tensor);
-  THTensor *total_weight = luaT_checkudata(L, 6, torch_Tensor);
-
-  if (THLongTensor_nDimension(target) > 1) {
+  if (THIndexTensor_(nDimension)(target) > 1) {
     THError("multi-target not supported");
   }
   if (THTensor_(nDimension)(input) > 2) {
@@ -26,11 +15,11 @@ static int nn_(ClassNLLCriterion_updateOutput)(lua_State *L)
   }
 
   input = THTensor_(newContiguous)(input);
-  target = THLongTensor_newContiguous(target);
+  target = THIndexTensor_(newContiguous)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
 
   real *input_data = THTensor_(data)(input);
-  long *target_data = THLongTensor_data(target);
+  TH_index_t *target_data = THIndexTensor_(data)(target);
   real *weights_data = weights ? THTensor_(data)(weights) : NULL;
   real *output_data = THTensor_(data)(output);
   real *total_weight_data = THTensor_(data)(total_weight);
@@ -65,40 +54,25 @@ static int nn_(ClassNLLCriterion_updateOutput)(lua_State *L)
     THTensor_(free)(weights);
   }
   THTensor_(free)(input);
-  THLongTensor_free(target);
-
-  return 0;
+  THIndexTensor_(free)(target);
 }
 
-static int nn_(ClassNLLCriterion_updateGradInput)(lua_State *L)
+void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input, THIndexTensor *target, THTensor *gradInput, bool sizeAverage, THTensor *weights, THTensor *total_weight)
 {
-  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
-  THLongTensor *target = luaT_checkudata(L, 2, "torch.LongTensor");
-  THTensor *weights = NULL;
-  if (!lua_isnil(L, 3)) {
-    weights = luaT_checkudata(L, 3, torch_Tensor);
-  }
-
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
 
-  int sizeAverage = lua_toboolean(L, 4);
-  THTensor *total_weight = luaT_checkudata(L, 5, torch_Tensor);
-  THTensor *gradInput = luaT_checkudata(L, 6, torch_Tensor);
-  luaL_argcheck(
-    L,
-    THTensor_(isContiguous)(gradInput),
-    6,
-    "gradInput must be contiguous"
-  );
-
-  real* total_weight_data = THTensor_(data)(total_weight);
-
-  if (!(*total_weight_data > 0)) {
-    return 0;
+  if (!THTensor_(isContiguous)(gradInput)) {
+    THError("gradInput must be contiguous");
   }
 
-  if (THLongTensor_nDimension(target) > 1) {
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  if (!(*total_weight_data > 0)) {
+    return;
+  }
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
     THError("multi-target not supported");
   }
 
@@ -106,10 +80,10 @@ static int nn_(ClassNLLCriterion_updateGradInput)(lua_State *L)
     THError("input tensor should be 1D or 2D");
   }
 
-  target = THLongTensor_newContiguous(target);
+  target = THIndexTensor_(newContiguous)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
 
-  long *target_data = THLongTensor_data(target);
+  TH_index_t *target_data = THIndexTensor_(data)(target);
   real *weights_data = weights ? THTensor_(data)(weights) : NULL;
   real *gradInput_data = THTensor_(data)(gradInput);
 
@@ -125,7 +99,7 @@ static int nn_(ClassNLLCriterion_updateGradInput)(lua_State *L)
     int n_target = THTensor_(size)(input, 1);
 
     int i;
-    for(i = 0; i < batch_size; i++){
+    for (i = 0; i < batch_size; i++){
       int cur_target = target_data[i] - 1;
 
       THAssert(cur_target >= 0 && cur_target < n_classes);
@@ -139,25 +113,10 @@ static int nn_(ClassNLLCriterion_updateGradInput)(lua_State *L)
     }
   }
 
-  THLongTensor_free(target);
+  THIndexTensor_(free)(target);
   if (weights) {
     THTensor_(free)(weights);
   }
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(ClassNLLCriterion__) [] = {
-  {"ClassNLLCriterion_updateOutput", nn_(ClassNLLCriterion_updateOutput)},
-  {"ClassNLLCriterion_updateGradInput", nn_(ClassNLLCriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(ClassNLLCriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(ClassNLLCriterion__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 00f5fed3020..c4bd9eb09d3 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -2,10 +2,6 @@
 #define TH_GENERIC_FILE "generic/THNN.h"
 #else
 
-#ifndef THIndexTensor
-#define THIndexTensor THLongTensor
-#endif
-
 TH_API void THNN_(Abs_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -29,4 +25,21 @@ TH_API void THNN_(AbsCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage);
 
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight);
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight);
+
 #endif
diff --git a/init.c b/init.c
index fbd9c5053f0..455b917a822 100644
--- a/init.c
+++ b/init.c
@@ -9,3 +9,6 @@
 
 #include "generic/AbsCriterion.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/ClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"

From 939f9341fa7b498749af6af4df7739e64937263c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 13:24:51 +0100
Subject: [PATCH 008/101] Move DistKLDivCriterion.c -> lib/THNN/generic

---
 generic/DistKLDivCriterion.c | 53 ++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 generic/DistKLDivCriterion.c

diff --git a/generic/DistKLDivCriterion.c b/generic/DistKLDivCriterion.c
new file mode 100644
index 00000000000..1e433c238da
--- /dev/null
+++ b/generic/DistKLDivCriterion.c
@@ -0,0 +1,53 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+static int nn_(DistKLDivCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(DistKLDivCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;)
+  return 1;
+}
+
+static const struct luaL_Reg nn_(DistKLDivCriterion__) [] = {
+  {"DistKLDivCriterion_updateOutput", nn_(DistKLDivCriterion_updateOutput)},
+  {"DistKLDivCriterion_updateGradInput", nn_(DistKLDivCriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(DistKLDivCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(DistKLDivCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From 90cbf8f3c3997c22d3bef4b3b4d9c58f0b0aeb9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 13:38:18 +0100
Subject: [PATCH 009/101] Add functional conversion of DistKLDivCriterion

---
 generic/DistKLDivCriterion.c | 42 ++++++++----------------------------
 generic/THNN.h               | 13 +++++++++++
 init.c                       |  3 +++
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/generic/DistKLDivCriterion.c b/generic/DistKLDivCriterion.c
index 1e433c238da..74cd43fc101 100644
--- a/generic/DistKLDivCriterion.c
+++ b/generic/DistKLDivCriterion.c
@@ -2,52 +2,28 @@
 #define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
 #else
 
-static int nn_(DistKLDivCriterion_updateOutput)(lua_State *L)
+void THNN_(DistKLDivCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, real *output, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  real sum;
+  real sum = 0;
 
-  sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
-                   sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;)
+    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+  );
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
-  lua_pushnumber(L, sum);
-  return 1;
+  *output = sum;
 }
 
-static int nn_(DistKLDivCriterion_updateGradInput)(lua_State *L)
+void THNN_(DistKLDivCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
-                   *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;)
-  return 1;
-}
-
-static const struct luaL_Reg nn_(DistKLDivCriterion__) [] = {
-  {"DistKLDivCriterion_updateOutput", nn_(DistKLDivCriterion_updateOutput)},
-  {"DistKLDivCriterion_updateGradInput", nn_(DistKLDivCriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(DistKLDivCriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(DistKLDivCriterion__), "nn");
-  lua_pop(L,1);
+    *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
+  );
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index c4bd9eb09d3..d217827533e 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -42,4 +42,17 @@ TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
           THTensor *weights,
           THTensor *total_weight);
 
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          real *output,
+          bool sizeAverage);
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
 #endif
diff --git a/init.c b/init.c
index 455b917a822..bd7afe00424 100644
--- a/init.c
+++ b/init.c
@@ -12,3 +12,6 @@
 
 #include "generic/ClassNLLCriterion.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/DistKLDivCriterion.c"
+#include "THGenerateFloatTypes.h"

From 582c66ff365384b8e2d6b8534f320bb024726793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 14:39:45 +0100
Subject: [PATCH 010/101] Move HardShrink.c -> lib/THNN/generic

---
 generic/HardShrink.c | 50 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 generic/HardShrink.c

diff --git a/generic/HardShrink.c b/generic/HardShrink.c
new file mode 100644
index 00000000000..67600366044
--- /dev/null
+++ b/generic/HardShrink.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+static int nn_(HardShrink_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  
+  TH_TENSOR_APPLY2(real, output, real, input,				\
+                   if ((*input_data) > lambda) *output_data = *input_data; \
+                   else if ((*input_data) < -lambda) *output_data = *input_data; \
+                   else *output_data = 0;);
+  return 1;
+}
+
+static int nn_(HardShrink_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,	\
+                   if ((*input_data) > lambda || (*input_data) < -lambda) \
+		     *gradInput_data = (*gradOutput_data);		\
+		   else							\
+		     *gradInput_data = 0;				\
+    );
+  return 1;
+}
+
+static const struct luaL_Reg nn_(HardShrink__) [] = {
+  {"HardShrink_updateOutput", nn_(HardShrink_updateOutput)},
+  {"HardShrink_updateGradInput", nn_(HardShrink_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(HardShrink_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(HardShrink__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From bf62bd7e06798342406e26dcf12d913a15d602cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 14:49:17 +0100
Subject: [PATCH 011/101] Add functional conversion of HardShrink

---
 generic/HardShrink.c | 54 ++++++++++++++------------------------------
 generic/THNN.h       | 12 ++++++++++
 init.c               |  3 +++
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/generic/HardShrink.c b/generic/HardShrink.c
index 67600366044..682534e349f 100644
--- a/generic/HardShrink.c
+++ b/generic/HardShrink.c
@@ -2,49 +2,29 @@
 #define TH_GENERIC_FILE "generic/HardShrink.c"
 #else
 
-static int nn_(HardShrink_updateOutput)(lua_State *L)
+void THNN_(HardShrink_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real lambda)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
-  
-  TH_TENSOR_APPLY2(real, output, real, input,				\
-                   if ((*input_data) > lambda) *output_data = *input_data; \
-                   else if ((*input_data) < -lambda) *output_data = *input_data; \
-                   else *output_data = 0;);
-  return 1;
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+      *output_data = *input_data;
+    else if ((*input_data) < -lambda)
+      *output_data = *input_data;
+    else
+      *output_data = 0;
+  );
 }
 
-static int nn_(HardShrink_updateGradInput)(lua_State *L)
+void THNN_(HardShrink_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real lambda)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, input);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,	\
-                   if ((*input_data) > lambda || (*input_data) < -lambda) \
-		     *gradInput_data = (*gradOutput_data);		\
-		   else							\
-		     *gradInput_data = 0;				\
-    );
-  return 1;
-}
-
-static const struct luaL_Reg nn_(HardShrink__) [] = {
-  {"HardShrink_updateOutput", nn_(HardShrink_updateOutput)},
-  {"HardShrink_updateGradInput", nn_(HardShrink_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(HardShrink_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(HardShrink__), "nn");
-  lua_pop(L,1);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index d217827533e..344cd26dbd4 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -55,4 +55,16 @@ TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage);
 
+TH_API void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+TH_API void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+
 #endif
diff --git a/init.c b/init.c
index bd7afe00424..649f5e190ff 100644
--- a/init.c
+++ b/init.c
@@ -15,3 +15,6 @@
 
 #include "generic/DistKLDivCriterion.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/HardShrink.c"
+#include "THGenerateFloatTypes.h"

From a2cf03f12f754a00eaacbdf16497b9a94acdc64b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 14:50:29 +0100
Subject: [PATCH 012/101] Move HardTanh.c -> lib/THNN/generic

---
 generic/HardTanh.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 generic/HardTanh.c

diff --git a/generic/HardTanh.c b/generic/HardTanh.c
new file mode 100644
index 00000000000..70d0e812453
--- /dev/null
+++ b/generic/HardTanh.c
@@ -0,0 +1,97 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+static int nn_(HardTanh_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real min_val = luaT_getfieldchecknumber(L, 1, "min_val");
+  real max_val = luaT_getfieldchecknumber(L, 1, "max_val");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,     \
+         if(*input_data < min_val)     \
+           *output_data = min_val;   \
+         else if(*input_data <= max_val)    \
+           *output_data = *input_data;  \
+         else       \
+           *output_data = max_val;);
+  }
+  else
+  {
+    real* ptr_output = THTensor_(data)(output);
+    real* ptr_input  = THTensor_(data)(input);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+    {
+      if(ptr_input[i] < min_val)
+        ptr_output[i] = min_val;
+      else if (ptr_input[i] <= max_val)
+        ptr_output[i] = ptr_input[i];
+      else
+        ptr_output[i] = max_val;
+    }
+  }
+  return 1;
+}
+
+static int nn_(HardTanh_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real min_val = luaT_getfieldchecknumber(L, 1, "min_val");
+  real max_val = luaT_getfieldchecknumber(L, 1, "max_val");
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 || 
+      !THTensor_(isContiguous)(input) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,  \
+         if(*input_data < min_val || *input_data > max_val)    \
+           *gradInput_data = 0;                             \
+         else           \
+           *gradInput_data = *gradOutput_data;);
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_input      = THTensor_(data)(input);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+    {
+      if(ptr_input[i] < min_val || ptr_input[i] > max_val)
+        ptr_gradInput[i] = 0;
+      else
+        ptr_gradInput[i] = ptr_gradOutput[i];
+    }
+  }
+  return 1;
+}
+
+static const struct luaL_Reg nn_(HardTanh__) [] = {
+  {"HardTanh_updateOutput", nn_(HardTanh_updateOutput)},
+  {"HardTanh_updateGradInput", nn_(HardTanh_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(HardTanh_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(HardTanh__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From 754a5aaebd4b6e0c5162795d1558bfdee5129cd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 15:05:07 +0100
Subject: [PATCH 013/101] Add functional conversion of HardTanh

---
 generic/HardTanh.c | 60 ++++++++++++++--------------------------------
 generic/THNN.h     | 14 +++++++++++
 init.c             |  3 +++
 3 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/generic/HardTanh.c b/generic/HardTanh.c
index 70d0e812453..6251e1b48fd 100644
--- a/generic/HardTanh.c
+++ b/generic/HardTanh.c
@@ -2,24 +2,20 @@
 #define TH_GENERIC_FILE "generic/HardTanh.c"
 #else
 
-static int nn_(HardTanh_updateOutput)(lua_State *L)
+void THNN_(HardTanh_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real min_val, real max_val)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real min_val = luaT_getfieldchecknumber(L, 1, "min_val");
-  real max_val = luaT_getfieldchecknumber(L, 1, "max_val");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
   
   if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
   {
-    TH_TENSOR_APPLY2(real, output, real, input,     \
-         if(*input_data < min_val)     \
-           *output_data = min_val;   \
-         else if(*input_data <= max_val)    \
-           *output_data = *input_data;  \
-         else       \
-           *output_data = max_val;);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      if (*input_data < min_val)
+        *output_data = min_val;
+      else if (*input_data <= max_val)
+        *output_data = *input_data;
+      else
+        *output_data = max_val;
+    );
   }
   else
   {
@@ -30,7 +26,7 @@ static int nn_(HardTanh_updateOutput)(lua_State *L)
 #pragma omp parallel for private(i)
     for (i = 0; i < THTensor_(nElement)(input); i++)
     {
-      if(ptr_input[i] < min_val)
+      if (ptr_input[i] < min_val)
         ptr_output[i] = min_val;
       else if (ptr_input[i] <= max_val)
         ptr_output[i] = ptr_input[i];
@@ -38,17 +34,10 @@ static int nn_(HardTanh_updateOutput)(lua_State *L)
         ptr_output[i] = max_val;
     }
   }
-  return 1;
 }
 
-static int nn_(HardTanh_updateGradInput)(lua_State *L)
+void THNN_(HardTanh_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real min_val, real max_val)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real min_val = luaT_getfieldchecknumber(L, 1, "min_val");
-  real max_val = luaT_getfieldchecknumber(L, 1, "max_val");
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, input);
 
   if (input->nDimension == 1 || 
@@ -56,11 +45,12 @@ static int nn_(HardTanh_updateGradInput)(lua_State *L)
       !THTensor_(isContiguous)(gradOutput) ||
       !THTensor_(isContiguous)(gradInput))
   {
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,  \
-         if(*input_data < min_val || *input_data > max_val)    \
-           *gradInput_data = 0;                             \
-         else           \
-           *gradInput_data = *gradOutput_data;);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if (*input_data < min_val || *input_data > max_val)
+        *gradInput_data = 0;
+      else
+        *gradInput_data = *gradOutput_data;
+    );
   }
   else
   {
@@ -72,26 +62,12 @@ static int nn_(HardTanh_updateGradInput)(lua_State *L)
 #pragma omp parallel for private(i)
     for (i = 0; i < THTensor_(nElement)(input); i++)
     {
-      if(ptr_input[i] < min_val || ptr_input[i] > max_val)
+      if (ptr_input[i] < min_val || ptr_input[i] > max_val)
         ptr_gradInput[i] = 0;
       else
         ptr_gradInput[i] = ptr_gradOutput[i];
     }
   }
-  return 1;
-}
-
-static const struct luaL_Reg nn_(HardTanh__) [] = {
-  {"HardTanh_updateOutput", nn_(HardTanh_updateOutput)},
-  {"HardTanh_updateGradInput", nn_(HardTanh_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(HardTanh_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(HardTanh__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 344cd26dbd4..958fd5d021e 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -67,4 +67,18 @@ TH_API void THNN_(HardShrink_updateGradInput)(
           THTensor *gradInput,
           real lambda);
 
+TH_API void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real min_val,
+          real max_val);
+TH_API void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real min_val,
+          real max_val);
+
 #endif
diff --git a/init.c b/init.c
index 649f5e190ff..9209296d8a2 100644
--- a/init.c
+++ b/init.c
@@ -18,3 +18,6 @@
 
 #include "generic/HardShrink.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/HardTanh.c"
+#include "THGenerateFloatTypes.h"

From 73e9676ceda4d50469eca119a6eb0537cb58ed5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 15:06:51 +0100
Subject: [PATCH 014/101] Move L1Cost.c -> lib/THNN/generic

---
 generic/L1Cost.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 generic/L1Cost.c

diff --git a/generic/L1Cost.c b/generic/L1Cost.c
new file mode 100644
index 00000000000..a450e06e117
--- /dev/null
+++ b/generic/L1Cost.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+static int nn_(L1Cost_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  accreal sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY(real, input, sum += fabs(*input_data););
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(L1Cost_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY2(real, gradInput, real, input,
+                   if (*input_data > 0)
+                     *gradInput_data = 1;
+                   else if (*input_data < 0)
+                     *gradInput_data = -1;
+                   else
+                     *gradInput_data = 0;);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(L1Cost__) [] = {
+  {"L1Cost_updateOutput", nn_(L1Cost_updateOutput)},
+  {"L1Cost_updateGradInput", nn_(L1Cost_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(L1Cost_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(L1Cost__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From 1256875d667c3b09e77a0c76a7364a6031329263 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 15:58:45 +0100
Subject: [PATCH 015/101] Add functional conversion of L1Cost

---
 generic/L1Cost.c | 48 ++++++++++++++----------------------------------
 generic/THNN.h   |  9 +++++++++
 init.c           |  3 +++
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/generic/L1Cost.c b/generic/L1Cost.c
index a450e06e117..d95f3d4c5ae 100644
--- a/generic/L1Cost.c
+++ b/generic/L1Cost.c
@@ -2,48 +2,28 @@
 #define TH_GENERIC_FILE "generic/L1Cost.c"
 #else
 
-static int nn_(L1Cost_updateOutput)(lua_State *L)
+void THNN_(L1Cost_updateOutput)(THNNState *state, THTensor *input, real *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  accreal sum;
+  accreal sum = 0;
 
-  sum = 0;
-  TH_TENSOR_APPLY(real, input, sum += fabs(*input_data););
+  TH_TENSOR_APPLY(real, input, 
+    sum += fabs(*input_data);
+  );
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
-  lua_pushnumber(L, sum);
-  return 1;
+  *output = sum;
 }
 
-static int nn_(L1Cost_updateGradInput)(lua_State *L)
+void THNN_(L1Cost_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY2(real, gradInput, real, input,
-                   if (*input_data > 0)
-                     *gradInput_data = 1;
-                   else if (*input_data < 0)
-                     *gradInput_data = -1;
-                   else
-                     *gradInput_data = 0;);
-  return 1;
-}
-
-static const struct luaL_Reg nn_(L1Cost__) [] = {
-  {"L1Cost_updateOutput", nn_(L1Cost_updateOutput)},
-  {"L1Cost_updateGradInput", nn_(L1Cost_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(L1Cost_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(L1Cost__), "nn");
-  lua_pop(L,1);
+    if (*input_data > 0)
+      *gradInput_data = 1;
+    else if (*input_data < 0)
+      *gradInput_data = -1;
+    else
+      *gradInput_data = 0;
+  );
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 958fd5d021e..3e0ab04feaa 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -81,4 +81,13 @@ TH_API void THNN_(HardTanh_updateGradInput)(
           real min_val,
           real max_val);
 
+TH_API void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          real *output);
+TH_API void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
 #endif
diff --git a/init.c b/init.c
index 9209296d8a2..e0accfbc1cb 100644
--- a/init.c
+++ b/init.c
@@ -21,3 +21,6 @@
 
 #include "generic/HardTanh.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/L1Cost.c"
+#include "THGenerateFloatTypes.h"

From b5bf8113b2711c008bfc0a9061eb3fdda32685cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sat, 2 Jan 2016 22:04:29 +0100
Subject: [PATCH 016/101] Use tensor for THNN functions even for single element
 outputs

---
 generic/AbsCriterion.c       | 4 ++--
 generic/DistKLDivCriterion.c | 4 ++--
 generic/L1Cost.c             | 4 ++--
 generic/THNN.h               | 7 ++++---
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/generic/AbsCriterion.c b/generic/AbsCriterion.c
index f14181cca40..8469ac5cc31 100644
--- a/generic/AbsCriterion.c
+++ b/generic/AbsCriterion.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/AbsCriterion.c"
 #else
 
-void THNN_(AbsCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, real *output, bool sizeAverage)
+void THNN_(AbsCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
 {
   real sum = 0;
 
@@ -13,7 +13,7 @@ void THNN_(AbsCriterion_updateOutput)(THNNState *state, THTensor *input, THTenso
   if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  *output = sum;
+  THTensor_(set1d)(output, 0, sum);
 }
 
 void THNN_(AbsCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
diff --git a/generic/DistKLDivCriterion.c b/generic/DistKLDivCriterion.c
index 74cd43fc101..62b10faaa04 100644
--- a/generic/DistKLDivCriterion.c
+++ b/generic/DistKLDivCriterion.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
 #else
 
-void THNN_(DistKLDivCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, real *output, bool sizeAverage)
+void THNN_(DistKLDivCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
 {
   real sum = 0;
 
@@ -13,7 +13,7 @@ void THNN_(DistKLDivCriterion_updateOutput)(THNNState *state, THTensor *input, T
   if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  *output = sum;
+  THTensor_(set1d)(output, 0, sum);
 }
 
 void THNN_(DistKLDivCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
diff --git a/generic/L1Cost.c b/generic/L1Cost.c
index d95f3d4c5ae..2d8d39e71d7 100644
--- a/generic/L1Cost.c
+++ b/generic/L1Cost.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/L1Cost.c"
 #else
 
-void THNN_(L1Cost_updateOutput)(THNNState *state, THTensor *input, real *output)
+void THNN_(L1Cost_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
 {
   accreal sum = 0;
 
@@ -10,7 +10,7 @@ void THNN_(L1Cost_updateOutput)(THNNState *state, THTensor *input, real *output)
     sum += fabs(*input_data);
   );
 
-  *output = sum;
+  THTensor_(set1d)(output, 0, sum);
 }
 
 void THNN_(L1Cost_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
diff --git a/generic/THNN.h b/generic/THNN.h
index 3e0ab04feaa..cc3d650c978 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -16,7 +16,7 @@ TH_API void THNN_(AbsCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
-          real *output,
+          THTensor *output,
           bool sizeAverage);
 TH_API void THNN_(AbsCriterion_updateGradInput)(
           THNNState *state,
@@ -46,7 +46,7 @@ TH_API void THNN_(DistKLDivCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
-          real *output,
+          THTensor *output,
           bool sizeAverage);
 TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
           THNNState *state,
@@ -84,10 +84,11 @@ TH_API void THNN_(HardTanh_updateGradInput)(
 TH_API void THNN_(L1Cost_updateOutput)(
           THNNState *state,
           THTensor *input,
-          real *output);
+          THTensor *output);
 TH_API void THNN_(L1Cost_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput);
+
 #endif

From 06395abc005ab6080375af15201a8580c46cc172 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Tue, 5 Jan 2016 15:19:16 +0100
Subject: [PATCH 017/101] Move { ELU, LeakyReLU, LogSigmoid, LogSoftMax,
 LookupTable }.c -> lib/THNN/generic

---
 generic/ELU.c         |  48 +++++++++++++++++
 generic/LeakyReLU.c   |  65 +++++++++++++++++++++++
 generic/LogSigmoid.c  |  49 +++++++++++++++++
 generic/LogSoftMax.c  | 118 +++++++++++++++++++++++++++++++++++++++++
 generic/LookupTable.c | 119 ++++++++++++++++++++++++++++++++++++++++++
 init.c                |  15 ++++++
 6 files changed, 414 insertions(+)
 create mode 100644 generic/ELU.c
 create mode 100644 generic/LeakyReLU.c
 create mode 100644 generic/LogSigmoid.c
 create mode 100644 generic/LogSoftMax.c
 create mode 100644 generic/LookupTable.c

diff --git a/generic/ELU.c b/generic/ELU.c
new file mode 100644
index 00000000000..07d6fe4aa21
--- /dev/null
+++ b/generic/ELU.c
@@ -0,0 +1,48 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+static int nn_(ELU_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  real alpha = luaT_getfieldchecknumber(L, 1, "alpha");
+  
+  THTensor_(resizeAs)(output, input);
+  TH_TENSOR_APPLY2(real, input, real, output, \
+                   *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data; \
+                   );
+  
+    
+  return 1;
+}
+
+static int nn_(ELU_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real alpha = luaT_getfieldchecknumber(L, 1, "alpha");
+   
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
+                   *gradInput_data = (*output_data) <= 0 ? (*gradOutput_data * (*output_data + alpha)) : (*gradOutput_data); \
+                   );
+  
+  return 1;
+}
+
+static const struct luaL_Reg nn_(ELU__) [] = {
+  { "ELU_updateOutput", nn_(ELU_updateOutput) },
+  { "ELU_updateGradInput", nn_(ELU_updateGradInput) },
+  { NULL, NULL }
+};
+
+static void nn_(ELU_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(ELU__), "nn");
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/LeakyReLU.c b/generic/LeakyReLU.c
new file mode 100644
index 00000000000..03754760bb4
--- /dev/null
+++ b/generic/LeakyReLU.c
@@ -0,0 +1,65 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+static int nn_(LeakyReLU_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real negval = luaT_getfieldchecknumber(L, 1, "negval");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
+
+  if (inPlace) {
+    TH_TENSOR_APPLY(real, input,                   \
+                    if (*input_data <= 0) { \
+                      *input_data *=  negval ;           \
+                    });
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,                         \
+                     *output_data = (*input_data > 0) ? *input_data : negval * (*input_data););
+
+  }
+
+  return 1;
+}
+
+static int nn_(LeakyReLU_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real negval = luaT_getfieldchecknumber(L, 1, "negval");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
+
+  if (inPlace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,    \
+                     if ((*input_data) <= 0) { \
+                       *gradOutput_data *= negval;           \
+                         });
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,    \
+                     if ((*input_data) > 0) *gradInput_data = *gradOutput_data; \
+                     else *gradInput_data = *gradOutput_data * negval;);                        \
+  }
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(LeakyReLU__) [] = {
+  {"LeakyReLU_updateOutput", nn_(LeakyReLU_updateOutput)},
+  {"LeakyReLU_updateGradInput", nn_(LeakyReLU_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(LeakyReLU_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(LeakyReLU__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/LogSigmoid.c b/generic/LogSigmoid.c
new file mode 100644
index 00000000000..9b47a324058
--- /dev/null
+++ b/generic/LogSigmoid.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+static int nn_(LogSigmoid_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *buffer = luaT_getfieldcheckudata(L, 1, "buffer", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  THTensor_(resizeAs)(buffer, input);
+
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,    \
+                   real z = exp(-*input_data);                 \
+                   *buffer_data = z;                           \
+                   *output_data = -log(1. + z);)
+
+  return 1;
+}
+
+static int nn_(LogSigmoid_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *buffer = luaT_getfieldcheckudata(L, 1, "buffer", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, buffer);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,    \
+                   real z = *buffer_data;                              \
+                   *gradInput_data = *gradOutput_data * z / (1. + z);)
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(LogSigmoid__) [] = {
+  {"LogSigmoid_updateOutput", nn_(LogSigmoid_updateOutput)},
+  {"LogSigmoid_updateGradInput", nn_(LogSigmoid_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(LogSigmoid_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(LogSigmoid__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/LogSoftMax.c b/generic/LogSoftMax.c
new file mode 100644
index 00000000000..75b8587d80c
--- /dev/null
+++ b/generic/LogSoftMax.c
@@ -0,0 +1,118 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSoftMax.c"
+#else
+
+static int nn_(LogSoftMax_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  real *input_data, *output_data;
+  long nframe = 0, dim = 0;
+  long t, d;
+
+  if(input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else if(input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+  }
+  else
+    THArgCheck(0, 2, "vector or matrix expected");
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  real* input_data0 = THTensor_(data)(input);
+  real* output_data0 = THTensor_(data)(output);
+
+  accreal logsum;
+  real maxInput;
+#pragma omp parallel for private(t, d, maxInput, logsum, input_data, \
+                                 output_data)
+  for(t = 0; t < nframe; t++)
+  {
+    logsum = 0;
+    maxInput = -THInf;
+    input_data = input_data0 + dim*t;
+    output_data = output_data0 + dim*t;
+
+    for(d = 0; d < dim; d++)
+      maxInput = THMax(maxInput, input_data[d]);
+
+    for(d = 0; d < dim; d++)
+      logsum += THExpMinusApprox(maxInput-input_data[d]);
+    logsum = maxInput + log(logsum);
+
+    for(d = 0; d < dim; d++)
+      output_data[d] = input_data[d] - logsum;
+  }
+
+  THTensor_(free)(input);
+
+  return 1;
+}
+
+static int nn_(LogSoftMax_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  long nframe = 0, dim = 0;
+  long t, d;
+
+  if(output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+  }
+  else if(output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+  }
+  else
+    THError("vector or matrix expected");
+
+  THTensor_(resizeAs)(gradInput, output);
+  real* gradInput_data0 = THTensor_(data)(gradInput);
+  real* output_data0 = THTensor_(data)(output);
+  real* gradOutput_data0 = THTensor_(data)(gradOutput);
+  accreal sum;
+#pragma omp parallel for private(t, sum, d, gradInput_data, output_data, \
+                                 gradOutput_data)
+  for(t = 0; t < nframe; t++)
+  {
+    sum = 0;
+    gradInput_data = gradInput_data0 + dim*t;
+    output_data = output_data0 + dim*t;
+    gradOutput_data = gradOutput_data0 + dim*t;
+
+    for(d = 0; d < dim; d++)
+      sum += gradOutput_data[d];
+
+    for(d = 0; d < dim; d++)
+      gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
+  }
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(LogSoftMax__) [] = {
+  {"LogSoftMax_updateOutput", nn_(LogSoftMax_updateOutput)},
+  {"LogSoftMax_updateGradInput", nn_(LogSoftMax_updateGradInput)},
+  {NULL, NULL}
+};
+
+void nn_(LogSoftMax_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(LogSoftMax__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/LookupTable.c b/generic/LookupTable.c
new file mode 100644
index 00000000000..c47a929359c
--- /dev/null
+++ b/generic/LookupTable.c
@@ -0,0 +1,119 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void nn_(LookupTable_resetCount)(int *count_data, THLongTensor *input)
+{
+  int i;
+  long *input_data = THLongTensor_data(input);
+  long numel = THLongTensor_nElement(input);
+
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - 1;
+    count_data[k] = 0;
+  }
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - 1;
+    count_data[k]++;
+  }
+}
+
+static int nn_(LookupTable_accGradParameters)(lua_State *L)
+{
+  long i;
+  THLongTensor *input = luaT_checkudata(L, 2, "torch.LongTensor");
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real lr = luaL_optnumber(L, 4, 1);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  int *count_data = NULL;
+
+  if (luaT_getfieldcheckboolean(L, 1, "shouldScaleGradByFreq"))
+  {
+    THIntTensor *count = luaT_getfieldcheckudata(L, 1, "_count", "torch.IntTensor");
+    THIntTensor_resize1d(count, gradWeight->size[0]);
+    count_data = THIntTensor_data(count);
+  }
+
+  if (!THTensor_(isContiguous)(gradWeight))
+    luaL_error(L, "gradWeight must be contiguous");
+  if (!THLongTensor_isContiguous(input))
+    luaL_error(L, "input must be contiguous");
+  if (input->nDimension != 1 && input->nDimension != 2)
+    luaL_error(L, "input must be a vector or matrix");
+
+  long *input_data = THLongTensor_data(input);
+  long numel = THLongTensor_nElement(input);
+  long numw = THTensor_(size)(gradWeight, 0);
+
+  // check that inputs are all within range
+  for (i=0; i<numel; i++)
+    if (input_data[i] < 1 || input_data[i] > numw)
+      THError("input out of range");
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  real *gw = THTensor_(data)(gradWeight);
+  real *go = THTensor_(data)(gradOutput);
+  long stride = THTensor_(stride)(gradWeight, 0);
+
+  if (count_data)
+    nn_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel private(i)
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+
+      long start = tid * (numw/nthreads + 1);
+      long end = start + (numw/nthreads + 1);
+      for (i=0; i<numel; i++)
+      {
+        long k = input_data[i] - 1;
+        if (k >= start && k < end)
+        {
+          real scale = lr;
+          if (count_data) scale /= count_data[k];
+          THBlas_(axpy)(stride, scale, go + i*stride, 1, gw + k*stride, 1);
+        }
+      }
+    }
+
+    THTensor_(free)(gradOutput);
+    return 0;
+  }
+#endif
+
+  for (i=0; i<numel; i++)
+  {
+    long k = input_data[i] - 1;
+    real scale = lr;
+    if (count_data) scale /= count_data[k];
+    THBlas_(axpy)(stride, scale, go + i*stride, 1, gw + k*stride, 1);
+  }
+
+  THTensor_(free)(gradOutput);
+  return 0;
+}
+
+static const struct luaL_Reg nn_(LookupTable__) [] = {
+  {"LookupTable_accGradParameters", nn_(LookupTable_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(LookupTable_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(LookupTable__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/init.c b/init.c
index e0accfbc1cb..8742ef5e6ba 100644
--- a/init.c
+++ b/init.c
@@ -16,6 +16,9 @@
 #include "generic/DistKLDivCriterion.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/ELU.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/HardShrink.c"
 #include "THGenerateFloatTypes.h"
 
@@ -24,3 +27,15 @@
 
 #include "generic/L1Cost.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/LeakyReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LookupTable.c"
+#include "THGenerateFloatTypes.h"

From ca14b9fbe783a275bd46498d154ead3bf520e07f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Tue, 5 Jan 2016 16:23:34 +0100
Subject: [PATCH 018/101] Add THNN conversion of {ELU, LeakyReLU, LogSigmoid,
 LogSoftMax, LookupTable}

---
 generic/ELU.c         |  45 ++++--------------
 generic/LeakyReLU.c   | 108 +++++++++++++++++-------------------------
 generic/LogSigmoid.c  |  45 +++++-------------
 generic/LogSoftMax.c  |  68 ++++++++++----------------
 generic/LookupTable.c |  48 +++++++------------
 generic/THNN.h        |  59 +++++++++++++++++++++++
 6 files changed, 162 insertions(+), 211 deletions(-)

diff --git a/generic/ELU.c b/generic/ELU.c
index 07d6fe4aa21..f748ee95782 100644
--- a/generic/ELU.c
+++ b/generic/ELU.c
@@ -2,47 +2,20 @@
 #define TH_GENERIC_FILE "generic/ELU.c"
 #else
 
-static int nn_(ELU_updateOutput)(lua_State *L)
+void THNN_(ELU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real alpha)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  real alpha = luaT_getfieldchecknumber(L, 1, "alpha");
-  
   THTensor_(resizeAs)(output, input);
-  TH_TENSOR_APPLY2(real, input, real, output, \
-                   *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data; \
-                   );
-  
-    
-  return 1;
+  TH_TENSOR_APPLY2(real, input, real, output,
+    *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+  );
 }
 
-static int nn_(ELU_updateGradInput)(lua_State *L)
-{
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  real alpha = luaT_getfieldchecknumber(L, 1, "alpha");
-   
+void THNN_(ELU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output, real alpha)
+{ 
   THTensor_(resizeAs)(gradInput, output);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
-                   *gradInput_data = (*output_data) <= 0 ? (*gradOutput_data * (*output_data + alpha)) : (*gradOutput_data); \
-                   );
-  
-  return 1;
-}
-
-static const struct luaL_Reg nn_(ELU__) [] = {
-  { "ELU_updateOutput", nn_(ELU_updateOutput) },
-  { "ELU_updateGradInput", nn_(ELU_updateGradInput) },
-  { NULL, NULL }
-};
-
-static void nn_(ELU_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(ELU__), "nn");
-  lua_pop(L, 1);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+  );
 }
 
 #endif
diff --git a/generic/LeakyReLU.c b/generic/LeakyReLU.c
index 03754760bb4..2fc533b9d28 100644
--- a/generic/LeakyReLU.c
+++ b/generic/LeakyReLU.c
@@ -1,65 +1,43 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/LeakyReLU.c"
-#else
-
-static int nn_(LeakyReLU_updateOutput)(lua_State *L)
-{
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real negval = luaT_getfieldchecknumber(L, 1, "negval");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
-
-  if (inPlace) {
-    TH_TENSOR_APPLY(real, input,                   \
-                    if (*input_data <= 0) { \
-                      *input_data *=  negval ;           \
-                    });
-    THTensor_(set)(output, input);
-  } else {
-    THTensor_(resizeAs)(output, input);
-    TH_TENSOR_APPLY2(real, output, real, input,                         \
-                     *output_data = (*input_data > 0) ? *input_data : negval * (*input_data););
-
-  }
-
-  return 1;
-}
-
-static int nn_(LeakyReLU_updateGradInput)(lua_State *L)
-{
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real negval = luaT_getfieldchecknumber(L, 1, "negval");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
-
-  if (inPlace) {
-    TH_TENSOR_APPLY2(real, gradOutput, real, input,    \
-                     if ((*input_data) <= 0) { \
-                       *gradOutput_data *= negval;           \
-                         });
-    THTensor_(set)(gradInput, gradOutput);
-  } else {
-    THTensor_(resizeAs)(gradInput, input);
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,    \
-                     if ((*input_data) > 0) *gradInput_data = *gradOutput_data; \
-                     else *gradInput_data = *gradOutput_data * negval;);                        \
-  }
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(LeakyReLU__) [] = {
-  {"LeakyReLU_updateOutput", nn_(LeakyReLU_updateOutput)},
-  {"LeakyReLU_updateGradInput", nn_(LeakyReLU_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(LeakyReLU_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(LeakyReLU__), "nn");
-  lua_pop(L,1);
-}
-
-#endif
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real negval, bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data *= negval;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = *input_data > 0 ? *input_data : *input_data * negval;
+    );
+  }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real negval, bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if (*input_data <= 0)
+        *gradOutput_data *= negval;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+    );
+  }
+}
+
+#endif
diff --git a/generic/LogSigmoid.c b/generic/LogSigmoid.c
index 9b47a324058..c0510377e1f 100644
--- a/generic/LogSigmoid.c
+++ b/generic/LogSigmoid.c
@@ -2,48 +2,25 @@
 #define TH_GENERIC_FILE "generic/LogSigmoid.c"
 #else
 
-static int nn_(LogSigmoid_updateOutput)(lua_State *L)
+void THNN_(LogSigmoid_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *buffer)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *buffer = luaT_getfieldcheckudata(L, 1, "buffer", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
   THTensor_(resizeAs)(buffer, input);
 
-  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,    \
-                   real z = exp(-*input_data);                 \
-                   *buffer_data = z;                           \
-                   *output_data = -log(1. + z);)
-
-  return 1;
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+    real z = exp(-*input_data);
+    *buffer_data = z;
+    *output_data = -log(1. + z);
+  );
 }
 
-static int nn_(LogSigmoid_updateGradInput)(lua_State *L)
+void THNN_(LogSigmoid_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *buffer)
 {
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *buffer = luaT_getfieldcheckudata(L, 1, "buffer", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, buffer);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,    \
-                   real z = *buffer_data;                              \
-                   *gradInput_data = *gradOutput_data * z / (1. + z);)
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(LogSigmoid__) [] = {
-  {"LogSigmoid_updateOutput", nn_(LogSigmoid_updateOutput)},
-  {"LogSigmoid_updateGradInput", nn_(LogSigmoid_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(LogSigmoid_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(LogSigmoid__), "nn");
-  lua_pop(L,1);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
+    real z = *buffer_data;
+    *gradInput_data = *gradOutput_data * z / (1. + z);
+  );
 }
 
 #endif
diff --git a/generic/LogSoftMax.c b/generic/LogSoftMax.c
index 75b8587d80c..f23622f7e56 100644
--- a/generic/LogSoftMax.c
+++ b/generic/LogSoftMax.c
@@ -2,117 +2,97 @@
 #define TH_GENERIC_FILE "generic/LogSoftMax.c"
 #else
 
-static int nn_(LogSoftMax_updateOutput)(lua_State *L)
+void THNN_(LogSoftMax_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
   real *input_data, *output_data;
   long nframe = 0, dim = 0;
   long t, d;
 
-  if(input->nDimension == 1)
+  if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0];
   }
-  else if(input->nDimension == 2)
+  else if (input->nDimension == 2)
   {
     nframe = input->size[0];
     dim = input->size[1];
   }
   else
+  {
     THArgCheck(0, 2, "vector or matrix expected");
+  }
 
   input = THTensor_(newContiguous)(input);
   THTensor_(resizeAs)(output, input);
 
-  real* input_data0 = THTensor_(data)(input);
-  real* output_data0 = THTensor_(data)(output);
+  real *input_data0 = THTensor_(data)(input);
+  real *output_data0 = THTensor_(data)(output);
 
   accreal logsum;
   real maxInput;
-#pragma omp parallel for private(t, d, maxInput, logsum, input_data, \
-                                 output_data)
-  for(t = 0; t < nframe; t++)
+  #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
+  for (t = 0; t < nframe; t++)
   {
     logsum = 0;
     maxInput = -THInf;
     input_data = input_data0 + dim*t;
     output_data = output_data0 + dim*t;
 
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       maxInput = THMax(maxInput, input_data[d]);
 
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       logsum += THExpMinusApprox(maxInput-input_data[d]);
     logsum = maxInput + log(logsum);
 
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       output_data[d] = input_data[d] - logsum;
   }
 
   THTensor_(free)(input);
-
-  return 1;
 }
 
-static int nn_(LogSoftMax_updateGradInput)(lua_State *L)
+void THNN_(LogSoftMax_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
 {
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real *gradInput_data, *gradOutput_data, *output_data;
   long nframe = 0, dim = 0;
   long t, d;
 
-  if(output->nDimension == 1)
+  if (output->nDimension == 1)
   {
     nframe = 1;
     dim = output->size[0];
   }
-  else if(output->nDimension == 2)
+  else if (output->nDimension == 2)
   {
     nframe = output->size[0];
     dim = output->size[1];
   }
   else
+  {
     THError("vector or matrix expected");
+  }
 
   THTensor_(resizeAs)(gradInput, output);
-  real* gradInput_data0 = THTensor_(data)(gradInput);
-  real* output_data0 = THTensor_(data)(output);
-  real* gradOutput_data0 = THTensor_(data)(gradOutput);
+  real *gradInput_data0 = THTensor_(data)(gradInput);
+  real *output_data0 = THTensor_(data)(output);
+  real *gradOutput_data0 = THTensor_(data)(gradOutput);
   accreal sum;
-#pragma omp parallel for private(t, sum, d, gradInput_data, output_data, \
-                                 gradOutput_data)
-  for(t = 0; t < nframe; t++)
+  #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
+  for (t = 0; t < nframe; t++)
   {
     sum = 0;
     gradInput_data = gradInput_data0 + dim*t;
     output_data = output_data0 + dim*t;
     gradOutput_data = gradOutput_data0 + dim*t;
 
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       sum += gradOutput_data[d];
 
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
   }
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(LogSoftMax__) [] = {
-  {"LogSoftMax_updateOutput", nn_(LogSoftMax_updateOutput)},
-  {"LogSoftMax_updateGradInput", nn_(LogSoftMax_updateGradInput)},
-  {NULL, NULL}
-};
-
-void nn_(LogSoftMax_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(LogSoftMax__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index c47a929359c..f498bc8a34a 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/LookupTable.c"
 #else
 
-static void nn_(LookupTable_resetCount)(int *count_data, THLongTensor *input)
+void THNN_(LookupTable_resetCount)(long *count_data, THLongTensor *input)
 {
   int i;
   long *input_data = THLongTensor_data(input);
@@ -20,28 +20,23 @@ static void nn_(LookupTable_resetCount)(int *count_data, THLongTensor *input)
   }
 }
 
-static int nn_(LookupTable_accGradParameters)(lua_State *L)
+void THNN_(LookupTable_accGradParameters)(THNNState *state, THLongTensor *input, THTensor *gradOutput, THTensor *gradWeight, real lr, bool shouldScaleGradByFreq, THLongTensor* count)
 {
   long i;
-  THLongTensor *input = luaT_checkudata(L, 2, "torch.LongTensor");
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real lr = luaL_optnumber(L, 4, 1);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  int *count_data = NULL;
-
-  if (luaT_getfieldcheckboolean(L, 1, "shouldScaleGradByFreq"))
+  long *count_data = NULL;
+  
+  if (shouldScaleGradByFreq)
   {
-    THIntTensor *count = luaT_getfieldcheckudata(L, 1, "_count", "torch.IntTensor");
-    THIntTensor_resize1d(count, gradWeight->size[0]);
-    count_data = THIntTensor_data(count);
+    THLongTensor_resize1d(count, gradWeight->size[0]);
+    count_data = THLongTensor_data(count);
   }
 
   if (!THTensor_(isContiguous)(gradWeight))
-    luaL_error(L, "gradWeight must be contiguous");
+    THError("gradWeight must be contiguous");
   if (!THLongTensor_isContiguous(input))
-    luaL_error(L, "input must be contiguous");
+    THError("input must be contiguous");
   if (input->nDimension != 1 && input->nDimension != 2)
-    luaL_error(L, "input must be a vector or matrix");
+    THError("input must be a vector or matrix");
 
   long *input_data = THLongTensor_data(input);
   long numel = THLongTensor_nElement(input);
@@ -59,7 +54,7 @@ static int nn_(LookupTable_accGradParameters)(lua_State *L)
   long stride = THTensor_(stride)(gradWeight, 0);
 
   if (count_data)
-    nn_(LookupTable_resetCount)(count_data, input);
+    THNN_(LookupTable_resetCount)(count_data, input);
 
 #ifdef _OPENMP
   if (numel > 1000)
@@ -81,14 +76,15 @@ static int nn_(LookupTable_accGradParameters)(lua_State *L)
         if (k >= start && k < end)
         {
           real scale = lr;
-          if (count_data) scale /= count_data[k];
+          if (count_data)
+            scale /= count_data[k];
           THBlas_(axpy)(stride, scale, go + i*stride, 1, gw + k*stride, 1);
         }
       }
     }
 
     THTensor_(free)(gradOutput);
-    return 0;
+    return;
   }
 #endif
 
@@ -96,24 +92,12 @@ static int nn_(LookupTable_accGradParameters)(lua_State *L)
   {
     long k = input_data[i] - 1;
     real scale = lr;
-    if (count_data) scale /= count_data[k];
+    if (count_data)
+      scale /= count_data[k];
     THBlas_(axpy)(stride, scale, go + i*stride, 1, gw + k*stride, 1);
   }
 
   THTensor_(free)(gradOutput);
-  return 0;
-}
-
-static const struct luaL_Reg nn_(LookupTable__) [] = {
-  {"LookupTable_accGradParameters", nn_(LookupTable_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(LookupTable_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(LookupTable__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index cc3d650c978..10f81d24ede 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -42,6 +42,19 @@ TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
           THTensor *weights,
           THTensor *total_weight);
 
+TH_API void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real alpha);
+TH_API void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real alpha);
+
 TH_API void THNN_(DistKLDivCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -91,4 +104,50 @@ TH_API void THNN_(L1Cost_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput);
 
+TH_API void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real negval,
+          bool inplace);
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real negval,
+          bool inplace);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer);
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer);
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THLongTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          real lr,
+          bool shouldScaleGradByFreq,
+          THLongTensor* count);
+
 #endif

From 4aea5c746a5d9ad359a3ddcd8f8b6c07ebc29a4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Wed, 6 Jan 2016 01:40:52 +0100
Subject: [PATCH 019/101] Revert to use IntTensor for LookupTable counts

---
 THNN.h                      |  6 +++++-
 generic/ClassNLLCriterion.c |  4 ++--
 generic/HardShrink.c        |  8 ++++----
 generic/HardTanh.c          |  8 ++++----
 generic/LookupTable.c       | 22 +++++++++++-----------
 generic/THNN.h              |  4 ++--
 6 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/THNN.h b/THNN.h
index f731e14cd82..4d694061165 100644
--- a/THNN.h
+++ b/THNN.h
@@ -9,7 +9,11 @@
 #define THIndexTensor THLongTensor
 #define THIndexTensor_(NAME) THLongTensor_ ## NAME
 
-typedef long TH_index_t;
+#define THIntegerTensor THIntTensor
+#define THIntegerTensor_(NAME) THIntTensor_ ## NAME
+
+typedef long THIndex_t;
+typedef int THInteger_t;
 typedef void THNNState;
 
 #include "generic/THNN.h"
diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index d8270dcf645..de8a82e8a35 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -19,7 +19,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input, TH
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
 
   real *input_data = THTensor_(data)(input);
-  TH_index_t *target_data = THIndexTensor_(data)(target);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
   real *weights_data = weights ? THTensor_(data)(weights) : NULL;
   real *output_data = THTensor_(data)(output);
   real *total_weight_data = THTensor_(data)(total_weight);
@@ -83,7 +83,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input,
   target = THIndexTensor_(newContiguous)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
 
-  TH_index_t *target_data = THIndexTensor_(data)(target);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
   real *weights_data = weights ? THTensor_(data)(weights) : NULL;
   real *gradInput_data = THTensor_(data)(gradInput);
 
diff --git a/generic/HardShrink.c b/generic/HardShrink.c
index 682534e349f..9abee6b1b17 100644
--- a/generic/HardShrink.c
+++ b/generic/HardShrink.c
@@ -7,9 +7,9 @@ void THNN_(HardShrink_updateOutput)(THNNState *state, THTensor *input, THTensor
   THTensor_(resizeAs)(output, input);
 
   TH_TENSOR_APPLY2(real, output, real, input,
-    if ((*input_data) > lambda)
+    if (*input_data > lambda)
       *output_data = *input_data;
-    else if ((*input_data) < -lambda)
+    else if (*input_data < -lambda)
       *output_data = *input_data;
     else
       *output_data = 0;
@@ -20,8 +20,8 @@ void THNN_(HardShrink_updateGradInput)(THNNState *state, THTensor *input, THTens
 {
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-    if ((*input_data) > lambda || (*input_data) < -lambda)
-      *gradInput_data = (*gradOutput_data);
+    if (*input_data > lambda || *input_data < -lambda)
+      *gradInput_data = *gradOutput_data;
     else
       *gradInput_data = 0;
   );
diff --git a/generic/HardTanh.c b/generic/HardTanh.c
index 6251e1b48fd..1fe54df8016 100644
--- a/generic/HardTanh.c
+++ b/generic/HardTanh.c
@@ -40,10 +40,10 @@ void THNN_(HardTanh_updateGradInput)(THNNState *state, THTensor *input, THTensor
 {
   THTensor_(resizeAs)(gradInput, input);
 
-  if (input->nDimension == 1 || 
-      !THTensor_(isContiguous)(input) || 
-      !THTensor_(isContiguous)(gradOutput) ||
-      !THTensor_(isContiguous)(gradInput))
+  if (input->nDimension == 1 ||
+    !THTensor_(isContiguous)(input) ||
+    !THTensor_(isContiguous)(gradOutput) ||
+    !THTensor_(isContiguous)(gradInput))
   {
     TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
       if (*input_data < min_val || *input_data > max_val)
diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index f498bc8a34a..270fb4e7bb7 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -2,11 +2,11 @@
 #define TH_GENERIC_FILE "generic/LookupTable.c"
 #else
 
-void THNN_(LookupTable_resetCount)(long *count_data, THLongTensor *input)
+static void THNN_(LookupTable_resetCount)(THInteger_t *count_data, THIndexTensor *input)
 {
   int i;
-  long *input_data = THLongTensor_data(input);
-  long numel = THLongTensor_nElement(input);
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  long numel = THIndexTensor_(nElement)(input);
 
   for (i = 0; i<numel; i++)
   {
@@ -20,26 +20,26 @@ void THNN_(LookupTable_resetCount)(long *count_data, THLongTensor *input)
   }
 }
 
-void THNN_(LookupTable_accGradParameters)(THNNState *state, THLongTensor *input, THTensor *gradOutput, THTensor *gradWeight, real lr, bool shouldScaleGradByFreq, THLongTensor* count)
+void THNN_(LookupTable_accGradParameters)(THNNState *state, THIndexTensor *input, THTensor *gradOutput, THTensor *gradWeight, real lr, bool shouldScaleGradByFreq, THIntegerTensor *count)
 {
   long i;
-  long *count_data = NULL;
+  THInteger_t *count_data = NULL;
   
   if (shouldScaleGradByFreq)
   {
-    THLongTensor_resize1d(count, gradWeight->size[0]);
-    count_data = THLongTensor_data(count);
+    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    count_data = THIntegerTensor_(data)(count);
   }
 
   if (!THTensor_(isContiguous)(gradWeight))
     THError("gradWeight must be contiguous");
-  if (!THLongTensor_isContiguous(input))
+  if (!THIndexTensor_(isContiguous)(input))
     THError("input must be contiguous");
-  if (input->nDimension != 1 && input->nDimension != 2)
+  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)
     THError("input must be a vector or matrix");
 
-  long *input_data = THLongTensor_data(input);
-  long numel = THLongTensor_nElement(input);
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  long numel = THIndexTensor_(nElement)(input);
   long numw = THTensor_(size)(gradWeight, 0);
 
   // check that inputs are all within range
diff --git a/generic/THNN.h b/generic/THNN.h
index 10f81d24ede..4b465455849 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -143,11 +143,11 @@ TH_API void THNN_(LogSoftMax_updateGradInput)(
 
 TH_API void THNN_(LookupTable_accGradParameters)(
           THNNState *state,
-          THLongTensor *input,
+          THIndexTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
           real lr,
           bool shouldScaleGradByFreq,
-          THLongTensor* count);
+          THIntegerTensor *count);
 
 #endif

From bf12a0663348891ec4585f2c22fa75dd14c8b021 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Fri, 8 Jan 2016 00:49:42 +0100
Subject: [PATCH 020/101] Harmonize LookupTable signature with cunn impl

---
 generic/LookupTable.c | 18 ++++++++++--------
 generic/THNN.h        |  8 +++++---
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index 270fb4e7bb7..66a09a8854b 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -20,12 +20,14 @@ static void THNN_(LookupTable_resetCount)(THInteger_t *count_data, THIndexTensor
   }
 }
 
-void THNN_(LookupTable_accGradParameters)(THNNState *state, THIndexTensor *input, THTensor *gradOutput, THTensor *gradWeight, real lr, bool shouldScaleGradByFreq, THIntegerTensor *count)
+void THNN_(LookupTable_accGradParameters)(THNNState *state, THIndexTensor *input, THTensor *gradOutput, 
+  THTensor *gradWeight, real scale, bool scaleGradByFreq, THIntegerTensor *count,
+  THTensor *sorted, THTensor *indices)
 {
   long i;
   THInteger_t *count_data = NULL;
   
-  if (shouldScaleGradByFreq)
+  if (scaleGradByFreq)
   {
     THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
     count_data = THIntegerTensor_(data)(count);
@@ -75,10 +77,10 @@ void THNN_(LookupTable_accGradParameters)(THNNState *state, THIndexTensor *input
         long k = input_data[i] - 1;
         if (k >= start && k < end)
         {
-          real scale = lr;
+          real lr = scale;
           if (count_data)
-            scale /= count_data[k];
-          THBlas_(axpy)(stride, scale, go + i*stride, 1, gw + k*stride, 1);
+            lr /= count_data[k];
+          THBlas_(axpy)(stride, lr, go + i*stride, 1, gw + k*stride, 1);
         }
       }
     }
@@ -91,10 +93,10 @@ void THNN_(LookupTable_accGradParameters)(THNNState *state, THIndexTensor *input
   for (i=0; i<numel; i++)
   {
     long k = input_data[i] - 1;
-    real scale = lr;
+    real lr = scale;
     if (count_data)
-      scale /= count_data[k];
-    THBlas_(axpy)(stride, scale, go + i*stride, 1, gw + k*stride, 1);
+      lr /= count_data[k];
+    THBlas_(axpy)(stride, lr, go + i*stride, 1, gw + k*stride, 1);
   }
 
   THTensor_(free)(gradOutput);
diff --git a/generic/THNN.h b/generic/THNN.h
index 4b465455849..69e91bc687a 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -146,8 +146,10 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THIndexTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          real lr,
-          bool shouldScaleGradByFreq,
-          THIntegerTensor *count);
+          real scale,
+          bool scaleGradByFreq,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices);
 
 #endif

From 297b39344fca622290caad7629d369e55c12a44f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Deltheil?= <cedric@moodstocks.com>
Date: Mon, 11 Jan 2016 10:06:35 +0100
Subject: [PATCH 021/101] THNN: add missing OpenMP include

---
 THNN.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/THNN.h b/THNN.h
index 4d694061165..9efcd46d716 100644
--- a/THNN.h
+++ b/THNN.h
@@ -3,6 +3,9 @@
 
 #include <stdbool.h>
 #include <TH.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 #define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
 

From 130ed2c27c2e79f1c0ae2e805ccc22310a3fc9e5 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Mon, 4 Jan 2016 07:58:05 +0100
Subject: [PATCH 022/101] Move SpatialConvolutionMM.c -> lib/THNN/generic

---
 generic/SpatialConvolutionMM.c | 265 +++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 generic/SpatialConvolutionMM.c

diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
new file mode 100644
index 00000000000..0554b9b3cda
--- /dev/null
+++ b/generic/SpatialConvolutionMM.c
@@ -0,0 +1,265 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+#ifdef _WIN32
+# include <windows.h>
+#endif
+
+#include "unfold.h"
+
+
+static void nn_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+                                                         int kW, int kH, int dW, int dH, int padW, int padH,
+                                                         long nInputPlane, long inputWidth, long inputHeight,
+                                                         long nOutputPlane, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  nn_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+
+  for(i = 0; i < nOutputPlane; i++)
+    THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  int dimf = 0;
+  int dimw = 2;
+  int dimh = 1;
+
+  long nInputPlane;
+  long inputWidth;
+  long inputHeight;
+  long nOutputPlane;
+  long outputWidth;
+  long outputHeight;
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+
+  if (input->nDimension == 4) {
+    dimf++;
+    dimw++;
+    dimh++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputWidth   = input->size[dimw];
+  inputHeight  = input->size[dimh];
+  nOutputPlane = weight->size[0];
+  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  if (nInputPlane*kW*kH != weight->size[1])
+    THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH));
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    nn_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kW, kH, dW, dH, padW, padH,
+                                                 nInputPlane, inputWidth, inputHeight,
+                                                 nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      nn_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                   kW, kH, dW, dH, padW, padH,
+                                                   nInputPlane, inputWidth, inputHeight,
+                                                   nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  return 1;
+}
+
+
+static void nn_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+                                                            int kW, int kH, int dW, int dH, int padW, int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  nn_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
+}
+
+static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *fgradInput = luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  if(input->nDimension == 3)
+  {
+    nn_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      nn_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  return 1;
+}
+
+static void nn_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput,
+                                                              real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  for(i = 0; i < gradBias->size[0]; i++)
+  {
+    long k;
+    real sum = 0;
+    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+    for(k = 0; k < gradOutput2d->size[1]; k++)
+      sum += data[k];
+    (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+static int nn_(SpatialConvolutionMM_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real scale = luaL_optnumber(L, 4, 1);
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+
+  if(input->nDimension == 3)
+  {
+    nn_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      nn_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialConvolutionMM__) [] = {
+  {"SpatialConvolutionMM_updateOutput", nn_(SpatialConvolutionMM_updateOutput)},
+  {"SpatialConvolutionMM_updateGradInput", nn_(SpatialConvolutionMM_updateGradInput)},
+  {"SpatialConvolutionMM_accGradParameters", nn_(SpatialConvolutionMM_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialConvolutionMM_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialConvolutionMM__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From b5f91d560274fc47030cb76de603b9b75c3c1ea4 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 17 Jan 2016 12:46:43 +0100
Subject: [PATCH 023/101] Add functional convertion of SpatialConvolutionMM

---
 generic/SpatialConvolutionMM.c | 225 ++++++++++++++++++++++-----------
 generic/THNN.h                 |  30 +++++
 init.c                         |   3 +
 3 files changed, 185 insertions(+), 73 deletions(-)

diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
index 0554b9b3cda..fc7dd0c51da 100644
--- a/generic/SpatialConvolutionMM.c
+++ b/generic/SpatialConvolutionMM.c
@@ -6,10 +6,144 @@
 # include <windows.h>
 #endif
 
-#include "unfold.h"
 
 
-static void nn_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
+                               int kW, int kH,
+                               int dW, int dH,
+                               int padW, int padH,
+                               int nInputPlane,
+                               int inputWidth, int inputHeight,
+                               int outputWidth, int outputHeight)
+{
+#ifdef _WIN32
+  LONG_PTR nip;
+#else
+  size_t nip;
+#endif
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    size_t kw, kh, y, x; 
+    long long ix = 0, iy = 0;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+        real *dst = input_data + nip*(inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          size_t lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH - padH + kh);
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = (long long)(0 - padW + kw);
+                 lpad = fmaxf(0,padW-kw);
+                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (long long)(x*dW - padW + kw);
+                   if (ix < 0 || ix >= inputWidth){
+                   }else
+                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH + kh);
+            ix = (long long)(0 + kw);
+            if (dW == 1 )
+               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            else{
+              for(x = 0; x < outputWidth; x++)
+                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(unfolded_copy)(THTensor *finput, THTensor *input,
+                               int kW, int kH,
+                               int dW, int dH,
+                               int padW, int padH,
+                               int nInputPlane,
+                               int inputWidth, int inputHeight,
+                               int outputWidth, int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane*kH*kW; k++) {
+    size_t nip = k / (kH*kW);
+    size_t rest = k % (kH*kW);
+    size_t kh = rest / kW;
+    size_t kw = rest % kW;
+    size_t x,y;
+    long long ix,iy;
+    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+    real *src = input_data + nip*(inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      size_t lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH - padH + kh);
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = (long long)(0 - padW + kw);
+             lpad = fmaxf(0,padW-kw);
+             rpad = fmaxf(0,padW-(kW-kw-1));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (long long)(x*dW - padW + kw);
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH + kh);
+        ix = (long long)(0 + kw);
+        if (dW == 1)
+           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
                                                          int kW, int kH, int dW, int dH, int padW, int padH,
                                                          long nInputPlane, long inputWidth, long inputHeight,
                                                          long nOutputPlane, long outputWidth, long outputHeight)
@@ -17,7 +151,7 @@ static void nn_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTens
   long i;
   THTensor *output2d;
 
-  nn_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
 
   output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
                                          nOutputPlane, -1,
@@ -31,21 +165,8 @@ static void nn_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTens
   THTensor_(free)(output2d);
 }
 
-static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L)
+void THNN_(SpatialConvolutionMM_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor* finput, int kW, int kH, int dW, int dH, int padW, int padH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   int dimf = 0;
   int dimw = 2;
   int dimh = 1;
@@ -57,8 +178,7 @@ static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L)
   long outputWidth;
   long outputHeight;
 
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
-
+  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 1, "3D or 4D (batch mode) tensor expected");
 
   if (input->nDimension == 4) {
     dimf++;
@@ -85,7 +205,7 @@ static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L)
     THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 
-    nn_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
+    THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
                                                  kW, kH, dW, dH, padW, padH,
                                                  nInputPlane, inputWidth, inputHeight,
                                                  nOutputPlane, outputWidth, outputHeight);
@@ -105,7 +225,7 @@ static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L)
       THTensor *output_t = THTensor_(newSelect)(output, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      nn_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+      THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
                                                    kW, kH, dW, dH, padW, padH,
                                                    nInputPlane, inputWidth, inputHeight,
                                                    nOutputPlane, outputWidth, outputHeight);
@@ -115,12 +235,10 @@ static int nn_(SpatialConvolutionMM_updateOutput)(lua_State *L)
       THTensor_(free)(finput_t);
     }
   }
-
-  return 1;
 }
 
 
-static void nn_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
                                                             int kW, int kH, int dW, int dH, int padW, int padH)
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
@@ -131,25 +249,12 @@ static void nn_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInput,
 
   THTensor_(zero)(gradInput);
 
-  nn_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
 }
 
-static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L)
+void THNN_(SpatialConvolutionMM_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias, THTensor *finput, THTensor *fgradInput, int kW, int kH, int dW, int dH, int padW, int padH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *fgradInput = luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  long nOutputPlane = weight->size[0];
 
   THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
 
@@ -159,7 +264,7 @@ static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L)
 
   if(input->nDimension == 3)
   {
-    nn_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
   }
   else
   {
@@ -173,7 +278,7 @@ static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L)
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
-      nn_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
 
       THTensor_(free)(gradInput_t);
       THTensor_(free)(gradOutput_t);
@@ -182,11 +287,9 @@ static int nn_(SpatialConvolutionMM_updateGradInput)(lua_State *L)
   }
 
   THTensor_(transpose)(weight, weight, 0, 1);
-
-  return 1;
 }
 
-static void nn_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput,
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput,
                                                               real scale)
 {
   long i;
@@ -211,22 +314,14 @@ static void nn_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOutp
   THTensor_(free)(gradOutput2d);
 }
 
-static int nn_(SpatialConvolutionMM_accGradParameters)(lua_State *L)
+void THNN_(SpatialConvolutionMM_accGradParameters)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real scale = luaL_optnumber(L, 4, 1);
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-
+  long nOutputPlane = gradWeight->size[0];
   THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
 
   if(input->nDimension == 3)
   {
-    nn_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
   }
   else
   {
@@ -238,28 +333,12 @@ static int nn_(SpatialConvolutionMM_accGradParameters)(lua_State *L)
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      nn_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
 
       THTensor_(free)(gradOutput_t);
       THTensor_(free)(finput_t);
     }
   }
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialConvolutionMM__) [] = {
-  {"SpatialConvolutionMM_updateOutput", nn_(SpatialConvolutionMM_updateOutput)},
-  {"SpatialConvolutionMM_updateGradInput", nn_(SpatialConvolutionMM_updateGradInput)},
-  {"SpatialConvolutionMM_accGradParameters", nn_(SpatialConvolutionMM_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialConvolutionMM_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialConvolutionMM__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 69e91bc687a..2371062d871 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -152,4 +152,34 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *sorted,
           THTensor *indices);
 
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor* finput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale);
 #endif
diff --git a/init.c b/init.c
index 8742ef5e6ba..a073450b34b 100644
--- a/init.c
+++ b/init.c
@@ -39,3 +39,6 @@
 
 #include "generic/LookupTable.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
\ No newline at end of file

From ce427e3e6ee8c47b4c5a4464cf170a3cbaca9c3b Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Wed, 6 Jan 2016 07:46:56 +0100
Subject: [PATCH 024/101] Move {Spatial(Adaptive,Average,Max)Pooling.c} to
 lib/THNN/generic

---
 generic/SpatialAdaptiveMaxPooling.c | 280 ++++++++++++++++++++++++++
 generic/SpatialAveragePooling.c     | 276 ++++++++++++++++++++++++++
 generic/SpatialMaxPooling.c         | 296 ++++++++++++++++++++++++++++
 3 files changed, 852 insertions(+)
 create mode 100644 generic/SpatialAdaptiveMaxPooling.c
 create mode 100644 generic/SpatialAveragePooling.c
 create mode 100644 generic/SpatialMaxPooling.c

diff --git a/generic/SpatialAdaptiveMaxPooling.c b/generic/SpatialAdaptiveMaxPooling.c
new file mode 100644
index 00000000000..85f728b7b9d
--- /dev/null
+++ b/generic/SpatialAdaptiveMaxPooling.c
@@ -0,0 +1,280 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+static void nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,real *output_p,
+                                                              real *indx_p, real *indy_p,
+                                                              long nslices,
+                                                              long iwidth, long iheight,
+                                                              long owidth, long oheight,
+                                                              long stridew,long strideh,
+                                                              long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float)i / oheight * iheight);
+      int y_end   = (int)ceil((float)(i + 1) / oheight * iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+        
+        int x_start = (int)floor((float)j / owidth * iwidth);
+        int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        real *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+        real *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -FLT_MAX;
+        long tcntr = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+            tcntr++;
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max (x,y) */
+        *indyp = (int)(maxindex / kW)+1;
+        *indxp = (maxindex % kW) +1;
+      }
+    }
+  }
+}
+
+static int nn_(SpatialAdaptiveMaxPooling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  long oheight = luaT_getfieldcheckint(L, 1, "H");
+  long owidth = luaT_getfieldcheckint(L, 1, "W");
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+
+  if (input->nDimension == 4) 
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data+nslices*owidth*oheight, indices_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+
+  return 1;
+}
+
+
+
+static void nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                                 real *indx_p, real *indy_p,
+                                                                 long nslices,
+                                                                 long iwidth, long iheight,
+                                                                 long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *indx_p_k = indx_p + k*owidth*oheight;
+    real *indy_p_k = indy_p + k*owidth*oheight;
+    
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float) i / oheight * iheight);
+      for(j = 0; j < owidth; j++)
+      {
+        int x_start = (int)floor((float) j / owidth * iwidth);
+        /* retrieve position of max */
+        long maxi = indy_p_k[i*owidth + j] - 1 + y_start;
+        long maxj = indx_p_k[i*owidth + j] - 1 + x_start;
+        
+        /* update gradient */
+        gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+static int nn_(SpatialAdaptiveMaxPooling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         indices_data+nslices*owidth*oheight, indices_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SpatialAdaptiveMaxPooling__) [] = {
+  {"SpatialAdaptiveMaxPooling_updateOutput", nn_(SpatialAdaptiveMaxPooling_updateOutput)},
+  {"SpatialAdaptiveMaxPooling_updateGradInput", nn_(SpatialAdaptiveMaxPooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialAdaptiveMaxPooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialAdaptiveMaxPooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
+
diff --git a/generic/SpatialAveragePooling.c b/generic/SpatialAveragePooling.c
new file mode 100644
index 00000000000..b56962d67ca
--- /dev/null
+++ b/generic/SpatialAveragePooling.c
@@ -0,0 +1,276 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+static int nn_(SpatialAveragePooling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
+  int count_include_pad = luaT_getfieldcheckboolean(L,1,"count_include_pad");
+
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  long k;
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+  luaL_argcheck(L, kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  luaL_argcheck(L, inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+  
+  input = THTensor_(newContiguous)(input);
+  luaL_argcheck(L, THTensor_(isContiguous)(output), 1, "");
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = 0;
+      
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real sum = 0;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+
+          for(ky = hstart; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              sum += ptr_input[ky*inputWidth + kx];
+          }
+          /* Update output */
+          *ptr_output++ += sum/divide_factor;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+
+  return 1;
+}
+
+static int nn_(SpatialAveragePooling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
+  int count_include_pad = luaT_getfieldcheckboolean(L,1,"count_include_pad");
+
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  luaL_argcheck(L, THTensor_(isContiguous)(gradInput), 1, "");
+
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real z = *ptr_gradOutput++;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+          for(ky = hstart ; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+          }
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SpatialAveragePooling__) [] = {
+  {"SpatialAveragePooling_updateOutput", nn_(SpatialAveragePooling_updateOutput)},
+  {"SpatialAveragePooling_updateGradInput", nn_(SpatialAveragePooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialAveragePooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialAveragePooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c
new file mode 100644
index 00000000000..ef6f5542b1c
--- /dev/null
+++ b/generic/SpatialMaxPooling.c
@@ -0,0 +1,296 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+static void nn_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      real *ind_p,
+                                                      long nslices,
+                                                      long iwidth, long iheight,
+                                                      long owidth, long oheight,
+                                                      int kW, int kH, int dW, int dH,
+                                                      int padW, int padH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    real *ip = input_p   + k*iwidth*iheight;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        long hstart = i * dH - padH;
+        long wstart = j * dW - padW;
+        long hend = fminf(hstart + kH, iheight);
+        long wend = fminf(wstart + kW, iwidth);
+        hstart = fmaxf(hstart, 0);
+        wstart = fmaxf(wstart, 0);
+
+        /* local pointers */
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        real *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long tcntr = 0;
+        long x,y;
+        for(y = hstart; y < hend; y++)
+        {
+          for(x = wstart; x < wend; x++)
+          {
+            tcntr = y*iwidth + x;
+            real val = *(ip + tcntr);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + 1;
+      }
+    }
+  }
+}
+
+static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+
+  if (input->nDimension == 4) 
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+  luaL_argcheck(L, input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
+
+  luaL_argcheck(L, kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  if (ceil_mode)
+  {
+    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((oheight - 1)*dH >= iheight + padH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + padW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize3d)(indices,  nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    nn_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight,
+                                              kW, kH, dW, dH,
+                                              padW, padH);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize4d)(indices, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+                                                indices_data+p*nslices*owidth*oheight,
+                                                nslices,
+                                                iwidth, iheight,
+                                                owidth, oheight,
+                                                kW, kH, dW, dH,
+                                                padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  return 1;
+}
+
+static void nn_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         real *ind_p,
+                                                         long nslices,
+                                                         long iwidth, long iheight,
+                                                         long owidth, long oheight,
+                                                         int dW, int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *ind_p_k = ind_p + k*owidth*oheight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        /* retrieve position of max */
+        long maxp = ind_p_k[i*owidth + j] - 1;
+        /* update gradient */
+        gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    nn_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight,
+                                                 dW, dH);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*owidth*oheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight,
+                                                   dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SpatialMaxPooling__) [] = {
+  {"SpatialMaxPooling_updateOutput", nn_(SpatialMaxPooling_updateOutput)},
+  {"SpatialMaxPooling_updateGradInput", nn_(SpatialMaxPooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialMaxPooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialMaxPooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From e62899bcb3b300fed2bce71d14b5095b6979ed39 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 17 Jan 2016 12:57:55 +0100
Subject: [PATCH 025/101] Add THNN conversion of
 {Spatial(AdaptiveMax,Average,Max)Pooling}

---
 generic/SpatialAdaptiveMaxPooling.c | 44 +++++------------------
 generic/SpatialAveragePooling.c     | 55 ++++-------------------------
 generic/SpatialMaxPooling.c         | 54 ++++++----------------------
 generic/THNN.h                      | 50 ++++++++++++++++++++++++++
 init.c                              | 11 +++++-
 5 files changed, 87 insertions(+), 127 deletions(-)

diff --git a/generic/SpatialAdaptiveMaxPooling.c b/generic/SpatialAdaptiveMaxPooling.c
index 85f728b7b9d..46ef6548b58 100644
--- a/generic/SpatialAdaptiveMaxPooling.c
+++ b/generic/SpatialAdaptiveMaxPooling.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
 #else
 
-static void nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,real *output_p,
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,real *output_p,
                                                               real *indx_p, real *indy_p,
                                                               long nslices,
                                                               long iwidth, long iheight,
@@ -65,13 +65,8 @@ static void nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,real
   }
 }
 
-static int nn_(SpatialAdaptiveMaxPooling_updateOutput)(lua_State *L)
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int owidth, int oheight, THTensor *indices)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  long oheight = luaT_getfieldcheckint(L, 1, "H");
-  long owidth = luaT_getfieldcheckint(L, 1, "W");
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
@@ -89,7 +84,7 @@ static int nn_(SpatialAdaptiveMaxPooling_updateOutput)(lua_State *L)
   real *indices_data;
 
 
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
 
   if (input->nDimension == 4) 
   {
@@ -119,7 +114,7 @@ static int nn_(SpatialAdaptiveMaxPooling_updateOutput)(lua_State *L)
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
                                                       indices_data+nslices*owidth*oheight, indices_data,
                                                       nslices,
                                                       iwidth, iheight,
@@ -142,7 +137,7 @@ static int nn_(SpatialAdaptiveMaxPooling_updateOutput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
                                                         indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
                                                         nslices,
                                                         iwidth, iheight,
@@ -151,13 +146,11 @@ static int nn_(SpatialAdaptiveMaxPooling_updateOutput)(lua_State *L)
                                                         istride_d);
     }
   }
-
-  return 1;
 }
 
 
 
-static void nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
                                                                  real *indx_p, real *indy_p,
                                                                  long nslices,
                                                                  long iwidth, long iheight,
@@ -191,12 +184,8 @@ static void nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(real *gradInput
   }
 }
 
-static int nn_(SpatialAdaptiveMaxPooling_updateGradInput)(lua_State *L)
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *indices)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
@@ -237,7 +226,7 @@ static int nn_(SpatialAdaptiveMaxPooling_updateGradInput)(lua_State *L)
   /* backprop */
   if (input->nDimension == 3)
   {
-    nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                          indices_data+nslices*owidth*oheight, indices_data,
                                                          nslices,
                                                          iwidth, iheight,
@@ -249,7 +238,7 @@ static int nn_(SpatialAdaptiveMaxPooling_updateGradInput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                            indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
                                                            nslices,
                                                            iwidth, iheight,
@@ -259,21 +248,6 @@ static int nn_(SpatialAdaptiveMaxPooling_updateGradInput)(lua_State *L)
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SpatialAdaptiveMaxPooling__) [] = {
-  {"SpatialAdaptiveMaxPooling_updateOutput", nn_(SpatialAdaptiveMaxPooling_updateOutput)},
-  {"SpatialAdaptiveMaxPooling_updateGradInput", nn_(SpatialAdaptiveMaxPooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialAdaptiveMaxPooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialAdaptiveMaxPooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialAveragePooling.c b/generic/SpatialAveragePooling.c
index b56962d67ca..29386abe107 100644
--- a/generic/SpatialAveragePooling.c
+++ b/generic/SpatialAveragePooling.c
@@ -2,20 +2,8 @@
 #define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
 #else
 
-static int nn_(SpatialAveragePooling_updateOutput)(lua_State *L)
+void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, int ceil_mode, int count_include_pad)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
-  int count_include_pad = luaT_getfieldcheckboolean(L,1,"count_include_pad");
-
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   real *output_data;
   real *input_data;
 
@@ -32,8 +20,8 @@ static int nn_(SpatialAveragePooling_updateOutput)(lua_State *L)
 
   long k;
 
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
-  luaL_argcheck(L, kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
 
   if (input->nDimension == 4) {
     nbatch = input->size[0];
@@ -66,7 +54,7 @@ static int nn_(SpatialAveragePooling_updateOutput)(lua_State *L)
       --outputWidth;
   }
 
-  luaL_argcheck(L, inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
+  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
 
   if (input->nDimension == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
@@ -74,7 +62,7 @@ static int nn_(SpatialAveragePooling_updateOutput)(lua_State *L)
     THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
   
   input = THTensor_(newContiguous)(input);
-  luaL_argcheck(L, THTensor_(isContiguous)(output), 1, "");
+  THArgCheck(THTensor_(isContiguous)(output), 1, "");
   input_data = THTensor_(data)(input);
   output_data = THTensor_(data)(output);
   
@@ -129,25 +117,10 @@ static int nn_(SpatialAveragePooling_updateOutput)(lua_State *L)
     }
   }
   THTensor_(free)(input);
-
-  return 1;
 }
 
-static int nn_(SpatialAveragePooling_updateGradInput)(lua_State *L)
+void THNN_(SpatialAveragePooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, int ceil_mode, int count_include_pad)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
-  int count_include_pad = luaT_getfieldcheckboolean(L,1,"count_include_pad");
-
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   int dimw = 2;
   int dimh = 1;
   int dimc = 0;
@@ -201,7 +174,7 @@ static int nn_(SpatialAveragePooling_updateGradInput)(lua_State *L)
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
-  luaL_argcheck(L, THTensor_(isContiguous)(gradInput), 1, "");
+  THArgCheck(THTensor_(isContiguous)(gradInput), 1, "");
 
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
@@ -257,20 +230,6 @@ static int nn_(SpatialAveragePooling_updateGradInput)(lua_State *L)
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SpatialAveragePooling__) [] = {
-  {"SpatialAveragePooling_updateOutput", nn_(SpatialAveragePooling_updateOutput)},
-  {"SpatialAveragePooling_updateGradInput", nn_(SpatialAveragePooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialAveragePooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialAveragePooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c
index ef6f5542b1c..7de370c9155 100644
--- a/generic/SpatialMaxPooling.c
+++ b/generic/SpatialMaxPooling.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
 #else
 
-static void nn_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *output_p,
+static void THNN_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *output_p,
                                                       real *ind_p,
                                                       long nslices,
                                                       long iwidth, long iheight,
@@ -61,18 +61,8 @@ static void nn_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *outpu
   }
 }
 
-static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
+void THNN_(SpatialMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, int ceil_mode, THTensor *indices)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
@@ -86,7 +76,7 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
   real *indices_data;
 
 
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
 
   if (input->nDimension == 4) 
   {
@@ -94,9 +84,9 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
     dimw++;
     dimh++;
   }
-  luaL_argcheck(L, input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
+  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
 
-  luaL_argcheck(L, kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
 
   /* sizes */
   nslices = input->size[dimh-1];
@@ -136,7 +126,7 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    nn_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
+    THNN_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
                                               indices_data,
                                               nslices,
                                               iwidth, iheight,
@@ -159,7 +149,7 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+      THNN_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
                                                 indices_data+p*nslices*owidth*oheight,
                                                 nslices,
                                                 iwidth, iheight,
@@ -171,10 +161,9 @@ static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
 
   /* cleanup */
   THTensor_(free)(input);
-  return 1;
 }
 
-static void nn_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+static void THNN_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
                                                          real *ind_p,
                                                          long nslices,
                                                          long iwidth, long iheight,
@@ -204,14 +193,8 @@ static void nn_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, real
   }
 }
 
-static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
+void THNN_(SpatialMaxPooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, int dW, int dH, THTensor *indices)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
@@ -252,7 +235,7 @@ static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
   /* backprop */
   if (input->nDimension == 3)
   {
-    nn_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+    THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                  indices_data,
                                                  nslices,
                                                  iwidth, iheight,
@@ -265,7 +248,7 @@ static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+      THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                    indices_data+p*nslices*owidth*oheight,
                                                    nslices,
                                                    iwidth, iheight,
@@ -276,21 +259,6 @@ static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SpatialMaxPooling__) [] = {
-  {"SpatialMaxPooling_updateOutput", nn_(SpatialMaxPooling_updateOutput)},
-  {"SpatialMaxPooling_updateGradInput", nn_(SpatialMaxPooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialMaxPooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialMaxPooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 2371062d871..3190c7acb60 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -182,4 +182,54 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           THTensor *gradBias,
           THTensor *finput,
           real scale);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth, int oheight,
+          THTensor *indices);
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int ceil_mode,
+          int count_include_pad);
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int ceil_mode,
+          int count_include_pad);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int ceil_mode,
+          THTensor *indices);
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int dW, int dH,
+          THTensor *indices);
 #endif
diff --git a/init.c b/init.c
index a073450b34b..4ce4288d2f6 100644
--- a/init.c
+++ b/init.c
@@ -41,4 +41,13 @@
 #include "THGenerateFloatTypes.h"
 
 #include "generic/SpatialConvolutionMM.c"
-#include "THGenerateFloatTypes.h"
\ No newline at end of file
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxPooling.c"
+#include "THGenerateFloatTypes.h"

From 95b39cd43a63e082a0978677b635cad53c5c5b0b Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Tue, 26 Jan 2016 23:10:52 +0100
Subject: [PATCH 026/101] Unify C/Cuda signatures for SpatialConvolutionMM and
 Spatial(AdaptiveMax,Max)Pooling

unfold has it's own file
---
 generic/SpatialAdaptiveMaxPooling.c |   2 +-
 generic/SpatialAveragePooling.c     |   8 +-
 generic/SpatialConvolutionMM.c      | 145 +---------------------------
 generic/SpatialMaxPooling.c         |   4 +-
 generic/THNN.h                      |  46 +++++++--
 generic/unfold.c                    | 139 ++++++++++++++++++++++++++
 init.c                              |   3 +
 7 files changed, 188 insertions(+), 159 deletions(-)
 create mode 100644 generic/unfold.c

diff --git a/generic/SpatialAdaptiveMaxPooling.c b/generic/SpatialAdaptiveMaxPooling.c
index 46ef6548b58..c34a5e779cc 100644
--- a/generic/SpatialAdaptiveMaxPooling.c
+++ b/generic/SpatialAdaptiveMaxPooling.c
@@ -65,7 +65,7 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,re
   }
 }
 
-void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int owidth, int oheight, THTensor *indices)
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *indices, int owidth, int oheight)
 {
   int dimw = 2;
   int dimh = 1;
diff --git a/generic/SpatialAveragePooling.c b/generic/SpatialAveragePooling.c
index 29386abe107..1bd297a12ad 100644
--- a/generic/SpatialAveragePooling.c
+++ b/generic/SpatialAveragePooling.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
 #else
 
-void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, int ceil_mode, int count_include_pad)
+void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
 {
   real *output_data;
   real *input_data;
@@ -62,7 +62,7 @@ void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input
     THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
   
   input = THTensor_(newContiguous)(input);
-  THArgCheck(THTensor_(isContiguous)(output), 1, "");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   input_data = THTensor_(data)(input);
   output_data = THTensor_(data)(output);
   
@@ -119,7 +119,7 @@ void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input
   THTensor_(free)(input);
 }
 
-void THNN_(SpatialAveragePooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, int ceil_mode, int count_include_pad)
+void THNN_(SpatialAveragePooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
 {
   int dimw = 2;
   int dimh = 1;
@@ -174,7 +174,7 @@ void THNN_(SpatialAveragePooling_updateGradInput)(THNNState *state, THTensor *in
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
-  THArgCheck(THTensor_(isContiguous)(gradInput), 1, "");
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
 
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
index fc7dd0c51da..e13037df557 100644
--- a/generic/SpatialConvolutionMM.c
+++ b/generic/SpatialConvolutionMM.c
@@ -2,147 +2,6 @@
 #define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
 #else
 
-#ifdef _WIN32
-# include <windows.h>
-#endif
-
-
-
-/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
-static void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
-                               int kW, int kH,
-                               int dW, int dH,
-                               int padW, int padH,
-                               int nInputPlane,
-                               int inputWidth, int inputHeight,
-                               int outputWidth, int outputHeight)
-{
-#ifdef _WIN32
-  LONG_PTR nip;
-#else
-  size_t nip;
-#endif
-
-  real *input_data = THTensor_(data)(input);
-  real *finput_data = THTensor_(data)(finput);
-
-#pragma omp parallel for private(nip)
-  for(nip = 0; nip < nInputPlane; nip++)
-  {
-    size_t kw, kh, y, x; 
-    long long ix = 0, iy = 0;
-    for(kh = 0; kh < kH; kh++)
-    {
-      for(kw = 0; kw < kW; kw++)
-      {
-        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
-        real *dst = input_data + nip*(inputHeight*inputWidth);
-        if (padW > 0 || padH > 0) {
-          size_t lpad,rpad;
-          for(y = 0; y < outputHeight; y++) {
-            iy = (long long)(y*dH - padH + kh);
-            if (iy < 0 || iy >= inputHeight) {
-            } else {
-              if (dW==1){
-                 ix = (long long)(0 - padW + kw);
-                 lpad = fmaxf(0,padW-kw);
-                 rpad = fmaxf(0,padW-(kW-kw-1));
-                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
-              }
-              else{
-                for (x=0; x<outputWidth; x++){
-                   ix = (long long)(x*dW - padW + kw);
-                   if (ix < 0 || ix >= inputWidth){
-                   }else
-                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
-                }
-              }
-            }
-          }
-        } else {
-          for(y = 0; y < outputHeight; y++) {
-            iy = (long long)(y*dH + kh);
-            ix = (long long)(0 + kw);
-            if (dW == 1 )
-               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
-            else{
-              for(x = 0; x < outputWidth; x++)
-                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-static void THNN_(unfolded_copy)(THTensor *finput, THTensor *input,
-                               int kW, int kH,
-                               int dW, int dH,
-                               int padW, int padH,
-                               int nInputPlane,
-                               int inputWidth, int inputHeight,
-                               int outputWidth, int outputHeight)
-{
-  long k;
-  real *input_data = THTensor_(data)(input);
-  real *finput_data = THTensor_(data)(finput);
-
-#pragma omp parallel for private(k)
-  for(k = 0; k < nInputPlane*kH*kW; k++) {
-    size_t nip = k / (kH*kW);
-    size_t rest = k % (kH*kW);
-    size_t kh = rest / kW;
-    size_t kw = rest % kW;
-    size_t x,y;
-    long long ix,iy;
-    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
-    real *src = input_data + nip*(inputHeight*inputWidth);
-    if (padW > 0 || padH > 0) {
-      size_t lpad,rpad;
-      for(y = 0; y < outputHeight; y++) {
-        iy = (long long)(y*dH - padH + kh);
-        if (iy < 0 || iy >= inputHeight) {
-          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
-        } else {
-          if (dW==1){
-             ix = (long long)(0 - padW + kw);
-             lpad = fmaxf(0,padW-kw);
-             rpad = fmaxf(0,padW-(kW-kw-1));
-             if (outputWidth-rpad-lpad <= 0) {
-                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
-             } else {
-                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
-                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
-                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
-             }
-          }
-          else{
-            for (x=0; x<outputWidth; x++){
-               ix = (long long)(x*dW - padW + kw);
-               if (ix < 0 || ix >= inputWidth)
-                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
-               else
-                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
-            }
-          }
-        }
-      }
-    } else {
-      for(y = 0; y < outputHeight; y++) {
-        iy = (long long)(y*dH + kh);
-        ix = (long long)(0 + kw);
-        if (dW == 1)
-           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
-        else{
-          for (x=0; x<outputWidth; x++)
-             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
-         }
-      }
-    }
-  }
-}
-
 static void THNN_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
                                                          int kW, int kH, int dW, int dH, int padW, int padH,
                                                          long nInputPlane, long inputWidth, long inputHeight,
@@ -165,7 +24,7 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTe
   THTensor_(free)(output2d);
 }
 
-void THNN_(SpatialConvolutionMM_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor* finput, int kW, int kH, int dW, int dH, int padW, int padH)
+void THNN_(SpatialConvolutionMM_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, THTensor* fgradInput, int kW, int kH, int dW, int dH, int padW, int padH)
 {
   int dimf = 0;
   int dimw = 2;
@@ -314,7 +173,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOu
   THTensor_(free)(gradOutput2d);
 }
 
-void THNN_(SpatialConvolutionMM_accGradParameters)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale)
+void THNN_(SpatialConvolutionMM_accGradParameters)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, THTensor *fgradInput, int kW, int kH, int dW, int dH, int padW, int padH, real scale)
 {
   long nOutputPlane = gradWeight->size[0];
   THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c
index 7de370c9155..30352822dff 100644
--- a/generic/SpatialMaxPooling.c
+++ b/generic/SpatialMaxPooling.c
@@ -61,7 +61,7 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *out
   }
 }
 
-void THNN_(SpatialMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, int ceil_mode, THTensor *indices)
+void THNN_(SpatialMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
 {
   int dimw = 2;
   int dimh = 1;
@@ -193,7 +193,7 @@ static void THNN_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, re
   }
 }
 
-void THNN_(SpatialMaxPooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, int dW, int dH, THTensor *indices)
+void THNN_(SpatialMaxPooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
 {
   int dimw = 2;
   int dimh = 1;
diff --git a/generic/THNN.h b/generic/THNN.h
index 3190c7acb60..e0f5cefd55f 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -159,6 +159,7 @@ TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THTensor *weight,
           THTensor *bias,
           THTensor* finput,
+          THTensor *fgradInput,
           int kW, int kH,
           int dW, int dH,
           int padW, int padH);
@@ -181,14 +182,18 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
           real scale);
 
 TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          int owidth, int oheight,
-          THTensor *indices);
+          THTensor *indices,
+          int owidth, int oheight);
 TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
@@ -203,8 +208,8 @@ TH_API void THNN_(SpatialAveragePooling_updateOutput)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
-          int ceil_mode,
-          int count_include_pad);
+          bool ceil_mode,
+          bool count_include_pad);
 TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
@@ -213,23 +218,46 @@ TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
-          int ceil_mode,
-          int count_include_pad);
+          bool ceil_mode,
+          bool count_include_pad);
 
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
+          THTensor *indices,
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
-          int ceil_mode,
-          THTensor *indices);
+          bool ceil_mode);
 TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
+          THTensor *indices,
+          int kW, int kH,
           int dW, int dH,
-          THTensor *indices);
+          int padW, int padH,
+          bool ceil_mode);
+
+TH_API void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+
 #endif
diff --git a/generic/unfold.c b/generic/unfold.c
new file mode 100644
index 00000000000..89a0759354e
--- /dev/null
+++ b/generic/unfold.c
@@ -0,0 +1,139 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+#ifdef _WIN32
+# include <windows.h>
+#endif
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
+                         int kW, int kH, int dW, int dH, int padW, int padH,
+                         int nInputPlane, int inputWidth, int inputHeight,
+                         int outputWidth, int outputHeight)
+{
+#ifdef _WIN32
+  LONG_PTR nip;
+#else
+  size_t nip;
+#endif
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    size_t kw, kh, y, x; 
+    long long ix = 0, iy = 0;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+        real *dst = input_data + nip*(inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          size_t lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH - padH + kh);
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = (long long)(0 - padW + kw);
+                 lpad = fmaxf(0,padW-kw);
+                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (long long)(x*dW - padW + kw);
+                   if (ix < 0 || ix >= inputWidth){
+                   }else
+                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH + kh);
+            ix = (long long)(0 + kw);
+            if (dW == 1 )
+               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            else{
+              for(x = 0; x < outputWidth; x++)
+                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+void THNN_(unfolded_copy)(THTensor *finput, THTensor *input,
+                          int kW, int kH, int dW, int dH, int padW, int padH,
+                          int nInputPlane, int inputWidth, int inputHeight,
+                          int outputWidth, int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane*kH*kW; k++) {
+    size_t nip = k / (kH*kW);
+    size_t rest = k % (kH*kW);
+    size_t kh = rest / kW;
+    size_t kw = rest % kW;
+    size_t x,y;
+    long long ix,iy;
+    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+    real *src = input_data + nip*(inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      size_t lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH - padH + kh);
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = (long long)(0 - padW + kw);
+             lpad = fmaxf(0,padW-kw);
+             rpad = fmaxf(0,padW-(kW-kw-1));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (long long)(x*dW - padW + kw);
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH + kh);
+        ix = (long long)(0 + kw);
+        if (dW == 1)
+           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/init.c b/init.c
index 4ce4288d2f6..80a4a9f043a 100644
--- a/init.c
+++ b/init.c
@@ -40,6 +40,9 @@
 #include "generic/LookupTable.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/unfold.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 

From c6181ddd99b21abcdd014b43986af8758312bb1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Fri, 22 Jan 2016 22:04:58 +0100
Subject: [PATCH 027/101] Move { MSECriterion, MarginCriterion,
 MultiLabelMarginCriterion, MultiMarginCriterion, PReLU }.c ->
 lib/THNN/generic

---
 generic/MSECriterion.c              |  54 +++++++
 generic/MarginCriterion.c           |  56 +++++++
 generic/MultiLabelMarginCriterion.c | 185 +++++++++++++++++++++
 generic/MultiMarginCriterion.c      | 164 +++++++++++++++++++
 generic/PReLU.c                     | 238 ++++++++++++++++++++++++++++
 init.c                              |  15 ++
 6 files changed, 712 insertions(+)
 create mode 100644 generic/MSECriterion.c
 create mode 100644 generic/MarginCriterion.c
 create mode 100644 generic/MultiLabelMarginCriterion.c
 create mode 100644 generic/MultiMarginCriterion.c
 create mode 100644 generic/PReLU.c

diff --git a/generic/MSECriterion.c b/generic/MSECriterion.c
new file mode 100644
index 00000000000..e46bb63fb1e
--- /dev/null
+++ b/generic/MSECriterion.c
@@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+static int nn_(MSECriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = (*input_data - *target_data);
+                   sum += z*z;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(MSECriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   *gradInput_data = norm * (*input_data - *target_data);)
+  return 1;
+}
+
+static const struct luaL_Reg nn_(MSECriterion__) [] = {
+  {"MSECriterion_updateOutput", nn_(MSECriterion_updateOutput)},
+  {"MSECriterion_updateGradInput", nn_(MSECriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(MSECriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(MSECriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/MarginCriterion.c b/generic/MarginCriterion.c
new file mode 100644
index 00000000000..7269046664d
--- /dev/null
+++ b/generic/MarginCriterion.c
@@ -0,0 +1,56 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+static int nn_(MarginCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  real margin = luaT_getfieldchecknumber(L, 1, "margin");
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = (margin - *input_data* *target_data);
+                   sum += z>0 ? z : 0;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(MarginCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real margin = luaT_getfieldchecknumber(L, 1, "margin");
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   *gradInput_data = (*input_data * *target_data) < margin ? -norm* *target_data : 0;)
+  return 1;
+}
+
+static const struct luaL_Reg nn_(MarginCriterion__) [] = {
+  {"MarginCriterion_updateOutput", nn_(MarginCriterion_updateOutput)},
+  {"MarginCriterion_updateGradInput", nn_(MarginCriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(MarginCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(MarginCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
new file mode 100644
index 00000000000..6812b22eb10
--- /dev/null
+++ b/generic/MultiLabelMarginCriterion.c
@@ -0,0 +1,185 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+static int nn_(MultiLabelMarginCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  real *input_data, *target_data;
+  long nframe, dim;
+  long t, d, dt, ddt;
+  THTensor *target;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if(input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0]; 
+    target = luaT_checkudata(L, 3, torch_Tensor);
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    target = luaT_checkudata(L, 3, torch_Tensor);
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+  }
+
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+
+  target = THTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+
+  sum = 0;
+  for(t = 0; t < nframe; t++)
+  {
+    for(dt = 0; dt < dim; dt++)
+    {
+      long target_idx = (long)target_data[dt]-1;
+      real input_target;
+      if(target_idx < 0)
+        break;
+      
+      input_target = input_data[target_idx];
+      for(d = 0; d < dim; d++)
+      {
+        int istarget = 0;
+        for(ddt = 0; ddt < dim; ddt++)
+        {
+          if(!target_data[ddt])
+            break;
+          if(((long)target_data[ddt])-1 == d)
+            istarget = 1;
+        }
+        
+        if(!istarget)
+        {
+          real z = 1 - input_target + input_data[d];
+          if(z > 0)
+            sum += z;
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+  }
+
+  if(sizeAverage)
+    sum /= dim;
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(MultiLabelMarginCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real *input_data;
+  real *gradInput_data;
+  real *target_data;
+  long nframe, dim;
+  long t, d, dt, ddt;
+  THTensor *target;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if(input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0]; 
+    target = luaT_checkudata(L, 3, torch_Tensor);
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    target = luaT_checkudata(L, 3, torch_Tensor);
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+  }
+
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+
+  target = THTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+
+  g = (sizeAverage ? 1./((real)dim) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  for(t = 0; t < nframe; t++)
+  {
+    for(dt = 0; dt < dim; dt++)
+    {
+      long target_idx = (long)target_data[dt]-1;
+      real input_target;
+      if(target_idx < 0)
+        break;
+      
+      input_target = input_data[target_idx];
+      for(d = 0; d < dim; d++)
+      {
+        int istarget = 0;
+        for(ddt = 0; ddt < dim; ddt++)
+        {
+          if(!target_data[ddt])
+            break;
+          if(((long)target_data[ddt])-1 == d)
+            istarget = 1;
+        }
+        
+        if(!istarget)
+        {
+          real z = 1 - input_target + input_data[d];
+          if(z > 0)
+          {
+            gradInput_data[target_idx] -= g;
+            gradInput_data[d] += g;
+          }
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);  
+  THTensor_(free)(target);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(MultiLabelMarginCriterion__) [] = {
+  {"MultiLabelMarginCriterion_updateOutput", nn_(MultiLabelMarginCriterion_updateOutput)},
+  {"MultiLabelMarginCriterion_updateGradInput", nn_(MultiLabelMarginCriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(MultiLabelMarginCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(MultiLabelMarginCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
new file mode 100644
index 00000000000..df7fc256f4d
--- /dev/null
+++ b/generic/MultiMarginCriterion.c
@@ -0,0 +1,164 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+static int nn_(MultiMarginCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  int p = luaT_getfieldchecknumber(L, 1, "p");
+  real *input_data, *target_data;
+  long nframe, dim;
+  long t, d;
+  real target_;
+  THTensor *target;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if(input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0]; 
+    target_ = luaL_checknumber(L, 3);
+    target = THTensor_(newWithSize1d)(1);
+    THTensor_(fill)(target, target_);
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    target = luaT_checkudata(L, 3, torch_Tensor);
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+    target = THTensor_(newContiguous)(target);
+  }
+
+  for(t = 0; t < nframe; t++)
+  {
+    real idx = THTensor_(get1d)(target, t);
+    THArgCheck((idx >= 1) && (idx <= dim), 3, "target out of range");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+
+  sum = 0;
+  for(t = 0; t < nframe; t++)
+  {
+    long target_idx = (long)(target_data[t]-1);
+    real input_target = input_data[target_idx];
+    for(d = 0; d < dim; d++)
+    {
+      real z = 1 - input_target + input_data[d];
+      if(d == target_idx)
+        continue;
+    
+      if(z > 0)
+        sum += (p==1) ? z : z*z;
+    }
+    input_data += dim;
+  }
+
+  if(sizeAverage)
+    sum /= dim;
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(MultiMarginCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  int p = luaT_getfieldchecknumber(L, 1, "p");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real *input_data;
+  real *gradInput_data;
+  real *target_data;
+  THTensor *target;
+  long nframe, dim;
+  long t, d;
+  real target_;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if(input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0]; 
+    target_ = luaL_checknumber(L, 3);
+    target = THTensor_(newWithSize1d)(1);
+    THTensor_(fill)(target, target_);
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    target = luaT_checkudata(L, 3, torch_Tensor);
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+    target = THTensor_(newContiguous)(target);
+  }
+
+  g = (sizeAverage ? 1./((real)dim) : 1.);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  target_data = THTensor_(data)(target);
+    
+  for(t = 0; t < nframe; t++)
+  {
+    long target_idx = (long)(target_data[t])-1;
+    real input_target = input_data[target_idx];
+    real gradInput_target = 0;
+    for(d = 0; d < dim; d++)
+    {
+      real z = 1 - input_target + input_data[d];
+      if(d == target_idx)
+        continue;
+    
+      if(z > 0)
+      {
+        real h = (p == 1) ? g : 2*g*z;
+        gradInput_target -= h;
+        gradInput_data[d] = h;
+      }
+      else
+        gradInput_data[d] = 0;
+    }
+    gradInput_data[target_idx] = gradInput_target;
+    
+    input_data += dim;
+    gradInput_data += dim;
+  }
+
+
+  THTensor_(free)(input);  
+  THTensor_(free)(target);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(MultiMarginCriterion__) [] = {
+  {"MultiMarginCriterion_updateOutput", nn_(MultiMarginCriterion_updateOutput)},
+  {"MultiMarginCriterion_updateGradInput", nn_(MultiMarginCriterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(MultiMarginCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(MultiMarginCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/PReLU.c b/generic/PReLU.c
new file mode 100644
index 00000000000..0862c284e40
--- /dev/null
+++ b/generic/PReLU.c
@@ -0,0 +1,238 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+
+static int nn_(PReLU_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  long nOutputPlane = luaT_getfieldchecknumber(L, 1, "nOutputPlane");
+
+  THTensor_(resizeAs)(output, input);
+
+  if (nOutputPlane == 0)
+  {
+    // handle shared parameter case
+    real w = *THTensor_(data)(weight);
+    TH_TENSOR_APPLY2(real, output, real, input, \
+		     *output_data = (*input_data > 0) ? *input_data : w*(*input_data););
+  }
+  else
+  {
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+	case 1:
+	  bs = 1;
+	  ks = 1;
+	  break;
+	case 2:
+	  bs = input->size[0];
+	  ks = 1;
+	  break;
+	case 3:
+	  bs = 1;
+	  ks = input->size[1] * input->size[2];
+	  break;
+	case 4:
+	  bs = input->size[0];
+	  ks = input->size[2] * input->size[3];
+	  break;
+      }
+
+      if(input->size[(input_ndim + 1) % 2] != nOutputPlane)
+	THError("wrong number of input planes");
+    }
+
+    real* output_data = THTensor_(data)(output);
+    real* input_data = THTensor_(data)(input);
+    real* weight_data = THTensor_(data)(weight);
+    long i,j,k;
+#pragma omp parallel for private(j,k)
+    for (i=0; i < bs; ++i)
+    {
+      real* n_input_data = input_data + i*nOutputPlane*ks;
+      real* n_output_data = output_data + i*nOutputPlane*ks;
+      for (j=0; j < nOutputPlane; ++j)
+      {
+	for (k=0; k < ks; ++k)
+	  n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+	n_input_data += ks;
+	n_output_data += ks;
+      }
+    }
+  }
+
+  return 1;
+}
+
+static int nn_(PReLU_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  long nOutputPlane = luaT_getfieldchecknumber(L, 1, "nOutputPlane");
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (nOutputPlane == 0)
+  {
+    real w = THTensor_(data)(weight)[0];
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,	\
+       if ((*input_data) > 0) *gradInput_data = *gradOutput_data;	\
+       else *gradInput_data = w* *gradOutput_data;);     		\
+  }
+  else
+  {
+    const real* input_data = THTensor_(data)(input);
+    const real* gradOutput_data = THTensor_(data)(gradOutput);
+    const real* weight_data = THTensor_(data)(weight);
+    real* gradInput_data = THTensor_(data)(gradInput);
+
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+	case 1:
+	  bs = 1;
+	  ks = 1;
+	  break;
+	case 2:
+	  bs = input->size[0];
+	  ks = 1;
+	  break;
+	case 3:
+	  bs = 1;
+	  ks = input->size[1] * input->size[2];
+	  break;
+	case 4:
+	  bs = input->size[0];
+	  ks = input->size[2] * input->size[3];
+	  break;
+      }
+
+      if(input->size[(input_ndim + 1) % 2] != nOutputPlane)
+	THError("wrong number of input planes");
+    }
+
+    long i,j,k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      const real* n_input_data = input_data + i*nOutputPlane*ks;
+      const real* n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      real* n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+      for (j=0; j < nOutputPlane; ++j)
+      {
+	real w = weight_data[j];
+	for (k=0; k < ks; ++k)
+	  if (n_input_data[k] > 0)
+	    n_gradInput_data[k] = n_gradOutput_data[k];
+	  else
+	    n_gradInput_data[k] = n_gradOutput_data[k] * w;
+	n_input_data += ks;
+	n_gradInput_data += ks;
+	n_gradOutput_data += ks;
+      }
+    }
+  }
+
+  return 1;
+}
+
+static int nn_(PReLU_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  long nOutputPlane = luaT_getfieldchecknumber(L, 1, "nOutputPlane");
+  real scale = luaL_optnumber(L, 4, 1);
+
+  real* gradWeight_data = THTensor_(data)(gradWeight);
+
+  if (nOutputPlane == 0)
+  {
+    real sum = 0;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,  \
+	if ((*input_data) <= 0) sum += *input_data* *gradOutput_data;);
+    gradWeight_data[0] += scale*sum;
+  }
+  else
+  {
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+	case 1:
+	  bs = 1;
+	  ks = 1;
+	  break;
+	case 2:
+	  bs = input->size[0];
+	  ks = 1;
+	  break;
+	case 3:
+	  bs = 1;
+	  ks = input->size[1] * input->size[2];
+	  break;
+	case 4:
+	  bs = input->size[0];
+	  ks = input->size[2] * input->size[3];
+	  break;
+      }
+
+      if(input->size[(input_ndim + 1) % 2] != nOutputPlane)
+	THError("wrong number of input planes");
+    }
+
+    const real* input_data = THTensor_(data)(input);
+    const real* gradOutput_data = THTensor_(data)(gradOutput);
+    const real* weight_data = THTensor_(data)(weight);
+    real* gradWeight_data = THTensor_(data)(gradWeight);
+
+    long i,j,k;
+    for (i = 0; i < bs; ++i)
+    {
+      const real* n_input_data = input_data + i*nOutputPlane*ks;
+      const real* n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+      for (j=0; j < nOutputPlane; ++j)
+      {
+	real sum = 0;
+	for (k=0; k < ks; ++k)
+	  if (n_input_data[k] <= 0)
+	    sum += n_gradOutput_data[k] * n_input_data[k];
+	gradWeight_data[j] += scale * sum;
+	n_input_data += ks;
+	n_gradOutput_data += ks;
+      }
+    }
+  }
+  return 1;
+}
+
+
+static const struct luaL_Reg nn_(PReLU__) [] = {
+  {"PReLU_updateOutput", nn_(PReLU_updateOutput)},
+  {"PReLU_updateGradInput", nn_(PReLU_updateGradInput)},
+  {"PReLU_accGradParameters", nn_(PReLU_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(PReLU_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(PReLU__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/init.c b/init.c
index 80a4a9f043a..e1a847edaf9 100644
--- a/init.c
+++ b/init.c
@@ -40,6 +40,21 @@
 #include "generic/LookupTable.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/MSECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiLabelMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/PReLU.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 

From 35dcbf02e3029e016b469bd6bbae2cd272fea533 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Fri, 22 Jan 2016 23:11:32 +0100
Subject: [PATCH 028/101] Add THNN conversion of {MarginCriterion,
 MSECriterion, MultiLabelMarginCriterion, MultiMarginCriterion, PReLU}

---
 generic/MSECriterion.c              |  44 ++---
 generic/MarginCriterion.c           |  46 ++----
 generic/MultiLabelMarginCriterion.c |  76 +++------
 generic/MultiMarginCriterion.c      |  73 ++-------
 generic/PReLU.c                     | 245 ++++++++++++----------------
 generic/THNN.h                      |  80 +++++++++
 6 files changed, 246 insertions(+), 318 deletions(-)

diff --git a/generic/MSECriterion.c b/generic/MSECriterion.c
index e46bb63fb1e..048829581c4 100644
--- a/generic/MSECriterion.c
+++ b/generic/MSECriterion.c
@@ -2,53 +2,29 @@
 #define TH_GENERIC_FILE "generic/MSECriterion.c"
 #else
 
-static int nn_(MSECriterion_updateOutput)(lua_State *L)
+void THNN_(MSECriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  real sum;
+  real sum = 0;
 
-  sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
-                   real z = (*input_data - *target_data);
-                   sum += z*z;)
+    real z = (*input_data - *target_data);
+    sum += z*z;
+  );
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
-  lua_pushnumber(L, sum);
-  return 1;
+  THTensor_(set1d)(output, 0, sum);
 }
 
-static int nn_(MSECriterion_updateGradInput)(lua_State *L)
+void THNN_(MSECriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
 
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
-                   *gradInput_data = norm * (*input_data - *target_data);)
-  return 1;
-}
-
-static const struct luaL_Reg nn_(MSECriterion__) [] = {
-  {"MSECriterion_updateOutput", nn_(MSECriterion_updateOutput)},
-  {"MSECriterion_updateGradInput", nn_(MSECriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(MSECriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(MSECriterion__), "nn");
-  lua_pop(L,1);
+    *gradInput_data = norm * (*input_data - *target_data);
+  );
 }
 
 #endif
diff --git a/generic/MarginCriterion.c b/generic/MarginCriterion.c
index 7269046664d..340ef8031ca 100644
--- a/generic/MarginCriterion.c
+++ b/generic/MarginCriterion.c
@@ -2,55 +2,29 @@
 #define TH_GENERIC_FILE "generic/MarginCriterion.c"
 #else
 
-static int nn_(MarginCriterion_updateOutput)(lua_State *L)
+void THNN_(MarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, real margin, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);  
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  real margin = luaT_getfieldchecknumber(L, 1, "margin");
-  real sum;
+  real sum = 0;
 
-  sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
-                   real z = (margin - *input_data* *target_data);
-                   sum += z>0 ? z : 0;)
+    real z = (margin - *input_data * *target_data);
+    sum += z>0 ? z : 0;
+  );
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
-  lua_pushnumber(L, sum);
-  return 1;
+  THTensor_(set1d)(output, 0, sum);
 }
 
-static int nn_(MarginCriterion_updateGradInput)(lua_State *L)
+void THNN_(MarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, real margin, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  real margin = luaT_getfieldchecknumber(L, 1, "margin");
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
-                   *gradInput_data = (*input_data * *target_data) < margin ? -norm* *target_data : 0;)
-  return 1;
-}
-
-static const struct luaL_Reg nn_(MarginCriterion__) [] = {
-  {"MarginCriterion_updateOutput", nn_(MarginCriterion_updateOutput)},
-  {"MarginCriterion_updateGradInput", nn_(MarginCriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(MarginCriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(MarginCriterion__), "nn");
-  lua_pop(L,1);
+    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+  );
 }
 
 #endif
diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index 6812b22eb10..cc2e52f0f3d 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -2,30 +2,25 @@
 #define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
 #else
 
-static int nn_(MultiLabelMarginCriterion_updateOutput)(lua_State *L)
+void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
   real *input_data, *target_data;
   long nframe, dim;
   long t, d, dt, ddt;
-  THTensor *target;
   real sum;
 
   THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
 
-  if(input->nDimension == 1)
+  if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0]; 
-    target = luaT_checkudata(L, 3, torch_Tensor);
     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
-    target = luaT_checkudata(L, 3, torch_Tensor);
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
@@ -38,31 +33,31 @@ static int nn_(MultiLabelMarginCriterion_updateOutput)(lua_State *L)
   target_data = THTensor_(data)(target);
 
   sum = 0;
-  for(t = 0; t < nframe; t++)
+  for (t = 0; t < nframe; t++)
   {
-    for(dt = 0; dt < dim; dt++)
+    for (dt = 0; dt < dim; dt++)
     {
       long target_idx = (long)target_data[dt]-1;
       real input_target;
-      if(target_idx < 0)
+      if (target_idx < 0)
         break;
       
       input_target = input_data[target_idx];
-      for(d = 0; d < dim; d++)
+      for (d = 0; d < dim; d++)
       {
         int istarget = 0;
         for(ddt = 0; ddt < dim; ddt++)
         {
-          if(!target_data[ddt])
+          if (!target_data[ddt])
             break;
-          if(((long)target_data[ddt])-1 == d)
+          if (((long)target_data[ddt])-1 == d)
             istarget = 1;
         }
         
-        if(!istarget)
+        if (!istarget)
         {
           real z = 1 - input_target + input_data[d];
-          if(z > 0)
+          if (z > 0)
             sum += z;
         }
       }
@@ -71,45 +66,36 @@ static int nn_(MultiLabelMarginCriterion_updateOutput)(lua_State *L)
     target_data += dim;
   }
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= dim;
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
+  THTensor_(set1d)(output, 0, sum);
+  
   THTensor_(free)(input);
   THTensor_(free)(target);
-  lua_pushnumber(L, sum);
-  return 1;
 }
 
-static int nn_(MultiLabelMarginCriterion_updateGradInput)(lua_State *L)
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real *input_data;
   real *gradInput_data;
   real *target_data;
   long nframe, dim;
   long t, d, dt, ddt;
-  THTensor *target;
   real g;
 
   THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
 
-  if(input->nDimension == 1)
+  if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0]; 
-    target = luaT_checkudata(L, 3, torch_Tensor);
     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
-    target = luaT_checkudata(L, 3, torch_Tensor);
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
@@ -127,31 +113,31 @@ static int nn_(MultiLabelMarginCriterion_updateGradInput)(lua_State *L)
   THTensor_(zero)(gradInput);
   gradInput_data = THTensor_(data)(gradInput);
 
-  for(t = 0; t < nframe; t++)
+  for (t = 0; t < nframe; t++)
   {
-    for(dt = 0; dt < dim; dt++)
+    for (dt = 0; dt < dim; dt++)
     {
       long target_idx = (long)target_data[dt]-1;
       real input_target;
-      if(target_idx < 0)
+      if (target_idx < 0)
         break;
       
       input_target = input_data[target_idx];
-      for(d = 0; d < dim; d++)
+      for (d = 0; d < dim; d++)
       {
         int istarget = 0;
-        for(ddt = 0; ddt < dim; ddt++)
+        for (ddt = 0; ddt < dim; ddt++)
         {
-          if(!target_data[ddt])
+          if (!target_data[ddt])
             break;
-          if(((long)target_data[ddt])-1 == d)
+          if (((long)target_data[ddt])-1 == d)
             istarget = 1;
         }
         
-        if(!istarget)
+        if (!istarget)
         {
           real z = 1 - input_target + input_data[d];
-          if(z > 0)
+          if (z > 0)
           {
             gradInput_data[target_idx] -= g;
             gradInput_data[d] += g;
@@ -166,20 +152,6 @@ static int nn_(MultiLabelMarginCriterion_updateGradInput)(lua_State *L)
 
   THTensor_(free)(input);  
   THTensor_(free)(target);
-  return 1;
-}
-
-static const struct luaL_Reg nn_(MultiLabelMarginCriterion__) [] = {
-  {"MultiLabelMarginCriterion_updateOutput", nn_(MultiLabelMarginCriterion_updateOutput)},
-  {"MultiLabelMarginCriterion_updateGradInput", nn_(MultiLabelMarginCriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(MultiLabelMarginCriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(MultiLabelMarginCriterion__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index df7fc256f4d..9cb1686cc67 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -2,113 +2,91 @@
 #define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
 #else
 
-static int nn_(MultiMarginCriterion_updateOutput)(lua_State *L)
+void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor* output, bool sizeAverage, int p)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  int p = luaT_getfieldchecknumber(L, 1, "p");
   real *input_data, *target_data;
   long nframe, dim;
   long t, d;
-  real target_;
-  THTensor *target;
   real sum;
 
   THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
 
-  if(input->nDimension == 1)
+  if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0]; 
-    target_ = luaL_checknumber(L, 3);
-    target = THTensor_(newWithSize1d)(1);
-    THTensor_(fill)(target, target_);
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
-    target = luaT_checkudata(L, 3, torch_Tensor);
     THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
-    target = THTensor_(newContiguous)(target);
   }
 
-  for(t = 0; t < nframe; t++)
+  for (t = 0; t < nframe; t++)
   {
     real idx = THTensor_(get1d)(target, t);
     THArgCheck((idx >= 1) && (idx <= dim), 3, "target out of range");
   }
 
   input = THTensor_(newContiguous)(input);
+  target = THTensor_(newContiguous)(target);
   input_data = THTensor_(data)(input);
   target_data = THTensor_(data)(target);
 
   sum = 0;
-  for(t = 0; t < nframe; t++)
+  for (t = 0; t < nframe; t++)
   {
     long target_idx = (long)(target_data[t]-1);
     real input_target = input_data[target_idx];
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
     {
       real z = 1 - input_target + input_data[d];
-      if(d == target_idx)
+      if (d == target_idx)
         continue;
     
-      if(z > 0)
-        sum += (p==1) ? z : z*z;
+      if (z > 0)
+        sum += (p == 1) ? z : z*z;
     }
     input_data += dim;
   }
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= dim;
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
+  THTensor_(set1d)(output, 0, sum);
 
   THTensor_(free)(input);
   THTensor_(free)(target);
-  lua_pushnumber(L, sum);
-  return 1;
 }
 
-static int nn_(MultiMarginCriterion_updateGradInput)(lua_State *L)
+void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage, int p)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  int p = luaT_getfieldchecknumber(L, 1, "p");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real *input_data;
   real *gradInput_data;
   real *target_data;
-  THTensor *target;
   long nframe, dim;
   long t, d;
-  real target_;
   real g;
 
   THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
 
-  if(input->nDimension == 1)
+  if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0]; 
-    target_ = luaL_checknumber(L, 3);
-    target = THTensor_(newWithSize1d)(1);
-    THTensor_(fill)(target, target_);
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
-    target = luaT_checkudata(L, 3, torch_Tensor);
     THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
-    target = THTensor_(newContiguous)(target);
   }
 
   g = (sizeAverage ? 1./((real)dim) : 1.);
 
   input = THTensor_(newContiguous)(input);
+  target = THTensor_(newContiguous)(target);
   input_data = THTensor_(data)(input);
 
   THTensor_(resizeAs)(gradInput, input);
@@ -116,18 +94,18 @@ static int nn_(MultiMarginCriterion_updateGradInput)(lua_State *L)
 
   target_data = THTensor_(data)(target);
     
-  for(t = 0; t < nframe; t++)
+  for (t = 0; t < nframe; t++)
   {
     long target_idx = (long)(target_data[t])-1;
     real input_target = input_data[target_idx];
     real gradInput_target = 0;
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
     {
       real z = 1 - input_target + input_data[d];
-      if(d == target_idx)
+      if (d == target_idx)
         continue;
     
-      if(z > 0)
+      if (z > 0)
       {
         real h = (p == 1) ? g : 2*g*z;
         gradInput_target -= h;
@@ -142,23 +120,8 @@ static int nn_(MultiMarginCriterion_updateGradInput)(lua_State *L)
     gradInput_data += dim;
   }
 
-
   THTensor_(free)(input);  
   THTensor_(free)(target);
-  return 1;
-}
-
-static const struct luaL_Reg nn_(MultiMarginCriterion__) [] = {
-  {"MultiMarginCriterion_updateOutput", nn_(MultiMarginCriterion_updateOutput)},
-  {"MultiMarginCriterion_updateGradInput", nn_(MultiMarginCriterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(MultiMarginCriterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(MultiMarginCriterion__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/PReLU.c b/generic/PReLU.c
index 0862c284e40..31a1312acf8 100644
--- a/generic/PReLU.c
+++ b/generic/PReLU.c
@@ -2,22 +2,17 @@
 #define TH_GENERIC_FILE "generic/PReLU.c"
 #else
 
-
-static int nn_(PReLU_updateOutput)(lua_State *L)
+void THNN_(PReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THIndex_t nOutputPlane)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  long nOutputPlane = luaT_getfieldchecknumber(L, 1, "nOutputPlane");
-
   THTensor_(resizeAs)(output, input);
 
   if (nOutputPlane == 0)
   {
     // handle shared parameter case
     real w = *THTensor_(data)(weight);
-    TH_TENSOR_APPLY2(real, output, real, input, \
-		     *output_data = (*input_data > 0) ? *input_data : w*(*input_data););
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
+    );
   }
   else
   {
@@ -26,102 +21,95 @@ static int nn_(PReLU_updateOutput)(lua_State *L)
       long input_ndim = THTensor_(nDimension)(input);
       switch (input_ndim)
       {
-	case 1:
-	  bs = 1;
-	  ks = 1;
-	  break;
-	case 2:
-	  bs = input->size[0];
-	  ks = 1;
-	  break;
-	case 3:
-	  bs = 1;
-	  ks = input->size[1] * input->size[2];
-	  break;
-	case 4:
-	  bs = input->size[0];
-	  ks = input->size[2] * input->size[3];
-	  break;
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
       }
 
-      if(input->size[(input_ndim + 1) % 2] != nOutputPlane)
-	THError("wrong number of input planes");
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
     }
 
-    real* output_data = THTensor_(data)(output);
-    real* input_data = THTensor_(data)(input);
-    real* weight_data = THTensor_(data)(weight);
-    long i,j,k;
+    real *output_data = THTensor_(data)(output);
+    real *input_data = THTensor_(data)(input);
+    real *weight_data = THTensor_(data)(weight);
+    THIndex_t i, j, k;
 #pragma omp parallel for private(j,k)
-    for (i=0; i < bs; ++i)
+    for (i = 0; i < bs; ++i)
     {
       real* n_input_data = input_data + i*nOutputPlane*ks;
       real* n_output_data = output_data + i*nOutputPlane*ks;
-      for (j=0; j < nOutputPlane; ++j)
+      for (j = 0; j < nOutputPlane; ++j)
       {
-	for (k=0; k < ks; ++k)
-	  n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
-	n_input_data += ks;
-	n_output_data += ks;
+        for (k = 0; k < ks; ++k)
+          n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+        n_input_data += ks;
+        n_output_data += ks;
       }
     }
   }
-
-  return 1;
 }
 
-static int nn_(PReLU_updateGradInput)(lua_State *L)
+void THNN_(PReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THIndex_t nOutputPlane)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  long nOutputPlane = luaT_getfieldchecknumber(L, 1, "nOutputPlane");
-
   THTensor_(resizeAs)(gradInput, input);
 
   if (nOutputPlane == 0)
   {
     real w = THTensor_(data)(weight)[0];
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,	\
-       if ((*input_data) > 0) *gradInput_data = *gradOutput_data;	\
-       else *gradInput_data = w* *gradOutput_data;);     		\
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+       if ((*input_data) > 0) *gradInput_data = *gradOutput_data;
+       else *gradInput_data = w* *gradOutput_data;
+    );
   }
   else
   {
-    const real* input_data = THTensor_(data)(input);
-    const real* gradOutput_data = THTensor_(data)(gradOutput);
-    const real* weight_data = THTensor_(data)(weight);
-    real* gradInput_data = THTensor_(data)(gradInput);
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradInput_data = THTensor_(data)(gradInput);
 
     long bs, ks;
     {
       long input_ndim = THTensor_(nDimension)(input);
       switch (input_ndim)
       {
-	case 1:
-	  bs = 1;
-	  ks = 1;
-	  break;
-	case 2:
-	  bs = input->size[0];
-	  ks = 1;
-	  break;
-	case 3:
-	  bs = 1;
-	  ks = input->size[1] * input->size[2];
-	  break;
-	case 4:
-	  bs = input->size[0];
-	  ks = input->size[2] * input->size[3];
-	  break;
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
       }
 
-      if(input->size[(input_ndim + 1) % 2] != nOutputPlane)
-	THError("wrong number of input planes");
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
     }
 
-    long i,j,k;
+    THIndex_t i, j, k;
 #pragma omp parallel for private(j,k)
     for (i = 0; i < bs; ++i)
     {
@@ -129,41 +117,32 @@ static int nn_(PReLU_updateGradInput)(lua_State *L)
       const real* n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
       real* n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
 
-      for (j=0; j < nOutputPlane; ++j)
+      for (j = 0; j < nOutputPlane; ++j)
       {
-	real w = weight_data[j];
-	for (k=0; k < ks; ++k)
-	  if (n_input_data[k] > 0)
-	    n_gradInput_data[k] = n_gradOutput_data[k];
-	  else
-	    n_gradInput_data[k] = n_gradOutput_data[k] * w;
-	n_input_data += ks;
-	n_gradInput_data += ks;
-	n_gradOutput_data += ks;
+        real w = weight_data[j];
+        for (k = 0; k < ks; ++k)
+          if (n_input_data[k] > 0)
+            n_gradInput_data[k] = n_gradOutput_data[k];
+          else
+            n_gradInput_data[k] = n_gradOutput_data[k] * w;
+        n_input_data += ks;
+        n_gradInput_data += ks;
+        n_gradOutput_data += ks;
       }
     }
   }
-
-  return 1;
 }
 
-static int nn_(PReLU_accGradParameters)(lua_State *L)
+void THNN_(PReLU_accGradParameters)(THNNState* state, THTensor* input, THTensor* gradOutput, THTensor* gradInput, THTensor *weight, THTensor *gradWeight, THIndex_t nOutputPlane, real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  long nOutputPlane = luaT_getfieldchecknumber(L, 1, "nOutputPlane");
-  real scale = luaL_optnumber(L, 4, 1);
-
   real* gradWeight_data = THTensor_(data)(gradWeight);
 
   if (nOutputPlane == 0)
   {
     real sum = 0;
     TH_TENSOR_APPLY2(real, input, real, gradOutput,  \
-	if ((*input_data) <= 0) sum += *input_data* *gradOutput_data;);
-    gradWeight_data[0] += scale*sum;
+    if ((*input_data) <= 0) sum += *input_data* *gradOutput_data;);
+      gradWeight_data[0] += scale*sum;
   }
   else
   {
@@ -172,67 +151,51 @@ static int nn_(PReLU_accGradParameters)(lua_State *L)
       long input_ndim = THTensor_(nDimension)(input);
       switch (input_ndim)
       {
-	case 1:
-	  bs = 1;
-	  ks = 1;
-	  break;
-	case 2:
-	  bs = input->size[0];
-	  ks = 1;
-	  break;
-	case 3:
-	  bs = 1;
-	  ks = input->size[1] * input->size[2];
-	  break;
-	case 4:
-	  bs = input->size[0];
-	  ks = input->size[2] * input->size[3];
-	  break;
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
       }
 
-      if(input->size[(input_ndim + 1) % 2] != nOutputPlane)
-	THError("wrong number of input planes");
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
     }
 
-    const real* input_data = THTensor_(data)(input);
-    const real* gradOutput_data = THTensor_(data)(gradOutput);
-    const real* weight_data = THTensor_(data)(weight);
-    real* gradWeight_data = THTensor_(data)(gradWeight);
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradWeight_data = THTensor_(data)(gradWeight);
 
-    long i,j,k;
+    THIndex_t i, j, k;
     for (i = 0; i < bs; ++i)
     {
       const real* n_input_data = input_data + i*nOutputPlane*ks;
       const real* n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
 
-      for (j=0; j < nOutputPlane; ++j)
+      for (j = 0; j < nOutputPlane; ++j)
       {
-	real sum = 0;
-	for (k=0; k < ks; ++k)
-	  if (n_input_data[k] <= 0)
-	    sum += n_gradOutput_data[k] * n_input_data[k];
-	gradWeight_data[j] += scale * sum;
-	n_input_data += ks;
-	n_gradOutput_data += ks;
+        real sum = 0;
+        for (k = 0; k < ks; ++k)
+          if (n_input_data[k] <= 0)
+            sum += n_gradOutput_data[k] * n_input_data[k];
+        gradWeight_data[j] += scale * sum;
+        n_input_data += ks;
+        n_gradOutput_data += ks;
       }
     }
   }
-  return 1;
-}
-
-
-static const struct luaL_Reg nn_(PReLU__) [] = {
-  {"PReLU_updateOutput", nn_(PReLU_updateOutput)},
-  {"PReLU_updateGradInput", nn_(PReLU_updateGradInput)},
-  {"PReLU_accGradParameters", nn_(PReLU_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(PReLU_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(PReLU__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index e0f5cefd55f..5675b050dce 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -152,6 +152,86 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *sorted,
           THTensor *indices);
 
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          real margin,
+          bool sizeAverage);
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          real margin,
+          bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor* output,
+          bool sizeAverage,
+          int p);
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p);
+
+TH_API void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_accGradParameters)(
+          THNNState* state,
+          THTensor* input,
+          THTensor* gradOutput,
+          THTensor* gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THIndex_t nOutputPlane,
+          real scale);
+          
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,

From 3e865d0c6271b22bbe15776cc747f8510af88f1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Mon, 25 Jan 2016 00:50:43 +0100
Subject: [PATCH 029/101] Add gradWeightBuf & gradWeightBuf2 params to
 PReLU_accGradParameters

---
 generic/MarginCriterion.c |  4 ++--
 generic/PReLU.c           | 40 +++++++++++++++++++++++++++------------
 generic/THNN.h            | 10 ++++++----
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/generic/MarginCriterion.c b/generic/MarginCriterion.c
index 340ef8031ca..4c88318bd46 100644
--- a/generic/MarginCriterion.c
+++ b/generic/MarginCriterion.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/MarginCriterion.c"
 #else
 
-void THNN_(MarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, real margin, bool sizeAverage)
+void THNN_(MarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage, real margin)
 {
   real sum = 0;
 
@@ -17,7 +17,7 @@ void THNN_(MarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTe
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(MarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, real margin, bool sizeAverage)
+void THNN_(MarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage, real margin)
 {
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
diff --git a/generic/PReLU.c b/generic/PReLU.c
index 31a1312acf8..9a828dfacb3 100644
--- a/generic/PReLU.c
+++ b/generic/PReLU.c
@@ -71,8 +71,10 @@ void THNN_(PReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *g
   {
     real w = THTensor_(data)(weight)[0];
     TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-       if ((*input_data) > 0) *gradInput_data = *gradOutput_data;
-       else *gradInput_data = w* *gradOutput_data;
+       if ((*input_data) > 0)
+         *gradInput_data = *gradOutput_data;
+       else
+         *gradInput_data = w * (*gradOutput_data);
     );
   }
   else
@@ -113,18 +115,20 @@ void THNN_(PReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *g
 #pragma omp parallel for private(j,k)
     for (i = 0; i < bs; ++i)
     {
-      const real* n_input_data = input_data + i*nOutputPlane*ks;
-      const real* n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
-      real* n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
 
       for (j = 0; j < nOutputPlane; ++j)
       {
         real w = weight_data[j];
         for (k = 0; k < ks; ++k)
+        {
           if (n_input_data[k] > 0)
             n_gradInput_data[k] = n_gradOutput_data[k];
           else
             n_gradInput_data[k] = n_gradOutput_data[k] * w;
+        }
         n_input_data += ks;
         n_gradInput_data += ks;
         n_gradOutput_data += ks;
@@ -133,16 +137,28 @@ void THNN_(PReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *g
   }
 }
 
-void THNN_(PReLU_accGradParameters)(THNNState* state, THTensor* input, THTensor* gradOutput, THTensor* gradInput, THTensor *weight, THTensor *gradWeight, THIndex_t nOutputPlane, real scale)
+void THNN_(PReLU_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *gradWeight,
+  THTensor *gradWeightBuf,
+  THTensor *gradWeightBuf2,
+  THIndex_t nOutputPlane,
+  real scale)
 {
-  real* gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
 
   if (nOutputPlane == 0)
   {
     real sum = 0;
-    TH_TENSOR_APPLY2(real, input, real, gradOutput,  \
-    if ((*input_data) <= 0) sum += *input_data* *gradOutput_data;);
-      gradWeight_data[0] += scale*sum;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,
+      if ((*input_data) <= 0)
+        sum += (*input_data) * (*gradOutput_data);
+    );
+    gradWeight_data[0] += scale * sum;
   }
   else
   {
@@ -181,8 +197,8 @@ void THNN_(PReLU_accGradParameters)(THNNState* state, THTensor* input, THTensor*
     THIndex_t i, j, k;
     for (i = 0; i < bs; ++i)
     {
-      const real* n_input_data = input_data + i*nOutputPlane*ks;
-      const real* n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
 
       for (j = 0; j < nOutputPlane; ++j)
       {
diff --git a/generic/THNN.h b/generic/THNN.h
index 5675b050dce..d8fe30d96ce 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -158,15 +158,15 @@ TH_API void THNN_(MarginCriterion_updateOutput)(
           THTensor *input,
           THTensor *target,
           THTensor *output,
-          real margin,
-          bool sizeAverage);
+          bool sizeAverage,
+          real margin);
 TH_API void THNN_(MarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
           THTensor *gradInput,
-          real margin,
-          bool sizeAverage);
+          bool sizeAverage,
+          real margin);
 
 TH_API void THNN_(MSECriterion_updateOutput)(
           THNNState *state,
@@ -229,6 +229,8 @@ TH_API void THNN_(PReLU_accGradParameters)(
           THTensor* gradInput,
           THTensor *weight,
           THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
           THIndex_t nOutputPlane,
           real scale);
           

From 1eea11e01955361d3afafa1374b7596867ca48a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Mon, 25 Jan 2016 22:35:03 +0100
Subject: [PATCH 030/101] Move {RReLU, Sigmoid, SmoothL1Criterion, SoftMax,
 SoftPlus}.c -> lib/THNN/generic

---
 generic/RReLU.c             | 139 ++++++++++++++++++++++++++++++++
 generic/Sigmoid.c           |  44 ++++++++++
 generic/SmoothL1Criterion.c |  60 ++++++++++++++
 generic/SoftMax.c           | 156 ++++++++++++++++++++++++++++++++++++
 generic/SoftPlus.c          |  56 +++++++++++++
 generic/THNN.h              |   2 +-
 init.c                      |  15 ++++
 7 files changed, 471 insertions(+), 1 deletion(-)
 create mode 100644 generic/RReLU.c
 create mode 100644 generic/Sigmoid.c
 create mode 100644 generic/SmoothL1Criterion.c
 create mode 100644 generic/SoftMax.c
 create mode 100644 generic/SoftPlus.c

diff --git a/generic/RReLU.c b/generic/RReLU.c
new file mode 100644
index 00000000000..19b92b6e8c0
--- /dev/null
+++ b/generic/RReLU.c
@@ -0,0 +1,139 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+static int nn_(RReLU_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *noise = luaT_getfieldcheckudata(L, 1, "noise", torch_Tensor);
+  real lower = luaT_getfieldchecknumber(L, 1, "lower");
+  real upper = luaT_getfieldchecknumber(L, 1, "upper");
+  int train = luaT_getfieldcheckboolean(L, 1, "train");
+  int inplace = luaT_getfieldcheckboolean(L, 1, "inplace");
+  
+  if (train)
+  {
+    // get default random generator
+    lua_getglobal(L, "torch");
+    THGenerator *generator = luaT_getfieldcheckudata(L, -1, "_gen", torch_Generator);
+    lua_pop(L, 2);
+
+    THTensor_(resizeAs)(noise, input);
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, input, real, noise, \
+        if (*input_data <= 0) { \
+          const real r = (real)THRandom_uniform(generator, lower, upper); \
+          *input_data = (*input_data) * r; \
+          *noise_data = r; \
+        } \
+        else { \
+          *noise_data = 1; \
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise, \
+        if (*input_data <= 0) { \
+          const real r = (real)THRandom_uniform(generator, lower, upper); \
+          *output_data = (*input_data) * r; \
+          *noise_data = r; \
+        } \
+        else { \
+          *output_data = *input_data;
+          *noise_data = 1; \
+        }
+      );
+    }
+  }
+  else
+  {
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input, \
+        if (*input_data <= 0) { \
+          *input_data = *input_data * negSlope; \
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY2(real, input, real, output, \
+        const real r = (*input_data) <= 0 ? negSlope : 1; \
+        *output_data = *input_data * r; \
+      );
+    }
+  }  
+  return 1;
+}
+
+static int nn_(RReLU_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  THTensor *noise = luaT_getfieldcheckudata(L, 1, "noise", torch_Tensor);
+  real lower = luaT_getfieldchecknumber(L, 1, "lower");
+  real upper = luaT_getfieldchecknumber(L, 1, "upper");
+  int train = luaT_getfieldcheckboolean(L, 1, "train");
+  int inplace = luaT_getfieldcheckboolean(L, 1, "inplace");
+  
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THTensor_(cmul)(gradOutput, gradOutput, noise);
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor_(cmul)(gradInput, gradOutput, noise);
+    }    
+  }
+  else
+  { 
+    // use constant factor for negative input values
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input, \
+        if (*input_data <= 0) { \
+         *gradOutput_data = (*gradOutput_data) * negSlope; \
+        } \
+      );
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data); \
+      );
+    }
+  }
+  return 1;
+}
+
+static const struct luaL_Reg nn_(RReLU__) [] = {
+  { "RReLU_updateOutput", nn_(RReLU_updateOutput) },
+  { "RReLU_updateGradInput", nn_(RReLU_updateGradInput) },
+  { NULL, NULL }
+};
+
+static void nn_(RReLU_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(RReLU__), "nn");
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/Sigmoid.c b/generic/Sigmoid.c
new file mode 100644
index 00000000000..057ebc4f5af
--- /dev/null
+++ b/generic/Sigmoid.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+static int nn_(Sigmoid_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input, \
+                   *output_data = 1./(1.+ exp(- *input_data));)
+
+  return 1;
+}
+
+static int nn_(Sigmoid_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
+                   real z = *output_data; \
+                   *gradInput_data = *gradOutput_data * (1. - z) * z;)
+  return 1;
+}
+
+static const struct luaL_Reg nn_(Sigmoid__) [] = {
+  {"Sigmoid_updateOutput", nn_(Sigmoid_updateOutput)},
+  {"Sigmoid_updateGradInput", nn_(Sigmoid_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(Sigmoid_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(Sigmoid__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SmoothL1Criterion.c b/generic/SmoothL1Criterion.c
new file mode 100644
index 00000000000..51cab0c46a8
--- /dev/null
+++ b/generic/SmoothL1Criterion.c
@@ -0,0 +1,60 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+static int nn_(SmoothL1Criterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = fabs(*input_data - *target_data);
+                   sum += z < 1 ? 0.5*z*z : z - 0.5;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  lua_pushnumber(L, sum);
+  lua_setfield(L, 1, "output");
+
+  lua_pushnumber(L, sum);
+  return 1;
+}
+
+static int nn_(SmoothL1Criterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real x = *input_data - *target_data;
+                   if(x < -1.)
+                     *gradInput_data = - norm;
+                   else if(x > 1.)
+                     *gradInput_data = norm;
+                   else
+                     *gradInput_data = norm * x;)
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SmoothL1Criterion__) [] = {
+  {"SmoothL1Criterion_updateOutput", nn_(SmoothL1Criterion_updateOutput)},
+  {"SmoothL1Criterion_updateGradInput", nn_(SmoothL1Criterion_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SmoothL1Criterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SmoothL1Criterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SoftMax.c b/generic/SoftMax.c
new file mode 100644
index 00000000000..0201aaf02c2
--- /dev/null
+++ b/generic/SoftMax.c
@@ -0,0 +1,156 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMax.c"
+#else
+
+static int nn_(SoftMax_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  real *input_data, *output_data;
+  long nframe = 0, dim = 0, stride = 0;
+  long t;
+
+  if(input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if(input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if(input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if(input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < stride*nframe; t++)
+  {
+    real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+
+    real inputMax = -THInf;
+    accreal sum;
+
+    long d;
+    for(d = 0; d < dim; d++) {
+      if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
+    }
+
+    sum = 0;
+    for(d = 0; d < dim; d++) {
+      real z = THExpMinusApprox(inputMax - input_ptr[d*stride]);
+      output_ptr[d*stride] = z;
+      sum += z;
+    }
+
+    for(d = 0; d < dim; d++) {
+      output_ptr[d*stride] *= 1/sum;
+    }
+  }
+
+  THTensor_(free)(input);
+
+  return 1;
+}
+
+static int nn_(SoftMax_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  long nframe = 0, dim = 0, stride = 0;
+  long t;
+
+  if(output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if(output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if(output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if(output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+    THError("1D, 2D, 3D or 4D tensor expected");
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  output = THTensor_(newContiguous)(output);
+
+  THTensor_(resizeAs)(gradInput, output);
+  gradInput_data = THTensor_(data)(gradInput);
+  output_data = THTensor_(data)(output);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < stride*nframe; t++)
+  {
+    real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+    real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
+
+    long d;
+    accreal sum = 0;
+    for(d = 0; d < dim; d++)
+      sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
+
+    for(d = 0; d < dim; d++)
+      gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SoftMax__) [] = {
+  {"SoftMax_updateOutput", nn_(SoftMax_updateOutput)},
+  {"SoftMax_updateGradInput", nn_(SoftMax_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SoftMax_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SoftMax__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SoftPlus.c b/generic/SoftPlus.c
new file mode 100644
index 00000000000..81f2a7ce167
--- /dev/null
+++ b/generic/SoftPlus.c
@@ -0,0 +1,56 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+static int nn_(SoftPlus_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  real beta = luaT_getfieldchecknumber(L, 1, "beta");
+  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
+
+  THTensor_(resizeAs)(output, input);
+
+  /* f(x) = 1/beta * log(1 + exp(beta * x)) */
+
+  TH_TENSOR_APPLY2(real, output, real, input,               \
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;)
+    
+    return 1;
+}
+
+static int nn_(SoftPlus_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  real beta = luaT_getfieldchecknumber(L, 1, "beta");
+  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
+
+  /* d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+     SINCE
+     y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+     THEREFORE:
+     d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y) */
+
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,    \
+                   real z = exp(*output_data * beta);                  \
+                   *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;)
+    return 1;
+}
+
+static const struct luaL_Reg nn_(SoftPlus__) [] = {
+  {"SoftPlus_updateOutput", nn_(SoftPlus_updateOutput)},
+  {"SoftPlus_updateGradInput", nn_(SoftPlus_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SoftPlus_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SoftPlus__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index d8fe30d96ce..871ca976659 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -233,7 +233,7 @@ TH_API void THNN_(PReLU_accGradParameters)(
           THTensor *gradWeightBuf2,
           THIndex_t nOutputPlane,
           real scale);
-          
+
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index e1a847edaf9..43572b92721 100644
--- a/init.c
+++ b/init.c
@@ -55,6 +55,21 @@
 #include "generic/PReLU.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/RReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SmoothL1Criterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftPlus.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 

From db88a2b38b10e4ece8a323beb8259ee238c62aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Mon, 25 Jan 2016 23:23:02 +0100
Subject: [PATCH 031/101] Add THNN conversion of {RReLU, Sigmoid,
 SmoothL1Criterion,SoftMax, SoftPlus}

---
 generic/MultiMarginCriterion.c |   2 +-
 generic/RReLU.c                | 102 ++++++++++++---------------------
 generic/Sigmoid.c              |  39 +++----------
 generic/SmoothL1Criterion.c    |  57 ++++++------------
 generic/SoftMax.c              |  63 ++++++++------------
 generic/SoftPlus.c             |  56 +++++-------------
 generic/THNN.h                 |  71 +++++++++++++++++++++++
 7 files changed, 172 insertions(+), 218 deletions(-)

diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index 9cb1686cc67..6445bb040fc 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
 #else
 
-void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor* output, bool sizeAverage, int p)
+void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage, int p)
 {
   real *input_data, *target_data;
   long nframe, dim;
diff --git a/generic/RReLU.c b/generic/RReLU.c
index 19b92b6e8c0..74c5df547ad 100644
--- a/generic/RReLU.c
+++ b/generic/RReLU.c
@@ -2,34 +2,24 @@
 #define TH_GENERIC_FILE "generic/RReLU.c"
 #else
 
-static int nn_(RReLU_updateOutput)(lua_State *L)
+void THNN_(RReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *noise, real lower, real upper, bool train, bool inplace, THGenerator *generator)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *noise = luaT_getfieldcheckudata(L, 1, "noise", torch_Tensor);
-  real lower = luaT_getfieldchecknumber(L, 1, "lower");
-  real upper = luaT_getfieldchecknumber(L, 1, "upper");
-  int train = luaT_getfieldcheckboolean(L, 1, "train");
-  int inplace = luaT_getfieldcheckboolean(L, 1, "inplace");
-  
   if (train)
   {
     // get default random generator
-    lua_getglobal(L, "torch");
-    THGenerator *generator = luaT_getfieldcheckudata(L, -1, "_gen", torch_Generator);
-    lua_pop(L, 2);
-
     THTensor_(resizeAs)(noise, input);
     if (inplace)
     {
-      TH_TENSOR_APPLY2(real, input, real, noise, \
-        if (*input_data <= 0) { \
-          const real r = (real)THRandom_uniform(generator, lower, upper); \
-          *input_data = (*input_data) * r; \
-          *noise_data = r; \
-        } \
-        else { \
-          *noise_data = 1; \
+      TH_TENSOR_APPLY2(real, input, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *input_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *noise_data = 1;
         }
       );
       THTensor_(set)(output, input);
@@ -37,15 +27,17 @@ static int nn_(RReLU_updateOutput)(lua_State *L)
     else
     {
       THTensor_(resizeAs)(output, input);
-      TH_TENSOR_APPLY3(real, input, real, output, real, noise, \
-        if (*input_data <= 0) { \
-          const real r = (real)THRandom_uniform(generator, lower, upper); \
-          *output_data = (*input_data) * r; \
-          *noise_data = r; \
-        } \
-        else { \
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *output_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
           *output_data = *input_data;
-          *noise_data = 1; \
+          *noise_data = 1;
         }
       );
     }
@@ -55,9 +47,10 @@ static int nn_(RReLU_updateOutput)(lua_State *L)
     const real negSlope = (lower + upper) / 2;
     if (inplace)
     {
-      TH_TENSOR_APPLY(real, input, \
-        if (*input_data <= 0) { \
-          *input_data = *input_data * negSlope; \
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data <= 0)
+        {
+          *input_data = *input_data * negSlope;
         }
       );
       THTensor_(set)(output, input);
@@ -65,26 +58,16 @@ static int nn_(RReLU_updateOutput)(lua_State *L)
     else
     {
       THTensor_(resizeAs)(output, input);
-      TH_TENSOR_APPLY2(real, input, real, output, \
-        const real r = (*input_data) <= 0 ? negSlope : 1; \
-        *output_data = *input_data * r; \
+      TH_TENSOR_APPLY2(real, input, real, output,
+        const real r = (*input_data) <= 0 ? negSlope : 1;
+        *output_data = *input_data * r;
       );
     }
   }  
-  return 1;
 }
 
-static int nn_(RReLU_updateGradInput)(lua_State *L)
+void THNN_(RReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *noise, real lower, real upper, bool train, bool inplace)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  THTensor *noise = luaT_getfieldcheckudata(L, 1, "noise", torch_Tensor);
-  real lower = luaT_getfieldchecknumber(L, 1, "lower");
-  real upper = luaT_getfieldchecknumber(L, 1, "upper");
-  int train = luaT_getfieldcheckboolean(L, 1, "train");
-  int inplace = luaT_getfieldcheckboolean(L, 1, "inplace");
-  
   if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
   {
     // multiply the gradient by the noise tensor
@@ -105,35 +88,22 @@ static int nn_(RReLU_updateGradInput)(lua_State *L)
     const real negSlope = (lower + upper) / 2;
     if (inplace)
     {
-      TH_TENSOR_APPLY2(real, gradOutput, real, input, \
-        if (*input_data <= 0) { \
-         *gradOutput_data = (*gradOutput_data) * negSlope; \
-        } \
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= 0)
+        {
+          *gradOutput_data = (*gradOutput_data) * negSlope;
+        }
       );
       THTensor_(set)(gradInput, gradOutput);
     }
     else
     {
       THTensor_(resizeAs)(gradInput, input);
-      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, \
-        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data); \
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
       );
     }
   }
-  return 1;
-}
-
-static const struct luaL_Reg nn_(RReLU__) [] = {
-  { "RReLU_updateOutput", nn_(RReLU_updateOutput) },
-  { "RReLU_updateGradInput", nn_(RReLU_updateGradInput) },
-  { NULL, NULL }
-};
-
-static void nn_(RReLU_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(RReLU__), "nn");
-  lua_pop(L, 1);
 }
 
 #endif
diff --git a/generic/Sigmoid.c b/generic/Sigmoid.c
index 057ebc4f5af..f58d33bf2d8 100644
--- a/generic/Sigmoid.c
+++ b/generic/Sigmoid.c
@@ -2,43 +2,22 @@
 #define TH_GENERIC_FILE "generic/Sigmoid.c"
 #else
 
-static int nn_(Sigmoid_updateOutput)(lua_State *L)
+void THNN_(Sigmoid_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
 
-  TH_TENSOR_APPLY2(real, output, real, input, \
-                   *output_data = 1./(1.+ exp(- *input_data));)
-
-  return 1;
+  TH_TENSOR_APPLY2(real, output, real, input,
+    *output_data = 1./(1.+ exp(- *input_data));
+  );
 }
 
-static int nn_(Sigmoid_updateGradInput)(lua_State *L)
+void THNN_(Sigmoid_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
 {
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, output);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
-                   real z = *output_data; \
-                   *gradInput_data = *gradOutput_data * (1. - z) * z;)
-  return 1;
-}
-
-static const struct luaL_Reg nn_(Sigmoid__) [] = {
-  {"Sigmoid_updateOutput", nn_(Sigmoid_updateOutput)},
-  {"Sigmoid_updateGradInput", nn_(Sigmoid_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(Sigmoid_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(Sigmoid__), "nn");
-  lua_pop(L,1);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = *output_data;
+    *gradInput_data = *gradOutput_data * (1. - z) * z;
+  );
 }
 
 #endif
diff --git a/generic/SmoothL1Criterion.c b/generic/SmoothL1Criterion.c
index 51cab0c46a8..3111b3dc693 100644
--- a/generic/SmoothL1Criterion.c
+++ b/generic/SmoothL1Criterion.c
@@ -2,59 +2,34 @@
 #define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
 #else
 
-static int nn_(SmoothL1Criterion_updateOutput)(lua_State *L)
+void THNN_(SmoothL1Criterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  real sum;
-
-  sum = 0;
+  real sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
-                   real z = fabs(*input_data - *target_data);
-                   sum += z < 1 ? 0.5*z*z : z - 0.5;)
+    real z = fabs(*input_data - *target_data);
+    sum += z < 1 ? 0.5*z*z : z - 0.5;
+  );
 
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= THTensor_(nElement)(input);
 
-  lua_pushnumber(L, sum);
-  lua_setfield(L, 1, "output");
-
-  lua_pushnumber(L, sum);
-  return 1;
+  THTensor_(set1d)(output, 0, sum);
 }
 
-static int nn_(SmoothL1Criterion_updateGradInput)(lua_State *L)
+void THNN_(SmoothL1Criterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *target = luaT_checkudata(L, 3, torch_Tensor);
-  int sizeAverage = luaT_getfieldcheckboolean(L, 1, "sizeAverage");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
-                   real x = *input_data - *target_data;
-                   if(x < -1.)
-                     *gradInput_data = - norm;
-                   else if(x > 1.)
-                     *gradInput_data = norm;
-                   else
-                     *gradInput_data = norm * x;)
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SmoothL1Criterion__) [] = {
-  {"SmoothL1Criterion_updateOutput", nn_(SmoothL1Criterion_updateOutput)},
-  {"SmoothL1Criterion_updateGradInput", nn_(SmoothL1Criterion_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SmoothL1Criterion_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SmoothL1Criterion__), "nn");
-  lua_pop(L,1);
+    real x = *input_data - *target_data;
+    if (x < -1.)
+     *gradInput_data = - norm;
+    else if (x > 1.)
+     *gradInput_data = norm;
+    else
+     *gradInput_data = norm * x;
+  );
 }
 
 #endif
diff --git a/generic/SoftMax.c b/generic/SoftMax.c
index 0201aaf02c2..598d35e8af8 100644
--- a/generic/SoftMax.c
+++ b/generic/SoftMax.c
@@ -2,40 +2,40 @@
 #define TH_GENERIC_FILE "generic/SoftMax.c"
 #else
 
-static int nn_(SoftMax_updateOutput)(lua_State *L)
+void THNN_(SoftMax_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
   real *input_data, *output_data;
   long nframe = 0, dim = 0, stride = 0;
   long t;
 
-  if(input->nDimension == 1)
+  if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0];
     stride = 1;
   }
-  else if(input->nDimension == 2)
+  else if (input->nDimension == 2)
   {
     nframe = input->size[0];
     dim = input->size[1];
     stride = 1;
   }
-  else if(input->nDimension == 3)
+  else if (input->nDimension == 3)
   {
     nframe = 1;
     dim = input->size[0];
     stride = input->size[1]*input->size[2];
   }
-  else if(input->nDimension == 4)
+  else if (input->nDimension == 4)
   {
     nframe = input->size[0];
     dim = input->size[1];
     stride = input->size[2]*input->size[3];
   }
   else
+  {
     THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+  }
 
   input = THTensor_(newContiguous)(input);
   THTensor_(resizeAs)(output, input);
@@ -44,7 +44,7 @@ static int nn_(SoftMax_updateOutput)(lua_State *L)
   output_data = THTensor_(data)(output);
 
 #pragma omp parallel for private(t)
-  for(t = 0; t < stride*nframe; t++)
+  for (t = 0; t < stride*nframe; t++)
   {
     real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
     real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
@@ -53,62 +53,62 @@ static int nn_(SoftMax_updateOutput)(lua_State *L)
     accreal sum;
 
     long d;
-    for(d = 0; d < dim; d++) {
+    for (d = 0; d < dim; d++)
+    {
       if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
     }
 
     sum = 0;
-    for(d = 0; d < dim; d++) {
+    for (d = 0; d < dim; d++)
+    {
       real z = THExpMinusApprox(inputMax - input_ptr[d*stride]);
       output_ptr[d*stride] = z;
       sum += z;
     }
 
-    for(d = 0; d < dim; d++) {
+    for (d = 0; d < dim; d++)
+    {
       output_ptr[d*stride] *= 1/sum;
     }
   }
 
   THTensor_(free)(input);
-
-  return 1;
 }
 
-static int nn_(SoftMax_updateGradInput)(lua_State *L)
+void THNN_(SoftMax_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
 {
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   real *gradInput_data, *gradOutput_data, *output_data;
   long nframe = 0, dim = 0, stride = 0;
   long t;
 
-  if(output->nDimension == 1)
+  if (output->nDimension == 1)
   {
     nframe = 1;
     dim = output->size[0];
     stride = 1;
   }
-  else if(output->nDimension == 2)
+  else if (output->nDimension == 2)
   {
     nframe = output->size[0];
     dim = output->size[1];
     stride = 1;
   }
-  else if(output->nDimension == 3)
+  else if (output->nDimension == 3)
   {
     nframe = 1;
     dim = output->size[0];
     stride = output->size[1]*output->size[2];
   }
-  else if(output->nDimension == 4)
+  else if (output->nDimension == 4)
   {
     nframe = output->size[0];
     dim = output->size[1];
     stride = output->size[2]*output->size[3];
   }
   else
+  {
     THError("1D, 2D, 3D or 4D tensor expected");
+  }
 
   gradOutput = THTensor_(newContiguous)(gradOutput);
   output = THTensor_(newContiguous)(output);
@@ -119,7 +119,7 @@ static int nn_(SoftMax_updateGradInput)(lua_State *L)
   gradOutput_data = THTensor_(data)(gradOutput);
 
 #pragma omp parallel for private(t)
-  for(t = 0; t < stride*nframe; t++)
+  for (t = 0; t < stride*nframe; t++)
   {
     real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
     real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
@@ -127,30 +127,15 @@ static int nn_(SoftMax_updateGradInput)(lua_State *L)
 
     long d;
     accreal sum = 0;
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
 
-    for(d = 0; d < dim; d++)
+    for (d = 0; d < dim; d++)
       gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
   }
 
   THTensor_(free)(gradOutput);
   THTensor_(free)(output);
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SoftMax__) [] = {
-  {"SoftMax_updateOutput", nn_(SoftMax_updateOutput)},
-  {"SoftMax_updateGradInput", nn_(SoftMax_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SoftMax_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SoftMax__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SoftPlus.c b/generic/SoftPlus.c
index 81f2a7ce167..76c9c1c4eb7 100644
--- a/generic/SoftPlus.c
+++ b/generic/SoftPlus.c
@@ -2,55 +2,29 @@
 #define TH_GENERIC_FILE "generic/SoftPlus.c"
 #else
 
-static int nn_(SoftPlus_updateOutput)(lua_State *L)
+void THNN_(SoftPlus_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real beta, real threshold)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  real beta = luaT_getfieldchecknumber(L, 1, "beta");
-  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
-
   THTensor_(resizeAs)(output, input);
 
-  /* f(x) = 1/beta * log(1 + exp(beta * x)) */
-
+  // f(x) = 1/beta * log(1 + exp(beta * x))
   TH_TENSOR_APPLY2(real, output, real, input,               \
-    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;)
-    
-    return 1;
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+  );
 }
 
-static int nn_(SoftPlus_updateGradInput)(lua_State *L)
+void THNN_(SoftPlus_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output, real beta, real threshold)
 {
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  real beta = luaT_getfieldchecknumber(L, 1, "beta");
-  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
-
-  /* d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
-     SINCE
-     y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
-     THEREFORE:
-     d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y) */
-
   THTensor_(resizeAs)(gradInput, output);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,    \
-                   real z = exp(*output_data * beta);                  \
-                   *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;)
-    return 1;
-}
-
-static const struct luaL_Reg nn_(SoftPlus__) [] = {
-  {"SoftPlus_updateOutput", nn_(SoftPlus_updateOutput)},
-  {"SoftPlus_updateGradInput", nn_(SoftPlus_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SoftPlus_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SoftPlus__), "nn");
-  lua_pop(L,1);
+  
+  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+  // SINCE
+  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+  // THEREFORE:
+  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = exp(*output_data * beta);
+    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+  );
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 871ca976659..f3d2ce86ace 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -234,6 +234,77 @@ TH_API void THNN_(PReLU_accGradParameters)(
           THIndex_t nOutputPlane,
           real scale);
 
+TH_API void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+TH_API void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold);
+TH_API void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold);
+
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,

From e9c5c1a79f663649636315a2b16fe52ad1981893 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Mon, 25 Jan 2016 23:27:26 +0100
Subject: [PATCH 032/101] Move {SoftShrink, Sqrt, Square, Tanh, Threshold}.c ->
 lib/THNN/generic

---
 generic/SoftShrink.c | 50 +++++++++++++++++++++++++++++++
 generic/Sqrt.c       | 64 +++++++++++++++++++++++++++++++++++++++
 generic/Square.c     | 71 ++++++++++++++++++++++++++++++++++++++++++++
 generic/Tanh.c       | 63 +++++++++++++++++++++++++++++++++++++++
 generic/Threshold.c  | 66 ++++++++++++++++++++++++++++++++++++++++
 init.c               | 15 ++++++++++
 6 files changed, 329 insertions(+)
 create mode 100644 generic/SoftShrink.c
 create mode 100644 generic/Sqrt.c
 create mode 100644 generic/Square.c
 create mode 100644 generic/Tanh.c
 create mode 100644 generic/Threshold.c

diff --git a/generic/SoftShrink.c b/generic/SoftShrink.c
new file mode 100644
index 00000000000..985196dec6e
--- /dev/null
+++ b/generic/SoftShrink.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+static int nn_(SoftShrink_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  
+  TH_TENSOR_APPLY2(real, output, real, input,				\
+                   if ((*input_data) > lambda) *output_data = *input_data - lambda; \
+                   else if ((*input_data) < -lambda) *output_data = *input_data + lambda; \
+                   else *output_data = 0;);
+  return 1;
+}
+
+static int nn_(SoftShrink_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,	\
+                   if ((*input_data) > lambda || (*input_data) < -lambda) \
+		     *gradInput_data = (*gradOutput_data);		\
+		   else							\
+		     *gradInput_data = 0;				\
+    );
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SoftShrink__) [] = {
+  {"SoftShrink_updateOutput", nn_(SoftShrink_updateOutput)},
+  {"SoftShrink_updateGradInput", nn_(SoftShrink_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SoftShrink_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SoftShrink__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/Sqrt.c b/generic/Sqrt.c
new file mode 100644
index 00000000000..c2261c9f7cd
--- /dev/null
+++ b/generic/Sqrt.c
@@ -0,0 +1,64 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+static int nn_(Sqrt_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real bias = luaT_getfieldchecknumber(L,1,"eps");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  THTensor_(sqrt)(output, input);
+  return 1;
+}
+
+static int nn_(Sqrt_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (output->nDimension == 1 || 
+      !THTensor_(isContiguous)(output) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
+                     *gradInput_data = ((*output_data == 0.0) ? 0.0 : \
+                                        (0.5 * (*gradOutput_data / *output_data))););
+  }
+  else
+  {
+    real* gradOutput_data = THTensor_(data)(gradOutput);
+    real* gradInput_data  = THTensor_(data)(gradInput);
+    real* output_data     = THTensor_(data)(output);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(output); i++)
+      if (output_data[i] == 0.0) {
+        gradInput_data[i] = 0.0;
+      } else {
+        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+      }
+  }
+  return 1;
+}
+
+static const struct luaL_Reg nn_(Sqrt__) [] = {
+  {"Sqrt_updateOutput", nn_(Sqrt_updateOutput)},
+  {"Sqrt_updateGradInput", nn_(Sqrt_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(Sqrt_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(Sqrt__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/Square.c b/generic/Square.c
new file mode 100644
index 00000000000..4e20116c6e5
--- /dev/null
+++ b/generic/Square.c
@@ -0,0 +1,71 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+static int nn_(Square_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,   \
+         *output_data = (*input_data) * (*input_data););
+  }
+  else
+  {
+    real* output_data = THTensor_(data)(output);
+    real* input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(input); i++)
+      output_data[i] = input_data[i]*input_data[i];
+  }
+  return 1;
+}
+
+static int nn_(Square_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 || 
+      !THTensor_(isContiguous)(input) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,  \
+         *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data););
+  }
+  else
+  {
+    real* gradOutput_data = THTensor_(data)(gradOutput);
+    real* gradInput_data  = THTensor_(data)(gradInput);
+    real* input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(gradInput); i++)
+      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+  }
+  return 1;
+}
+
+static const struct luaL_Reg nn_(Square__) [] = {
+  {"Square_updateOutput", nn_(Square_updateOutput)},
+  {"Square_updateGradInput", nn_(Square_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(Square_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(Square__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/Tanh.c b/generic/Tanh.c
new file mode 100644
index 00000000000..f0a05795069
--- /dev/null
+++ b/generic/Tanh.c
@@ -0,0 +1,63 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+static int nn_(Tanh_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  THTensor_(resizeAs)(output, input);
+  THTensor_(tanh)(output, input);
+  return 1;
+}
+
+static int nn_(Tanh_updateGradInput)(lua_State *L)
+{
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, output);
+
+  if (output->nDimension == 1 || 
+      !THTensor_(isContiguous)(output) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,  \
+         real z = *output_data;            \
+         *gradInput_data = *gradOutput_data * (1. - z*z););
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_output     = THTensor_(data)(output);
+    long i;
+
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(gradInput); i++)
+    {
+      real z = ptr_output[i];
+      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+    }
+  }
+  return 1;
+}
+
+static const struct luaL_Reg nn_(Tanh__) [] = {
+  {"Tanh_updateOutput", nn_(Tanh_updateOutput)},
+  {"Tanh_updateGradInput", nn_(Tanh_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(Tanh_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(Tanh__), "nn");
+  lua_pop(L,1);
+
+}
+
+#endif
diff --git a/generic/Threshold.c b/generic/Threshold.c
new file mode 100644
index 00000000000..a309f78a732
--- /dev/null
+++ b/generic/Threshold.c
@@ -0,0 +1,66 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+static int nn_(Threshold_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  real val = luaT_getfieldchecknumber(L, 1, "val");
+  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
+
+  if (inPlace) {
+    TH_TENSOR_APPLY(real, input,                   \
+                    if (*input_data <= threshold) { \
+                      *input_data = val;           \
+                    });
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,                         \
+                     *output_data = (*input_data > threshold) ? *input_data : val;);
+
+  }
+
+  return 1;
+}
+
+static int nn_(Threshold_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
+
+  if (inPlace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,    \
+                     if ((*input_data) <= threshold) { \
+                       *gradOutput_data = 0;           \
+                         });
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,    \
+                     if ((*input_data) > threshold) *gradInput_data = *gradOutput_data; \
+                     else *gradInput_data = 0;);                        \
+  }
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(Threshold__) [] = {
+  {"Threshold_updateOutput", nn_(Threshold_updateOutput)},
+  {"Threshold_updateGradInput", nn_(Threshold_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(Threshold_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(Threshold__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/init.c b/init.c
index 43572b92721..4328961713d 100644
--- a/init.c
+++ b/init.c
@@ -70,6 +70,21 @@
 #include "generic/SoftPlus.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SoftShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sqrt.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Square.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Tanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Threshold.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 

From fe8b616552c8acb276f354846108455207913b2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Tue, 26 Jan 2016 00:02:33 +0100
Subject: [PATCH 033/101] Add THNN conversion of {oftShrink, Sqrt, Square,
 Tanh, Threshold}

---
 generic/SoftShrink.c | 52 ++++++++-----------------
 generic/Sqrt.c       | 47 ++++++----------------
 generic/Square.c     | 50 ++++++++----------------
 generic/THNN.h       | 92 +++++++++++++++++++++++++++++++-------------
 generic/Tanh.c       | 36 ++++-------------
 generic/Threshold.c  | 80 +++++++++++++++-----------------------
 6 files changed, 146 insertions(+), 211 deletions(-)

diff --git a/generic/SoftShrink.c b/generic/SoftShrink.c
index 985196dec6e..b15003fd2f6 100644
--- a/generic/SoftShrink.c
+++ b/generic/SoftShrink.c
@@ -2,49 +2,29 @@
 #define TH_GENERIC_FILE "generic/SoftShrink.c"
 #else
 
-static int nn_(SoftShrink_updateOutput)(lua_State *L)
+void THNN_(SoftShrink_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real lambda)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
   
-  TH_TENSOR_APPLY2(real, output, real, input,				\
-                   if ((*input_data) > lambda) *output_data = *input_data - lambda; \
-                   else if ((*input_data) < -lambda) *output_data = *input_data + lambda; \
-                   else *output_data = 0;);
-  return 1;
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+     *output_data = *input_data - lambda;
+    else if ((*input_data) < -lambda)
+     *output_data = *input_data + lambda;
+    else
+     *output_data = 0;
+  );
 }
 
-static int nn_(SoftShrink_updateGradInput)(lua_State *L)
+void THNN_(SoftShrink_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real lambda)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real lambda = luaT_getfieldchecknumber(L, 1, "lambda");
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, input);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,	\
-                   if ((*input_data) > lambda || (*input_data) < -lambda) \
-		     *gradInput_data = (*gradOutput_data);		\
-		   else							\
-		     *gradInput_data = 0;				\
-    );
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SoftShrink__) [] = {
-  {"SoftShrink_updateOutput", nn_(SoftShrink_updateOutput)},
-  {"SoftShrink_updateGradInput", nn_(SoftShrink_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SoftShrink_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SoftShrink__), "nn");
-  lua_pop(L,1);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
 }
 
 #endif
diff --git a/generic/Sqrt.c b/generic/Sqrt.c
index c2261c9f7cd..a1cd4a06897 100644
--- a/generic/Sqrt.c
+++ b/generic/Sqrt.c
@@ -2,24 +2,14 @@
 #define TH_GENERIC_FILE "generic/Sqrt.c"
 #else
 
-static int nn_(Sqrt_updateOutput)(lua_State *L)
+void THNN_(Sqrt_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real eps)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real bias = luaT_getfieldchecknumber(L,1,"eps");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
   THTensor_(sqrt)(output, input);
-  return 1;
 }
 
-static int nn_(Sqrt_updateGradInput)(lua_State *L)
+void THNN_(Sqrt_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, input);
 
   if (output->nDimension == 1 || 
@@ -27,38 +17,25 @@ static int nn_(Sqrt_updateGradInput)(lua_State *L)
       !THTensor_(isContiguous)(gradOutput) ||
       !THTensor_(isContiguous)(gradInput))
   {
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, \
-                     *gradInput_data = ((*output_data == 0.0) ? 0.0 : \
-                                        (0.5 * (*gradOutput_data / *output_data))););
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+    );
   }
   else
   {
-    real* gradOutput_data = THTensor_(data)(gradOutput);
-    real* gradInput_data  = THTensor_(data)(gradInput);
-    real* output_data     = THTensor_(data)(output);
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *output_data     = THTensor_(data)(output);
     long i;
 #pragma omp parallel for private(i)
     for(i = 0; i < THTensor_(nElement)(output); i++)
-      if (output_data[i] == 0.0) {
+    {
+      if (output_data[i] == 0.0)
         gradInput_data[i] = 0.0;
-      } else {
+      else
         gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
-      }
+    }
   }
-  return 1;
-}
-
-static const struct luaL_Reg nn_(Sqrt__) [] = {
-  {"Sqrt_updateOutput", nn_(Sqrt_updateOutput)},
-  {"Sqrt_updateGradInput", nn_(Sqrt_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(Sqrt_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(Sqrt__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/Square.c b/generic/Square.c
index 4e20116c6e5..efdb54fb9d3 100644
--- a/generic/Square.c
+++ b/generic/Square.c
@@ -2,36 +2,29 @@
 #define TH_GENERIC_FILE "generic/Square.c"
 #else
 
-static int nn_(Square_updateOutput)(lua_State *L)
+void THNN_(Square_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
   
   if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
   {
-    TH_TENSOR_APPLY2(real, output, real, input,   \
-         *output_data = (*input_data) * (*input_data););
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data) * (*input_data);
+    );
   }
   else
   {
-    real* output_data = THTensor_(data)(output);
-    real* input_data  = THTensor_(data)(input);
+    real *output_data = THTensor_(data)(output);
+    real *input_data  = THTensor_(data)(input);
     long i;
 #pragma omp parallel for private(i)
-    for(i = 0; i < THTensor_(nElement)(input); i++)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
       output_data[i] = input_data[i]*input_data[i];
   }
-  return 1;
 }
 
-static int nn_(Square_updateGradInput)(lua_State *L)
+void THNN_(Square_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, input);
 
   if (input->nDimension == 1 || 
@@ -39,33 +32,20 @@ static int nn_(Square_updateGradInput)(lua_State *L)
       !THTensor_(isContiguous)(gradOutput) ||
       !THTensor_(isContiguous)(gradInput))
   {
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,  \
-         *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data););
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
+    );
   }
   else
   {
-    real* gradOutput_data = THTensor_(data)(gradOutput);
-    real* gradInput_data  = THTensor_(data)(gradInput);
-    real* input_data  = THTensor_(data)(input);
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *input_data  = THTensor_(data)(input);
     long i;
 #pragma omp parallel for private(i)
-    for(i = 0; i < THTensor_(nElement)(gradInput); i++)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
       gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
   }
-  return 1;
-}
-
-static const struct luaL_Reg nn_(Square__) [] = {
-  {"Square_updateOutput", nn_(Square_updateOutput)},
-  {"Square_updateGradInput", nn_(Square_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(Square_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(Square__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index f3d2ce86ace..59d030baa0b 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -152,7 +152,6 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *sorted,
           THTensor *indices);
 
-
 TH_API void THNN_(MarginCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -198,7 +197,7 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
-          THTensor* output,
+          THTensor *output,
           bool sizeAverage,
           int p);
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
@@ -223,10 +222,10 @@ TH_API void THNN_(PReLU_updateGradInput)(
           THTensor *weight,
           THIndex_t nOutputPlane);
 TH_API void THNN_(PReLU_accGradParameters)(
-          THNNState* state,
-          THTensor* input,
-          THTensor* gradOutput,
-          THTensor* gradInput,
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
           THTensor *weight,
           THTensor *gradWeight,
           THTensor *gradWeightBuf,
@@ -305,13 +304,73 @@ TH_API void THNN_(SoftPlus_updateGradInput)(
           real beta,
           real threshold);
 
+TH_API void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+TH_API void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps);
+TH_API void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace);
+TH_API void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace);
+
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           THTensor *weight,
           THTensor *bias,
-          THTensor* finput,
+          THTensor *finput,
           THTensor *fgradInput,
           int kW, int kH,
           int dW, int dH,
@@ -394,23 +453,4 @@ TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           int padW, int padH,
           bool ceil_mode);
 
-TH_API void THNN_(unfolded_acc)(
-          THTensor *finput,
-          THTensor *input,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int nInputPlane,
-          int inputWidth, int inputHeight,
-          int outputWidth, int outputHeight);
-TH_API void THNN_(unfolded_copy)(
-          THTensor *finput,
-          THTensor *input,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int nInputPlane,
-          int inputWidth, int inputHeight,
-          int outputWidth, int outputHeight);
-
 #endif
diff --git a/generic/Tanh.c b/generic/Tanh.c
index f0a05795069..ba8e2cea518 100644
--- a/generic/Tanh.c
+++ b/generic/Tanh.c
@@ -2,22 +2,14 @@
 #define TH_GENERIC_FILE "generic/Tanh.c"
 #else
 
-static int nn_(Tanh_updateOutput)(lua_State *L)
+void THNN_(Tanh_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   THTensor_(resizeAs)(output, input);
   THTensor_(tanh)(output, input);
-  return 1;
 }
 
-static int nn_(Tanh_updateGradInput)(lua_State *L)
+void THNN_(Tanh_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
 {
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
   THTensor_(resizeAs)(gradInput, output);
 
   if (output->nDimension == 1 || 
@@ -25,9 +17,10 @@ static int nn_(Tanh_updateGradInput)(lua_State *L)
       !THTensor_(isContiguous)(gradOutput) ||
       !THTensor_(isContiguous)(gradInput))
   {
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,  \
-         real z = *output_data;            \
-         *gradInput_data = *gradOutput_data * (1. - z*z););
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      real z = *output_data;            \
+      *gradInput_data = *gradOutput_data * (1. - z*z);
+    );
   }
   else
   {
@@ -37,27 +30,12 @@ static int nn_(Tanh_updateGradInput)(lua_State *L)
     long i;
 
 #pragma omp parallel for private(i)
-    for(i = 0; i < THTensor_(nElement)(gradInput); i++)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
     {
       real z = ptr_output[i];
       ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
     }
   }
-  return 1;
-}
-
-static const struct luaL_Reg nn_(Tanh__) [] = {
-  {"Tanh_updateOutput", nn_(Tanh_updateOutput)},
-  {"Tanh_updateGradInput", nn_(Tanh_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(Tanh_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(Tanh__), "nn");
-  lua_pop(L,1);
-
 }
 
 #endif
diff --git a/generic/Threshold.c b/generic/Threshold.c
index a309f78a732..acf8ee566a6 100644
--- a/generic/Threshold.c
+++ b/generic/Threshold.c
@@ -2,65 +2,45 @@
 #define TH_GENERIC_FILE "generic/Threshold.c"
 #else
 
-static int nn_(Threshold_updateOutput)(lua_State *L)
+void THNN_(Threshold_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real threshold, real val, bool inplace)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  real val = luaT_getfieldchecknumber(L, 1, "val");
-  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
-
-  if (inPlace) {
-    TH_TENSOR_APPLY(real, input,                   \
-                    if (*input_data <= threshold) { \
-                      *input_data = val;           \
-                    });
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= threshold)
+        *input_data = val;
+    );
     THTensor_(set)(output, input);
-  } else {
+  }
+  else
+  {
     THTensor_(resizeAs)(output, input);
-    TH_TENSOR_APPLY2(real, output, real, input,                         \
-                     *output_data = (*input_data > threshold) ? *input_data : val;);
-
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > threshold) ? *input_data : val;
+    );
   }
-
-  return 1;
 }
 
-static int nn_(Threshold_updateGradInput)(lua_State *L)
+void THNN_(Threshold_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real threshold, bool inplace)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real threshold = luaT_getfieldchecknumber(L, 1, "threshold");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  int inPlace = luaT_getfieldcheckboolean(L, 1, "inplace");
-
-  if (inPlace) {
-    TH_TENSOR_APPLY2(real, gradOutput, real, input,    \
-                     if ((*input_data) <= threshold) { \
-                       *gradOutput_data = 0;           \
-                         });
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= threshold)
+        *gradOutput_data = 0;
+    );
     THTensor_(set)(gradInput, gradOutput);
-  } else {
-    THTensor_(resizeAs)(gradInput, input);
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,    \
-                     if ((*input_data) > threshold) *gradInput_data = *gradOutput_data; \
-                     else *gradInput_data = 0;);                        \
   }
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(Threshold__) [] = {
-  {"Threshold_updateOutput", nn_(Threshold_updateOutput)},
-  {"Threshold_updateGradInput", nn_(Threshold_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(Threshold_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(Threshold__), "nn");
-  lua_pop(L,1);
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > threshold)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
 }
 
 #endif

From cb31ed4f9902037a96cc756a8d742cd9164799e5 Mon Sep 17 00:00:00 2001
From: soumith <soumith@gmail.com>
Date: Tue, 2 Feb 2016 09:25:18 -0500
Subject: [PATCH 034/101] fix for torch7 minall/maxall changes

---
 generic/MultiLabelMarginCriterion.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index cc2e52f0f3d..b942d80fd6e 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -14,7 +14,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
   if (input->nDimension == 1)
   {
     nframe = 1;
-    dim = input->size[0]; 
+    dim = input->size[0];
     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
   }
   else
@@ -24,8 +24,8 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
-  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
-  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+  THArgCheck(THTensor_(minall)(NULL, target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(NULL, target) <= dim, 3, "target out of range");
 
   target = THTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
@@ -41,7 +41,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
       real input_target;
       if (target_idx < 0)
         break;
-      
+
       input_target = input_data[target_idx];
       for (d = 0; d < dim; d++)
       {
@@ -53,7 +53,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
           if (((long)target_data[ddt])-1 == d)
             istarget = 1;
         }
-        
+
         if (!istarget)
         {
           real z = 1 - input_target + input_data[d];
@@ -70,7 +70,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
     sum /= dim;
 
   THTensor_(set1d)(output, 0, sum);
-  
+
   THTensor_(free)(input);
   THTensor_(free)(target);
 }
@@ -89,7 +89,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
   if (input->nDimension == 1)
   {
     nframe = 1;
-    dim = input->size[0]; 
+    dim = input->size[0];
     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
   }
   else
@@ -99,8 +99,8 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
-  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
-  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+  THArgCheck(THTensor_(minall)(NULL, target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(NULL, target) <= dim, 3, "target out of range");
 
   target = THTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
@@ -121,7 +121,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
       real input_target;
       if (target_idx < 0)
         break;
-      
+
       input_target = input_data[target_idx];
       for (d = 0; d < dim; d++)
       {
@@ -133,7 +133,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
           if (((long)target_data[ddt])-1 == d)
             istarget = 1;
         }
-        
+
         if (!istarget)
         {
           real z = 1 - input_target + input_data[d];
@@ -150,7 +150,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
     gradInput_data += dim;
   }
 
-  THTensor_(free)(input);  
+  THTensor_(free)(input);
   THTensor_(free)(target);
 }
 

From 0c14165f49bad236ac6421b55e05d6a20049b79f Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@fb.com>
Date: Tue, 2 Feb 2016 12:43:07 -0500
Subject: [PATCH 035/101] reverting b2af7eaddfc1de72661b6861115d3fdb97403bf3
 because of Revert in torch7 https://github.com/torch/torch7/pull/523

---
 generic/MultiLabelMarginCriterion.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index b942d80fd6e..a3cf96503af 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -24,8 +24,8 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
-  THArgCheck(THTensor_(minall)(NULL, target) >= 0, 3, "target out of range");
-  THArgCheck(THTensor_(maxall)(NULL, target) <= dim, 3, "target out of range");
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
 
   target = THTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
@@ -99,8 +99,8 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
-  THArgCheck(THTensor_(minall)(NULL, target) >= 0, 3, "target out of range");
-  THArgCheck(THTensor_(maxall)(NULL, target) <= dim, 3, "target out of range");
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
 
   target = THTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);

From 5e02136dcf59428e71b3d2443e348457cc44a4c3 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@fb.com>
Date: Tue, 2 Feb 2016 16:02:43 -0500
Subject: [PATCH 036/101] ARM / 32-bit fixes for SpatialConvolutionMM as
 suggested in #495

---
 generic/unfold.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/generic/unfold.c b/generic/unfold.c
index 89a0759354e..3581413badd 100644
--- a/generic/unfold.c
+++ b/generic/unfold.c
@@ -40,8 +40,8 @@ void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
             } else {
               if (dW==1){
                  ix = (long long)(0 - padW + kw);
-                 lpad = fmaxf(0,padW-kw);
-                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 lpad = fmaxf(0,(int)(padW-kw));
+                 rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
                  THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
               }
               else{
@@ -100,8 +100,8 @@ void THNN_(unfolded_copy)(THTensor *finput, THTensor *input,
         } else {
           if (dW==1){
              ix = (long long)(0 - padW + kw);
-             lpad = fmaxf(0,padW-kw);
-             rpad = fmaxf(0,padW-(kW-kw-1));
+             lpad = fmaxf(0,(int)(padW-kw));
+             rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
              if (outputWidth-rpad-lpad <= 0) {
                 memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
              } else {

From d9da22e53dec40059f9eab3207208c4eda79c521 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sun, 31 Jan 2016 00:30:54 +0100
Subject: [PATCH 037/101] Move generic/Volumetric* -> lib/THNN/generic

---
 generic/VolumetricAveragePooling.c  | 262 +++++++++++++++++++
 generic/VolumetricConvolution.c     | 211 +++++++++++++++
 generic/VolumetricConvolutionMM.c   | 386 ++++++++++++++++++++++++++++
 generic/VolumetricFullConvolution.c | 296 +++++++++++++++++++++
 generic/VolumetricMaxPooling.c      | 341 ++++++++++++++++++++++++
 generic/VolumetricMaxUnpooling.c    | 292 +++++++++++++++++++++
 6 files changed, 1788 insertions(+)
 create mode 100644 generic/VolumetricAveragePooling.c
 create mode 100644 generic/VolumetricConvolution.c
 create mode 100644 generic/VolumetricConvolutionMM.c
 create mode 100644 generic/VolumetricFullConvolution.c
 create mode 100644 generic/VolumetricMaxPooling.c
 create mode 100644 generic/VolumetricMaxUnpooling.c

diff --git a/generic/VolumetricAveragePooling.c b/generic/VolumetricAveragePooling.c
new file mode 100644
index 00000000000..e23438641fb
--- /dev/null
+++ b/generic/VolumetricAveragePooling.c
@@ -0,0 +1,262 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static void nn_(VolumetricAveragePooling_updateOutput_frame)(
+  real *input_p, real *output_p, long nslices,
+  long itime, long iwidth, long iheight,
+  long otime, long owidth, long oheight,
+  int kT, int kW, int kH, int dT, int dW, int dH) {
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)  {
+    /* loop over output */
+    long i, j, ti;
+    for(ti = 0; ti < otime; ti++) {
+      for(i = 0; i < oheight; i++) {
+        for(j = 0; j < owidth; j++) {
+          /* local pointers */
+          real *ip = input_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local sum: */
+          real sum = 0.0;
+          int x,y,z;
+
+          for(z=0; z < kT; z++) {
+            for(y = 0; y < kH; y++) {
+              for(x = 0; x < kW; x++) {
+                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = sum / (kT * kW * kH);
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricAveragePooling_updateOutput)(lua_State *L) {
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2,
+                "4D or 5D (batch-mode) tensor expected");
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5) {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  luaL_argcheck(L, input->size[dimw] >= kW && input->size[dimh] >= kH &&
+                input->size[dimt] >= kT, 2,
+                "input image smaller than kernel size");
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) { /* non-batch mode */
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    nn_(VolumetricAveragePooling_updateOutput_frame)(input_data, output_data,
+                                                     nslices,
+                                                     itime, iwidth, iheight,
+                                                     otime, owidth, oheight,
+                                                     kT, kW, kH, dT, dW, dH);
+  } else { /* batch mode */
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++) {
+      nn_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride,
+        nslices, itime, iwidth, iheight, otime, owidth, oheight,
+        kT, kW, kH, dT, dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  return 1;
+}
+
+static void nn_(VolumetricAveragePooling_updateGradInput_frame)(
+  real *gradInput_p, real *gradOutput_p, long nslices,
+  long itime, long iwidth, long iheight,
+  long otime, long owidth, long oheight,
+  int kT, int kW, int kH, int dT, int dW, int dH) {
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)  {
+    /* loop over output */
+    long i, j, ti;
+    for(ti = 0; ti < otime; ti++) {
+      for(i = 0; i < oheight; i++) {
+        for(j = 0; j < owidth; j++) {
+          /* local pointers */
+          real *ip = gradInput_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = gradOutput_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* scatter gradients out to footprint: */
+          real val  = *op / (kT * kW * kH);
+          int x,y,z;
+          for(z=0; z < kT; z++) {
+            for(y = 0; y < kH; y++) {
+              for(x = 0; x < kW; x++) {
+                *(ip + z * iwidth * iheight + y * iwidth + x) += val;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricAveragePooling_updateGradInput)(lua_State *L) {
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput",
+                                                torch_Tensor);
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5) {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 4) { /* non-batch mode*/
+    nn_(VolumetricAveragePooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data, nslices,
+      itime, iwidth, iheight, otime, owidth, oheight,
+      kT, kW, kH, dT, dW, dH);
+  } else { /* batch mode */
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++) {
+      nn_(VolumetricAveragePooling_updateGradInput_frame)(
+        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
+        itime, iwidth, iheight, otime, owidth, oheight,
+        kT, kW, kH, dT, dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(VolumetricAveragePooling__) [] = {
+  {"VolumetricAveragePooling_updateOutput",
+   nn_(VolumetricAveragePooling_updateOutput)},
+  {"VolumetricAveragePooling_updateGradInput",
+   nn_(VolumetricAveragePooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricAveragePooling_init)(lua_State *L) {
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricAveragePooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/VolumetricConvolution.c b/generic/VolumetricConvolution.c
new file mode 100644
index 00000000000..bb30a70d782
--- /dev/null
+++ b/generic/VolumetricConvolution.c
@@ -0,0 +1,211 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+static int nn_(VolumetricConvolution_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 
+		2, "4D or 5D (batch-mode) tensor expected");
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5) {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nOutputPlane = weight->size[0];
+  long kT           = weight->size[2];
+  long kH           = weight->size[3];
+  long kW           = weight->size[4];
+  long inputDepth   = input->size[dimt];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth - kT) / dT + 1;
+  long outputWidth  = (inputWidth - kW) / dW + 1;
+  long outputHeight = (inputHeight - kH) / dH + 1;
+  THTensor *outn = THTensor_(new)();
+  long i,j;
+  if (input->nDimension == 4) { /* non-batch mode */
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  
+    /* add bias */
+    for (i=0; i<bias->size[0]; i++) {
+      THTensor_(select)(outn,output,0,i);
+      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+    }
+
+    /* do convolutions */
+    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+  } else { /* batch mode */
+    long nBatch = input->size[0];
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, 
+			outputDepth, outputHeight, outputWidth);
+    THTensor *inb = THTensor_(new)();
+    THTensor *outb = THTensor_(new)();
+
+    for (j=0; j<nBatch; j++) { /* loop over batches */
+      THTensor_(select)(inb,input,0,j);
+      THTensor_(select)(outb,output,0,j);
+	
+      /* add bias */
+      for (i=0; i<bias->size[0]; i++) {
+	THTensor_(select)(outn,outb,0,i);
+	THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+
+      /* do convolutions */
+      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+    }
+
+    THTensor_(free)(inb);
+    THTensor_(free)(outb);
+  }
+  THTensor_(free)(outn);
+  
+  return 1;
+}
+
+
+static int nn_(VolumetricConvolution_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);  
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  THTensor *tweight;
+
+  luaL_argcheck(L, gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 
+		3, "4D or 5D (batch-mode) tensor expected");
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5) {
+    dimPlane++;
+  }
+  THArgCheck( nOutputPlane == gradOutput->size[dimPlane], 1, 
+	      "Number of output features is not equal to nOutputPlane" );
+
+  /* gradient to input */
+  tweight = THTensor_(newTranspose)(weight,0,1);
+  if (gradOutput->nDimension == 4) { /* non-batch mode */
+    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+  } else { /* batch mode */
+    long nBatch = gradOutput->size[0];
+    THTensor *ginpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+    THTensor_(resize5d)(gradInput, input->size[0], input->size[1], input->size[2],
+			input->size[3], input->size[4]);
+
+    for (j=0; j<nBatch; j++) { /* loop over batches */
+      THTensor_(select)(ginpb,gradInput,0,j);
+      THTensor_(select)(goutb,gradOutput,0,j);
+      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+    }
+    THTensor_(free)(ginpb);
+    THTensor_(free)(goutb);
+  }
+
+  THTensor_(free)(tweight);
+
+  return 1;
+}
+
+static int nn_(VolumetricConvolution_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real scale = luaL_optnumber(L, 4, 1);
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  long k;
+  real *gradBias_data;
+  THTensor* gradOutSlice;
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5) {
+    dimPlane++;
+  }
+  
+  THArgCheck( nOutputPlane == gradOutput->size[dimPlane], 1, 
+	      "Number of output features is not equal to nOutputPlane" );
+
+  
+  if (gradOutput->nDimension == 4) { /* non-batch mode */
+    /* gradient to bias */
+    gradBias_data = THTensor_(data)(gradBias);
+    gradOutSlice = THTensor_(new)();
+    for(k = 0; k < nOutputPlane; k++)
+      {
+	THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+	gradBias_data[k] += scale*THTensor_(sumall)(gradOutSlice);
+      }
+    THTensor_(free)(gradOutSlice);
+    
+    /* gradient to kernels */
+    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+  } else { /* batch mode */
+    long nBatch = gradOutput->size[0];
+    THTensor *inpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    for (j=0; j<nBatch; j++) { /* loop over batches */
+      THTensor_(select)(inpb,input,0,j);
+      THTensor_(select)(goutb,gradOutput,0,j);
+      
+      /* gradient to bias */
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for(k = 0; k < nOutputPlane; k++)
+	{
+	  THTensor_(select)(gradOutSlice, goutb, 0, k);
+	  gradBias_data[k] += scale*THTensor_(sumall)(gradOutSlice);
+	}
+      THTensor_(free)(gradOutSlice);
+      
+      /* gradient to kernels */
+      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+    }
+    THTensor_(free)(inpb);
+    THTensor_(free)(goutb);
+  }
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(VolumetricConvolution__) [] = {
+  {"VolumetricConvolution_updateOutput", nn_(VolumetricConvolution_updateOutput)},
+  {"VolumetricConvolution_updateGradInput", nn_(VolumetricConvolution_updateGradInput)},
+  {"VolumetricConvolution_accGradParameters", nn_(VolumetricConvolution_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricConvolution_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricConvolution__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/VolumetricConvolutionMM.c b/generic/VolumetricConvolutionMM.c
new file mode 100644
index 00000000000..ed13a5d2c31
--- /dev/null
+++ b/generic/VolumetricConvolutionMM.c
@@ -0,0 +1,386 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void nn_(unfolded_acc_vol)(THTensor *finput, THTensor *input,
+                               int kT, int kW, int kH,
+                               int dT, int dW, int dH,
+                               int padT, int padW, int padH,
+                               int nInputPlane,
+                               int inputDepth, int inputWidth, int inputHeight,
+                               int outputDepth, int outputWidth, int outputHeight)
+{
+  int nip;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+//#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    int kt, kw, kh, t, y, x, it, ix, iy;
+    for(kt = 0; kt < kT; kt++)
+    {
+      for(kh = 0; kh < kH; kh++)
+      {
+        for(kw = 0; kw < kW; kw++)
+        {
+          real *src = finput_data + nip*(kT*kH*kW*outputDepth*outputHeight*outputWidth) + kt*(kH*kW*outputDepth*outputHeight*outputWidth)+ kh*(kW*outputDepth*outputHeight*outputWidth) + kw*(outputDepth*outputHeight*outputWidth);
+          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
+          if (padT > 0 || padH > 0 || padW > 0)
+          {
+            for(t = 0; t < outputDepth; t++)
+            {
+              it = t*dT - padT + kt;
+              for(y = 0; y < outputHeight; y++) 
+              {
+                iy = y*dH - padH + kh;
+                for(x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW - padW + kw;
+                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+                    {}
+                  else
+                    THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                }
+              }
+            }
+          }
+          else {
+            for(t = 0; t < outputDepth; t++) {
+              it = t*dT + kt;
+              for(y = 0; y < outputHeight; y++) {
+                iy = y*dH + kh;
+                for(x = 0; x < outputWidth; x++) {
+                  ix = x*dW + kw;
+                  THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+static void nn_(unfolded_copy_vol)(THTensor *finput, THTensor *input,
+                               int kT, int kW, int kH,
+                               int dT, int dW, int dH,
+                               int padT, int padW, int padH,
+                               int nInputPlane,
+                               int inputDepth, int inputWidth, int inputHeight,
+                               int outputDepth, int outputWidth, int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+// #pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane*kT*kH*kW; k++) {
+    int nip = k / (kT*kH*kW);
+    int rest = k % (kT*kH*kW);
+    int kt = rest / (kH*kW);
+    rest = rest % (kH*kW);
+    int kh = rest / kW;
+    int kw = rest % kW;
+    int t,x,y,it,ix,iy;
+    real *dst = finput_data + nip*(kT*kH*kW*outputDepth*outputHeight*outputWidth) + kt*(kH*kW*outputDepth*outputHeight*outputWidth) + kh*(kW*outputDepth*outputHeight*outputWidth) + kw*(outputDepth*outputHeight*outputWidth);
+    real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
+    
+    if (padT > 0 || padH > 0 || padW > 0)
+    {
+      for(t = 0; t < outputDepth; t++)
+      {
+        it = t*dT - padT + kt;
+        for(y = 0; y < outputHeight; y++) 
+        {
+          iy = y*dH - padH + kh;
+          for(x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW - padW + kw;
+            if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+              memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
+            else
+              memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+     else {
+      for(t = 0; t < outputDepth; t++) {
+        it = t*dT + kt;
+        for(y = 0; y < outputHeight; y++) {
+          iy = y*dH + kh;
+          for(x = 0; x < outputWidth; x++) {
+            ix = x*dW + kw;
+            memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+  }
+}
+
+static void nn_(VolumetricConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+                                                    int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH,
+                                                 		long nInputPlane, long inputDepth, long inputWidth, long inputHeight,
+                                                 		long nOutputPlane, long outputDepth, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  nn_(unfolded_copy_vol)(finput, input, kT, kW, kH, dT, dW, dH, padT, padW, padH, nInputPlane, inputDepth, inputWidth, inputHeight, outputDepth, outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputDepth*outputHeight*outputWidth, -1);
+
+  for(i = 0; i < nOutputPlane; i++)
+    THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputDepth*outputHeight*outputWidth);
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+static int nn_(VolumetricConvolutionMM_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+  
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D(batch mode) tensor expected");
+
+
+  if (input->nDimension == 5) {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*padT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+        nInputPlane,inputDepth,inputHeight,inputWidth,nInputPlane,outputDepth,outputHeight,outputWidth);
+
+  if(input->nDimension == 4)
+  {
+    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    nn_(VolumetricConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kT, kW, kH, dT, dW, dH, padT, padW, padH,
+                                                 nInputPlane, inputDepth, inputWidth, inputHeight,
+                                                 nOutputPlane, outputDepth, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+// #pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      nn_(VolumetricConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                 kT, kW, kH, dT, dW, dH, padT, padW, padH,
+                                                 nInputPlane, inputDepth, inputWidth, inputHeight,
+                                                 nOutputPlane, outputDepth, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  return 1;
+}
+
+
+static void nn_(VolumetricConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+                                                            int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  nn_(unfolded_acc_vol)(fgradInput, gradInput, kT, kW, kH, dT, dW, dH, padT, padW, padH, gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2], gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]);
+}
+
+static int nn_(VolumetricConvolutionMM_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *fgradInput = luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  if(input->nDimension == 4)
+  {
+    nn_(VolumetricConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kT, kW, kH, dT, dW, dH, padT, padW, padH);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+//#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      nn_(VolumetricConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kT, kW, kH, dT, dW, dH, padT, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  return 1;
+}
+
+static void nn_(VolumetricConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput,
+                                                              real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  for(i = 0; i < gradBias->size[0]; i++)
+  {
+    long k;
+    real sum = 0;
+    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+    for(k = 0; k < gradOutput2d->size[1]; k++)
+      sum += data[k];
+    (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+static int nn_(VolumetricConvolutionMM_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real scale = luaL_optnumber(L, 4, 1);
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+
+  if(input->nDimension == 4)
+  {
+    nn_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      nn_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(VolumetricConvolutionMM__) [] = {
+  {"VolumetricConvolutionMM_updateOutput", nn_(VolumetricConvolutionMM_updateOutput)},
+  {"VolumetricConvolutionMM_updateGradInput", nn_(VolumetricConvolutionMM_updateGradInput)},
+  {"VolumetricConvolutionMM_accGradParameters", nn_(VolumetricConvolutionMM_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricConvolutionMM_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricConvolutionMM__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
new file mode 100644
index 00000000000..4c63c991198
--- /dev/null
+++ b/generic/VolumetricFullConvolution.c
@@ -0,0 +1,296 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
+  // Input
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+
+  // Params:
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int pT = luaT_getfieldcheckint(L, 1, "pT");
+  int pH = luaT_getfieldcheckint(L, 1, "pH");
+  int pW = luaT_getfieldcheckint(L, 1, "pW");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  int inputDepth   = input->size[2];
+  int inputHeight  = input->size[3];
+  int inputWidth   = input->size[4];
+
+  int outputDepth  = (inputDepth - 1) * dT - 2 * pT + kT;
+  int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
+  int outputWidth  = (inputWidth - 1) * dW - 2 * pW + kW;
+
+  luaL_argcheck(L, input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
+  luaL_argcheck(L, kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+
+  // Batch size
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth,
+                        outputHeight, outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  const real* weight_ptr = THTensor_(data)(weight);
+  const real* bias_ptr = THTensor_(data)(bias);
+
+  int n;
+  for (n = 0; n < batchSize; ++n) {
+    THTensor_(select)(input_n, input, 0, n);
+    THTensor_(select)(output_n, output, 0, n);
+
+    THTensor *outn = THTensor_(new)();
+    // add bias first
+    int i;
+    for (i = 0; i < bias->size[0]; i++) {
+      THTensor_(select)(outn,output_n,0,i);
+      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+    }
+    THTensor_(free)(outn);
+
+    int t, h, w, kc_, kt_, kh_, kw_, c;
+
+    const real* input_ptr = THTensor_(data)(input_n);
+    real* output_ptr = THTensor_(data)(output_n);
+    for (t = 0; t < inputDepth; t++)
+      for (h = 0; h < inputHeight; h++)
+        for (w = 0; w < inputWidth; w++)
+          for (kc_ = 0; kc_ < nOutputPlane; kc_++)
+            for (kt_ = 0; kt_ < kT; kt_++)
+              for (kh_ = 0; kh_ < kH; kh_++)
+                for (kw_ = 0; kw_ < kW; kw_++) {
+                  int pt = t * dT - pT + kt_;
+                  int ph = h * dH - pH + kh_;
+                  int pw = w * dW - pW + kw_;
+                  if (pt >=0 && ph >=0 && pw >= 0 &&
+                    pt < outputDepth && ph < outputHeight && pw < outputWidth) {
+                    real val = 0;
+                    for (c = 0; c < nInputPlane; c++) {
+                      val += input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w]
+                      * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
+                    }
+                    output_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
+                      += val;
+                  }
+                }
+  }
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // return output
+  return 1;
+}
+
+static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
+  // Input
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+
+
+  // Params:
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int pT = luaT_getfieldcheckint(L, 1, "pT");
+  int pH = luaT_getfieldcheckint(L, 1, "pH");
+  int pW = luaT_getfieldcheckint(L, 1, "pW");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  int inputDepth   = input->size[2];
+  int inputHeight  = input->size[3];
+  int inputWidth   = input->size[4];
+
+  int outputDepth  = (inputDepth - 1) * dT - 2 * pT + kT;
+  int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
+  int outputWidth  = (inputWidth - 1) * dW - 2 * pW + kW;
+
+  luaL_argcheck(L, input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
+  luaL_argcheck(L, kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+
+  // Batch size
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  const real* weight_ptr = THTensor_(data)(weight);
+
+  // For each n in batch, do:
+  int n;
+  for (n = 0; n < batchSize; n++) {
+    THTensor_(select)(gradInput_n, gradInput, 0, n);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, n);
+    THTensor_(fill)(gradInput_n, 0);
+
+    int t, h, w, kc_, kt_, kh_, kw_, c;
+
+    real* gradInput_ptr = THTensor_(data)(gradInput_n);
+    const real* gradOutput_ptr = THTensor_(data)(gradOutput_n);
+    for (t = 0; t < inputDepth; t++)
+      for (h = 0; h < inputHeight; h++)
+        for (w = 0; w < inputWidth; w++)
+          for (kc_ = 0; kc_ < nOutputPlane; kc_++)
+            for (kt_ = 0; kt_ < kT; kt_++)
+              for (kh_ = 0; kh_ < kH; kh_++)
+                for (kw_ = 0; kw_ < kW; kw_++) {
+                  int pt = t * dT - pT + kt_;
+                  int ph = h * dH - pH + kh_;
+                  int pw = w * dW - pW + kw_;
+                  if (pt >=0 && ph >=0 && pw >= 0 &&
+                    pt < outputDepth && ph < outputHeight && pw < outputWidth) {
+                    for (c = 0; c < nInputPlane; c++) {
+                      gradInput_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] +=
+                      gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
+                      * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
+                    }
+                  }
+                }
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Return gradInput
+  return 1;
+}
+
+static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
+  // Inputs
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+
+  // Params
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int pT = luaT_getfieldcheckint(L, 1, "pT");
+  int pH = luaT_getfieldcheckint(L, 1, "pH");
+  int pW = luaT_getfieldcheckint(L, 1, "pW");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
+  luaL_argcheck(L, kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+
+  THTensor_(resize1d)(gradBias, nOutputPlane);
+  THTensor_(resize5d)(gradWeight, nOutputPlane, nInputPlane, kT, kH, kW);
+
+  int inputDepth   = input->size[2];
+  int inputHeight  = input->size[3];
+  int inputWidth   = input->size[4];
+
+  int outputDepth  = (inputDepth - 1) * dT - 2 * pT + kT;
+  int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
+  int outputWidth  = (inputWidth - 1) * dW - 2 * pW + kW;
+
+  // Batch size
+  long batchSize = input->size[0];
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // reset gradBias = 0
+  THTensor_(fill)(gradBias, 0);
+  // reset gradWeight = 0
+  THTensor_(fill)(gradWeight, 0);
+
+  real* gradWeight_ptr = THTensor_(data)(gradWeight);
+  real* gradBias_ptr = THTensor_(data)(gradBias);
+
+  // For each n in batch, do:
+  int n;
+  for (n = 0; n < batchSize; n++) {
+    THTensor_(select)(input_n, input, 0, n);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, n);
+
+    THTensor *goutn = THTensor_(new)();
+
+    // accumulate bias gradient first
+    int i;
+    for (i = 0; i < gradBias->size[0]; i++) {
+      THTensor_(select)(goutn, gradOutput_n, 0, i);
+      gradBias_ptr[i] += THTensor_(sumall)(goutn);
+    }
+    THTensor_(free)(goutn);
+
+    int t, h, w, kc_, kt_, kh_, kw_, c;
+
+    const real* input_ptr = THTensor_(data)(input_n);
+    const real* gradOutput_ptr = THTensor_(data)(gradOutput_n);
+    for (t = 0; t < inputDepth; t++)
+      for (h = 0; h < inputHeight; h++)
+        for (w = 0; w < inputWidth; w++)
+          for (kc_ = 0; kc_ < nOutputPlane; kc_++)
+            for (kt_ = 0; kt_ < kT; kt_++)
+              for (kh_ = 0; kh_ < kH; kh_++)
+                for (kw_ = 0; kw_ < kW; kw_++) {
+                  int pt = t * dT - pT + kt_;
+                  int ph = h * dH - pH + kh_;
+                  int pw = w * dW - pW + kw_;
+                  if (pt >=0 && ph >=0 && pw >= 0 &&
+                    pt < outputDepth && ph < outputHeight && pw < outputWidth) {
+                    for (c = 0; c < nInputPlane; c++) {
+                      gradWeight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_] +=
+                      input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] *
+                      gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw];
+                    }
+                  }
+                }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Return nothing
+  return 0;
+}
+
+static const struct luaL_Reg nn_(VolumetricFullConvolution__) [] = {
+  {"VolumetricFullConvolution_updateOutput", nn_(VolumetricFullConvolution_updateOutput)},
+  {"VolumetricFullConvolution_updateGradInput", nn_(VolumetricFullConvolution_updateGradInput)},
+  {"VolumetricFullConvolution_accGradParameters", nn_(VolumetricFullConvolution_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricFullConvolution_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricFullConvolution__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/VolumetricMaxPooling.c b/generic/VolumetricMaxPooling.c
new file mode 100644
index 00000000000..04d2288a98f
--- /dev/null
+++ b/generic/VolumetricMaxPooling.c
@@ -0,0 +1,341 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+static void nn_(VolumetricMaxPooling_updateOutput_frame)(
+  real *input_p, real *output_p, real *indz_p,
+  long nslices, long itime, long iwidth, long iheight,
+  long otime, long owidth, long oheight,
+  int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH) {
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for(ti = 0; ti < otime; ti++) {
+      for(i = 0; i < oheight; i++) {
+        for(j = 0; j < owidth; j++) {
+          /* local pointers */
+          
+          long start_t = ti * dT - padT;
+          long start_h = i * dH - padH;
+          long start_w = j * dW - padW;
+          
+          long kernel_t = fminf(kT, kT + start_t);
+          long kernel_h = fminf(kH, kH + start_h);
+          long kernel_w = fminf(kW, kW + start_w);
+          
+          start_t = fmaxf(start_t, 0);
+          start_h = fmaxf(start_h, 0);
+          start_w = fmaxf(start_w, 0);
+          
+          real *ip = input_p + k * itime * iwidth * iheight
+            + start_t * iwidth * iheight + start_h * iwidth + start_w;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          real *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local max: */
+          real maxval = -THInf;
+          int x,y,z;
+          int mx, my, mz;
+
+          for(z = 0; z < kernel_t; z++) {
+            for(y = 0; y < kernel_h; y++) {
+              for(x = 0; x < kernel_w; x++) {
+                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
+                {
+                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
+                  if (val > maxval) {
+                    maxval = val;
+                    // Store indices w.r.t the kernel dimension
+                    mz = z + (kT - kernel_t); 
+                    my = y + (kH - kernel_h);
+                    mx = x + (kW - kernel_w);
+                  }
+                }
+              }
+            }
+          }
+
+          // set max values
+          ((unsigned char*)(indzp))[0] = mz;
+          ((unsigned char*)(indzp))[1] = my;
+          ((unsigned char*)(indzp))[2] = mx;
+          ((unsigned char*)(indzp))[3] = 0;
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricMaxPooling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2,
+                "4D or 5D (batch-mode) tensor expected");
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5) {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  luaL_argcheck(L, input->size[dimw] >= kW &&
+                input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+                "input image smaller than kernel size");
+
+  luaL_argcheck(L, kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceil_mode) {
+    otime   = (int)(ceil((float)(itime   - kT + 2 * padT) / dT) + 1);
+    oheight = (int)(ceil((float)(iheight - kH + 2 * padH) / dH) + 1);
+    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * padW) / dW) + 1);
+  } else {
+    otime   = (int)(floor((float)(itime   - kT + 2 * padT) / dT) + 1);
+    oheight = (int)(floor((float)(iheight - kH + 2 * padH) / dH) + 1);
+    owidth  = (int)(floor((float)(iwidth  - kW + 2 * padW) / dW) + 1);
+  }
+
+  if (padT || padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + padT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + padH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + padW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) { /* non-batch mode */
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    nn_(VolumetricMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 itime, iwidth, iheight,
+                                                 otime, owidth, oheight,
+                                                 kT, kW, kH, dT, dW, dH, padT, padW, padH);
+  } else { /* batch mode */
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++) {
+      nn_(VolumetricMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH, dT, dW, dH, padT, padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  return 1;
+}
+
+static void nn_(VolumetricMaxPooling_updateGradInput_frame)(
+  real *gradInput_p, real *gradOutput_p, real *indz_p,
+  long nslices,
+  long itime, long iwidth, long iheight,
+  long otime, long owidth, long oheight,
+  int dT, int dW, int dH,
+  int padT, int padW, int padH) {
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++) {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    real *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    long ti, i, j;
+    for(ti = 0; ti < otime; ti++) {
+      for(i = 0; i < oheight; i++) {
+        for(j = 0; j < owidth; j++) {
+          /* retrieve position of max */
+          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - padT;
+          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - padH;
+          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - padW;
+
+          /* update gradient */
+          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricMaxPooling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5) {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4) { /* non-batch mode*/
+    nn_(VolumetricMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH, padT, padW, padH);
+  }
+  else { /* batch mode */
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++) {
+      nn_(VolumetricMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH, padT, padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(VolumetricMaxPooling__) [] = {
+  {"VolumetricMaxPooling_updateOutput", nn_(VolumetricMaxPooling_updateOutput)},
+  {"VolumetricMaxPooling_updateGradInput", nn_(VolumetricMaxPooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricMaxPooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricMaxPooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/VolumetricMaxUnpooling.c b/generic/VolumetricMaxUnpooling.c
new file mode 100644
index 00000000000..5e1fe1d476a
--- /dev/null
+++ b/generic/VolumetricMaxUnpooling.c
@@ -0,0 +1,292 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static void nn_(VolumetricMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      real *ind_p,
+                                                      long nslices,
+                                                      long itime, long iwidth, long iheight,
+                                                      long otime, long owidth, long oheight,
+                                                      int dT, int dW, int dH,
+                                                      int padT, int padW, int padH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {    
+    long ti, i, j, maxz, maxy, maxx;
+    for(ti = 0; ti < itime; ti++)
+    {
+      for(i = 0; i < iheight; i++)
+      {
+        for(j = 0; j < iwidth; j++)
+        {
+          long start_t = ti * dT - padT;
+          long start_h = i * dH - padH;
+          long start_w = j * dW - padW;
+          
+          //real *output_p_k = output_p + k*otime*owidth*oheight + ti*owidth*oheight*dT + i*owidth*dH + j*dW;
+          real *input_p_k = input_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
+          real *ind_p_k = ind_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
+          
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if(start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=otime || start_h+maxy>=oheight || start_w+maxx>=owidth)
+          {
+              THError("invalid max index z= %d, y= %d, x= %d, otime= %d, owidth= %d, oheight= %d", start_t+maxz, start_h+maxy, start_w+maxx, otime, owidth, oheight);
+          }
+          output_p[k*otime*owidth*oheight + oheight*owidth*(start_t+maxz) + owidth*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricMaxUnpooling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  int otime = luaT_getfieldcheckint(L, 1, "otime");
+  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
+  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5 , 2, "4D or 5D (batch mode) tensor expected");
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  }  
+
+  if (input->nDimension == 5) 
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    nn_(VolumetricMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              itime, iwidth, iheight,
+                                              otime, owidth, oheight,
+                                              dT, dW, dH, padT, padW, padH);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, otime, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(VolumetricMaxUnpooling_updateOutput_frame)(input_data+p*nslices*itime*iwidth*iheight, output_data+p*nslices*otime*owidth*oheight,
+                                                indices_data+p*nslices*itime*iwidth*iheight,
+                                                nslices,
+                                                itime, iwidth, iheight,
+                                                otime, owidth, oheight,
+                                                dT, dW, dH, padT, padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THTensor_(free)(indices);
+  return 1;
+}
+
+static void nn_(VolumetricMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         real *ind_p,
+                                                         long nslices,
+                                                         long itime, long iwidth, long iheight,
+                                                         long otime, long owidth, long oheight,
+                                                         int dT, int dW, int dH,
+                                                         int padT, int padW, int padH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    long ti, i, j, maxz, maxy, maxx;
+    for(ti = 0; ti < itime; ti++)
+    {
+      for(i = 0; i < iheight; i++)
+      {
+        for(j = 0; j < iwidth; j++)
+        {        
+          long start_t = ti * dT - padT;
+          long start_h = i * dH - padH;
+          long start_w = j * dW - padW;
+          
+          real *gradInput_p_k = gradInput_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
+          //real *gradOutput_p_k = gradOutput_p + k*otime*owidth*oheight + ti*owidth*oheight*dT + i*owidth*dH + j*dW;
+          real *ind_p_k = ind_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
+          
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if(start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=otime || start_h+maxy>=oheight || start_w+maxx>=owidth)
+          {
+              THError("invalid max index z= %d, y= %d, x= %d, otime= %d, owidth= %d, oheight= %d", start_t+maxz, start_h+maxy, start_w+maxx, otime, owidth, oheight);
+          }  
+          *gradInput_p_k = gradOutput_p[k*otime*owidth*oheight + oheight*owidth*(start_t+maxz) + owidth*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricMaxUnpooling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int otime = luaT_getfieldcheckint(L, 1, "otime");
+  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
+  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  } 
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5) {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(otime!=gradOutput->size[dimt] || owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. otime= %d, oheight= %d, owidth= %d, gradOutput: %dx%d", otime, oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4)
+  {
+    nn_(VolumetricMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 itime, iwidth, iheight,
+                                                 otime, owidth, oheight,
+                                                 dT, dW, dH,
+                                                 padT, padW, padH);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(VolumetricMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*itime*iwidth*iheight, gradOutput_data+p*nslices*otime*owidth*oheight,
+                                                   indices_data+p*nslices*itime*iwidth*iheight,
+                                                   nslices,
+                                                   itime, iwidth, iheight,
+                                                   otime, owidth, oheight,
+                                                   dT, dW, dH,
+                                                   padT, padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(indices);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(VolumetricMaxUnpooling__) [] = {
+  {"VolumetricMaxUnpooling_updateOutput", nn_(VolumetricMaxUnpooling_updateOutput)},
+  {"VolumetricMaxUnpooling_updateGradInput", nn_(VolumetricMaxUnpooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricMaxUnpooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricMaxUnpooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From f0d9886a7ff651b89565ce29a91cb3db6c8d88bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Sun, 31 Jan 2016 23:45:34 +0100
Subject: [PATCH 038/101] Add THNN conversion of Volumetric* modules

---
 generic/LookupTable.c               |  13 +-
 generic/THNN.h                      | 144 +++++++++-
 generic/VolumetricAveragePooling.c  | 183 +++++++------
 generic/VolumetricConvolution.c     | 227 ++++++++--------
 generic/VolumetricConvolutionMM.c   | 389 ++++++++++++++++------------
 generic/VolumetricFullConvolution.c | 265 +++++++++----------
 generic/VolumetricMaxPooling.c      | 221 ++++++++--------
 generic/VolumetricMaxUnpooling.c    | 280 ++++++++++----------
 init.c                              |  18 ++
 9 files changed, 1011 insertions(+), 729 deletions(-)

diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index 66a09a8854b..ed9656e9f16 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -20,9 +20,16 @@ static void THNN_(LookupTable_resetCount)(THInteger_t *count_data, THIndexTensor
   }
 }
 
-void THNN_(LookupTable_accGradParameters)(THNNState *state, THIndexTensor *input, THTensor *gradOutput, 
-  THTensor *gradWeight, real scale, bool scaleGradByFreq, THIntegerTensor *count,
-  THTensor *sorted, THTensor *indices)
+void THNN_(LookupTable_accGradParameters)(
+  THNNState *state,
+  THIndexTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THIntegerTensor *count,
+  THTensor *sorted,
+  THTensor *indices,
+  bool scaleGradByFreq,
+  real scale)
 {
   long i;
   THInteger_t *count_data = NULL;
diff --git a/generic/THNN.h b/generic/THNN.h
index 59d030baa0b..9977255d8f8 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -146,11 +146,11 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THIndexTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          real scale,
-          bool scaleGradByFreq,
           THIntegerTensor *count,
           THTensor *sorted,
-          THTensor *indices);
+          THTensor *indices,
+          bool scaleGradByFreq,
+          real scale);
 
 TH_API void THNN_(MarginCriterion_updateOutput)(
           THNNState *state,
@@ -453,4 +453,142 @@ TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           int padW, int padH,
           bool ceil_mode);
 
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+
+TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
 #endif
diff --git a/generic/VolumetricAveragePooling.c b/generic/VolumetricAveragePooling.c
index e23438641fb..0206585e18c 100644
--- a/generic/VolumetricAveragePooling.c
+++ b/generic/VolumetricAveragePooling.c
@@ -2,19 +2,25 @@
 #define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
 #else
 
-static void nn_(VolumetricAveragePooling_updateOutput_frame)(
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
   real *input_p, real *output_p, long nslices,
   long itime, long iwidth, long iheight,
   long otime, long owidth, long oheight,
-  int kT, int kW, int kH, int dT, int dW, int dH) {
+  int kT, int kW, int kH,
+  int dT, int dW, int dH)
+{
   long k;
 #pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)  {
+  for (k = 0; k < nslices; k++)
+  {
     /* loop over output */
     long i, j, ti;
-    for(ti = 0; ti < otime; ti++) {
-      for(i = 0; i < oheight; i++) {
-        for(j = 0; j < owidth; j++) {
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
           /* local pointers */
           real *ip = input_p + k * itime * iwidth * iheight
             + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
@@ -23,11 +29,14 @@ static void nn_(VolumetricAveragePooling_updateOutput_frame)(
 
           /* compute local sum: */
           real sum = 0.0;
-          int x,y,z;
+          int x, y, z;
 
-          for(z=0; z < kT; z++) {
-            for(y = 0; y < kH; y++) {
-              for(x = 0; x < kW; x++) {
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
                 sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
               }
             }
@@ -41,15 +50,11 @@ static void nn_(VolumetricAveragePooling_updateOutput_frame)(
   }
 }
 
-static int nn_(VolumetricAveragePooling_updateOutput)(lua_State *L) {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+void THNN_(VolumetricAveragePooling_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH)
+{
   long nslices;
   long itime;
   long iheight;
@@ -60,24 +65,26 @@ static int nn_(VolumetricAveragePooling_updateOutput)(lua_State *L) {
   real *input_data;
   real *output_data;
 
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2,
-                "4D or 5D (batch-mode) tensor expected");
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
 
   int dimN = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     dimN++;
     dimt++;
     dimh++;
     dimw++;
   }
 
-  luaL_argcheck(L, input->size[dimw] >= kW && input->size[dimh] >= kH &&
-                input->size[dimt] >= kT, 2,
-                "input image smaller than kernel size");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
 
   /* sizes */
   nslices = input->size[dimN];
@@ -91,19 +98,24 @@ static int nn_(VolumetricAveragePooling_updateOutput)(lua_State *L) {
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
 
-  if (input->nDimension == 4) { /* non-batch mode */
+  if (input->nDimension == 4) /* non-batch mode */
+  {
     /* resize output */
     THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
 
-    nn_(VolumetricAveragePooling_updateOutput_frame)(input_data, output_data,
-                                                     nslices,
-                                                     itime, iwidth, iheight,
-                                                     otime, owidth, oheight,
-                                                     kT, kW, kH, dT, dW, dH);
-  } else { /* batch mode */
+    THNN_(VolumetricAveragePooling_updateOutput_frame)(
+      input_data, output_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else  /* batch mode */
+  {
     long p;
     long nBatch = input->size[0];
 
@@ -117,32 +129,41 @@ static int nn_(VolumetricAveragePooling_updateOutput)(lua_State *L) {
     output_data = THTensor_(data)(output);
 
 #pragma omp parallel for private(p)
-    for (p=0; p < nBatch; p++) {
-      nn_(VolumetricAveragePooling_updateOutput_frame)(
-        input_data + p * istride, output_data + p * ostride,
-        nslices, itime, iwidth, iheight, otime, owidth, oheight,
-        kT, kW, kH, dT, dW, dH);
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
     }
   }
 
   /* cleanup */
   THTensor_(free)(input);
-  return 1;
 }
 
-static void nn_(VolumetricAveragePooling_updateGradInput_frame)(
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
   real *gradInput_p, real *gradOutput_p, long nslices,
   long itime, long iwidth, long iheight,
   long otime, long owidth, long oheight,
-  int kT, int kW, int kH, int dT, int dW, int dH) {
+  int kT, int kW, int kH,
+  int dT, int dW, int dH)
+{
   long k;
 #pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)  {
+  for (k = 0; k < nslices; k++)
+  {
     /* loop over output */
     long i, j, ti;
-    for(ti = 0; ti < otime; ti++) {
-      for(i = 0; i < oheight; i++) {
-        for(j = 0; j < owidth; j++) {
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
           /* local pointers */
           real *ip = gradInput_p + k * itime * iwidth * iheight
             + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
@@ -152,9 +173,12 @@ static void nn_(VolumetricAveragePooling_updateGradInput_frame)(
           /* scatter gradients out to footprint: */
           real val  = *op / (kT * kW * kH);
           int x,y,z;
-          for(z=0; z < kT; z++) {
-            for(y = 0; y < kH; y++) {
-              for(x = 0; x < kW; x++) {
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
                 *(ip + z * iwidth * iheight + y * iwidth + x) += val;
               }
             }
@@ -165,17 +189,14 @@ static void nn_(VolumetricAveragePooling_updateGradInput_frame)(
   }
 }
 
-static int nn_(VolumetricAveragePooling_updateGradInput)(lua_State *L) {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput",
-                                                torch_Tensor);
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH)
+{
   int nslices;
   int itime;
   int iheight;
@@ -198,7 +219,8 @@ static int nn_(VolumetricAveragePooling_updateGradInput)(lua_State *L) {
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     dimN++;
     dimt++;
     dimh++;
@@ -219,12 +241,18 @@ static int nn_(VolumetricAveragePooling_updateGradInput)(lua_State *L) {
   gradOutput_data = THTensor_(data)(gradOutput);
 
   /* backprop */
-  if (input->nDimension == 4) { /* non-batch mode*/
-    nn_(VolumetricAveragePooling_updateGradInput_frame)(
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
       gradInput_data, gradOutput_data, nslices,
-      itime, iwidth, iheight, otime, owidth, oheight,
-      kT, kW, kH, dT, dW, dH);
-  } else { /* batch mode */
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else /* batch mode */
+  {
     long p;
     long nBatch = input->size[0];
 
@@ -232,31 +260,20 @@ static int nn_(VolumetricAveragePooling_updateGradInput)(lua_State *L) {
     long ostride = nslices * otime * owidth * oheight;
 
 #pragma omp parallel for private(p)
-    for (p = 0; p < nBatch; p++) {
-      nn_(VolumetricAveragePooling_updateGradInput_frame)(
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
         gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
-        itime, iwidth, iheight, otime, owidth, oheight,
-        kT, kW, kH, dT, dW, dH);
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
     }
   }
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-  return 1;
-}
-
-static const struct luaL_Reg nn_(VolumetricAveragePooling__) [] = {
-  {"VolumetricAveragePooling_updateOutput",
-   nn_(VolumetricAveragePooling_updateOutput)},
-  {"VolumetricAveragePooling_updateGradInput",
-   nn_(VolumetricAveragePooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricAveragePooling_init)(lua_State *L) {
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricAveragePooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/VolumetricConvolution.c b/generic/VolumetricConvolution.c
index bb30a70d782..31ad4d58f22 100644
--- a/generic/VolumetricConvolution.c
+++ b/generic/VolumetricConvolution.c
@@ -2,24 +2,27 @@
 #define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
 #else
 
-static int nn_(VolumetricConvolution_updateOutput)(lua_State *L)
+void THNN_(VolumetricConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *output,
+  THTensor *weight,
+  THTensor *bias,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
 
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 
-		2, "4D or 5D (batch-mode) tensor expected");
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     dimt++;
     dimh++;
     dimw++;
@@ -36,33 +39,39 @@ static int nn_(VolumetricConvolution_updateOutput)(lua_State *L)
   long outputWidth  = (inputWidth - kW) / dW + 1;
   long outputHeight = (inputHeight - kH) / dH + 1;
   THTensor *outn = THTensor_(new)();
-  long i,j;
-  if (input->nDimension == 4) { /* non-batch mode */
+  long i, j;
+  if (input->nDimension == 4) /* non-batch mode */
+  {
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
-  
+
     /* add bias */
-    for (i=0; i<bias->size[0]; i++) {
-      THTensor_(select)(outn,output,0,i);
+    for (i = 0; i < bias->size[0]; i++)
+    {
+      THTensor_(select)(outn, output, 0, i);
       THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
     }
 
     /* do convolutions */
     THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
-  } else { /* batch mode */
+  }
+  else /* batch mode */
+  {
     long nBatch = input->size[0];
-    THTensor_(resize5d)(output, nBatch, nOutputPlane, 
-			outputDepth, outputHeight, outputWidth);
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor *inb = THTensor_(new)();
     THTensor *outb = THTensor_(new)();
 
-    for (j=0; j<nBatch; j++) { /* loop over batches */
-      THTensor_(select)(inb,input,0,j);
-      THTensor_(select)(outb,output,0,j);
-	
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inb, input, 0, j);
+      THTensor_(select)(outb, output, 0, j);
+
       /* add bias */
-      for (i=0; i<bias->size[0]; i++) {
-	THTensor_(select)(outn,outb,0,i);
-	THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, outb, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
       }
 
       /* do convolutions */
@@ -73,48 +82,61 @@ static int nn_(VolumetricConvolution_updateOutput)(lua_State *L)
     THTensor_(free)(outb);
   }
   THTensor_(free)(outn);
-  
-  return 1;
 }
 
-
-static int nn_(VolumetricConvolution_updateGradInput)(lua_State *L)
+void THNN_(VolumetricConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);  
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  THTensor *tweight;
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+    "4D or 5D (batch-mode) tensor expected"
+  );
 
-  luaL_argcheck(L, gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 
-		3, "4D or 5D (batch-mode) tensor expected");
   int dimPlane = 0;
-  if (gradOutput->nDimension == 5) {
+  if (gradOutput->nDimension == 5)
+  {
     dimPlane++;
   }
-  THArgCheck( nOutputPlane == gradOutput->size[dimPlane], 1, 
-	      "Number of output features is not equal to nOutputPlane" );
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
 
   /* gradient to input */
-  tweight = THTensor_(newTranspose)(weight,0,1);
-  if (gradOutput->nDimension == 4) { /* non-batch mode */
+  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
     THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
-  } else { /* batch mode */
+  }
+  else /* batch mode */
+  {
     long nBatch = gradOutput->size[0];
     THTensor *ginpb = THTensor_(new)();
     THTensor *goutb = THTensor_(new)();
     long j;
-    THTensor_(resize5d)(gradInput, input->size[0], input->size[1], input->size[2],
-			input->size[3], input->size[4]);
 
-    for (j=0; j<nBatch; j++) { /* loop over batches */
-      THTensor_(select)(ginpb,gradInput,0,j);
-      THTensor_(select)(goutb,gradOutput,0,j);
+    THTensor_(resize5d)(gradInput,
+      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+    );
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(ginpb, gradInput, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
       THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
     }
     THTensor_(free)(ginpb);
@@ -122,90 +144,87 @@ static int nn_(VolumetricConvolution_updateGradInput)(lua_State *L)
   }
 
   THTensor_(free)(tweight);
-
-  return 1;
 }
 
-static int nn_(VolumetricConvolution_accGradParameters)(lua_State *L)
+void THNN_(VolumetricConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH,
+  real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);  
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real scale = luaL_optnumber(L, 4, 1);
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
 
   long k;
   real *gradBias_data;
-  THTensor* gradOutSlice;
+  THTensor *gradOutSlice;
   int dimPlane = 0;
-  if (gradOutput->nDimension == 5) {
+  if (gradOutput->nDimension == 5)
+  {
     dimPlane++;
   }
-  
-  THArgCheck( nOutputPlane == gradOutput->size[dimPlane], 1, 
-	      "Number of output features is not equal to nOutputPlane" );
 
-  
-  if (gradOutput->nDimension == 4) { /* non-batch mode */
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
     /* gradient to bias */
     gradBias_data = THTensor_(data)(gradBias);
     gradOutSlice = THTensor_(new)();
-    for(k = 0; k < nOutputPlane; k++)
-      {
-	THTensor_(select)(gradOutSlice, gradOutput, 0, k);
-	gradBias_data[k] += scale*THTensor_(sumall)(gradOutSlice);
-      }
+    for (k = 0; k < nOutputPlane; k++)
+    {
+      THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+      gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+    }
     THTensor_(free)(gradOutSlice);
-    
+
     /* gradient to kernels */
     THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
-  } else { /* batch mode */
+  }
+  else /* batch mode */
+  {
     long nBatch = gradOutput->size[0];
     THTensor *inpb = THTensor_(new)();
     THTensor *goutb = THTensor_(new)();
     long j;
 
-    for (j=0; j<nBatch; j++) { /* loop over batches */
-      THTensor_(select)(inpb,input,0,j);
-      THTensor_(select)(goutb,gradOutput,0,j);
-      
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inpb, input, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+
       /* gradient to bias */
       gradBias_data = THTensor_(data)(gradBias);
       gradOutSlice = THTensor_(new)();
-      for(k = 0; k < nOutputPlane; k++)
-	{
-	  THTensor_(select)(gradOutSlice, goutb, 0, k);
-	  gradBias_data[k] += scale*THTensor_(sumall)(gradOutSlice);
-	}
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, goutb, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
       THTensor_(free)(gradOutSlice);
-      
+
       /* gradient to kernels */
       THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
     }
     THTensor_(free)(inpb);
     THTensor_(free)(goutb);
   }
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(VolumetricConvolution__) [] = {
-  {"VolumetricConvolution_updateOutput", nn_(VolumetricConvolution_updateOutput)},
-  {"VolumetricConvolution_updateGradInput", nn_(VolumetricConvolution_updateGradInput)},
-  {"VolumetricConvolution_accGradParameters", nn_(VolumetricConvolution_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricConvolution_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricConvolution__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/VolumetricConvolutionMM.c b/generic/VolumetricConvolutionMM.c
index ed13a5d2c31..fc9e2e46197 100644
--- a/generic/VolumetricConvolutionMM.c
+++ b/generic/VolumetricConvolutionMM.c
@@ -3,55 +3,68 @@
 #else
 
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
-static void nn_(unfolded_acc_vol)(THTensor *finput, THTensor *input,
-                               int kT, int kW, int kH,
-                               int dT, int dW, int dH,
-                               int padT, int padW, int padH,
-                               int nInputPlane,
-                               int inputDepth, int inputWidth, int inputHeight,
-                               int outputDepth, int outputWidth, int outputHeight)
+static void THNN_(unfolded_acc_vol)(
+  THTensor *finput, THTensor *input,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH,
+  int nInputPlane,
+  int inputDepth, int inputWidth, int inputHeight,
+  int outputDepth, int outputWidth, int outputHeight)
 {
   int nip;
   real *input_data = THTensor_(data)(input);
   real *finput_data = THTensor_(data)(finput);
 
 //#pragma omp parallel for private(nip)
-  for(nip = 0; nip < nInputPlane; nip++)
+  for (nip = 0; nip < nInputPlane; nip++)
   {
     int kt, kw, kh, t, y, x, it, ix, iy;
-    for(kt = 0; kt < kT; kt++)
+    for (kt = 0; kt < kT; kt++)
     {
-      for(kh = 0; kh < kH; kh++)
+      for (kh = 0; kh < kH; kh++)
       {
-        for(kw = 0; kw < kW; kw++)
+        for (kw = 0; kw < kW; kw++)
         {
-          real *src = finput_data + nip*(kT*kH*kW*outputDepth*outputHeight*outputWidth) + kt*(kH*kW*outputDepth*outputHeight*outputWidth)+ kh*(kW*outputDepth*outputHeight*outputWidth) + kw*(outputDepth*outputHeight*outputWidth);
+          real *src = finput_data
+            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+            + kh  * (kW*outputDepth*outputHeight*outputWidth)
+            + kw  * (outputDepth*outputHeight*outputWidth);
+
           real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
-          if (padT > 0 || padH > 0 || padW > 0)
+          if (pT > 0 || pH > 0 || pW > 0)
           {
-            for(t = 0; t < outputDepth; t++)
+            for (t = 0; t < outputDepth; t++)
             {
-              it = t*dT - padT + kt;
-              for(y = 0; y < outputHeight; y++) 
+              it = t*dT - pT + kt;
+              for (y = 0; y < outputHeight; y++)
               {
-                iy = y*dH - padH + kh;
-                for(x = 0; x < outputWidth; x++)
+                iy = y*dH - pH + kh;
+                for (x = 0; x < outputWidth; x++)
                 {
-                  ix = x*dW - padW + kw;
+                  ix = x*dW - pW + kw;
                   if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
-                    {}
+                  {
+                  }
                   else
+                  {
                     THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                  }
                 }
               }
             }
           }
-          else {
-            for(t = 0; t < outputDepth; t++) {
+          else
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
               it = t*dT + kt;
-              for(y = 0; y < outputHeight; y++) {
+              for (y = 0; y < outputHeight; y++)
+              {
                 iy = y*dH + kh;
-                for(x = 0; x < outputWidth; x++) {
+                for(x = 0; x < outputWidth; x++)
+                {
                   ix = x*dW + kw;
                   THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                 }
@@ -63,19 +76,22 @@ static void nn_(unfolded_acc_vol)(THTensor *finput, THTensor *input,
     }
   }
 }
-static void nn_(unfolded_copy_vol)(THTensor *finput, THTensor *input,
-                               int kT, int kW, int kH,
-                               int dT, int dW, int dH,
-                               int padT, int padW, int padH,
-                               int nInputPlane,
-                               int inputDepth, int inputWidth, int inputHeight,
-                               int outputDepth, int outputWidth, int outputHeight)
+
+static void THNN_(unfolded_copy_vol)(
+  THTensor *finput, THTensor *input,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH,
+  int nInputPlane,
+  int inputDepth, int inputWidth, int inputHeight,
+  int outputDepth, int outputWidth, int outputHeight)
 {
   long k;
   real *input_data = THTensor_(data)(input);
   real *finput_data = THTensor_(data)(finput);
 // #pragma omp parallel for private(k)
-  for(k = 0; k < nInputPlane*kT*kH*kW; k++) {
+  for (k = 0; k < nInputPlane*kT*kH*kW; k++)
+  {
     int nip = k / (kT*kH*kW);
     int rest = k % (kT*kH*kW);
     int kt = rest / (kH*kW);
@@ -83,20 +99,24 @@ static void nn_(unfolded_copy_vol)(THTensor *finput, THTensor *input,
     int kh = rest / kW;
     int kw = rest % kW;
     int t,x,y,it,ix,iy;
-    real *dst = finput_data + nip*(kT*kH*kW*outputDepth*outputHeight*outputWidth) + kt*(kH*kW*outputDepth*outputHeight*outputWidth) + kh*(kW*outputDepth*outputHeight*outputWidth) + kw*(outputDepth*outputHeight*outputWidth);
+    real *dst = finput_data
+      + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+      + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+      + kh  * (kW*outputDepth*outputHeight*outputWidth)
+      + kw  * (outputDepth*outputHeight*outputWidth);
     real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
-    
-    if (padT > 0 || padH > 0 || padW > 0)
+
+    if (pT > 0 || pH > 0 || pW > 0)
     {
-      for(t = 0; t < outputDepth; t++)
+      for (t = 0; t < outputDepth; t++)
       {
-        it = t*dT - padT + kt;
-        for(y = 0; y < outputHeight; y++) 
+        it = t*dT - pT + kt;
+        for (y = 0; y < outputHeight; y++)
         {
-          iy = y*dH - padH + kh;
-          for(x = 0; x < outputWidth; x++)
+          iy = y*dH - pH + kh;
+          for (x = 0; x < outputWidth; x++)
           {
-            ix = x*dW - padW + kw;
+            ix = x*dW - pW + kw;
             if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
               memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
             else
@@ -105,12 +125,16 @@ static void nn_(unfolded_copy_vol)(THTensor *finput, THTensor *input,
         }
       }
     }
-     else {
-      for(t = 0; t < outputDepth; t++) {
+    else
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
         it = t*dT + kt;
-        for(y = 0; y < outputHeight; y++) {
+        for (y = 0; y < outputHeight; y++)
+        {
           iy = y*dH + kh;
-          for(x = 0; x < outputWidth; x++) {
+          for(x = 0; x < outputWidth; x++)
+          {
             ix = x*dW + kw;
             memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
           }
@@ -120,46 +144,57 @@ static void nn_(unfolded_copy_vol)(THTensor *finput, THTensor *input,
   }
 }
 
-static void nn_(VolumetricConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
-                                                    int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH,
-                                                 		long nInputPlane, long inputDepth, long inputWidth, long inputHeight,
-                                                 		long nOutputPlane, long outputDepth, long outputWidth, long outputHeight)
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+  THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT,int pW, int pH,
+  long nInputPlane, long inputDepth, long inputWidth, long inputHeight,
+  long nOutputPlane, long outputDepth, long outputWidth, long outputHeight)
 {
   long i;
   THTensor *output2d;
 
-  nn_(unfolded_copy_vol)(finput, input, kT, kW, kH, dT, dW, dH, padT, padW, padH, nInputPlane, inputDepth, inputWidth, inputHeight, outputDepth, outputWidth, outputHeight);
+  THNN_(unfolded_copy_vol)(
+    finput, input,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    nInputPlane,
+    inputDepth, inputWidth, inputHeight,
+    outputDepth, outputWidth, outputHeight
+  );
 
-  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
-                                         nOutputPlane, -1,
-                                         outputDepth*outputHeight*outputWidth, -1);
+  output2d = THTensor_(newWithStorage2d)(
+    output->storage, output->storageOffset, nOutputPlane, -1,
+    outputDepth*outputHeight*outputWidth, -1
+  );
 
-  for(i = 0; i < nOutputPlane; i++)
-    THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputDepth*outputHeight*outputWidth);
+  for (i = 0; i < nOutputPlane; i++)
+  {
+    THVector_(fill)(
+      output->storage->data+output->storageOffset+output->stride[0]*i,
+      THTensor_(get1d)(bias, i),
+      outputDepth*outputHeight*outputWidth
+    );
+  }
 
   THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
 
   THTensor_(free)(output2d);
 }
 
-static int nn_(VolumetricConvolutionMM_updateOutput)(lua_State *L)
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *output,
+  THTensor *weight,
+  THTensor *bias,
+  THTensor *finput,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   int dimf = 0;
   int dimt = 1;
   int dimh = 2;
@@ -173,11 +208,13 @@ static int nn_(VolumetricConvolutionMM_updateOutput)(lua_State *L)
   long outputDepth;
   long outputHeight;
   long outputWidth;
-  
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D(batch mode) tensor expected");
 
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D(batch mode) tensor expected"
+  );
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     dimf++;
     dimt++;
     dimh++;
@@ -189,24 +226,32 @@ static int nn_(VolumetricConvolutionMM_updateOutput)(lua_State *L)
   inputHeight  = input->size[dimh];
   inputWidth   = input->size[dimw];
   nOutputPlane = weight->size[0];
-  outputDepth  = (inputDepth + 2*padT - kT) / dT + 1;
-  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
 
   if (outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-        nInputPlane,inputDepth,inputHeight,inputWidth,nInputPlane,outputDepth,outputHeight,outputWidth);
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
 
-  if(input->nDimension == 4)
+  if (input->nDimension == 4)
   {
     THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
 
-    nn_(VolumetricConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
-                                                 kT, kW, kH, dT, dW, dH, padT, padW, padH,
-                                                 nInputPlane, inputDepth, inputWidth, inputHeight,
-                                                 nOutputPlane, outputDepth, outputWidth, outputHeight);
+    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+      input, output, weight, bias, finput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      nInputPlane, inputDepth, inputWidth, inputHeight,
+      nOutputPlane, outputDepth, outputWidth, outputHeight
+    );
   }
   else
   {
@@ -217,70 +262,90 @@ static int nn_(VolumetricConvolutionMM_updateOutput)(lua_State *L)
     THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
 
 // #pragma omp parallel for private(t)
-    for(t = 0; t < T; t++)
+    for (t = 0; t < T; t++)
     {
       THTensor *input_t = THTensor_(newSelect)(input, 0, t);
       THTensor *output_t = THTensor_(newSelect)(output, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      nn_(VolumetricConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
-                                                 kT, kW, kH, dT, dW, dH, padT, padW, padH,
-                                                 nInputPlane, inputDepth, inputWidth, inputHeight,
-                                                 nOutputPlane, outputDepth, outputWidth, outputHeight);
+      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+        input_t, output_t, weight, bias, finput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        nInputPlane, inputDepth, inputWidth, inputHeight,
+        nOutputPlane, outputDepth, outputWidth, outputHeight
+      );
 
       THTensor_(free)(input_t);
       THTensor_(free)(output_t);
       THTensor_(free)(finput_t);
     }
   }
-
-  return 1;
 }
 
-
-static void nn_(VolumetricConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
-                                                            int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH)
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+  THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
-                                                       gradOutput->size[0], -1,
-                                                       gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1);
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
   THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
   THTensor_(free)(gradOutput2d);
 
   THTensor_(zero)(gradInput);
 
-  nn_(unfolded_acc_vol)(fgradInput, gradInput, kT, kW, kH, dT, dW, dH, padT, padW, padH, gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2], gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]);
+  THNN_(unfolded_acc_vol)(
+    fgradInput, gradInput,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+  );
 }
 
-static int nn_(VolumetricConvolutionMM_updateGradInput)(lua_State *L)
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  // number of input/output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 2, 4,
+    "2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
+  );
 
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *fgradInput = luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int nOutputPlane = (int)weight->size[0];
 
-  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
   THTensor_(transpose)(weight, weight, 0, 1);
 
-  if(input->nDimension == 4)
+  if (input->nDimension == 4)
   {
-    nn_(VolumetricConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kT, kW, kH, dT, dW, dH, padT, padW, padH);
+    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+      gradInput, gradOutput, weight, fgradInput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
   }
   else
   {
@@ -288,13 +353,18 @@ static int nn_(VolumetricConvolutionMM_updateGradInput)(lua_State *L)
     long t;
 
 //#pragma omp parallel for private(t)
-    for(t = 0; t < T; t++)
+    for (t = 0; t < T; t++)
     {
       THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
-      nn_(VolumetricConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kT, kW, kH, dT, dW, dH, padT, padW, padH);
+      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+        gradInput_t, gradOutput_t, weight, fgradInput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
 
       THTensor_(free)(gradInput_t);
       THTensor_(free)(gradOutput_t);
@@ -303,84 +373,79 @@ static int nn_(VolumetricConvolutionMM_updateGradInput)(lua_State *L)
   }
 
   THTensor_(transpose)(weight, weight, 0, 1);
-
-  return 1;
 }
 
-static void nn_(VolumetricConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput,
-                                                              real scale)
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+  THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale)
 {
   long i;
-  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
-                                                       gradOutput->size[0], -1,
-                                                       gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1);
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
 
   THTensor_(transpose)(finput, finput, 0, 1);
   THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
   THTensor_(transpose)(finput, finput, 0, 1);
 
-  for(i = 0; i < gradBias->size[0]; i++)
+  for (i = 0; i < gradBias->size[0]; i++)
   {
     long k;
     real sum = 0;
     real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
-    for(k = 0; k < gradOutput2d->size[1]; k++)
+    for (k = 0; k < gradOutput2d->size[1]; k++)
       sum += data[k];
-    (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+
+    (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
   }
 
   THTensor_(free)(gradOutput2d);
 }
 
-static int nn_(VolumetricConvolutionMM_accGradParameters)(lua_State *L)
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real scale = luaL_optnumber(L, 4, 1);
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(gradWeight->nDimension == 2, 4,
+    "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
+  );
 
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  int nOutputPlane = (int)gradWeight->size[0];
 
-  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
 
-  if(input->nDimension == 4)
+  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (input->nDimension == 4)   // non-batch mode
   {
-    nn_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
   }
-  else
+  else  // batch mode
   {
     long T = input->size[0];
     long t;
 
-    for(t = 0; t < T; t++)
+    for (t = 0; t < T; t++)
     {
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      nn_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
 
       THTensor_(free)(gradOutput_t);
       THTensor_(free)(finput_t);
     }
   }
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(VolumetricConvolutionMM__) [] = {
-  {"VolumetricConvolutionMM_updateOutput", nn_(VolumetricConvolutionMM_updateOutput)},
-  {"VolumetricConvolutionMM_updateGradInput", nn_(VolumetricConvolutionMM_updateGradInput)},
-  {"VolumetricConvolutionMM_accGradParameters", nn_(VolumetricConvolutionMM_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricConvolutionMM_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricConvolutionMM__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
index 4c63c991198..73e81a140ec 100644
--- a/generic/VolumetricFullConvolution.c
+++ b/generic/VolumetricFullConvolution.c
@@ -2,44 +2,44 @@
 #define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
 #else
 
-static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
-  // Input
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *output,
+  THTensor *weight,
+  THTensor *bias,
+  THTensor *finput,         // only used by cuda impl
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
+{
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
 
-  // Params:
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int pT = luaT_getfieldcheckint(L, 1, "pT");
-  int pH = luaT_getfieldcheckint(L, 1, "pH");
-  int pW = luaT_getfieldcheckint(L, 1, "pW");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int nOutputPlane = (int)weight->size[0];
+  int nInputPlane  = (int)weight->size[1];
+  int kT           = (int)weight->size[2];
+  int kW           = (int)weight->size[3];
+  int kH           = (int)weight->size[4];
 
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THArgCheck(kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+  THArgCheck(input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
+  THArgCheck(input->size[1] == nInputPlane, 2, "input tensor has wrong number of planes");
 
-  int inputDepth   = input->size[2];
-  int inputHeight  = input->size[3];
-  int inputWidth   = input->size[4];
+  // input tensor dimensions
+  long batchSize   = input->size[0];
+  int inputDepth   = (int)input->size[2];
+  int inputHeight  = (int)input->size[3];
+  int inputWidth   = (int)input->size[4];
 
-  int outputDepth  = (inputDepth - 1) * dT - 2 * pT + kT;
+  int outputDepth  = (inputDepth  - 1) * dT - 2 * pT + kT;
   int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
-  int outputWidth  = (inputWidth - 1) * dW - 2 * pW + kW;
-
-  luaL_argcheck(L, input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
-  luaL_argcheck(L, kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
-
-  // Batch size
-  long batchSize = input->size[0];
+  int outputWidth  = (inputWidth  - 1) * dW - 2 * pW + kW;
 
   // Resize output
-  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth,
-                        outputHeight, outputWidth);
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
 
   // Helpers
   THTensor *input_n = THTensor_(new)();
@@ -49,14 +49,16 @@ static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
   const real* bias_ptr = THTensor_(data)(bias);
 
   int n;
-  for (n = 0; n < batchSize; ++n) {
+  for (n = 0; n < batchSize; ++n)
+  {
     THTensor_(select)(input_n, input, 0, n);
     THTensor_(select)(output_n, output, 0, n);
 
     THTensor *outn = THTensor_(new)();
     // add bias first
     int i;
-    for (i = 0; i < bias->size[0]; i++) {
+    for (i = 0; i < bias->size[0]; i++)
+    {
       THTensor_(select)(outn,output_n,0,i);
       THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
     }
@@ -64,69 +66,73 @@ static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
 
     int t, h, w, kc_, kt_, kh_, kw_, c;
 
-    const real* input_ptr = THTensor_(data)(input_n);
-    real* output_ptr = THTensor_(data)(output_n);
+    const real *input_ptr = THTensor_(data)(input_n);
+    real *output_ptr = THTensor_(data)(output_n);
     for (t = 0; t < inputDepth; t++)
+    {
       for (h = 0; h < inputHeight; h++)
         for (w = 0; w < inputWidth; w++)
           for (kc_ = 0; kc_ < nOutputPlane; kc_++)
             for (kt_ = 0; kt_ < kT; kt_++)
               for (kh_ = 0; kh_ < kH; kh_++)
-                for (kw_ = 0; kw_ < kW; kw_++) {
+                for (kw_ = 0; kw_ < kW; kw_++)
+                {
                   int pt = t * dT - pT + kt_;
                   int ph = h * dH - pH + kh_;
                   int pw = w * dW - pW + kw_;
                   if (pt >=0 && ph >=0 && pw >= 0 &&
-                    pt < outputDepth && ph < outputHeight && pw < outputWidth) {
+                    pt < outputDepth && ph < outputHeight && pw < outputWidth)
+                  {
                     real val = 0;
-                    for (c = 0; c < nInputPlane; c++) {
+                    for (c = 0; c < nInputPlane; c++)
+                    {
                       val += input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w]
-                      * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
+                        * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
                     }
                     output_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
                       += val;
                   }
                 }
+    }
   }
   THTensor_(free)(input_n);
   THTensor_(free)(output_n);
-
-  // return output
-  return 1;
 }
 
-static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
-  // Input
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,         // only used by cuda impl
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,
+  int pT, int pW, int pH
+)
+{
+  // number of input/output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
 
+  int nOutputPlane = (int)weight->size[0];
+  int nInputPlane  = (int)weight->size[1];
+  int kT           = (int)weight->size[2];
+  int kW           = (int)weight->size[3];
+  int kH           = (int)weight->size[4];
 
-  // Params:
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int pT = luaT_getfieldcheckint(L, 1, "pT");
-  int pH = luaT_getfieldcheckint(L, 1, "pH");
-  int pW = luaT_getfieldcheckint(L, 1, "pW");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+  THArgCheck(input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
+  THArgCheck(input->size[1] == nInputPlane, 2, "input tensor has wrong number of planes");
 
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int inputDepth   = (int)input->size[2];
+  int inputHeight  = (int)input->size[3];
+  int inputWidth   = (int)input->size[4];
 
-  int inputDepth   = input->size[2];
-  int inputHeight  = input->size[3];
-  int inputWidth   = input->size[4];
-
-  int outputDepth  = (inputDepth - 1) * dT - 2 * pT + kT;
+  int outputDepth  = (inputDepth  - 1) * dT - 2 * pT + kT;
   int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
-  int outputWidth  = (inputWidth - 1) * dW - 2 * pW + kW;
-
-  luaL_argcheck(L, input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
-  luaL_argcheck(L, kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+  int outputWidth  = (inputWidth  - 1) * dW - 2 * pW + kW;
 
   // Batch size
   long batchSize = input->size[0];
@@ -142,31 +148,35 @@ static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
 
   // For each n in batch, do:
   int n;
-  for (n = 0; n < batchSize; n++) {
+  for (n = 0; n < batchSize; n++)
+  {
     THTensor_(select)(gradInput_n, gradInput, 0, n);
     THTensor_(select)(gradOutput_n, gradOutput, 0, n);
     THTensor_(fill)(gradInput_n, 0);
 
     int t, h, w, kc_, kt_, kh_, kw_, c;
 
-    real* gradInput_ptr = THTensor_(data)(gradInput_n);
-    const real* gradOutput_ptr = THTensor_(data)(gradOutput_n);
+    real *gradInput_ptr = THTensor_(data)(gradInput_n);
+    const real *gradOutput_ptr = THTensor_(data)(gradOutput_n);
     for (t = 0; t < inputDepth; t++)
       for (h = 0; h < inputHeight; h++)
         for (w = 0; w < inputWidth; w++)
           for (kc_ = 0; kc_ < nOutputPlane; kc_++)
             for (kt_ = 0; kt_ < kT; kt_++)
               for (kh_ = 0; kh_ < kH; kh_++)
-                for (kw_ = 0; kw_ < kW; kw_++) {
+                for (kw_ = 0; kw_ < kW; kw_++)
+                {
                   int pt = t * dT - pT + kt_;
                   int ph = h * dH - pH + kh_;
                   int pw = w * dW - pW + kw_;
                   if (pt >=0 && ph >=0 && pw >= 0 &&
-                    pt < outputDepth && ph < outputHeight && pw < outputWidth) {
-                    for (c = 0; c < nInputPlane; c++) {
+                    pt < outputDepth && ph < outputHeight && pw < outputWidth)
+                  {
+                    for (c = 0; c < nInputPlane; c++)
+                    {
                       gradInput_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] +=
-                      gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
-                      * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
+                        gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
+                        * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
                     }
                   }
                 }
@@ -175,34 +185,38 @@ static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
   // Free
   THTensor_(free)(gradInput_n);
   THTensor_(free)(gradOutput_n);
-
-  // Return gradInput
-  return 1;
 }
 
-static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
-  // Inputs
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,         // only used by cuda impl
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,
+  int pT, int pW, int pH,
+  real scale)
+{
+  // number of input/output planes and kernel size is indirectly defined by the gradWeight tensor
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
 
-  // Params
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int pT = luaT_getfieldcheckint(L, 1, "pT");
-  int pH = luaT_getfieldcheckint(L, 1, "pH");
-  int pW = luaT_getfieldcheckint(L, 1, "pW");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int nOutputPlane = (int)gradWeight->size[0];
+  int nInputPlane  = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kW           = (int)gradWeight->size[3];
+  int kH           = (int)gradWeight->size[4];
 
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
 
-  luaL_argcheck(L, input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
-  luaL_argcheck(L, kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+  THArgCheck(input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
+  THArgCheck(kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
+  THArgCheck(input->size[1] == nInputPlane, 2, "input tensor has wrong number of planes");
 
   THTensor_(resize1d)(gradBias, nOutputPlane);
   THTensor_(resize5d)(gradWeight, nOutputPlane, nInputPlane, kT, kH, kW);
@@ -211,9 +225,9 @@ static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
   int inputHeight  = input->size[3];
   int inputWidth   = input->size[4];
 
-  int outputDepth  = (inputDepth - 1) * dT - 2 * pT + kT;
+  int outputDepth  = (inputDepth  - 1) * dT - 2 * pT + kT;
   int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
-  int outputWidth  = (inputWidth - 1) * dW - 2 * pW + kW;
+  int outputWidth  = (inputWidth  - 1) * dW - 2 * pW + kW;
 
   // Batch size
   long batchSize = input->size[0];
@@ -227,12 +241,13 @@ static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
   // reset gradWeight = 0
   THTensor_(fill)(gradWeight, 0);
 
-  real* gradWeight_ptr = THTensor_(data)(gradWeight);
-  real* gradBias_ptr = THTensor_(data)(gradBias);
+  real *gradWeight_ptr = THTensor_(data)(gradWeight);
+  real *gradBias_ptr = THTensor_(data)(gradBias);
 
   // For each n in batch, do:
   int n;
-  for (n = 0; n < batchSize; n++) {
+  for (n = 0; n < batchSize; n++)
+  {
     THTensor_(select)(input_n, input, 0, n);
     THTensor_(select)(gradOutput_n, gradOutput, 0, n);
 
@@ -240,32 +255,37 @@ static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
 
     // accumulate bias gradient first
     int i;
-    for (i = 0; i < gradBias->size[0]; i++) {
+    for (i = 0; i < gradBias->size[0]; i++)
+    {
       THTensor_(select)(goutn, gradOutput_n, 0, i);
-      gradBias_ptr[i] += THTensor_(sumall)(goutn);
+      gradBias_ptr[i] += scale * THTensor_(sumall)(goutn);
     }
     THTensor_(free)(goutn);
 
     int t, h, w, kc_, kt_, kh_, kw_, c;
 
-    const real* input_ptr = THTensor_(data)(input_n);
-    const real* gradOutput_ptr = THTensor_(data)(gradOutput_n);
+    const real *input_ptr = THTensor_(data)(input_n);
+    const real *gradOutput_ptr = THTensor_(data)(gradOutput_n);
     for (t = 0; t < inputDepth; t++)
       for (h = 0; h < inputHeight; h++)
         for (w = 0; w < inputWidth; w++)
           for (kc_ = 0; kc_ < nOutputPlane; kc_++)
             for (kt_ = 0; kt_ < kT; kt_++)
               for (kh_ = 0; kh_ < kH; kh_++)
-                for (kw_ = 0; kw_ < kW; kw_++) {
+                for (kw_ = 0; kw_ < kW; kw_++)
+                {
                   int pt = t * dT - pT + kt_;
                   int ph = h * dH - pH + kh_;
                   int pw = w * dW - pW + kw_;
                   if (pt >=0 && ph >=0 && pw >= 0 &&
-                    pt < outputDepth && ph < outputHeight && pw < outputWidth) {
-                    for (c = 0; c < nInputPlane; c++) {
+                    pt < outputDepth && ph < outputHeight && pw < outputWidth)
+                  {
+                    for (c = 0; c < nInputPlane; c++)
+                    {
                       gradWeight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_] +=
-                      input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] *
-                      gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw];
+                        scale *
+                        input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] *
+                        gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw];
                     }
                   }
                 }
@@ -274,23 +294,6 @@ static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
   // Free
   THTensor_(free)(input_n);
   THTensor_(free)(gradOutput_n);
-
-  // Return nothing
-  return 0;
-}
-
-static const struct luaL_Reg nn_(VolumetricFullConvolution__) [] = {
-  {"VolumetricFullConvolution_updateOutput", nn_(VolumetricFullConvolution_updateOutput)},
-  {"VolumetricFullConvolution_updateGradInput", nn_(VolumetricFullConvolution_updateGradInput)},
-  {"VolumetricFullConvolution_accGradParameters", nn_(VolumetricFullConvolution_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricFullConvolution_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricFullConvolution__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/VolumetricMaxPooling.c b/generic/VolumetricMaxPooling.c
index 04d2288a98f..b32a3819a51 100644
--- a/generic/VolumetricMaxPooling.c
+++ b/generic/VolumetricMaxPooling.c
@@ -2,34 +2,40 @@
 #define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
 #else
 
-static void nn_(VolumetricMaxPooling_updateOutput_frame)(
+static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
   real *input_p, real *output_p, real *indz_p,
   long nslices, long itime, long iwidth, long iheight,
   long otime, long owidth, long oheight,
-  int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH) {
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
+{
   long k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
     /* loop over output */
     long i, j, ti;
-    for(ti = 0; ti < otime; ti++) {
-      for(i = 0; i < oheight; i++) {
-        for(j = 0; j < owidth; j++) {
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
           /* local pointers */
-          
-          long start_t = ti * dT - padT;
-          long start_h = i * dH - padH;
-          long start_w = j * dW - padW;
-          
+
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
           long kernel_t = fminf(kT, kT + start_t);
           long kernel_h = fminf(kH, kH + start_h);
           long kernel_w = fminf(kW, kW + start_w);
-          
+
           start_t = fmaxf(start_t, 0);
           start_h = fmaxf(start_h, 0);
           start_w = fmaxf(start_w, 0);
-          
+
           real *ip = input_p + k * itime * iwidth * iheight
             + start_t * iwidth * iheight + start_h * iwidth + start_w;
           real *op = output_p + k * otime * owidth * oheight
@@ -42,16 +48,20 @@ static void nn_(VolumetricMaxPooling_updateOutput_frame)(
           int x,y,z;
           int mx, my, mz;
 
-          for(z = 0; z < kernel_t; z++) {
-            for(y = 0; y < kernel_h; y++) {
-              for(x = 0; x < kernel_w; x++) {
+          for (z = 0; z < kernel_t; z++)
+          {
+            for (y = 0; y < kernel_h; y++)
+            {
+              for (x = 0; x < kernel_w; x++)
+              {
                 if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
                 {
                   real val = *(ip + z * iwidth * iheight + y * iwidth + x);
-                  if (val > maxval) {
+                  if (val > maxval)
+                  {
                     maxval = val;
                     // Store indices w.r.t the kernel dimension
-                    mz = z + (kT - kernel_t); 
+                    mz = z + (kT - kernel_t);
                     my = y + (kH - kernel_h);
                     mx = x + (kW - kernel_w);
                   }
@@ -65,6 +75,7 @@ static void nn_(VolumetricMaxPooling_updateOutput_frame)(
           ((unsigned char*)(indzp))[1] = my;
           ((unsigned char*)(indzp))[2] = mx;
           ((unsigned char*)(indzp))[3] = 0;
+
           /* set output to local max */
           *op = maxval;
         }
@@ -73,21 +84,13 @@ static void nn_(VolumetricMaxPooling_updateOutput_frame)(
   }
 }
 
-static int nn_(VolumetricMaxPooling_updateOutput)(lua_State *L)
+void THNN_(VolumetricMaxPooling_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *indices,
+  int kT, int kW, int kH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH,
+  bool ceilMode)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int ceil_mode = luaT_getfieldcheckboolean(L,1,"ceil_mode");
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
   long nslices;
   long itime;
   long iheight;
@@ -99,57 +102,65 @@ static int nn_(VolumetricMaxPooling_updateOutput)(lua_State *L)
   real *output_data;
   real *indices_data;
 
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2,
-                "4D or 5D (batch-mode) tensor expected");
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
 
   int dimN = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     dimN++;
     dimt++;
     dimh++;
     dimw++;
   }
 
-  luaL_argcheck(L, input->size[dimw] >= kW &&
-                input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
-                "input image smaller than kernel size");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
 
-  luaL_argcheck(L, kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+    "pad should be smaller than half of kernel size"
+  );
 
   /* sizes */
   nslices = input->size[dimN];
   itime   = input->size[dimt];
   iheight = input->size[dimh];
   iwidth  = input->size[dimw];
-  if (ceil_mode) {
-    otime   = (int)(ceil((float)(itime   - kT + 2 * padT) / dT) + 1);
-    oheight = (int)(ceil((float)(iheight - kH + 2 * padH) / dH) + 1);
-    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * padW) / dW) + 1);
-  } else {
-    otime   = (int)(floor((float)(itime   - kT + 2 * padT) / dT) + 1);
-    oheight = (int)(floor((float)(iheight - kH + 2 * padH) / dH) + 1);
-    owidth  = (int)(floor((float)(iwidth  - kW + 2 * padW) / dW) + 1);
+  if (ceilMode)
+  {
+    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
+    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
+    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+  }
+  else
+  {
+    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
+    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
+    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
   }
 
-  if (padT || padW || padH)
+  if (pT || pW || pH)
   {
     // ensure that the last pooling starts inside the image
-    if ((otime - 1)*dT >= itime + padT)
+    if ((otime - 1)*dT >= itime + pT)
       --otime;
-    if ((oheight - 1)*dH >= iheight + padH)
+    if ((oheight - 1)*dH >= iheight + pH)
       --oheight;
-    if ((owidth  - 1)*dW >= iwidth  + padW)
+    if ((owidth  - 1)*dW >= iwidth  + pW)
       --owidth;
   }
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
 
-  if (input->nDimension == 4) { /* non-batch mode */
+  if (input->nDimension == 4) /* non-batch mode */
+  {
     /* resize output */
     THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
     /* indices will contain ti,i,j uchar locations packed into float/double */
@@ -159,13 +170,19 @@ static int nn_(VolumetricMaxPooling_updateOutput)(lua_State *L)
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    nn_(VolumetricMaxPooling_updateOutput_frame)(input_data, output_data,
-                                                 indices_data,
-                                                 nslices,
-                                                 itime, iwidth, iheight,
-                                                 otime, owidth, oheight,
-                                                 kT, kW, kH, dT, dW, dH, padT, padW, padH);
-  } else { /* batch mode */
+    THNN_(VolumetricMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else /* batch mode */
+  {
     long p;
     long nBatch = input->size[0];
 
@@ -182,47 +199,55 @@ static int nn_(VolumetricMaxPooling_updateOutput)(lua_State *L)
     indices_data = THTensor_(data)(indices);
 
 #pragma omp parallel for private(p)
-    for (p=0; p < nBatch; p++) {
-      nn_(VolumetricMaxPooling_updateOutput_frame)(
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricMaxPooling_updateOutput_frame)(
         input_data   + p * istride,
         output_data  + p * ostride,
         indices_data + p * ostride,
         nslices,
         itime, iwidth, iheight,
         otime, owidth, oheight,
-        kT, kW, kH, dT, dW, dH, padT, padW, padH);
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
     }
   }
 
   /* cleanup */
   THTensor_(free)(input);
-  return 1;
 }
 
-static void nn_(VolumetricMaxPooling_updateGradInput_frame)(
+static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
   real *gradInput_p, real *gradOutput_p, real *indz_p,
   long nslices,
   long itime, long iwidth, long iheight,
   long otime, long owidth, long oheight,
   int dT, int dW, int dH,
-  int padT, int padW, int padH) {
+  int pT, int pW, int pH)
+{
   long k;
 #pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++) {
+  for (k = 0; k < nslices; k++)
+  {
     real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
     real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
     real *indz_p_k = indz_p + k * otime * owidth * oheight;
 
     /* calculate max points */
     long ti, i, j;
-    for(ti = 0; ti < otime; ti++) {
-      for(i = 0; i < oheight; i++) {
-        for(j = 0; j < owidth; j++) {
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
           /* retrieve position of max */
           real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
-          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - padT;
-          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - padH;
-          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - padW;
+          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
 
           /* update gradient */
           gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
@@ -233,18 +258,11 @@ static void nn_(VolumetricMaxPooling_updateGradInput_frame)(
   }
 }
 
-static int nn_(VolumetricMaxPooling_updateGradInput)(lua_State *L)
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *indices,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
   int nslices;
   int itime;
   int iheight;
@@ -261,7 +279,6 @@ static int nn_(VolumetricMaxPooling_updateGradInput)(lua_State *L)
   int dimh = 2;
   int dimw = 3;
 
-
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
@@ -269,7 +286,8 @@ static int nn_(VolumetricMaxPooling_updateGradInput)(lua_State *L)
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     dimN++;
     dimt++;
     dimh++;
@@ -291,16 +309,20 @@ static int nn_(VolumetricMaxPooling_updateGradInput)(lua_State *L)
   indices_data = THTensor_(data)(indices);
 
   /* backprop */
-  if (input->nDimension == 4) { /* non-batch mode*/
-    nn_(VolumetricMaxPooling_updateGradInput_frame)(
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
       gradInput_data, gradOutput_data,
       indices_data,
       nslices,
       itime, iwidth, iheight,
       otime, owidth, oheight,
-      dT, dW, dH, padT, padW, padH);
+      dT, dW, dH,
+      pT, pW, pH
+    );
   }
-  else { /* batch mode */
+  else /* batch mode */
+  {
     long p;
     long nBatch = input->size[0];
 
@@ -308,34 +330,23 @@ static int nn_(VolumetricMaxPooling_updateGradInput)(lua_State *L)
     long ostride = nslices * otime * owidth * oheight;
 
 #pragma omp parallel for private(p)
-    for (p = 0; p < nBatch; p++) {
-      nn_(VolumetricMaxPooling_updateGradInput_frame)(
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
         gradInput_data + p * istride,
         gradOutput_data + p * ostride,
         indices_data + p * ostride,
         nslices,
         itime, iwidth, iheight,
         otime, owidth, oheight,
-        dT, dW, dH, padT, padW, padH);
+        dT, dW, dH,
+        pT, pW, pH
+      );
     }
   }
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-  return 1;
-}
-
-static const struct luaL_Reg nn_(VolumetricMaxPooling__) [] = {
-  {"VolumetricMaxPooling_updateOutput", nn_(VolumetricMaxPooling_updateOutput)},
-  {"VolumetricMaxPooling_updateGradInput", nn_(VolumetricMaxPooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricMaxPooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricMaxPooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/VolumetricMaxUnpooling.c b/generic/VolumetricMaxUnpooling.c
index 5e1fe1d476a..9c6239b7eb3 100644
--- a/generic/VolumetricMaxUnpooling.c
+++ b/generic/VolumetricMaxUnpooling.c
@@ -2,80 +2,84 @@
 #define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
 #else
 
-static void nn_(VolumetricMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
-                                                      real *ind_p,
-                                                      long nslices,
-                                                      long itime, long iwidth, long iheight,
-                                                      long otime, long owidth, long oheight,
-                                                      int dT, int dW, int dH,
-                                                      int padT, int padW, int padH)
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+  real *input_p,
+  real *output_p,
+  real *ind_p,
+  long nslices,
+  long iT, long iW, long iH,
+  long oT, long oW, long oH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
   long k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
-  {    
+  {
     long ti, i, j, maxz, maxy, maxx;
-    for(ti = 0; ti < itime; ti++)
+    for (ti = 0; ti < iT; ti++)
     {
-      for(i = 0; i < iheight; i++)
+      for (i = 0; i < iH; i++)
       {
-        for(j = 0; j < iwidth; j++)
+        for (j = 0; j < iW; j++)
         {
-          long start_t = ti * dT - padT;
-          long start_h = i * dH - padH;
-          long start_w = j * dW - padW;
-          
-          //real *output_p_k = output_p + k*otime*owidth*oheight + ti*owidth*oheight*dT + i*owidth*dH + j*dW;
-          real *input_p_k = input_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
-          real *ind_p_k = ind_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
-          
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          //real *output_p_k = output_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
+          real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
           maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
           maxy = ((unsigned char*)(ind_p_k))[1];
           maxx = ((unsigned char*)(ind_p_k))[2];
 
-          if(start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=otime || start_h+maxy>=oheight || start_w+maxx>=owidth)
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
           {
-              THError("invalid max index z= %d, y= %d, x= %d, otime= %d, owidth= %d, oheight= %d", start_t+maxz, start_h+maxy, start_w+maxx, otime, owidth, oheight);
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
           }
-          output_p[k*otime*owidth*oheight + oheight*owidth*(start_t+maxz) + owidth*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
+          output_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
         }
       }
     }
   }
 }
 
-static int nn_(VolumetricMaxUnpooling_updateOutput)(lua_State *L)
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *output,
+  THTensor *indices,
+  int oT, int oW, int oH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  int otime = luaT_getfieldcheckint(L, 1, "otime");
-  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
-  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
   int dimw = 3;
   int dimh = 2;
   int dimt = 1;
   int nbatch = 1;
   int nslices;
-  int itime;
-  int iheight;
-  int iwidth;
+  int iT;
+  int iH;
+  int iW;
   real *input_data;
   real *output_data;
   real *indices_data;
 
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5 , 2, "4D or 5D (batch mode) tensor expected");
-  if (!THTensor_(isSameSizeAs)(input, indices)){
-    THError("Invalid input size w.r.t current indices size");
-  }  
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5 , 2,
+    "4D or 5D (batch mode) tensor expected"
+  );
 
-  if (input->nDimension == 5) 
+  if (!THTensor_(isSameSizeAs)(input, indices))
+  {
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  if (input->nDimension == 5)
   {
     nbatch = input->size[0];
     dimt++;
@@ -85,9 +89,9 @@ static int nn_(VolumetricMaxUnpooling_updateOutput)(lua_State *L)
 
   /* sizes */
   nslices = input->size[dimt-1];
-  itime = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -96,25 +100,27 @@ static int nn_(VolumetricMaxUnpooling_updateOutput)(lua_State *L)
   /* resize output */
   if (input->nDimension == 4)
   {
-    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    THTensor_(resize4d)(output, nslices, oT, oH, oW);
     THTensor_(zero)(output);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    nn_(VolumetricMaxUnpooling_updateOutput_frame)(input_data, output_data,
-                                              indices_data,
-                                              nslices,
-                                              itime, iwidth, iheight,
-                                              otime, owidth, oheight,
-                                              dT, dW, dH, padT, padW, padH);
+    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH, pT, pW, pH
+    );
   }
   else
   {
     long p;
 
-    THTensor_(resize5d)(output, nbatch, nslices, otime, oheight, owidth);
+    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
     THTensor_(zero)(output);
 
     input_data = THTensor_(data)(input);
@@ -124,93 +130,96 @@ static int nn_(VolumetricMaxUnpooling_updateOutput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(VolumetricMaxUnpooling_updateOutput_frame)(input_data+p*nslices*itime*iwidth*iheight, output_data+p*nslices*otime*owidth*oheight,
-                                                indices_data+p*nslices*itime*iwidth*iheight,
-                                                nslices,
-                                                itime, iwidth, iheight,
-                                                otime, owidth, oheight,
-                                                dT, dW, dH, padT, padW, padH);
+      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+        input_data+p*nslices*iT*iW*iH,
+        output_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
     }
   }
 
   /* cleanup */
   THTensor_(free)(input);
   THTensor_(free)(indices);
-  return 1;
 }
 
-static void nn_(VolumetricMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
-                                                         real *ind_p,
-                                                         long nslices,
-                                                         long itime, long iwidth, long iheight,
-                                                         long otime, long owidth, long oheight,
-                                                         int dT, int dW, int dH,
-                                                         int padT, int padW, int padH)
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+  real *gradInput_p, real *gradOutput_p,
+  real *ind_p,
+  long nslices,
+  long iT, long iW, long iH,
+  long oT, long oW, long oH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
   long k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
     long ti, i, j, maxz, maxy, maxx;
-    for(ti = 0; ti < itime; ti++)
+    for (ti = 0; ti < iT; ti++)
     {
-      for(i = 0; i < iheight; i++)
+      for (i = 0; i < iH; i++)
       {
-        for(j = 0; j < iwidth; j++)
-        {        
-          long start_t = ti * dT - padT;
-          long start_h = i * dH - padH;
-          long start_w = j * dW - padW;
-          
-          real *gradInput_p_k = gradInput_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
-          //real *gradOutput_p_k = gradOutput_p + k*otime*owidth*oheight + ti*owidth*oheight*dT + i*owidth*dH + j*dW;
-          real *ind_p_k = ind_p + k*itime*iwidth*iheight + ti*iwidth*iheight + i*iwidth + j;
-          
+        for (j = 0; j < iW; j++)
+        {
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          //real *gradOutput_p_k = gradOutput_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
+          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
           maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
           maxy = ((unsigned char*)(ind_p_k))[1];
           maxx = ((unsigned char*)(ind_p_k))[2];
 
-          if(start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=otime || start_h+maxy>=oheight || start_w+maxx>=owidth)
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
           {
-              THError("invalid max index z= %d, y= %d, x= %d, otime= %d, owidth= %d, oheight= %d", start_t+maxz, start_h+maxy, start_w+maxx, otime, owidth, oheight);
-          }  
-          *gradInput_p_k = gradOutput_p[k*otime*owidth*oheight + oheight*owidth*(start_t+maxz) + owidth*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
         }
       }
     }
   }
 }
 
-static int nn_(VolumetricMaxUnpooling_updateGradInput)(lua_State *L)
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *indices,
+  int oT, int oW, int oH,
+  int dT, int dW, int dH,
+  int pT, int pW, int pH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  int otime = luaT_getfieldcheckint(L, 1, "otime");
-  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
-  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
   int dimw = 3;
   int dimh = 2;
   int dimt = 1;
   int nbatch = 1;
   int nslices;
-  int itime;
-  int iheight;
-  int iwidth;
+  int iT;
+  int iH;
+  int iW;
   real *gradInput_data;
   real *gradOutput_data;
   real *indices_data;
 
-  if (!THTensor_(isSameSizeAs)(input, indices)){
+  if (!THTensor_(isSameSizeAs)(input, indices))
+  {
     THError("Invalid input size w.r.t current indices size");
-  } 
+  }
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -220,7 +229,8 @@ static int nn_(VolumetricMaxUnpooling_updateGradInput)(lua_State *L)
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);
 
-  if (input->nDimension == 5) {
+  if (input->nDimension == 5)
+  {
     nbatch = input->size[0];
     dimt++;
     dimw++;
@@ -229,12 +239,16 @@ static int nn_(VolumetricMaxUnpooling_updateGradInput)(lua_State *L)
 
   /* sizes */
   nslices = input->size[dimt-1];
-  itime = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
 
-  if(otime!=gradOutput->size[dimt] || owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
-    THError("Inconsistent gradOutput size. otime= %d, oheight= %d, owidth= %d, gradOutput: %dx%d", otime, oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+  if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+  {
+    THError(
+      "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%d",
+      oT, oH, oW,gradOutput->size[dimh], gradOutput->size[dimw]
+    );
   }
 
   /* get raw pointers */
@@ -245,13 +259,15 @@ static int nn_(VolumetricMaxUnpooling_updateGradInput)(lua_State *L)
   /* backprop */
   if (input->nDimension == 4)
   {
-    nn_(VolumetricMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
-                                                 indices_data,
-                                                 nslices,
-                                                 itime, iwidth, iheight,
-                                                 otime, owidth, oheight,
-                                                 dT, dW, dH,
-                                                 padT, padW, padH);
+    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
   }
   else
   {
@@ -259,34 +275,22 @@ static int nn_(VolumetricMaxUnpooling_updateGradInput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(VolumetricMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*itime*iwidth*iheight, gradOutput_data+p*nslices*otime*owidth*oheight,
-                                                   indices_data+p*nslices*itime*iwidth*iheight,
-                                                   nslices,
-                                                   itime, iwidth, iheight,
-                                                   otime, owidth, oheight,
-                                                   dT, dW, dH,
-                                                   padT, padW, padH);
+      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+        gradInput_data+p*nslices*iT*iW*iH,
+        gradOutput_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
     }
   }
 
   /* cleanup */
   THTensor_(free)(gradOutput);
   THTensor_(free)(indices);
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(VolumetricMaxUnpooling__) [] = {
-  {"VolumetricMaxUnpooling_updateOutput", nn_(VolumetricMaxUnpooling_updateOutput)},
-  {"VolumetricMaxUnpooling_updateGradInput", nn_(VolumetricMaxUnpooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricMaxUnpooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricMaxUnpooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/init.c b/init.c
index 4328961713d..4b34c61daac 100644
--- a/init.c
+++ b/init.c
@@ -99,3 +99,21 @@
 
 #include "generic/SpatialMaxPooling.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"

From 897d81b7bd949e7c671dffb095c53847f19c6378 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Tue, 2 Feb 2016 01:31:58 +0100
Subject: [PATCH 039/101] Unify Volumetric* signatures with cunn

---
 generic/THNN.h                  | 5 +++++
 generic/VolumetricConvolution.c | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/generic/THNN.h b/generic/THNN.h
index 9977255d8f8..c937c0f8f12 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -473,6 +473,8 @@ TH_API void THNN_(VolumetricConvolution_updateOutput)(
           THTensor *output,
           THTensor *weight,
           THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 TH_API void THNN_(VolumetricConvolution_updateGradInput)(
@@ -481,6 +483,7 @@ TH_API void THNN_(VolumetricConvolution_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *weight,
+          THTensor *finput,
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 TH_API void THNN_(VolumetricConvolution_accGradParameters)(
@@ -489,6 +492,8 @@ TH_API void THNN_(VolumetricConvolution_accGradParameters)(
           THTensor *gradOutput,
           THTensor *gradWeight,
           THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
           real scale);
diff --git a/generic/VolumetricConvolution.c b/generic/VolumetricConvolution.c
index 31ad4d58f22..9d4046ec764 100644
--- a/generic/VolumetricConvolution.c
+++ b/generic/VolumetricConvolution.c
@@ -8,6 +8,8 @@ void THNN_(VolumetricConvolution_updateOutput)(
   THTensor *output,
   THTensor *weight,
   THTensor *bias,
+  THTensor *finput,       // only used by cuda impl
+  THTensor *fgradInput,   // only used by cuda impl
   int dT, int dW, int dH,
   int pT, int pW, int pH)
 {
@@ -90,6 +92,7 @@ void THNN_(VolumetricConvolution_updateGradInput)(
   THTensor *gradOutput,
   THTensor *gradInput,
   THTensor *weight,
+  THTensor *finput,       // only used by cuda impl
   int dT, int dW, int dH,
   int pT, int pW, int pH)
 {
@@ -152,6 +155,8 @@ void THNN_(VolumetricConvolution_accGradParameters)(
   THTensor *gradOutput,
   THTensor *gradWeight,
   THTensor *gradBias,
+  THTensor *finput,       // only used by cuda impl
+  THTensor *fgradInput,   // only used by cuda impl
   int dT, int dW, int dH,
   int pT, int pW, int pH,
   real scale)

From 2555ac14d3896338fbfd8714f51c7b389c1ca9ac Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Wed, 10 Feb 2016 00:48:08 -0500
Subject: [PATCH 040/101] moving Temporal* modules to THNN

---
 generic/TemporalConvolution.c | 344 ++++++++++++++++++++++++++++++++++
 generic/TemporalMaxPooling.c  | 231 +++++++++++++++++++++++
 generic/TemporalSubSampling.c | 112 +++++++++++
 init.c                        |   9 +
 4 files changed, 696 insertions(+)
 create mode 100644 generic/TemporalConvolution.c
 create mode 100644 generic/TemporalMaxPooling.c
 create mode 100644 generic/TemporalSubSampling.c

diff --git a/generic/TemporalConvolution.c b/generic/TemporalConvolution.c
new file mode 100644
index 00000000000..a8109886a4a
--- /dev/null
+++ b/generic/TemporalConvolution.c
@@ -0,0 +1,344 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+void THNN_(TemporalConvolution_updateOutput)(THNNState *state,
+					     THTensor *input,
+					     THTensor *output,
+					     THTensor *weight,
+					     THTensor *bias,
+					     int kW, int dW,
+					     int inputFrameSize,
+					     int outputFrameSize
+					     )
+{  
+  THTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+  
+  if (input->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck(input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  input = THTensor_(newContiguous)(input);
+  outputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THTensor_(resize2d)(output,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(outputWindow, output, 0, k);
+      THTensor_(copy)(outputWindow, bias);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(outputWindow, output->storage, 
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THTensor_(transpose)(weight, NULL, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
+      THTensor_(transpose)(weight, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THTensor *outputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    THTensor_(resize3d)(output,
+                        nBatchFrame,
+                        nOutputFrame,
+                        outputFrameSize);
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(outputSample, output, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+      
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(outputWindow, outputSample, 0, k);
+        THTensor_(copy)(outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage, 
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THTensor_(transpose)(weight, NULL, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
+        THTensor_(transpose)(weight, NULL, 0, 1);
+      }
+    }
+    THTensor_(free)(outputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(outputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(THNNState* state,
+						THTensor *input,
+						THTensor *gradOutput,
+						THTensor *gradInput,
+						THTensor *weight,
+						int kW, int dW)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *gradInputWindow;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  if (gradOutput->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  gradOutputWindow = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (gradOutput->nDimension == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *gradInputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(gradInputSample, gradInput, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+      
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(gradInputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(gradInputWindow);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(THNNState *state,
+						  THTensor *input,
+						  THTensor *gradOutput,
+						  THTensor *gradWeight,
+						  THTensor *gradBias,
+						  int kW, int dW,
+						  real scale)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *inputWindow;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  if (gradOutput->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  
+  if (input->nDimension == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, 
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
+      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+      
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, 
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
+        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+#endif
diff --git a/generic/TemporalMaxPooling.c b/generic/TemporalMaxPooling.c
new file mode 100644
index 00000000000..2b3d9703e9a
--- /dev/null
+++ b/generic/TemporalMaxPooling.c
@@ -0,0 +1,231 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+void THNN_(TemporalMaxPooling_updateOutput)(THNNState *state,
+					    THTensor *input,
+					    THTensor *output,
+					    THTensor *indices,
+					    int kW, int dW)
+{
+  long niframe;
+  long framesize;
+  long noframe;
+
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  long t, y;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  /* sizes */
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 2)
+  {
+    /* resize output */
+    THTensor_(resize2d)(output, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THTensor_(resize2d)(indices, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    for(t = 0; t < noframe; t++)
+    {
+      real *ip = input_data + t*framesize*dW;
+      real *op = output_data + t*framesize;
+      real *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long x;
+        for(x = 0; x < kW; x++)
+        {
+          real val = ip[x*framesize+y];
+          if (val > maxval)
+          {
+            maxval = val;
+            maxindex = x;
+          }
+        }
+
+        /* set output to local max */
+        op[y] = maxval;
+        xp[y] = (real)maxindex;
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    /* resize output */
+    THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *inputSample_data = input_data + i*niframe*framesize;
+      real *outputSample_data = output_data + i*noframe*framesize;
+      real *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *ip = inputSample_data + t*framesize*dW;
+        real *op = outputSample_data + t*framesize;
+        real *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = -1;
+          real maxval = -THInf;
+          long x;
+          for(x = 0; x < kW; x++)
+          {
+            real val = ip[x*framesize+y];
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = x;
+            }
+          }
+
+          /* set output to local max */
+          op[y] = maxval;
+          xp[y] = (real)maxindex;
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(THNNState *state,
+					       THTensor *input,
+					       THTensor *gradOutput,
+					       THTensor *gradInput,
+					       THTensor *indices,
+					       int kW, int dW)
+{
+  long niframe;
+  int noframe;
+  long framesize;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  long t, y;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize and zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  /* sizes */
+  niframe = input->size[dimS];
+  noframe = gradOutput->size[dimS];
+  framesize = gradOutput->size[dimF];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  if (input->nDimension == 2)
+  {
+    for(t = 0; t < noframe; t++)
+    {
+      real *gip = gradInput_data + t*framesize*dW;
+      real *gop = gradOutput_data + t*framesize;
+      real *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = (long)xp[y];
+        gip[maxindex*framesize+y] += gop[y];
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+      real *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *gip = gradInputSample_data + t*framesize*dW;
+        real *gop = gradOutputSample_data + t*framesize;
+        real *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = (long)xp[y];
+          gip[maxindex*framesize+y] += gop[y];
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/generic/TemporalSubSampling.c b/generic/TemporalSubSampling.c
new file mode 100644
index 00000000000..012deb7d655
--- /dev/null
+++ b/generic/TemporalSubSampling.c
@@ -0,0 +1,112 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+void THNN_(TemporalSubSampling_updateOutput)(THNNState *state,
+					     THTensor *input,
+					     THTensor *output,
+					     THTensor *weight,
+					     THTensor *bias,
+					     int kW, int dW,
+					     int inputFrameSize
+					     )
+{
+  THTensor *outputFrame, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k;
+  
+  THArgCheck( input->nDimension == 2, 2, "2D tensor expected");
+  THArgCheck( input->size[1] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck( input->size[0] >= kW, 2, "input sequence smaller than kernel size");
+
+  outputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  THTensor_(resize2d)(output,
+                      nOutputFrame,
+                      inputFrameSize);
+  
+  for(k = 0; k < nOutputFrame; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(outputFrame, output, 0, k);
+    THTensor_(sum)(outputFrame, inputWindow, 0);
+    THTensor_(cmul)(outputFrame, outputFrame, weight);
+    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+  }
+
+  THTensor_(free)(outputFrame);
+  THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(THNNState *state,
+					      THTensor *input,
+					      THTensor *gradOutput,
+					      THTensor *gradInput,
+					      THTensor *weight,
+					      int kW, int dW
+					      )
+{
+
+  THTensor *gradOutputFrame;
+  THTensor *gradInputWindow, *buffer, *kwunit;
+  long k;
+
+  gradOutputFrame = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+  kwunit = THTensor_(newWithSize1d)(kW);
+
+  THTensor_(fill)(kwunit, 1);
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(cmul)(buffer, weight, gradOutputFrame);
+    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(buffer);
+  THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(THNNState *state,
+						THTensor *input,
+						THTensor *gradOutput,
+						THTensor *gradWeight,
+						THTensor *gradBias,
+						int kW, int dW,
+						real scale)
+{
+  THTensor *gradOutputFrame;
+  THTensor *inputWindow, *buffer;
+  long k;
+
+
+  gradOutputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(sum)(buffer, inputWindow, 0);
+    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(buffer);
+}
+
+#endif
diff --git a/init.c b/init.c
index 4b34c61daac..eea66580a1d 100644
--- a/init.c
+++ b/init.c
@@ -85,6 +85,15 @@
 #include "generic/Threshold.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/TemporalConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 

From d474fbd198958c403f485003705e3d5a13f62a46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Thu, 11 Feb 2016 21:32:41 +0100
Subject: [PATCH 041/101] Move SparseLinear.c -> ../lib/THNN/generic

---
 generic/SparseLinear.c | 314 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 generic/SparseLinear.c

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
new file mode 100644
index 00000000000..7a5b25244d5
--- /dev/null
+++ b/generic/SparseLinear.c
@@ -0,0 +1,314 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+static int nn_(checkInput)(THTensor* t) {
+  return t->nDimension == 2 && t->size[1] == 2;
+}
+
+static int nn_(checkSize2D)(THTensor* t, long size0, long size1) {
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static int nn_(checkSize1D)(THTensor* t, long size0) {
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static int nn_(SparseLinear_updateOutput)(lua_State *L)
+{
+  long i;
+  THTensor * input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
+  luaL_argcheck(L, nn_(checkSize1D)(output, outDim), 1, "output size wrong");
+  luaL_argcheck(L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+
+  lua_getfield(L, 1, "shardBuffer");
+  if (!lua_isnil(L, -1)) {
+    THTensor *buffer =
+      luaT_getfieldcheckudata(L, 1, "shardBuffer", torch_Tensor);
+    long num_shards = buffer->size[1];
+    luaL_argcheck(L,
+                  buffer->nDimension == 2 && buffer->size[0] == outDim &&
+                      num_shards > 0,
+                  1,
+                  "shardBuffer size wrong");
+
+    THTensor_(zero)(buffer);
+    #pragma omp parallel for private(i) schedule(static) num_threads(num_shards)
+    for (i = 0; i < input->size[0]; i++) {
+#ifdef _OPENMP
+      int shardId = omp_get_thread_num();
+#else
+      int shardId = 1;
+#endif
+      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      THTensor_(get2d)(input, i, 1),
+                      THTensor_(data)(weight) + offset * weight->stride[1],
+                      weight->stride[0],
+                      THTensor_(data)(buffer) + shardId * buffer->stride[1],
+                      buffer->stride[0]);
+      } else {
+        luaL_error(L, "index out of bound. updateOutput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+      }
+    }
+
+    THTensor_(sum)(output, buffer, 1);
+    THTensor_(cadd)(output, bias, 1.0, output);
+
+    lua_getfield(L, 1, "output");
+    return 1;
+  }
+
+  THTensor_(copy)(output, bias);
+  for(i = 0; i < input->size[0]; i++)
+  {
+    long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+    if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+    {
+        real val = THTensor_(get2d)(input, i, 1);
+        THBlas_(axpy)(output->size[0],
+                      val,
+                      THTensor_(data)(weight)+offset*weight->stride[1],
+                      weight->stride[0],
+                      THTensor_(data)(output),
+                      output->stride[0]);
+    }
+    else {
+        luaL_error(L, "index out of bound. updateOutput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+    }
+  }
+
+  lua_getfield(L, 1, "output");
+  return 1;
+}
+
+static int nn_(SparseLinear_accGradParameters)(lua_State *L)
+{
+  long i;
+  THTensor * input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor * gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real scale = luaL_optnumber(L, 4, 1);
+  THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay");
+
+  long nnz = input->size[0];
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+  for(i = 0; i < nnz; i++)
+  {
+      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+
+      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+      {
+          real val = scale*THTensor_(get2d)(input, i, 1);
+
+          THBlas_(axpy)(outDim,
+                        val,
+                        THTensor_(data)(gradOutput),
+                        gradOutput->stride[0],
+                        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
+                        gradWeight->stride[0]);
+      }
+      else {
+          luaL_error(L, "index out of bound. accGradParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
+      }
+  }
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  if(weightDecay != 0) {
+    #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+    for(i = 0; i < nnz; i++) {
+      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+      THBlas_(axpy)(outDim,
+                    weightDecay,
+                    THTensor_(data)(weight) + offset*weight->stride[1],
+                    weight->stride[0],
+                    THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
+                    gradWeight->stride[0]);
+    }
+    THTensor_(cadd)(gradBias, gradBias, weightDecay, bias);
+  }
+
+  return 0;
+}
+
+int nn_(SparseLinear_updateParameters)(lua_State *L)
+{
+  long i;
+  real learningRate = luaL_checknumber(L, 2);
+  THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THTensor * gradWeight = luaT_getfieldcheckudata(
+    L, 1, "gradWeight", torch_Tensor);
+  THTensor * lastInput = luaT_getfieldcheckudata(
+    L, 1, "lastInput", torch_Tensor);
+
+  long nnz = lastInput->size[0];
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(
+    L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
+  for(i = 0; i < nnz; i++)
+  {
+      long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+
+      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+      {
+          real* pGradWeight =
+            THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+          THBlas_(axpy)(outDim,
+                        -learningRate,
+                        pGradWeight,
+                        gradWeight->stride[0],
+                        THTensor_(data)(weight)+offset*weight->stride[1],
+                        weight->stride[0]);
+      }
+      else {
+          luaL_error(L, "index out of bound. updateParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
+      }
+  }
+  return 0;
+}
+
+int nn_(SparseLinear_zeroGradParameters)(lua_State *L)
+{
+  long i;
+  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THTensor * gradWeight = luaT_getfieldcheckudata(
+    L, 1, "gradWeight", torch_Tensor);
+  THTensor * lastInput = luaT_getfieldcheckudata(
+    L, 1, "lastInput", torch_Tensor);
+
+  long nnz = lastInput->size[0];
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+  THTensor_(zero)(gradBias);
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
+  for(i = 0; i < nnz; i++)
+  {
+      long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+
+      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+      {
+          real* pGradWeight =
+            THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+          if(gradWeight->stride[0] == 1) {
+              THVector_(fill)(pGradWeight, 0, outDim);
+          } else {
+              long j;
+              for(j = 0; j < outDim; ++j) {
+                  pGradWeight[j * gradWeight->stride[0]] = 0;
+              }
+          }
+      }
+      else {
+          luaL_error(L, "index out of bound. zeroGradParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
+      }
+  }
+  return 0;
+}
+
+static int nn_(SparseLinear_updateGradInput)(lua_State *L) {
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput =
+      luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+
+  long i;
+  long nnz = input->size[0];
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(
+    L, nn_(checkInput)(input), 2, "input must be an nnz x 2 tensor");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+
+  THTensor_(resize2d)(gradInput, input->size[0], input->size[1]);
+
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+  for (i = 0; i < nnz; ++i) {
+    long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+    THTensor_(set2d)(gradInput, i, 0, offset + 1);
+
+    if (offset >= 0 && offset < inDim) {
+      real val =
+          THBlas_(dot)(outDim,
+                       THTensor_(data)(gradOutput),
+                       gradOutput->stride[0],
+                       THTensor_(data)(weight) + offset * weight->stride[1],
+                       weight->stride[0]);
+      THTensor_(set2d)(gradInput, i, 1, val);
+    } else {
+      luaL_error(L, "index out of bound. updateGradInput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+    }
+  }
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SparseLinear__) [] = {
+  {"SparseLinear_updateOutput", nn_(SparseLinear_updateOutput)},
+  {"SparseLinear_accGradParameters", nn_(SparseLinear_accGradParameters)},
+  {"SparseLinear_updateParameters", nn_(SparseLinear_updateParameters)},
+  {"SparseLinear_zeroGradParameters", nn_(SparseLinear_zeroGradParameters)},
+  {"SparseLinear_updateGradInput", nn_(SparseLinear_updateGradInput)},
+  {NULL, NULL}
+};
+
+void nn_(SparseLinear_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SparseLinear__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From bf7920e6f2c0a7f6dd728254cbc5ef3461e87245 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Thu, 11 Feb 2016 22:43:10 +0100
Subject: [PATCH 042/101] Add THNN conversion of SparseLinear

---
 generic/SparseLinear.c | 355 ++++++++++++++++++++---------------------
 generic/THNN.h         |  37 +++++
 init.c                 |   3 +
 3 files changed, 209 insertions(+), 186 deletions(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index 7a5b25244d5..e7abecf75d4 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -6,309 +6,292 @@
 #include <omp.h>
 #endif
 
-static int nn_(checkInput)(THTensor* t) {
+static bool THNN_(checkInput)(THTensor* t)
+{
   return t->nDimension == 2 && t->size[1] == 2;
 }
 
-static int nn_(checkSize2D)(THTensor* t, long size0, long size1) {
+static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
+{
   return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
 }
 
-static int nn_(checkSize1D)(THTensor* t, long size0) {
+static bool THNN_(checkSize1D)(THTensor* t, long size0)
+{
   return t->nDimension == 1 && t->size[0] == size0;
 }
 
-static int nn_(SparseLinear_updateOutput)(lua_State *L)
+void THNN_(SparseLinear_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *shardBuffer)
 {
   long i;
-  THTensor * input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
   long outDim = weight->size[0];
   long inDim = weight->size[1];
 
-  luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
-  luaL_argcheck(L, nn_(checkSize1D)(output, outDim), 1, "output size wrong");
-  luaL_argcheck(L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 2");
+  THArgCheck(THNN_(checkSize1D)(output, outDim), 3, "output size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 
-  lua_getfield(L, 1, "shardBuffer");
-  if (!lua_isnil(L, -1)) {
-    THTensor *buffer =
-      luaT_getfieldcheckudata(L, 1, "shardBuffer", torch_Tensor);
-    long num_shards = buffer->size[1];
-    luaL_argcheck(L,
-                  buffer->nDimension == 2 && buffer->size[0] == outDim &&
-                      num_shards > 0,
-                  1,
-                  "shardBuffer size wrong");
+  if (shardBuffer != NULL)
+  {
+    long num_shards = shardBuffer->size[1];
+    THArgCheck(
+      shardBuffer->nDimension == 2 && shardBuffer->size[0] == outDim && num_shards > 0,
+      6,
+      "shardBuffer size wrong"
+    );
 
-    THTensor_(zero)(buffer);
+    THTensor_(zero)(shardBuffer);
     #pragma omp parallel for private(i) schedule(static) num_threads(num_shards)
-    for (i = 0; i < input->size[0]; i++) {
+    for (i = 0; i < input->size[0]; i++)
+    {
 #ifdef _OPENMP
       int shardId = omp_get_thread_num();
 #else
       int shardId = 1;
 #endif
-      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
 
-      if (offset >= 0 && offset < inDim) {
-        THBlas_(axpy)(outDim,
-                      THTensor_(get2d)(input, i, 1),
-                      THTensor_(data)(weight) + offset * weight->stride[1],
-                      weight->stride[0],
-                      THTensor_(data)(buffer) + shardId * buffer->stride[1],
-                      buffer->stride[0]);
-      } else {
-        luaL_error(L, "index out of bound. updateOutput: \
+      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim)
+      {
+        THBlas_(axpy)(
+          outDim,
+          THTensor_(get2d)(input, i, 1),
+          THTensor_(data)(weight) + offset * weight->stride[1],
+          weight->stride[0],
+          THTensor_(data)(shardBuffer) + shardId * shardBuffer->stride[1],
+          shardBuffer->stride[0]
+        );
+      }
+      else
+      {
+        THError("index out of bound. updateOutput: \
 %ld not between 1 and %ld", offset + 1, inDim);
       }
     }
 
-    THTensor_(sum)(output, buffer, 1);
+    THTensor_(sum)(output, shardBuffer, 1);
     THTensor_(cadd)(output, bias, 1.0, output);
 
-    lua_getfield(L, 1, "output");
-    return 1;
+    return;
   }
 
   THTensor_(copy)(output, bias);
-  for(i = 0; i < input->size[0]; i++)
+  for (i = 0; i < input->size[0]; i++)
   {
     long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-    if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+    if (offset >= 0 && offset < inDim) // make sure indices are in bounds..
     {
-        real val = THTensor_(get2d)(input, i, 1);
-        THBlas_(axpy)(output->size[0],
-                      val,
-                      THTensor_(data)(weight)+offset*weight->stride[1],
-                      weight->stride[0],
-                      THTensor_(data)(output),
-                      output->stride[0]);
+      real val = THTensor_(get2d)(input, i, 1);
+      THBlas_(axpy)(
+        output->size[0],
+        val,
+        THTensor_(data)(weight)+offset*weight->stride[1],
+        weight->stride[0],
+        THTensor_(data)(output),
+        output->stride[0]
+      );
     }
-    else {
-        luaL_error(L, "index out of bound. updateOutput: \
+    else
+    {
+      THError("index out of bound. updateOutput: \
 %ld not between 1 and %ld", offset + 1, inDim);
     }
   }
-
-  lua_getfield(L, 1, "output");
-  return 1;
 }
 
-static int nn_(SparseLinear_accGradParameters)(lua_State *L)
+void THNN_(SparseLinear_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *weight,
+  THTensor *bias,
+  real weightDecay,
+  real scale)
 {
   long i;
-  THTensor * input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor * gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real scale = luaL_optnumber(L, 4, 1);
-  THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-  THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay");
-
   long nnz = input->size[0];
   long outDim = weight->size[0];
   long inDim = weight->size[1];
 
-  luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
-  luaL_argcheck(
-    L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
-  luaL_argcheck(
-    L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
-  luaL_argcheck(
-    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 2");
+  THArgCheck(THNN_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
 
   #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
-  for(i = 0; i < nnz; i++)
+  for (i = 0; i < nnz; i++)
   {
       long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
 
-      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+      if (offset >= 0 && offset < inDim) // make sure indices are in bounds..
       {
-          real val = scale*THTensor_(get2d)(input, i, 1);
+        real val = scale*THTensor_(get2d)(input, i, 1);
 
-          THBlas_(axpy)(outDim,
-                        val,
-                        THTensor_(data)(gradOutput),
-                        gradOutput->stride[0],
-                        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
-                        gradWeight->stride[0]);
+        THBlas_(axpy)(
+          outDim,
+          val,
+          THTensor_(data)(gradOutput),
+          gradOutput->stride[0],
+          THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
+          gradWeight->stride[0]
+        );
       }
-      else {
-          luaL_error(L, "index out of bound. accGradParameters: \
+      else
+      {
+        THError("index out of bound. accGradParameters: \
 %ld not between 1 and %ld", offset + 1, inDim);
       }
   }
 
   THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
 
-  if(weightDecay != 0) {
+  if (weightDecay != 0)
+  {
     #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
-    for(i = 0; i < nnz; i++) {
+    for (i = 0; i < nnz; i++)
+    {
       long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-      THBlas_(axpy)(outDim,
-                    weightDecay,
-                    THTensor_(data)(weight) + offset*weight->stride[1],
-                    weight->stride[0],
-                    THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
-                    gradWeight->stride[0]);
+      THBlas_(axpy)(
+        outDim,
+        weightDecay,
+        THTensor_(data)(weight) + offset*weight->stride[1],
+        weight->stride[0],
+        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
+        gradWeight->stride[0]
+      );
     }
     THTensor_(cadd)(gradBias, gradBias, weightDecay, bias);
   }
-
-  return 0;
 }
 
-int nn_(SparseLinear_updateParameters)(lua_State *L)
+void THNN_(SparseLinear_updateParameters)(
+  THNNState *state,
+  THTensor *weight,
+  THTensor *bias,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *lastInput,
+  real learningRate)
 {
   long i;
-  real learningRate = luaL_checknumber(L, 2);
-  THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-  THTensor * gradWeight = luaT_getfieldcheckudata(
-    L, 1, "gradWeight", torch_Tensor);
-  THTensor * lastInput = luaT_getfieldcheckudata(
-    L, 1, "lastInput", torch_Tensor);
-
   long nnz = lastInput->size[0];
   long outDim = weight->size[0];
   long inDim = weight->size[1];
 
-  luaL_argcheck(
-    L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
-  luaL_argcheck(
-    L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
-  luaL_argcheck(
-    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
 
   THTensor_(cadd)(bias, bias, -learningRate, gradBias);
 
   #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
-  for(i = 0; i < nnz; i++)
+  for (i = 0; i < nnz; i++)
   {
-      long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+    long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
 
-      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
-      {
-          real* pGradWeight =
-            THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
-          THBlas_(axpy)(outDim,
-                        -learningRate,
-                        pGradWeight,
-                        gradWeight->stride[0],
-                        THTensor_(data)(weight)+offset*weight->stride[1],
-                        weight->stride[0]);
-      }
-      else {
-          luaL_error(L, "index out of bound. updateParameters: \
+    if (offset >= 0 && offset < inDim) // make sure indices are in bounds..
+    {
+      real* pGradWeight =
+        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+      THBlas_(axpy)(
+        outDim,
+        -learningRate,
+        pGradWeight,
+        gradWeight->stride[0],
+        THTensor_(data)(weight)+offset*weight->stride[1],
+        weight->stride[0]
+      );
+    }
+    else
+    {
+      THError("index out of bound. updateParameters: \
 %ld not between 1 and %ld", offset + 1, inDim);
-      }
+    }
   }
-  return 0;
 }
 
-int nn_(SparseLinear_zeroGradParameters)(lua_State *L)
+void THNN_(SparseLinear_zeroGradParameters)(THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput)
 {
   long i;
-  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-  THTensor * gradWeight = luaT_getfieldcheckudata(
-    L, 1, "gradWeight", torch_Tensor);
-  THTensor * lastInput = luaT_getfieldcheckudata(
-    L, 1, "lastInput", torch_Tensor);
-
   long nnz = lastInput->size[0];
   long outDim = gradWeight->size[0];
   long inDim = gradWeight->size[1];
 
-  luaL_argcheck(
-    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
 
   THTensor_(zero)(gradBias);
   #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
-  for(i = 0; i < nnz; i++)
+  for (i = 0; i < nnz; i++)
   {
-      long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+    long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
 
-      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+    if(offset >= 0 && offset < inDim) // make sure indices are in bounds..
+    {
+      real* pGradWeight = THTensor_(data)(gradWeight) + offset * gradWeight->stride[1];
+      if (gradWeight->stride[0] == 1)
       {
-          real* pGradWeight =
-            THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
-          if(gradWeight->stride[0] == 1) {
-              THVector_(fill)(pGradWeight, 0, outDim);
-          } else {
-              long j;
-              for(j = 0; j < outDim; ++j) {
-                  pGradWeight[j * gradWeight->stride[0]] = 0;
-              }
-          }
+          THVector_(fill)(pGradWeight, 0, outDim);
       }
-      else {
-          luaL_error(L, "index out of bound. zeroGradParameters: \
+      else
+      {
+        long j;
+        for (j = 0; j < outDim; ++j)
+        {
+          pGradWeight[j * gradWeight->stride[0]] = 0;
+        }
+      }
+    }
+    else
+    {
+      THError("index out of bound. zeroGradParameters: \
 %ld not between 1 and %ld", offset + 1, inDim);
-      }
+    }
   }
-  return 0;
 }
 
-static int nn_(SparseLinear_updateGradInput)(lua_State *L) {
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput =
-      luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-
+void THNN_(SparseLinear_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight)
+{
   long i;
   long nnz = input->size[0];
   long outDim = weight->size[0];
   long inDim = weight->size[1];
 
-  luaL_argcheck(
-    L, nn_(checkInput)(input), 2, "input must be an nnz x 2 tensor");
-  luaL_argcheck(
-    L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be an nnz x 2 tensor");
+  THArgCheck(THNN_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
 
   THTensor_(resize2d)(gradInput, input->size[0], input->size[1]);
 
   #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
-  for (i = 0; i < nnz; ++i) {
+  for (i = 0; i < nnz; ++i)
+  {
     long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
     THTensor_(set2d)(gradInput, i, 0, offset + 1);
 
-    if (offset >= 0 && offset < inDim) {
-      real val =
-          THBlas_(dot)(outDim,
-                       THTensor_(data)(gradOutput),
-                       gradOutput->stride[0],
-                       THTensor_(data)(weight) + offset * weight->stride[1],
-                       weight->stride[0]);
+    if (offset >= 0 && offset < inDim)
+    {
+      real val = 
+        THBlas_(dot)(
+          outDim,
+          THTensor_(data)(gradOutput),
+          gradOutput->stride[0],
+          THTensor_(data)(weight) + offset * weight->stride[1],
+          weight->stride[0]
+        );
       THTensor_(set2d)(gradInput, i, 1, val);
-    } else {
-      luaL_error(L, "index out of bound. updateGradInput: \
+    }
+    else
+    {
+      THError("index out of bound. updateGradInput: \
 %ld not between 1 and %ld", offset + 1, inDim);
     }
   }
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SparseLinear__) [] = {
-  {"SparseLinear_updateOutput", nn_(SparseLinear_updateOutput)},
-  {"SparseLinear_accGradParameters", nn_(SparseLinear_accGradParameters)},
-  {"SparseLinear_updateParameters", nn_(SparseLinear_updateParameters)},
-  {"SparseLinear_zeroGradParameters", nn_(SparseLinear_zeroGradParameters)},
-  {"SparseLinear_updateGradInput", nn_(SparseLinear_updateGradInput)},
-  {NULL, NULL}
-};
-
-void nn_(SparseLinear_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SparseLinear__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index c937c0f8f12..fca4c5c6cc2 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -316,6 +316,43 @@ TH_API void THNN_(SoftShrink_updateGradInput)(
           THTensor *gradInput,
           real lambda);
 
+TH_API void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *shardBuffer);
+TH_API void THNN_(SparseLinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+TH_API void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate);
+
 TH_API void THNN_(Sqrt_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index eea66580a1d..e49ab8d2b13 100644
--- a/init.c
+++ b/init.c
@@ -73,6 +73,9 @@
 #include "generic/SoftShrink.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SparseLinear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/Sqrt.c"
 #include "THGenerateFloatTypes.h"
 

From 5a7b38a2a84ee357550d77d3d5894976c6026ad1 Mon Sep 17 00:00:00 2001
From: Frederic Besse <fbesse@google.com>
Date: Wed, 17 Feb 2016 11:28:26 +0000
Subject: [PATCH 043/101] Replacing implementation of
 VolumetricFullConvolution. Now works like SpatialFullConvolution.

---
 generic/THNN.h                      |  32 ---
 generic/VolumetricFullConvolution.c | 299 ----------------------------
 init.c                              |   3 -
 3 files changed, 334 deletions(-)
 delete mode 100644 generic/VolumetricFullConvolution.c

diff --git a/generic/THNN.h b/generic/THNN.h
index fca4c5c6cc2..a7d36bd42f9 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -565,38 +565,6 @@ TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
           THTensor *finput,
           real scale);
 
-TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          THTensor *weight,
-          THTensor *bias,
-          THTensor *finput,
-          THTensor *fgradInput,
-          int dT, int dW, int dH,
-          int pT, int pW, int pH);
-TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *weight,
-          THTensor *finput,
-          THTensor *fgradInput,
-          int dT, int dW, int dH,
-          int pT, int pW, int pH);
-TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradWeight,
-          THTensor *gradBias,
-          THTensor *finput,
-          THTensor *fgradInput,
-          int dT, int dW, int dH,
-          int pT, int pW, int pH,
-          real scale);
-
 TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
deleted file mode 100644
index 73e81a140ec..00000000000
--- a/generic/VolumetricFullConvolution.c
+++ /dev/null
@@ -1,299 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
-#else
-
-void THNN_(VolumetricFullConvolution_updateOutput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *output,
-  THTensor *weight,
-  THTensor *bias,
-  THTensor *finput,         // only used by cuda impl
-  THTensor *fgradInput,     // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
-{
-  // number of input & output planes and kernel size is indirectly defined by the weight tensor
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
-
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kW           = (int)weight->size[3];
-  int kH           = (int)weight->size[4];
-
-  THArgCheck(kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
-  THArgCheck(input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
-  THArgCheck(input->size[1] == nInputPlane, 2, "input tensor has wrong number of planes");
-
-  // input tensor dimensions
-  long batchSize   = input->size[0];
-  int inputDepth   = (int)input->size[2];
-  int inputHeight  = (int)input->size[3];
-  int inputWidth   = (int)input->size[4];
-
-  int outputDepth  = (inputDepth  - 1) * dT - 2 * pT + kT;
-  int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
-  int outputWidth  = (inputWidth  - 1) * dW - 2 * pW + kW;
-
-  // Resize output
-  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
-
-  // Helpers
-  THTensor *input_n = THTensor_(new)();
-  THTensor *output_n = THTensor_(new)();
-
-  const real* weight_ptr = THTensor_(data)(weight);
-  const real* bias_ptr = THTensor_(data)(bias);
-
-  int n;
-  for (n = 0; n < batchSize; ++n)
-  {
-    THTensor_(select)(input_n, input, 0, n);
-    THTensor_(select)(output_n, output, 0, n);
-
-    THTensor *outn = THTensor_(new)();
-    // add bias first
-    int i;
-    for (i = 0; i < bias->size[0]; i++)
-    {
-      THTensor_(select)(outn,output_n,0,i);
-      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
-    }
-    THTensor_(free)(outn);
-
-    int t, h, w, kc_, kt_, kh_, kw_, c;
-
-    const real *input_ptr = THTensor_(data)(input_n);
-    real *output_ptr = THTensor_(data)(output_n);
-    for (t = 0; t < inputDepth; t++)
-    {
-      for (h = 0; h < inputHeight; h++)
-        for (w = 0; w < inputWidth; w++)
-          for (kc_ = 0; kc_ < nOutputPlane; kc_++)
-            for (kt_ = 0; kt_ < kT; kt_++)
-              for (kh_ = 0; kh_ < kH; kh_++)
-                for (kw_ = 0; kw_ < kW; kw_++)
-                {
-                  int pt = t * dT - pT + kt_;
-                  int ph = h * dH - pH + kh_;
-                  int pw = w * dW - pW + kw_;
-                  if (pt >=0 && ph >=0 && pw >= 0 &&
-                    pt < outputDepth && ph < outputHeight && pw < outputWidth)
-                  {
-                    real val = 0;
-                    for (c = 0; c < nInputPlane; c++)
-                    {
-                      val += input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w]
-                        * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
-                    }
-                    output_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
-                      += val;
-                  }
-                }
-    }
-  }
-  THTensor_(free)(input_n);
-  THTensor_(free)(output_n);
-}
-
-void THNN_(VolumetricFullConvolution_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *weight,
-  THTensor *finput,         // only used by cuda impl
-  THTensor *fgradInput,     // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH
-)
-{
-  // number of input/output planes and kernel size is indirectly defined by the weight tensor
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
-
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kW           = (int)weight->size[3];
-  int kH           = (int)weight->size[4];
-
-  THArgCheck(kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
-  THArgCheck(input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
-  THArgCheck(input->size[1] == nInputPlane, 2, "input tensor has wrong number of planes");
-
-  int inputDepth   = (int)input->size[2];
-  int inputHeight  = (int)input->size[3];
-  int inputWidth   = (int)input->size[4];
-
-  int outputDepth  = (inputDepth  - 1) * dT - 2 * pT + kT;
-  int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
-  int outputWidth  = (inputWidth  - 1) * dW - 2 * pW + kW;
-
-  // Batch size
-  long batchSize = input->size[0];
-
-  // Resize output
-  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
-
-  // Helpers
-  THTensor *gradInput_n = THTensor_(new)();
-  THTensor *gradOutput_n = THTensor_(new)();
-
-  const real* weight_ptr = THTensor_(data)(weight);
-
-  // For each n in batch, do:
-  int n;
-  for (n = 0; n < batchSize; n++)
-  {
-    THTensor_(select)(gradInput_n, gradInput, 0, n);
-    THTensor_(select)(gradOutput_n, gradOutput, 0, n);
-    THTensor_(fill)(gradInput_n, 0);
-
-    int t, h, w, kc_, kt_, kh_, kw_, c;
-
-    real *gradInput_ptr = THTensor_(data)(gradInput_n);
-    const real *gradOutput_ptr = THTensor_(data)(gradOutput_n);
-    for (t = 0; t < inputDepth; t++)
-      for (h = 0; h < inputHeight; h++)
-        for (w = 0; w < inputWidth; w++)
-          for (kc_ = 0; kc_ < nOutputPlane; kc_++)
-            for (kt_ = 0; kt_ < kT; kt_++)
-              for (kh_ = 0; kh_ < kH; kh_++)
-                for (kw_ = 0; kw_ < kW; kw_++)
-                {
-                  int pt = t * dT - pT + kt_;
-                  int ph = h * dH - pH + kh_;
-                  int pw = w * dW - pW + kw_;
-                  if (pt >=0 && ph >=0 && pw >= 0 &&
-                    pt < outputDepth && ph < outputHeight && pw < outputWidth)
-                  {
-                    for (c = 0; c < nInputPlane; c++)
-                    {
-                      gradInput_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] +=
-                        gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw]
-                        * weight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_];
-                    }
-                  }
-                }
-  }
-
-  // Free
-  THTensor_(free)(gradInput_n);
-  THTensor_(free)(gradOutput_n);
-}
-
-void THNN_(VolumetricFullConvolution_accGradParameters)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradWeight,
-  THTensor *gradBias,
-  THTensor *finput,         // only used by cuda impl
-  THTensor *fgradInput,     // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH,
-  real scale)
-{
-  // number of input/output planes and kernel size is indirectly defined by the gradWeight tensor
-  THArgCheck(gradWeight->nDimension == 5, 4,
-    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
-
-  int nOutputPlane = (int)gradWeight->size[0];
-  int nInputPlane  = (int)gradWeight->size[1];
-  int kT           = (int)gradWeight->size[2];
-  int kW           = (int)gradWeight->size[3];
-  int kH           = (int)gradWeight->size[4];
-
-  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
-    "gradBias tensor has wrong size"
-  );
-
-  THArgCheck(input->nDimension == 5, 2, "5D (batch mode) tensor is expected");
-  THArgCheck(kH == kW && pH == pW, 2, "kH == kW && pH == pW is expected");
-  THArgCheck(input->size[1] == nInputPlane, 2, "input tensor has wrong number of planes");
-
-  THTensor_(resize1d)(gradBias, nOutputPlane);
-  THTensor_(resize5d)(gradWeight, nOutputPlane, nInputPlane, kT, kH, kW);
-
-  int inputDepth   = input->size[2];
-  int inputHeight  = input->size[3];
-  int inputWidth   = input->size[4];
-
-  int outputDepth  = (inputDepth  - 1) * dT - 2 * pT + kT;
-  int outputHeight = (inputHeight - 1) * dH - 2 * pH + kH;
-  int outputWidth  = (inputWidth  - 1) * dW - 2 * pW + kW;
-
-  // Batch size
-  long batchSize = input->size[0];
-
-  // Helpers
-  THTensor *input_n = THTensor_(new)();
-  THTensor *gradOutput_n = THTensor_(new)();
-
-  // reset gradBias = 0
-  THTensor_(fill)(gradBias, 0);
-  // reset gradWeight = 0
-  THTensor_(fill)(gradWeight, 0);
-
-  real *gradWeight_ptr = THTensor_(data)(gradWeight);
-  real *gradBias_ptr = THTensor_(data)(gradBias);
-
-  // For each n in batch, do:
-  int n;
-  for (n = 0; n < batchSize; n++)
-  {
-    THTensor_(select)(input_n, input, 0, n);
-    THTensor_(select)(gradOutput_n, gradOutput, 0, n);
-
-    THTensor *goutn = THTensor_(new)();
-
-    // accumulate bias gradient first
-    int i;
-    for (i = 0; i < gradBias->size[0]; i++)
-    {
-      THTensor_(select)(goutn, gradOutput_n, 0, i);
-      gradBias_ptr[i] += scale * THTensor_(sumall)(goutn);
-    }
-    THTensor_(free)(goutn);
-
-    int t, h, w, kc_, kt_, kh_, kw_, c;
-
-    const real *input_ptr = THTensor_(data)(input_n);
-    const real *gradOutput_ptr = THTensor_(data)(gradOutput_n);
-    for (t = 0; t < inputDepth; t++)
-      for (h = 0; h < inputHeight; h++)
-        for (w = 0; w < inputWidth; w++)
-          for (kc_ = 0; kc_ < nOutputPlane; kc_++)
-            for (kt_ = 0; kt_ < kT; kt_++)
-              for (kh_ = 0; kh_ < kH; kh_++)
-                for (kw_ = 0; kw_ < kW; kw_++)
-                {
-                  int pt = t * dT - pT + kt_;
-                  int ph = h * dH - pH + kh_;
-                  int pw = w * dW - pW + kw_;
-                  if (pt >=0 && ph >=0 && pw >= 0 &&
-                    pt < outputDepth && ph < outputHeight && pw < outputWidth)
-                  {
-                    for (c = 0; c < nInputPlane; c++)
-                    {
-                      gradWeight_ptr[(((kc_ * nInputPlane + c) * kT + kt_) * kH + kh_) * kW + kw_] +=
-                        scale *
-                        input_ptr[((c * inputDepth + t) * inputHeight + h) * inputWidth + w] *
-                        gradOutput_ptr[((kc_ * outputDepth + pt) * outputHeight + ph) * outputWidth + pw];
-                    }
-                  }
-                }
-  }
-
-  // Free
-  THTensor_(free)(input_n);
-  THTensor_(free)(gradOutput_n);
-}
-
-#endif
diff --git a/init.c b/init.c
index e49ab8d2b13..c14f913f50a 100644
--- a/init.c
+++ b/init.c
@@ -121,9 +121,6 @@
 #include "generic/VolumetricConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 
-#include "generic/VolumetricFullConvolution.c"
-#include "THGenerateFloatTypes.h"
-
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 

From d7974f89419f985ca8a3b9cba3a67b901acb697a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Wed, 17 Feb 2016 20:44:11 +0100
Subject: [PATCH 044/101] Move generic/VolumetricFullConvolution.c -> lib/THNN

---
 generic/VolumetricFullConvolution.c | 444 ++++++++++++++++++++++++++++
 1 file changed, 444 insertions(+)
 create mode 100644 generic/VolumetricFullConvolution.c

diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
new file mode 100644
index 00000000000..ba1341dc0ae
--- /dev/null
+++ b/generic/VolumetricFullConvolution.c
@@ -0,0 +1,444 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+
+static void nn_(vol2col)(const real* data_vol, const int channels,
+    const int depth, const int height, const int width, const int kernel_t, const int kernel_h, const int kernel_w,
+    const int pad_t, const int pad_h, const int pad_w,
+    const int stride_t, const int stride_h, const int stride_w,
+    real* data_col) {
+  int c, t, h, w;
+  int depth_col = (depth + 2 * pad_t - kernel_t) / stride_t + 1;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_t * kernel_h * kernel_w;
+  for (c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int t_offset = (c / kernel_w / kernel_h) % kernel_t;
+    int c_vol = c / kernel_t / kernel_h / kernel_w;
+    for (t = 0; t < depth_col; ++t) {
+      for (h = 0; h < height_col; ++h) {
+        for (w = 0; w < width_col; ++w) {
+          int t_pad = t * stride_t - pad_t + t_offset;
+          int h_pad = h * stride_h - pad_h + h_offset;
+          int w_pad = w * stride_w - pad_w + w_offset;
+          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void nn_(col2vol)(const real* data_col, const int channels,
+    const int depth, const int height, const int width, const int patch_t, const int patch_h, const int patch_w,
+    const int pad_t, const int pad_h, const int pad_w,
+    const int stride_t, const int stride_h, const int stride_w,
+    real* data_vol) {
+  int c, t, h, w;
+  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+  int depth_col = (depth + 2 * pad_t - patch_t) / stride_t + 1;
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int channels_col = channels * patch_t * patch_h * patch_w;
+  for (c = 0; c < channels_col; ++c) {
+    int w_offset = c % patch_w;
+    int h_offset = (c / patch_w) % patch_h;
+    int t_offset = (c / patch_w / patch_h) % patch_t;
+    int c_vol = c / patch_t / patch_h / patch_w;
+    for (t = 0; t < depth_col; ++t) {
+      for (h = 0; h < height_col; ++h) {
+        for (w = 0; w < width_col; ++w) {
+          int t_pad = t * stride_t - pad_t + t_offset;
+          int h_pad = h * stride_h - pad_h + h_offset;
+          int w_pad = w * stride_w - pad_w + w_offset;
+          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
+  // Input
+  THTensor *input = (THTensor*)luaT_checkudata(L, 2, torch_Tensor);
+
+  // Params:
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
+  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
+  int adjT = luaT_getfieldcheckint(L, 1, "adjT");
+
+  THTensor *weight  = (THTensor*)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias    = (THTensor*)luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *ones    = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+  THTensor *output  = (THTensor*)luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  } else {
+    luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long inputDepth = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    nn_(col2vol)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  // return output
+  return 1;
+}
+
+static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
+  // Inputs
+  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+
+  // Params
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
+  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
+  int adjT = luaT_getfieldcheckint(L, 1, "adjT");
+
+  THTensor *weight = (THTensor *)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradColumns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *gradInput = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long inputDepth   = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    nn_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THTensor_(data)(gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  // Return gradInput
+  return 1;
+}
+
+
+static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
+  // Inputs
+  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+
+  // Params
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int dT = luaT_getfieldcheckint(L, 1, "dT");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int kT = luaT_getfieldcheckint(L, 1, "kT");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int padT = luaT_getfieldcheckint(L, 1, "padT");
+  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
+  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
+  int adjT = luaT_getfieldcheckint(L, 1, "adjT");
+  float scale = luaL_optnumber(L, 4, 1);
+
+  THTensor *gradWeight = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *ones = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long inputDepth  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    nn_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  // Return nothing
+  return 0;
+}
+
+static const struct luaL_Reg nn_(VolumetricFullConvolution__) [] = {
+  {"VolumetricFullConvolution_updateOutput", nn_(VolumetricFullConvolution_updateOutput)},
+  {"VolumetricFullConvolution_updateGradInput", nn_(VolumetricFullConvolution_updateGradInput)},
+  {"VolumetricFullConvolution_accGradParameters", nn_(VolumetricFullConvolution_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(VolumetricFullConvolution_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(VolumetricFullConvolution__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From 959b6343b8c8baf35c88c40b461d4b246c37c2db Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 31 Jan 2016 14:00:22 +0100
Subject: [PATCH 045/101] Move Spatial* to lib/THNN/generic

---
 generic/SpatialBatchNormalization.c   | 162 ++++++++++
 generic/SpatialConvolutionLocal.c     | 280 +++++++++++++++++
 generic/SpatialFractionalMaxPooling.c | 270 +++++++++++++++++
 generic/SpatialFullConvolution.c      | 417 ++++++++++++++++++++++++++
 generic/SpatialMaxUnpooling.c         | 240 +++++++++++++++
 generic/SpatialSubSampling.c          | 291 ++++++++++++++++++
 generic/SpatialUpSamplingNearest.c    | 159 ++++++++++
 7 files changed, 1819 insertions(+)
 create mode 100644 generic/SpatialBatchNormalization.c
 create mode 100644 generic/SpatialConvolutionLocal.c
 create mode 100644 generic/SpatialFractionalMaxPooling.c
 create mode 100644 generic/SpatialFullConvolution.c
 create mode 100644 generic/SpatialMaxUnpooling.c
 create mode 100644 generic/SpatialSubSampling.c
 create mode 100644 generic/SpatialUpSamplingNearest.c

diff --git a/generic/SpatialBatchNormalization.c b/generic/SpatialBatchNormalization.c
new file mode 100644
index 00000000000..25171c67c74
--- /dev/null
+++ b/generic/SpatialBatchNormalization.c
@@ -0,0 +1,162 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialBatchNormalization.c"
+#else
+
+static int nn_(SpatialBatchNormalization_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *output = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *weight = luaT_toudata(L, 3, torch_Tensor);
+  THTensor *bias = luaT_toudata(L, 4, torch_Tensor);
+  int train = lua_toboolean(L, 5);
+  double eps = lua_tonumber(L, 6);
+  double momentum = lua_tonumber(L, 7);
+  THTensor *running_mean = luaT_checkudata(L, 8, torch_Tensor);
+  THTensor *running_var = luaT_checkudata(L, 9, torch_Tensor);
+  THTensor *save_mean = luaT_toudata(L, 10, torch_Tensor);
+  THTensor *save_std = luaT_toudata(L, 11, torch_Tensor);
+
+  long nBatch = THTensor_(size)(input, 0);
+  long nFeature = THTensor_(size)(input, 1);
+  long iH = THTensor_(size)(input, 2);
+  long iW = THTensor_(size)(input, 3);
+  long n = nBatch * iH * iW;
+
+  #pragma parallel for
+  for (long f = 0; f < nFeature; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+    real mean, invstd;
+
+    if (train) {
+      // compute mean per feature plane
+      accreal sum = 0;
+      TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+      mean = (real) sum / n;
+      THTensor_(set1d)(save_mean, f, (real) mean);
+
+      // compute variance per feature plane
+      sum = 0;
+      TH_TENSOR_APPLY(real, in,
+        sum += (*in_data - mean) * (*in_data - mean););
+
+      if (sum == 0 && eps == 0.0) {
+        invstd = 0;
+      } else {
+        invstd = (real) (1 / sqrt(sum/n + eps));
+      }
+      THTensor_(set1d)(save_std, f, (real) invstd);
+
+      // update running averages
+      THTensor_(set1d)(running_mean, f,
+        (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+
+      accreal unbiased_var = sum / (n - 1);
+      THTensor_(set1d)(running_var, f,
+        (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // compute output
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(real, in, real, out,
+      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+    THTensor_(free)(out);
+    THTensor_(free)(in);
+  }
+
+  return 0;
+}
+
+static int nn_(SpatialBatchNormalization_backward)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradInput = luaT_toudata(L, 3, torch_Tensor);
+  THTensor *gradWeight = luaT_toudata(L, 4, torch_Tensor);
+  THTensor *gradBias = luaT_toudata(L, 5, torch_Tensor);
+  THTensor *weight = luaT_toudata(L, 6, torch_Tensor);
+  THTensor *save_mean = luaT_toudata(L, 7, torch_Tensor);
+  THTensor *save_std = luaT_toudata(L, 8, torch_Tensor);
+  double scale = lua_tonumber(L, 9);
+
+  long nBatch = THTensor_(size)(input, 0);
+  long nFeature = THTensor_(size)(input, 1);
+  long iH = THTensor_(size)(input, 2);
+  long iW = THTensor_(size)(input, 3);
+  long n = nBatch * iH * iW;
+
+  // Q(X) = X - E[x] ; i.e. input centered to zero mean
+  // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ
+
+  #pragma parallel for
+  for (long f = 0; f < nFeature; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+    real mean = THTensor_(get1d)(save_mean, f);
+    real invstd = THTensor_(get1d)(save_std, f);
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+
+    // sum over all gradOutput in feature plane
+    accreal sum = 0;
+    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    accreal dotp = 0;
+    TH_TENSOR_APPLY2(real, in, real, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+      // projection of gradOutput on to output scaled by std
+      real k = (real) dotp * invstd * invstd / n;
+      TH_TENSOR_APPLY2(real, gradIn, real, in,
+        *gradIn_data = (*in_data - mean) * k;);
+
+      accreal gradMean = sum / n;
+      TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+        *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      THTensor_(free)(gradIn);
+    }
+
+    if (gradWeight) {
+      real val = THTensor_(get1d)(gradWeight, f);
+      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      real val = THTensor_(get1d)(gradBias, f);
+      THTensor_(set1d)(gradBias, f, val + scale * sum);
+    }
+
+    THTensor_(free)(gradOut);
+    THTensor_(free)(in);
+  }
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialBatchNormalization__) [] = {
+  {"SpatialBatchNormalization_updateOutput", nn_(SpatialBatchNormalization_updateOutput)},
+  {"SpatialBatchNormalization_backward", nn_(SpatialBatchNormalization_backward)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialBatchNormalization_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialBatchNormalization__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialConvolutionLocal.c b/generic/SpatialConvolutionLocal.c
new file mode 100644
index 00000000000..6377ecda6a0
--- /dev/null
+++ b/generic/SpatialConvolutionLocal.c
@@ -0,0 +1,280 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+#ifdef _WIN32
+# include <windows.h>
+#endif
+
+#include "unfold.h"
+
+
+static void nn_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+                                                         int kW, int kH, int dW, int dH, int padW, int padH,
+                                                         long nInputPlane, long inputWidth, long inputHeight,
+                                                         long nOutputPlane, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output3d, *finput3d;
+
+  nn_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+
+  THTensor_(copy)(output, bias);
+
+  output3d = THTensor_(newWithStorage3d)(output->storage, output->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         nOutputPlane, outputHeight*outputWidth,
+                                         1, nOutputPlane*outputHeight*outputWidth);
+ 
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+  // output3d:  oH*oW x nOutputPlane x 1
+  
+  THTensor_(free)(output3d);
+  THTensor_(free)(finput3d);
+}
+
+static int nn_(SpatialConvolutionLocal_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  long inputWidth = luaT_getfieldcheckint(L, 1, "iW");
+  long inputHeight = luaT_getfieldcheckint(L, 1, "iH");
+  long outputWidth = luaT_getfieldcheckint(L, 1, "oW");
+  long outputHeight = luaT_getfieldcheckint(L, 1, "oH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  long nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  long nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    nn_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kW, kH, dW, dH, padW, padH,
+                                                 nInputPlane, inputWidth, inputHeight,
+                                                 nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      nn_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                   kW, kH, dW, dH, padW, padH,
+                                                   nInputPlane, inputWidth, inputHeight,
+                                                   nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  return 1;
+}
+
+
+static void nn_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
+                                                            long nInputPlane, long inputWidth, long inputHeight,
+                                                            long nOutputPlane, long outputWidth, long outputHeight)
+{
+  THTensor *gradOutput3d, *fgradInput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+  // gradOutput3d:  oH*oW x nOutputPlane x 1         
+  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(fgradInput3d);
+  
+  THTensor_(zero)(gradInput);
+
+  nn_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
+                                            nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+}
+
+static int nn_(SpatialConvolutionLocal_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  long inputWidth = luaT_getfieldcheckint(L, 1, "iW");
+  long inputHeight = luaT_getfieldcheckint(L, 1, "iH");
+  long outputWidth = luaT_getfieldcheckint(L, 1, "oW");
+  long outputHeight = luaT_getfieldcheckint(L, 1, "oH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  long nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  long nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *fgradInput = luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 1, 2);
+
+  if(input->nDimension == 3)
+  {
+    nn_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
+                                                       nInputPlane, inputWidth, inputHeight,
+                                                       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      nn_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
+                                                         nInputPlane, inputWidth, inputHeight,
+                                                         nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 1, 2);
+
+  return 1;
+}
+
+static void nn_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
+                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
+                                                            long nInputPlane, long inputWidth, long inputHeight,
+                                                            long nOutputPlane, long outputWidth, long outputHeight)
+{
+   
+  THTensor *gradOutput3d, *finput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth);
+  // gradOutput3d:  oH*oW x nOutputPlane x 1  
+  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(finput3d);
+}
+
+static int nn_(SpatialConvolutionLocal_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real scale = luaL_optnumber(L, 4, 1);
+  long inputWidth = luaT_getfieldcheckint(L, 1, "iW");
+  long inputHeight = luaT_getfieldcheckint(L, 1, "iH");
+  long outputWidth = luaT_getfieldcheckint(L, 1, "oW");
+  long outputHeight = luaT_getfieldcheckint(L, 1, "oH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  long nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  long nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  if(input->nDimension == 3)
+  {
+    nn_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
+                                                         nInputPlane, inputWidth, inputHeight,
+                                                         nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      nn_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
+                                                           nInputPlane, inputWidth, inputHeight,
+                                                           nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialConvolutionLocal__) [] = {
+  {"SpatialConvolutionLocal_updateOutput", nn_(SpatialConvolutionLocal_updateOutput)},
+  {"SpatialConvolutionLocal_updateGradInput", nn_(SpatialConvolutionLocal_updateGradInput)},
+  {"SpatialConvolutionLocal_accGradParameters", nn_(SpatialConvolutionLocal_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialConvolutionLocal_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialConvolutionLocal__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialFractionalMaxPooling.c b/generic/SpatialFractionalMaxPooling.c
new file mode 100644
index 00000000000..f90f92a3d6e
--- /dev/null
+++ b/generic/SpatialFractionalMaxPooling.c
@@ -0,0 +1,270 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static long* nn_(SpatialFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  real* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH,
+  int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 2 random samples, one for W and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 2;
+
+    /* Generate interval sequence */
+    long* sequenceW =
+      nn_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      nn_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w;
+
+    real* inputForPlane = input + plane * inputW * inputH;
+    real* outputForPlane = output + plane * outputW * outputH;
+    real* indicesForPlane = indices + plane * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        real maxVal = -THInf;
+        long maxIndex = -1;
+
+        long h2, w2;
+        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+            THAssert(h2 >= 0 && h2 < inputH);
+            THAssert(w2 >= 0 && w2 < inputW);
+
+            long planeIndex = h2 * inputW + w2;
+            real val = inputForPlane[planeIndex];
+            if (val > maxVal) {
+              maxVal = val;
+              maxIndex = planeIndex;
+            }
+          }
+        }
+
+        THAssert(maxVal != -THInf);
+        THAssert(maxIndex != -1);
+
+        outputForPlane[h * outputW + w] = maxVal;
+        /* +1 to lua index */
+        indicesForPlane[h * outputW + w] = (real) maxIndex + 1;
+      }
+    }
+
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+static int nn_(SpatialFractionalMaxPooling_updateOutput)(lua_State *L) {
+  THTensor* output = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor* input = luaT_checkudata(L, 2, torch_Tensor);
+  int outputW = luaL_checknumber(L, 3);
+  int outputH = luaL_checknumber(L, 4);
+  int poolSizeW = luaL_checknumber(L, 5);
+  int poolSizeH = luaL_checknumber(L, 6);
+  THTensor* indices = luaT_checkudata(L, 7, torch_Tensor);
+  THTensor* randomSamples = luaT_checkudata(L, 8, torch_Tensor);
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  luaL_argcheck(L, numInputDims == 3 || numInputDims == 4, 2,
+                "3D or 4D (batch mode) tensor expected");
+
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  luaL_argcheck(L, outputH + poolSizeH - 1 < inputH, 6,
+                "poolSizeH too large relative to input height");
+  luaL_argcheck(L, outputW + poolSizeW - 1 < inputW, 5,
+                "poolSizeW too large relative to input width");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+    nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+  return 0;
+}
+
+static void nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  real* indices,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+    real* indicesForPlane = indices + plane * outputW * outputH;
+
+    long h, w;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        long outputIndex = h * outputW + w;
+        long index = indicesForPlane[outputIndex] - 1;
+        THAssert(index >= 0 && index < inputW * inputH);
+
+        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+      }
+    }
+  }
+}
+
+static int nn_(SpatialFractionalMaxPooling_updateGradInput)(lua_State *L) {
+  THTensor* gradInput = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor* input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor* gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  long outputW = luaL_checknumber(L, 4);
+  long outputH = luaL_checknumber(L, 5);
+  int poolSizeW = luaL_checknumber(L, 6);
+  int poolSizeH = luaL_checknumber(L, 7);
+  THTensor* indices = luaT_checkudata(L, 8, torch_Tensor);
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  luaL_argcheck(L, outputW == THTensor_(size)(gradOutput, widthDim), 3,
+                "gradOutput width unexpected");
+  luaL_argcheck(L, outputH == THTensor_(size)(gradOutput, heightDim), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 3) {
+    nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THTensor_(data)(indices),
+      numPlanes, inputW, inputH, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        numPlanes, inputW, inputH, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialFractionalMaxPooling__) [] = {
+  {"SpatialFractionalMaxPooling_updateOutput", nn_(SpatialFractionalMaxPooling_updateOutput)},
+  {"SpatialFractionalMaxPooling_updateGradInput", nn_(SpatialFractionalMaxPooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialFractionalMaxPooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialFractionalMaxPooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialFullConvolution.c b/generic/SpatialFullConvolution.c
new file mode 100644
index 00000000000..964179226e3
--- /dev/null
+++ b/generic/SpatialFullConvolution.c
@@ -0,0 +1,417 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+
+static void nn_(im2col)(const real* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    real* data_col) {
+  int c, h, w;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  for (c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (h = 0; h < height_col; ++h) {
+      for (w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_col[(c * height_col + h) * width_col + w] =
+            data_im[(c_im * height + h_pad) * width + w_pad];
+        else
+          data_col[(c * height_col + h) * width_col + w] = 0;
+      }
+    }
+  }
+}
+
+static void nn_(col2im)(const real* data_col, const int channels,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    real* data_im) {
+  int c, h, w;
+  memset(data_im, 0, sizeof(real)*height * width * channels);
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int channels_col = channels * patch_h * patch_w;
+  for (c = 0; c < channels_col; ++c) {
+    int w_offset = c % patch_w;
+    int h_offset = (c / patch_w) % patch_h;
+    int c_im = c / patch_h / patch_w;
+    for (h = 0; h < height_col; ++h) {
+      for (w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_im[(c_im * height + h_pad) * width + w_pad] +=
+            data_col[(c * height_col + h) * width_col + w];
+      }
+    }
+  }
+}
+
+static int nn_(SpatialFullConvolution_updateOutput)(lua_State *L) {
+  // Input
+  THTensor *input = (THTensor*)luaT_checkudata(L, 2, torch_Tensor);
+
+  // Params:
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
+  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
+
+  THTensor *weight  = (THTensor*)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias    = (THTensor*)luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *ones    = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+  THTensor *output  = (THTensor*)luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    nn_(col2im)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+    );
+
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  // return output
+  return 1;
+}
+
+static int nn_(SpatialFullConvolution_updateGradInput)(lua_State *L) {
+  // Inputs
+  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+
+  // Params
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
+  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
+
+  THTensor *weight = (THTensor *)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradColumns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *gradInput = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    nn_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      THTensor_(data)(gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  // Return gradInput
+  return 1;
+}
+
+
+static int nn_(SpatialFullConvolution_accGradParameters)(lua_State *L) {
+  // Inputs
+  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+
+  // Params
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int padW = luaT_getfieldcheckint(L, 1, "padW");
+  int padH = luaT_getfieldcheckint(L, 1, "padH");
+  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
+  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
+  float scale = luaL_optnumber(L, 4, 1);
+
+  THTensor *gradWeight = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
+  THTensor *ones = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    nn_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  // Return nothing
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialFullConvolution__) [] = {
+  {"SpatialFullConvolution_updateOutput", nn_(SpatialFullConvolution_updateOutput)},
+  {"SpatialFullConvolution_updateGradInput", nn_(SpatialFullConvolution_updateGradInput)},
+  {"SpatialFullConvolution_accGradParameters", nn_(SpatialFullConvolution_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialFullConvolution_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialFullConvolution__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialMaxUnpooling.c b/generic/SpatialMaxUnpooling.c
new file mode 100644
index 00000000000..045a3e63c2f
--- /dev/null
+++ b/generic/SpatialMaxUnpooling.c
@@ -0,0 +1,240 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void nn_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      real *ind_p,
+                                                      long nslices,
+                                                      long iwidth, long iheight,
+                                                      long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {    
+    real *output_p_k = output_p + k*owidth*oheight;
+    real *input_p_k = input_p + k*iwidth*iheight;
+    real *ind_p_k = ind_p + k*iwidth*iheight;
+
+    long i, j, maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - 1;  /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        }
+        output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+      }
+    }
+  }
+}
+
+static int nn_(SpatialMaxUnpooling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
+  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  }  
+
+  if (input->nDimension == 4) 
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input and indices */
+  input = THTensor_(newContiguous)(input);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    nn_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+                                                indices_data+p*nslices*iwidth*iheight,
+                                                nslices,
+                                                iwidth, iheight,
+                                                owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THTensor_(free)(indices);
+
+  return 1;
+}
+
+static void nn_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         real *ind_p,
+                                                         long nslices,
+                                                         long iwidth, long iheight,
+                                                         long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *ind_p_k = ind_p + k*iwidth*iheight;
+
+    long i, j, maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {        
+        maxp = ind_p_k[i*iwidth + j] - 1; /* retrieve position of max */         
+        if(maxp<0 || maxp>=owidth*oheight){
+            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        }  
+        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+      }
+    }
+  }
+}
+
+static int nn_(SpatialMaxUnpooling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
+  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  } 
+
+  /* get contiguous gradOutput and indices */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    nn_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      nn_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*iwidth*iheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(indices);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SpatialMaxUnpooling__) [] = {
+  {"SpatialMaxUnpooling_updateOutput", nn_(SpatialMaxUnpooling_updateOutput)},
+  {"SpatialMaxUnpooling_updateGradInput", nn_(SpatialMaxUnpooling_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialMaxUnpooling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialMaxUnpooling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialSubSampling.c b/generic/SpatialSubSampling.c
new file mode 100644
index 00000000000..912592c7285
--- /dev/null
+++ b/generic/SpatialSubSampling.c
@@ -0,0 +1,291 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  long k;
+
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  luaL_argcheck(L, input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  luaL_argcheck(L, inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+  
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      /* Get the good mask for (k,i) (k out, i in) */
+      real the_weight = weight_data[k];
+      /* Initialize to the bias */
+      real z = bias_data[k];
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = z;
+      
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real sum = 0;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += ptr_input[kx];
+            ptr_input += inputWidth; /* next input line */
+          }
+          /* Update output */
+          *ptr_output++ += the_weight*sum;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+
+  return 1;
+}
+
+static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  real *weight_data;
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  weight_data = THTensor_(data)(weight);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real the_weight = weight_data[k];
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++ * the_weight;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              ptr_gradInput[kx] += z;
+            ptr_gradInput += inputWidth;
+          }
+        }
+      }
+    }
+  }
+
+  return 1;
+}
+
+static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  real scale = luaL_optnumber(L, 4, 1);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  long nbatch = 1;
+  long dimw = 2;
+  long dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  real *gradWeight_data;
+  real *gradBias_data;
+  real *gradOutput_data;
+  real *input_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+    nbatch = input->size[0];
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      real sum;
+      long xx, yy;
+      long i;
+
+      sum = 0;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        sum += ptr_gradOutput[i];
+      gradBias_data[k] += scale*sum;
+
+      sum = 0;
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += z * ptr_input[kx];
+            ptr_input += inputWidth;
+          }
+        }
+      }
+      gradWeight_data[k] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(input);
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialSubSampling__) [] = {
+  {"SpatialSubSampling_updateOutput", nn_(SpatialSubSampling_updateOutput)},
+  {"SpatialSubSampling_updateGradInput", nn_(SpatialSubSampling_updateGradInput)},
+  {"SpatialSubSampling_accGradParameters", nn_(SpatialSubSampling_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialSubSampling_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialSubSampling__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialUpSamplingNearest.c b/generic/SpatialUpSamplingNearest.c
new file mode 100644
index 00000000000..c3cddb05101
--- /dev/null
+++ b/generic/SpatialUpSamplingNearest.c
@@ -0,0 +1,159 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+static int nn_(SpatialUpSamplingNearest_updateOutput)(lua_State *L)
+{
+  // get all params
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int scale_factor = luaT_getfieldcheckint(L, 1, "scale_factor");
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-2;
+  int yDim = input->nDimension-1;
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  // dims
+  int idim = input->nDimension;  // Gauranteed to be between 3 and 5
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = 1;
+  if (idim > 3) {
+    osz3 = output->size[3];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst;
+  int iout[4];  // Output indices
+  int iin[4];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+
+          // set the indices for the upsampled dimensions
+          iin[xDim] = iout[xDim] / dW;
+          iin[yDim] = iout[yDim] / dH;
+
+          idst = i0*os[0] + i1*os[1] + i2*os[2];
+          isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
+          if (idim > 3) {
+            idst += i3*os[3];
+            isrc += iin[3]*is[3];
+          }
+
+          pout[idst] = pin[isrc];
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+static int nn_(SpatialUpSamplingNearest_updateGradInput)(lua_State *L)
+{
+  // get all params
+  //THTensor *input = luaT_checkudata(L,2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L,3, torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L,1, "gradInput", torch_Tensor);
+
+  int scale_factor = luaT_getfieldcheckint(L, 1, "scale_factor");
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = gradInput->nDimension-2;
+  int yDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Gauranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = 1;
+  if (idim > 3) {
+    isz3 = gradInput->size[3];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst, x, y;
+  int iin[4];  // Input indices
+  int iout[4];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          idst = i0*is[0] + i1*is[1] + i2*is[2];
+          if (idim > 3) {
+            idst += i3*is[3];
+          }
+
+          // Now accumulate the gradients from gradOutput
+          for (y = 0; y < dH; y++) {
+            for (x = 0; x < dW; x++) {
+              iout[xDim] = dW * iin[xDim] + x;
+              iout[yDim] = dH * iin[yDim] + y;
+              isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
+              if (idim > 3) {
+                isrc += iout[3]*os[3];
+              }
+              pin[idst] += pout[isrc];
+            }
+          }
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+static const struct luaL_Reg nn_(SpatialUpSamplingNearest__) [] = {
+  {"SpatialUpSamplingNearest_updateOutput", nn_(SpatialUpSamplingNearest_updateOutput)},
+  {"SpatialUpSamplingNearest_updateGradInput", nn_(SpatialUpSamplingNearest_updateGradInput)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialUpSamplingNearest_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialUpSamplingNearest__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From e59dad9a108a3a7ecd9cdfaefcab575360ffd524 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sun, 31 Jan 2016 15:32:30 +0100
Subject: [PATCH 046/101] Add THNN conversion for Spatial* modules

Add THNN conversion of SpatialBatchNormalization, SpatialFractionalMaxPooling and SpatialSubSampling

Add THNN convertion of SpatialConvolutionLocal, SpatialFullConvolution and SpatialUpSamplingNearest

THNN conversion of SpatialMaxUnpooling

Remove unfold from generic

Add functional conversion of SpatialCrossMapLRN

Plus fix in the init.c

Fix
---
 generic/SpatialBatchNormalization.c   |  43 +-----
 generic/SpatialConvolutionLocal.c     | 153 ++++++++------------
 generic/SpatialFractionalMaxPooling.c |  91 +++++-------
 generic/SpatialFullConvolution.c      | 151 ++++++++------------
 generic/SpatialMaxUnpooling.c         |  57 +++-----
 generic/SpatialSubSampling.c          |  96 +++++--------
 generic/SpatialUpSamplingNearest.c    |  38 ++---
 generic/THNN.h                        | 192 ++++++++++++++++++++++++++
 init.c                                |  21 +++
 9 files changed, 430 insertions(+), 412 deletions(-)

diff --git a/generic/SpatialBatchNormalization.c b/generic/SpatialBatchNormalization.c
index 25171c67c74..bf3108bf11a 100644
--- a/generic/SpatialBatchNormalization.c
+++ b/generic/SpatialBatchNormalization.c
@@ -2,20 +2,8 @@
 #define TH_GENERIC_FILE "generic/SpatialBatchNormalization.c"
 #else
 
-static int nn_(SpatialBatchNormalization_updateOutput)(lua_State *L)
+void THNN_(SpatialBatchNormalization_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *running_mean, THTensor *running_var, THTensor *save_mean, THTensor *save_std, bool train, double momentum, double eps)
 {
-  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
-  THTensor *output = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *weight = luaT_toudata(L, 3, torch_Tensor);
-  THTensor *bias = luaT_toudata(L, 4, torch_Tensor);
-  int train = lua_toboolean(L, 5);
-  double eps = lua_tonumber(L, 6);
-  double momentum = lua_tonumber(L, 7);
-  THTensor *running_mean = luaT_checkudata(L, 8, torch_Tensor);
-  THTensor *running_var = luaT_checkudata(L, 9, torch_Tensor);
-  THTensor *save_mean = luaT_toudata(L, 10, torch_Tensor);
-  THTensor *save_std = luaT_toudata(L, 11, torch_Tensor);
-
   long nBatch = THTensor_(size)(input, 0);
   long nFeature = THTensor_(size)(input, 1);
   long iH = THTensor_(size)(input, 2);
@@ -71,22 +59,10 @@ static int nn_(SpatialBatchNormalization_updateOutput)(lua_State *L)
     THTensor_(free)(out);
     THTensor_(free)(in);
   }
-
-  return 0;
 }
 
-static int nn_(SpatialBatchNormalization_backward)(lua_State *L)
+void THNN_(SpatialBatchNormalization_backward)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *save_mean, THTensor *save_std, double scale)
 {
-  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradInput = luaT_toudata(L, 3, torch_Tensor);
-  THTensor *gradWeight = luaT_toudata(L, 4, torch_Tensor);
-  THTensor *gradBias = luaT_toudata(L, 5, torch_Tensor);
-  THTensor *weight = luaT_toudata(L, 6, torch_Tensor);
-  THTensor *save_mean = luaT_toudata(L, 7, torch_Tensor);
-  THTensor *save_std = luaT_toudata(L, 8, torch_Tensor);
-  double scale = lua_tonumber(L, 9);
-
   long nBatch = THTensor_(size)(input, 0);
   long nFeature = THTensor_(size)(input, 1);
   long iH = THTensor_(size)(input, 2);
@@ -142,21 +118,6 @@ static int nn_(SpatialBatchNormalization_backward)(lua_State *L)
     THTensor_(free)(gradOut);
     THTensor_(free)(in);
   }
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialBatchNormalization__) [] = {
-  {"SpatialBatchNormalization_updateOutput", nn_(SpatialBatchNormalization_updateOutput)},
-  {"SpatialBatchNormalization_backward", nn_(SpatialBatchNormalization_backward)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialBatchNormalization_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialBatchNormalization__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialConvolutionLocal.c b/generic/SpatialConvolutionLocal.c
index 6377ecda6a0..091c6f01c6a 100644
--- a/generic/SpatialConvolutionLocal.c
+++ b/generic/SpatialConvolutionLocal.c
@@ -2,14 +2,8 @@
 #define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
 #else
 
-#ifdef _WIN32
-# include <windows.h>
-#endif
 
-#include "unfold.h"
-
-
-static void nn_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
                                                          int kW, int kH, int dW, int dH, int padW, int padH,
                                                          long nInputPlane, long inputWidth, long inputHeight,
                                                          long nOutputPlane, long outputWidth, long outputHeight)
@@ -17,7 +11,7 @@ static void nn_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THT
   long i;
   THTensor *output3d, *finput3d;
 
-  nn_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
 
   THTensor_(copy)(output, bias);
 
@@ -39,33 +33,29 @@ static void nn_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THT
   THTensor_(free)(finput3d);
 }
 
-static int nn_(SpatialConvolutionLocal_updateOutput)(lua_State *L)
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  long inputWidth = luaT_getfieldcheckint(L, 1, "iW");
-  long inputHeight = luaT_getfieldcheckint(L, 1, "iH");
-  long outputWidth = luaT_getfieldcheckint(L, 1, "oW");
-  long outputHeight = luaT_getfieldcheckint(L, 1, "oH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  long nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  long nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
 
   if(input->nDimension == 3)
   {
     THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 
-    nn_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
                                                  kW, kH, dW, dH, padW, padH,
                                                  nInputPlane, inputWidth, inputHeight,
                                                  nOutputPlane, outputWidth, outputHeight);
@@ -85,7 +75,7 @@ static int nn_(SpatialConvolutionLocal_updateOutput)(lua_State *L)
       THTensor *output_t = THTensor_(newSelect)(output, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      nn_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
                                                    kW, kH, dW, dH, padW, padH,
                                                    nInputPlane, inputWidth, inputHeight,
                                                    nOutputPlane, outputWidth, outputHeight);
@@ -95,12 +85,10 @@ static int nn_(SpatialConvolutionLocal_updateOutput)(lua_State *L)
       THTensor_(free)(finput_t);
     }
   }
-
-  return 1;
 }
 
 
-static void nn_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
                                                             int kW, int kH, int dW, int dH, int padW, int padH, 
                                                             long nInputPlane, long inputWidth, long inputHeight,
                                                             long nOutputPlane, long outputWidth, long outputHeight)
@@ -124,31 +112,26 @@ static void nn_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInp
   
   THTensor_(zero)(gradInput);
 
-  nn_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
                                             nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
 }
 
-static int nn_(SpatialConvolutionLocal_updateGradInput)(lua_State *L)
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  long inputWidth = luaT_getfieldcheckint(L, 1, "iW");
-  long inputHeight = luaT_getfieldcheckint(L, 1, "iH");
-  long outputWidth = luaT_getfieldcheckint(L, 1, "oW");
-  long outputHeight = luaT_getfieldcheckint(L, 1, "oH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  long nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  long nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *fgradInput = luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
@@ -156,7 +139,7 @@ static int nn_(SpatialConvolutionLocal_updateGradInput)(lua_State *L)
 
   if(input->nDimension == 3)
   {
-    nn_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
                                                        nInputPlane, inputWidth, inputHeight,
                                                        nOutputPlane, outputWidth, outputHeight);
   }
@@ -172,7 +155,7 @@ static int nn_(SpatialConvolutionLocal_updateGradInput)(lua_State *L)
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
-      nn_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
                                                          nInputPlane, inputWidth, inputHeight,
                                                          nOutputPlane, outputWidth, outputHeight);
 
@@ -183,11 +166,9 @@ static int nn_(SpatialConvolutionLocal_updateGradInput)(lua_State *L)
   }
 
   THTensor_(transpose)(weight, weight, 1, 2);
-
-  return 1;
 }
 
-static void nn_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
                                                             int kW, int kH, int dW, int dH, int padW, int padH, 
                                                             long nInputPlane, long inputWidth, long inputHeight,
                                                             long nOutputPlane, long outputWidth, long outputHeight)
@@ -213,31 +194,27 @@ static void nn_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradO
   THTensor_(free)(finput3d);
 }
 
-static int nn_(SpatialConvolutionLocal_accGradParameters)(lua_State *L)
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight,
+    real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real scale = luaL_optnumber(L, 4, 1);
-  long inputWidth = luaT_getfieldcheckint(L, 1, "iW");
-  long inputHeight = luaT_getfieldcheckint(L, 1, "iH");
-  long outputWidth = luaT_getfieldcheckint(L, 1, "oW");
-  long outputHeight = luaT_getfieldcheckint(L, 1, "oH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  long nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  long nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-
-  THTensor *finput = luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(gradWeight,1);
 
   if(input->nDimension == 3)
   {
-    nn_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
                                                          nInputPlane, inputWidth, inputHeight,
                                                          nOutputPlane, outputWidth, outputHeight);
   }
@@ -251,7 +228,7 @@ static int nn_(SpatialConvolutionLocal_accGradParameters)(lua_State *L)
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      nn_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
                                                            nInputPlane, inputWidth, inputHeight,
                                                            nOutputPlane, outputWidth, outputHeight);
 
@@ -259,22 +236,6 @@ static int nn_(SpatialConvolutionLocal_accGradParameters)(lua_State *L)
       THTensor_(free)(finput_t);
     }
   }
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialConvolutionLocal__) [] = {
-  {"SpatialConvolutionLocal_updateOutput", nn_(SpatialConvolutionLocal_updateOutput)},
-  {"SpatialConvolutionLocal_updateGradInput", nn_(SpatialConvolutionLocal_updateGradInput)},
-  {"SpatialConvolutionLocal_accGradParameters", nn_(SpatialConvolutionLocal_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialConvolutionLocal_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialConvolutionLocal__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialFractionalMaxPooling.c b/generic/SpatialFractionalMaxPooling.c
index f90f92a3d6e..1c2b6ab1900 100644
--- a/generic/SpatialFractionalMaxPooling.c
+++ b/generic/SpatialFractionalMaxPooling.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
 #else
 
-static long* nn_(SpatialFractionalMaxPooling_generateIntervals)(
+static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
   real sample,
   long inputSize,
   long outputSize,
@@ -20,7 +20,7 @@ static long* nn_(SpatialFractionalMaxPooling_generateIntervals)(
   return sequence;
 }
 
-static void nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
   real* input,
   real* output,
   real* indices,
@@ -37,10 +37,10 @@ static void nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
 
     /* Generate interval sequence */
     long* sequenceW =
-      nn_(SpatialFractionalMaxPooling_generateIntervals)(
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
         randomSamplesForPlane[0], inputW, outputW, poolSizeW);
     long* sequenceH =
-      nn_(SpatialFractionalMaxPooling_generateIntervals)(
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
         randomSamplesForPlane[1], inputH, outputH, poolSizeH);
 
     /* loop over output */
@@ -88,24 +88,23 @@ static void nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
   }
 }
 
-static int nn_(SpatialFractionalMaxPooling_updateOutput)(lua_State *L) {
-  THTensor* output = luaT_checkudata(L, 1, torch_Tensor);
-  THTensor* input = luaT_checkudata(L, 2, torch_Tensor);
-  int outputW = luaL_checknumber(L, 3);
-  int outputH = luaL_checknumber(L, 4);
-  int poolSizeW = luaL_checknumber(L, 5);
-  int poolSizeH = luaL_checknumber(L, 6);
-  THTensor* indices = luaT_checkudata(L, 7, torch_Tensor);
-  THTensor* randomSamples = luaT_checkudata(L, 8, torch_Tensor);
-
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THTensor *indices,
+    THTensor *randomSamples) {
+  
   long numBatch = 1;
   int planeDim = 0;
   int heightDim = 1;
   int widthDim = 2;
 
   long numInputDims = THTensor_(nDimension)(input);
-  luaL_argcheck(L, numInputDims == 3 || numInputDims == 4, 2,
-                "3D or 4D (batch mode) tensor expected");
+  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+             "3D or 4D (batch mode) tensor expected");
 
   if (numInputDims == 4) {
     numBatch = THTensor_(size)(input, 0);
@@ -119,10 +118,10 @@ static int nn_(SpatialFractionalMaxPooling_updateOutput)(lua_State *L) {
   long inputH = THTensor_(size)(input, heightDim);
   long inputW = THTensor_(size)(input, widthDim);
 
-  luaL_argcheck(L, outputH + poolSizeH - 1 < inputH, 6,
-                "poolSizeH too large relative to input height");
-  luaL_argcheck(L, outputW + poolSizeW - 1 < inputW, 5,
-                "poolSizeW too large relative to input width");
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+             "poolSizeH too large relative to input height");
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+             "poolSizeW too large relative to input width");
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -133,7 +132,7 @@ static int nn_(SpatialFractionalMaxPooling_updateOutput)(lua_State *L) {
     /* indices will contain the locations for each output point */
     THTensor_(resize3d)(indices, numPlanes, outputH, outputW);
 
-    nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
+    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
       THTensor_(data)(input),
       THTensor_(data)(output),
       THTensor_(data)(indices),
@@ -147,7 +146,7 @@ static int nn_(SpatialFractionalMaxPooling_updateOutput)(lua_State *L) {
     long batch;
 #pragma omp parallel for private(batch)
     for (batch = 0; batch < numBatch; ++batch) {
-      nn_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
         THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
         THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
         THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
@@ -158,11 +157,9 @@ static int nn_(SpatialFractionalMaxPooling_updateOutput)(lua_State *L) {
 
   /* cleanup */
   THTensor_(free)(input);
-
-  return 0;
 }
 
-static void nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
   real* gradInput,
   real* gradOutput,
   real* indices,
@@ -189,15 +186,14 @@ static void nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
   }
 }
 
-static int nn_(SpatialFractionalMaxPooling_updateGradInput)(lua_State *L) {
-  THTensor* gradInput = luaT_checkudata(L, 1, torch_Tensor);
-  THTensor* input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor* gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  long outputW = luaL_checknumber(L, 4);
-  long outputH = luaL_checknumber(L, 5);
-  int poolSizeW = luaL_checknumber(L, 6);
-  int poolSizeH = luaL_checknumber(L, 7);
-  THTensor* indices = luaT_checkudata(L, 8, torch_Tensor);
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THTensor *indices) {
 
   long numBatch = 1;
   int planeDim = 0;
@@ -217,10 +213,10 @@ static int nn_(SpatialFractionalMaxPooling_updateGradInput)(lua_State *L) {
   long inputH = THTensor_(size)(input, heightDim);
   long inputW = THTensor_(size)(input, widthDim);
 
-  luaL_argcheck(L, outputW == THTensor_(size)(gradOutput, widthDim), 3,
-                "gradOutput width unexpected");
-  luaL_argcheck(L, outputH == THTensor_(size)(gradOutput, heightDim), 3,
-                "gradOutput height unexpected");
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -231,7 +227,7 @@ static int nn_(SpatialFractionalMaxPooling_updateGradInput)(lua_State *L) {
 
   /* backprop */
   if (numInputDims == 3) {
-    nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
       THTensor_(data)(gradInput),
       THTensor_(data)(gradOutput),
       THTensor_(data)(indices),
@@ -240,7 +236,7 @@ static int nn_(SpatialFractionalMaxPooling_updateGradInput)(lua_State *L) {
     long batch;
 #pragma omp parallel for private(batch)
     for (batch = 0; batch < numBatch; ++batch) {
-      nn_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
         THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
         THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
         THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
@@ -250,21 +246,6 @@ static int nn_(SpatialFractionalMaxPooling_updateGradInput)(lua_State *L) {
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialFractionalMaxPooling__) [] = {
-  {"SpatialFractionalMaxPooling_updateOutput", nn_(SpatialFractionalMaxPooling_updateOutput)},
-  {"SpatialFractionalMaxPooling_updateGradInput", nn_(SpatialFractionalMaxPooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialFractionalMaxPooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialFractionalMaxPooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialFullConvolution.c b/generic/SpatialFullConvolution.c
index 964179226e3..de2c18fc06a 100644
--- a/generic/SpatialFullConvolution.c
+++ b/generic/SpatialFullConvolution.c
@@ -3,7 +3,7 @@
 #else
 
 
-static void nn_(im2col)(const real* data_im, const int channels,
+static void THNN_(im2col)(const real* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
@@ -30,7 +30,7 @@ static void nn_(im2col)(const real* data_im, const int channels,
   }
 }
 
-static void nn_(col2im)(const real* data_col, const int channels,
+static void THNN_(col2im)(const real* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
@@ -56,38 +56,32 @@ static void nn_(col2im)(const real* data_col, const int channels,
   }
 }
 
-static int nn_(SpatialFullConvolution_updateOutput)(lua_State *L) {
-  // Input
-  THTensor *input = (THTensor*)luaT_checkudata(L, 2, torch_Tensor);
+void THNN_(SpatialFullConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{  
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
 
-  // Params:
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
-  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
-
-  THTensor *weight  = (THTensor*)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias    = (THTensor*)luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *ones    = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-  THTensor *output  = (THTensor*)luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
 
   int batch = 1;
   if (input->nDimension == 3) {
-    luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
   } else {
-    luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }
 
   long inputWidth   = input->size[3];
@@ -142,7 +136,7 @@ static int nn_(SpatialFullConvolution_updateOutput)(lua_State *L) {
     );
 
     // Unpack columns back into input:
-    nn_(col2im)(
+    THNN_(col2im)(
       THTensor_(data)(columns),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
       THTensor_(data)(output_n)
@@ -177,33 +171,24 @@ static int nn_(SpatialFullConvolution_updateOutput)(lua_State *L) {
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
-
-  // return output
-  return 1;
 }
 
-static int nn_(SpatialFullConvolution_updateGradInput)(lua_State *L) {
-  // Inputs
-  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+void THNN_(SpatialFullConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
 
-  // Params
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
-  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
-
-  THTensor *weight = (THTensor *)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradColumns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *gradInput = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
 
   int batch = 1;
   if (input->nDimension == 3) {
@@ -239,7 +224,7 @@ static int nn_(SpatialFullConvolution_updateGradInput)(lua_State *L) {
     THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
 
     // Extract columns:
-    nn_(im2col)(
+    THNN_(im2col)(
       THTensor_(data)(gradOutput_n),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
       THTensor_(data)(gradColumns)
@@ -275,36 +260,27 @@ static int nn_(SpatialFullConvolution_updateGradInput)(lua_State *L) {
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
     THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
   }
-
-  // Return gradInput
-  return 1;
 }
 
 
-static int nn_(SpatialFullConvolution_accGradParameters)(lua_State *L) {
-  // Inputs
-  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+void THNN_(SpatialFullConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH,
+    real scale)
+{
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+  int nOutputPlane = THTensor_(size)(gradWeight,1);
 
-  // Params
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
-  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
-  float scale = luaL_optnumber(L, 4, 1);
-
-  THTensor *gradWeight = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *ones = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
 
   int batch = 1;
   if (input->nDimension == 3) {
@@ -344,7 +320,7 @@ static int nn_(SpatialFullConvolution_accGradParameters)(lua_State *L) {
     THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
 
     // Extract columns:
-    nn_(im2col)(
+    THNN_(im2col)(
       THTensor_(data)(gradOutput_n),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
       THTensor_(data)(columns)
@@ -395,23 +371,6 @@ static int nn_(SpatialFullConvolution_accGradParameters)(lua_State *L) {
     THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
-
-  // Return nothing
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialFullConvolution__) [] = {
-  {"SpatialFullConvolution_updateOutput", nn_(SpatialFullConvolution_updateOutput)},
-  {"SpatialFullConvolution_updateGradInput", nn_(SpatialFullConvolution_updateGradInput)},
-  {"SpatialFullConvolution_accGradParameters", nn_(SpatialFullConvolution_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialFullConvolution_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialFullConvolution__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialMaxUnpooling.c b/generic/SpatialMaxUnpooling.c
index 045a3e63c2f..6e7a76e9880 100644
--- a/generic/SpatialMaxUnpooling.c
+++ b/generic/SpatialMaxUnpooling.c
@@ -2,7 +2,7 @@
 #define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
 #else
 
-static void nn_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
                                                       real *ind_p,
                                                       long nslices,
                                                       long iwidth, long iheight,
@@ -31,13 +31,13 @@ static void nn_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *out
   }
 }
 
-static int nn_(SpatialMaxUnpooling_updateOutput)(lua_State *L)
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *indices,
+    int owidth, int oheight)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
-  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
   int dimw = 2;
   int dimh = 1;
   int nbatch = 1;
@@ -49,7 +49,7 @@ static int nn_(SpatialMaxUnpooling_updateOutput)(lua_State *L)
   real *indices_data;
 
 
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
   if (!THTensor_(isSameSizeAs)(input, indices)){
     THError("Invalid input size w.r.t current indices size");
   }  
@@ -80,7 +80,7 @@ static int nn_(SpatialMaxUnpooling_updateOutput)(lua_State *L)
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    nn_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
                                               indices_data,
                                               nslices,
                                               iwidth, iheight,
@@ -100,7 +100,7 @@ static int nn_(SpatialMaxUnpooling_updateOutput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
                                                 indices_data+p*nslices*iwidth*iheight,
                                                 nslices,
                                                 iwidth, iheight,
@@ -111,11 +111,9 @@ static int nn_(SpatialMaxUnpooling_updateOutput)(lua_State *L)
   /* cleanup */
   THTensor_(free)(input);
   THTensor_(free)(indices);
-
-  return 1;
 }
 
-static void nn_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
                                                          real *ind_p,
                                                          long nslices,
                                                          long iwidth, long iheight,
@@ -144,14 +142,14 @@ static void nn_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, re
   }
 }
 
-static int nn_(SpatialMaxUnpooling_updateGradInput)(lua_State *L)
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *indices,
+    int owidth, int oheight)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-  int owidth = luaT_getfieldcheckint(L, 1, "owidth");
-  int oheight = luaT_getfieldcheckint(L, 1, "oheight");
   int dimw = 2;
   int dimh = 1;
   int nbatch = 1;
@@ -197,7 +195,7 @@ static int nn_(SpatialMaxUnpooling_updateGradInput)(lua_State *L)
   /* backprop */
   if (input->nDimension == 3)
   {
-    nn_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                  indices_data,
                                                  nslices,
                                                  iwidth, iheight,
@@ -209,7 +207,7 @@ static int nn_(SpatialMaxUnpooling_updateGradInput)(lua_State *L)
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      nn_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                    indices_data+p*nslices*iwidth*iheight,
                                                    nslices,
                                                    iwidth, iheight,
@@ -220,21 +218,6 @@ static int nn_(SpatialMaxUnpooling_updateGradInput)(lua_State *L)
   /* cleanup */
   THTensor_(free)(gradOutput);
   THTensor_(free)(indices);
-
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SpatialMaxUnpooling__) [] = {
-  {"SpatialMaxUnpooling_updateOutput", nn_(SpatialMaxUnpooling_updateOutput)},
-  {"SpatialMaxUnpooling_updateGradInput", nn_(SpatialMaxUnpooling_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialMaxUnpooling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialMaxUnpooling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialSubSampling.c b/generic/SpatialSubSampling.c
index 912592c7285..abfbfceaeff 100644
--- a/generic/SpatialSubSampling.c
+++ b/generic/SpatialSubSampling.c
@@ -2,19 +2,16 @@
 #define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
 #else
 
-static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
+void THNN_(SpatialSubSampling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    int kW, int kH,
+    int dW, int dH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
+  
   real *weight_data = THTensor_(data)(weight);
   real *bias_data = THTensor_(data)(bias);
   real *output_data;
@@ -29,9 +26,11 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
   long outputWidth;
   long outputHeight;
 
+  int nInputPlane = THTensor_(size)(weight,0);
+
   long k;
 
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
 
   if (input->nDimension == 4) {
     nbatch = input->size[0];
@@ -44,8 +43,8 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
   outputWidth = (inputWidth - kW) / dW + 1;
   outputHeight = (inputHeight - kH) / dH + 1;
 
-  luaL_argcheck(L, input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
-  luaL_argcheck(L, inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
 
   if (input->nDimension == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
@@ -95,23 +94,18 @@ static int nn_(SpatialSubSampling_updateOutput)(lua_State *L)
     }
   }
   THTensor_(free)(input);
-
-  return 1;
 }
 
-static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
+void THNN_(SpatialSubSampling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    int kW, int kH,
+    int dW, int dH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
+  
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
@@ -121,6 +115,8 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
   long outputWidth;
   long outputHeight;
 
+  int nInputPlane = THTensor_(size)(weight,0);
+
   real *weight_data;
   real *gradOutput_data;
   real *input_data, *gradInput_data;
@@ -180,24 +176,18 @@ static int nn_(SpatialSubSampling_updateGradInput)(lua_State *L)
       }
     }
   }
-
-  return 1;
 }
 
-static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
+void THNN_(SpatialSubSampling_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    int kW, int kH,
+    int dW, int dH,
+    real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  real scale = luaL_optnumber(L, 4, 1);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-
   long nbatch = 1;
   long dimw = 2;
   long dimh = 1;
@@ -207,6 +197,8 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
   long outputWidth;
   long outputHeight;
 
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+
   real *gradWeight_data;
   real *gradBias_data;
   real *gradOutput_data;
@@ -270,22 +262,6 @@ static int nn_(SpatialSubSampling_accGradParameters)(lua_State *L)
   }
 
   THTensor_(free)(input);
-
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialSubSampling__) [] = {
-  {"SpatialSubSampling_updateOutput", nn_(SpatialSubSampling_updateOutput)},
-  {"SpatialSubSampling_updateGradInput", nn_(SpatialSubSampling_updateGradInput)},
-  {"SpatialSubSampling_accGradParameters", nn_(SpatialSubSampling_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialSubSampling_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialSubSampling__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialUpSamplingNearest.c b/generic/SpatialUpSamplingNearest.c
index c3cddb05101..7ef093c9488 100644
--- a/generic/SpatialUpSamplingNearest.c
+++ b/generic/SpatialUpSamplingNearest.c
@@ -2,16 +2,16 @@
 #define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
 #else
 
-static int nn_(SpatialUpSamplingNearest_updateOutput)(lua_State *L)
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
 {
-  // get all params
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int scale_factor = luaT_getfieldcheckint(L, 1, "scale_factor");
   int dW = scale_factor;
   int dH = scale_factor;
   int xDim = input->nDimension-2;
   int yDim = input->nDimension-1;
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
 
   // dims
   int idim = input->nDimension;  // Gauranteed to be between 3 and 5
@@ -65,17 +65,15 @@ static int nn_(SpatialUpSamplingNearest_updateOutput)(lua_State *L)
       }
     }
   }
-  return 1;
 }
 
-static int nn_(SpatialUpSamplingNearest_updateGradInput)(lua_State *L)
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
 {
-  // get all params
-  //THTensor *input = luaT_checkudata(L,2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L,3, torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L,1, "gradInput", torch_Tensor);
-
-  int scale_factor = luaT_getfieldcheckint(L, 1, "scale_factor");
   int dW = scale_factor;
   int dH = scale_factor;
   int xDim = gradInput->nDimension-2;
@@ -140,20 +138,6 @@ static int nn_(SpatialUpSamplingNearest_updateGradInput)(lua_State *L)
       }
     }
   }
-  return 1;
-}
-
-static const struct luaL_Reg nn_(SpatialUpSamplingNearest__) [] = {
-  {"SpatialUpSamplingNearest_updateOutput", nn_(SpatialUpSamplingNearest_updateOutput)},
-  {"SpatialUpSamplingNearest_updateGradInput", nn_(SpatialUpSamplingNearest_updateGradInput)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialUpSamplingNearest_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialUpSamplingNearest__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index fca4c5c6cc2..0c0f801a5a6 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -401,6 +401,31 @@ TH_API void THNN_(Threshold_updateGradInput)(
           real threshold,
           bool inplace);
 
+TH_API void THNN_(SpatialBatchNormalization_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+TH_API void THNN_(SpatialBatchNormalization_backward)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *save_mean,
+          THTensor *save_std,
+          double scale);
+
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -437,6 +462,47 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           int padW, int padH,
           real scale);
 
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight,
+          real scale);
+
 TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -470,6 +536,60 @@ TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
           bool ceil_mode,
           bool count_include_pad);
 
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          real scale);
+
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -490,6 +610,77 @@ TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           int padW, int padH,
           bool ceil_mode);
 
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int owidth, int oheight);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          real scale);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+
 TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -633,4 +824,5 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
+
 #endif
diff --git a/init.c b/init.c
index e49ab8d2b13..614ef263b01 100644
--- a/init.c
+++ b/init.c
@@ -97,21 +97,42 @@
 #include "generic/TemporalMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialBatchNormalization.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 
 #include "generic/SpatialConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialConvolutionLocal.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialAdaptiveMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
 #include "generic/SpatialAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 

From d30e400f08674297d23689907030c50969e936c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Wed, 17 Feb 2016 22:57:58 +0100
Subject: [PATCH 047/101] Add THNN conversion of VolumetricFullConvolution

---
 generic/THNN.h                      |  35 ++
 generic/VolumetricFullConvolution.c | 476 ++++++++++++++--------------
 init.c                              |   3 +
 3 files changed, 283 insertions(+), 231 deletions(-)

diff --git a/generic/THNN.h b/generic/THNN.h
index a7d36bd42f9..37594824468 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -565,6 +565,41 @@ TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
           THTensor *finput,
           real scale);
 
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *output,         // [OUT] volumetric convolution output
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *bias,           // gradBias tensor (nOutputPlane)
+          THTensor *finput,         // [OUT] internal columns buffer
+          THTensor *fgradInput,     // [OUT] internal ones buffer
+          int dT, int dW, int dH,   // stride of the convolution
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradInput,      // [OUT] gradient w.r.t. input
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *gradBias,       // gradBias tensor (nOutputPlane)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH,   // extra output adjustment
+          real scale);              // scaling factor
+
 TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
index ba1341dc0ae..5a6a1a74705 100644
--- a/generic/VolumetricFullConvolution.c
+++ b/generic/VolumetricFullConvolution.c
@@ -2,28 +2,34 @@
 #define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
 #else
 
-
-static void nn_(vol2col)(const real* data_vol, const int channels,
-    const int depth, const int height, const int width, const int kernel_t, const int kernel_h, const int kernel_w,
-    const int pad_t, const int pad_h, const int pad_w,
-    const int stride_t, const int stride_h, const int stride_w,
-    real* data_col) {
+static void THNN_(vol2col)(
+  const real *data_vol, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  real *data_col)
+{
   int c, t, h, w;
-  int depth_col = (depth + 2 * pad_t - kernel_t) / stride_t + 1;
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int channels_col = channels * kernel_t * kernel_h * kernel_w;
-  for (c = 0; c < channels_col; ++c) {
-    int w_offset = c % kernel_w;
-    int h_offset = (c / kernel_w) % kernel_h;
-    int t_offset = (c / kernel_w / kernel_h) % kernel_t;
-    int c_vol = c / kernel_t / kernel_h / kernel_w;
-    for (t = 0; t < depth_col; ++t) {
-      for (h = 0; h < height_col; ++h) {
-        for (w = 0; w < width_col; ++w) {
-          int t_pad = t * stride_t - pad_t + t_offset;
-          int h_pad = h * stride_h - pad_h + h_offset;
-          int w_pad = w * stride_w - pad_w + w_offset;
+  int depth_col  = (depth  + 2 * pT - kT) / dT + 1;
+  int height_col = (height + 2 * pH - kH) / dH + 1;
+  int width_col  = (width  + 2 * pW - kW) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset;
+          int h_pad = h * dH - pH + h_offset;
+          int w_pad = w * dW - pW + w_offset;
           if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
             data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
               data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
@@ -35,28 +41,35 @@ static void nn_(vol2col)(const real* data_vol, const int channels,
   }
 }
 
-static void nn_(col2vol)(const real* data_col, const int channels,
-    const int depth, const int height, const int width, const int patch_t, const int patch_h, const int patch_w,
-    const int pad_t, const int pad_h, const int pad_w,
-    const int stride_t, const int stride_h, const int stride_w,
-    real* data_vol) {
+static void THNN_(col2vol)(
+  const real* data_col, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  real* data_vol)
+{
   int c, t, h, w;
   memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
-  int depth_col = (depth + 2 * pad_t - patch_t) / stride_t + 1;
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int channels_col = channels * patch_t * patch_h * patch_w;
-  for (c = 0; c < channels_col; ++c) {
-    int w_offset = c % patch_w;
-    int h_offset = (c / patch_w) % patch_h;
-    int t_offset = (c / patch_w / patch_h) % patch_t;
-    int c_vol = c / patch_t / patch_h / patch_w;
-    for (t = 0; t < depth_col; ++t) {
-      for (h = 0; h < height_col; ++h) {
-        for (w = 0; w < width_col; ++w) {
-          int t_pad = t * stride_t - pad_t + t_offset;
-          int h_pad = h * stride_h - pad_h + h_offset;
-          int w_pad = w * stride_w - pad_w + w_offset;
+  int depth_col = (depth + 2 * pT - kT) / dT + 1;
+  int height_col = (height + 2 * pH - kH) / dH + 1;
+  int width_col = (width + 2 * pW - kW) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset;
+          int h_pad = h * dH - pH + h_offset;
+          int w_pad = w * dW - pW + w_offset;
           if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
             data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
               data_col[((c * depth_col + t) * height_col + h) * width_col + w];
@@ -66,53 +79,58 @@ static void nn_(col2vol)(const real* data_col, const int channels,
   }
 }
 
-static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
-  // Input
-  THTensor *input = (THTensor*)luaT_checkudata(L, 2, torch_Tensor);
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *columns = finput;
+  THTensor *ones    = fgradInput;
 
-  // Params:
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
-  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
-  int adjT = luaT_getfieldcheckint(L, 1, "adjT");
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
 
-  THTensor *weight  = (THTensor*)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias    = (THTensor*)luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *ones    = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-  THTensor *output  = (THTensor*)luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
 
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
 
   int batch = 1;
-  if (input->nDimension == 4) {
-    luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  if (input->nDimension == 4)
+  {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-  } else {
-    luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+  else
+  {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }
 
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long inputDepth = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
 
   // Batch size + input planes
-  long batchSize = input->size[0];
+  const long batchSize = input->size[0];
 
   // Resize output
   THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
@@ -123,7 +141,8 @@ static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
     // Resize plane and fill with ones...
     THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -135,51 +154,55 @@ static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
 
   int elt;
   // For each elt in batch, do:
-  for (elt = 0; elt < batchSize; elt ++) {
+  for (elt = 0; elt < batchSize; ++elt)
+  {
     // Matrix mulitply per output:
     THTensor_(select)(input_n, input, 0, elt);
     THTensor_(select)(output_n, output, 0, elt);
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
-    long n = columns->size[1];
-    long k = weight->size[0];
+    const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const long n = columns->size[1];
+    const long k = weight->size[0];
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
-        'n', 't',
-        n, m, k,
-        1,
-        THTensor_(data)(input_n), n,
-        THTensor_(data)(weight), m,
-        0,
-        THTensor_(data)(columns), n
+      'n', 't',
+      n, m, k,
+      1,
+      THTensor_(data)(input_n), n,
+      THTensor_(data)(weight), m,
+      0,
+      THTensor_(data)(columns), n
     );
 
     // Unpack columns back into input:
-    nn_(col2vol)(
+    THNN_(col2vol)(
       THTensor_(data)(columns),
-      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
       THTensor_(data)(output_n)
     );
 
     // Do Bias after:
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputDepth * outputHeight * outputWidth;
-    long k_ = 1;
+    const long m_ = nOutputPlane;
+    const long n_ = outputDepth * outputHeight * outputWidth;
+    const long k_ = 1;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
-        't', 'n',
-        n_, m_, k_,
-        1,
-        THTensor_(data)(ones), k_,
-        THTensor_(data)(bias), k_,
-        1,
-        THTensor_(data)(output_n), n_
+      't', 'n',
+      n_, m_, k_,
+      1,
+      THTensor_(data)(ones), k_,
+      THTensor_(data)(bias), k_,
+      1,
+      THTensor_(data)(output_n), n_
     );
   }
 
@@ -188,59 +211,60 @@ static int nn_(VolumetricFullConvolution_updateOutput)(lua_State *L) {
   THTensor_(free)(output_n);
 
   // Resize output
-  if (batch == 0) {
+  if (batch == 0)
+  {
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
-
-  // return output
-  return 1;
 }
 
-static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
-  // Inputs
-  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *gradColumns = finput;
 
-  // Params
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
-  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
-  int adjT = luaT_getfieldcheckint(L, 1, "adjT");
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
 
-  THTensor *weight = (THTensor *)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradColumns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *gradInput = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
 
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
 
   int batch = 1;
-  if (input->nDimension == 4) {
+  if (input->nDimension == 4)
+  {
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
     THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
   }
 
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long inputDepth   = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
 
   // Batch size + input planes
-  long batchSize = input->size[0];
+  const long batchSize = input->size[0];
 
   // Resize output
   THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
@@ -254,103 +278,106 @@ static int nn_(VolumetricFullConvolution_updateGradInput)(lua_State *L) {
 
   int elt;
   // For each elt in batch, do:
-  for (elt = 0; elt < batchSize; elt ++) {
+  for (elt = 0; elt < batchSize; ++elt)
+  {
     // Matrix mulitply per sample:
     THTensor_(select)(gradInput_n, gradInput, 0, elt);
     THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
 
     // Extract columns:
-    nn_(vol2col)(
+    THNN_(vol2col)(
       THTensor_(data)(gradOutput_n),
-      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
       THTensor_(data)(gradColumns)
     );
 
-
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[0];
-    long n = gradColumns->size[1];
-    long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const long m = weight->size[0];
+    const long n = gradColumns->size[1];
+    const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
-        'n', 'n',
-        n, m, k,
-        1,
-        THTensor_(data)(gradColumns), n,
-        THTensor_(data)(weight), k,
-        0,
-        THTensor_(data)(gradInput_n), n
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(gradColumns), n,
+      THTensor_(data)(weight), k,
+      0,
+      THTensor_(data)(gradInput_n), n
     );
   }
 
-
   // Free
   THTensor_(free)(gradInput_n);
   THTensor_(free)(gradOutput_n);
 
   // Resize output
-  if (batch == 0) {
+  if (batch == 0)
+  {
     THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
     THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
-
-  // Return gradInput
-  return 1;
 }
 
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH,   // extra output adjustment
+  real scale)
+{
+  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
 
-static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
-  // Inputs
-  THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor);
+  int nInputPlane  = (int)gradWeight->size[0];
+  int nOutputPlane = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
 
-  // Params
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int dT = luaT_getfieldcheckint(L, 1, "dT");
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int kT = luaT_getfieldcheckint(L, 1, "kT");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int padW = luaT_getfieldcheckint(L, 1, "padW");
-  int padH = luaT_getfieldcheckint(L, 1, "padH");
-  int padT = luaT_getfieldcheckint(L, 1, "padT");
-  int adjW = luaT_getfieldcheckint(L, 1, "adjW");
-  int adjH = luaT_getfieldcheckint(L, 1, "adjH");
-  int adjT = luaT_getfieldcheckint(L, 1, "adjT");
-  float scale = luaL_optnumber(L, 4, 1);
+  THTensor *columns = finput;
+  THTensor *ones = fgradInput;
 
-  THTensor *gradWeight = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-  THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor);
-  THTensor *ones = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor);
-
-  luaL_argcheck(L, input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
 
   int batch = 1;
-  if (input->nDimension == 4) {
+  if (input->nDimension == 4)
+  {
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
     THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
   }
 
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long inputDepth  = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
 
   // Batch size + input planes
-  long batchSize = input->size[0];
+  const long batchSize = input->size[0];
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
     // Resize plane and fill with ones...
     THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -365,51 +392,54 @@ static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
 
   int elt;
   // For each elt in batch, do:
-  for (elt = 0; elt < batchSize; elt ++) {
+  for (elt = 0; elt < batchSize; ++elt)
+  {
     // Matrix mulitply per output:
     THTensor_(select)(input_n, input, 0, elt);
     THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
 
     // Extract columns:
-    nn_(vol2col)(
-      THTensor_(data)(gradOutput_n),
-      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n), nOutputPlane,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
       THTensor_(data)(columns)
     );
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long n = columns->size[0];   // nOutputPlane * kt * kh * kw
-    long m = input_n->size[0];   // nInputPlane
-    long k = columns->size[1];   // inputHeight * inputWidth
+    const long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    const long m = input_n->size[0];   // nInputPlane
+    const long k = columns->size[1];   // inputHeight * inputWidth
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
-        't', 'n',
-        n, m, k,
-        scale,
-        THTensor_(data)(columns), k,
-        THTensor_(data)(input_n), k,
-        1,
-        THTensor_(data)(gradWeight), n
+      't', 'n',
+      n, m, k,
+      scale,
+      THTensor_(data)(columns), k,
+      THTensor_(data)(input_n), k,
+      1,
+      THTensor_(data)(gradWeight), n
     );
 
-
     // Do Bias:
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputDepth * outputHeight * outputWidth;
+    const long m_ = nOutputPlane;
+    const long k_ = outputDepth * outputHeight * outputWidth;
 
     // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
     THBlas_(gemv)(
-        't',
-        k_, m_,
-        scale,
-        THTensor_(data)(gradOutput_n), k_,
-        THTensor_(data)(ones), 1,
-        1,
-        THTensor_(data)(gradBias), 1
+      't',
+      k_, m_,
+      scale,
+      THTensor_(data)(gradOutput_n), k_,
+      THTensor_(data)(ones), 1,
+      1,
+      THTensor_(data)(gradBias), 1
     );
   }
 
@@ -418,27 +448,11 @@ static int nn_(VolumetricFullConvolution_accGradParameters)(lua_State *L) {
   THTensor_(free)(gradOutput_n);
 
   // Resize
-  if (batch == 0) {
+  if (batch == 0)
+  {
     THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
-
-  // Return nothing
-  return 0;
-}
-
-static const struct luaL_Reg nn_(VolumetricFullConvolution__) [] = {
-  {"VolumetricFullConvolution_updateOutput", nn_(VolumetricFullConvolution_updateOutput)},
-  {"VolumetricFullConvolution_updateGradInput", nn_(VolumetricFullConvolution_updateGradInput)},
-  {"VolumetricFullConvolution_accGradParameters", nn_(VolumetricFullConvolution_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(VolumetricFullConvolution_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(VolumetricFullConvolution__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/init.c b/init.c
index c14f913f50a..e49ab8d2b13 100644
--- a/init.c
+++ b/init.c
@@ -121,6 +121,9 @@
 #include "generic/VolumetricConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/VolumetricFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 

From 2fb50f928210c773f136e02f9e51965b7cd2550f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Thu, 18 Feb 2016 10:13:59 +0100
Subject: [PATCH 048/101] Move generic/Spatial*Map.c -> lib/THNN

---
 generic/SpatialConvolutionMap.c     | 279 ++++++++++++++++++++++++++++
 generic/SpatialFullConvolutionMap.c | 268 ++++++++++++++++++++++++++
 2 files changed, 547 insertions(+)
 create mode 100644 generic/SpatialConvolutionMap.c
 create mode 100644 generic/SpatialFullConvolutionMap.c

diff --git a/generic/SpatialConvolutionMap.c b/generic/SpatialConvolutionMap.c
new file mode 100644
index 00000000000..004913021e3
--- /dev/null
+++ b/generic/SpatialConvolutionMap.c
@@ -0,0 +1,279 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
+{
+ THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  real *input_data;
+  real *output_data;
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+
+  long p;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimc++;
+    dimw++;
+    dimh++;
+  }
+  luaL_argcheck(L, input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  luaL_argcheck(L, input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  long input_w   = input->size[dimw];
+  long input_h   = input->size[dimh];
+  long output_w  = (input_w - kW) / dW + 1;
+  long output_h  = (input_h - kH) / dH + 1;
+
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+  else
+    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+
+  /* get raw pointers */
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++) {
+
+    long m;
+    for(m = 0; m < nbatch; m++){
+      /* add bias */
+      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long j,k;
+      real z= bias_data[p];
+      for(j = 0; j < output_h*output_w; j++)
+        ptr_output[j] = z;
+
+      /* convolve all maps */
+      int nweight = connTable->size[0];
+      for (k = 0; k < nweight; k++) {
+        /* get offsets for input/output */
+        int o = (int)connTable_data[k*2+1]-1;
+        int i = (int)connTable_data[k*2+0]-1;
+
+        if (o == p){
+          THTensor_(validXCorr2Dptr)(output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+                                    1.0,
+                                    input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+                                    weight_data + k*kW*kH, kH, kW,
+                                    dH, dW);
+
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+
+  return 1;
+}
+
+static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+    /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  long input_w   = input->size[dimw];
+  long input_h   = input->size[dimh];
+  long weight_h = weight->size[1];
+  long weight_w = weight->size[2];
+  long output_h = gradOutput->size[dimh];
+  long output_w = gradOutput->size[dimw];
+
+  long p;
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+
+#pragma omp parallel for private(p)
+  for(p = 0; p < nInputPlane; p++){
+    long m;
+    for(m = 0; m < nbatch; m++){
+      long k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for(k = 0; k < nkernel; k++)
+      {
+        int o = (int)connTable_data[k*2+1]-1;
+        int i = (int)connTable_data[k*2+0]-1;
+        if (i == p){
+          /* gradient to input */
+          THTensor_(fullConv2Dptr)(gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+                gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+                weight_data + k*weight_w*weight_h, weight_h, weight_w, dH, dW);
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+
+  return 1;
+}
+
+static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  real scale = luaL_optnumber(L, 4, 1);
+
+  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  real *input_data;
+  real *gradOutput_data;
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+    /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  long input_w   = input->size[dimw];
+  long input_h   = input->size[dimh];
+  long output_h = gradOutput->size[dimh];
+  long output_w = gradOutput->size[dimw];
+  long weight_h  = weight->size[1];
+  long weight_w  = weight->size[2];
+
+  int nkernel;
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  input_data = THTensor_(data)(input);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  long k;
+    /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for(k = 0; k < nOutputPlane; k++) {
+    long m;
+    for(m = 0; m < nbatch; m++){
+      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long l;
+      for(l = 0; l < output_h*output_w; l++)
+        gradBias_data[k] += scale*ptr_gradOutput[l];
+    }
+  }
+
+  /* gradients wrt weight */
+  nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for(k = 0; k < nkernel; k++){
+    long m;
+    for(m = 0; m < nbatch; m++){
+      int o = (int)THTensor_(get2d)(connTable,k,1)-1;
+      int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h,
+                                   scale,
+                                   input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+                                   gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+                                   dH, dW);
+    }
+  }
+
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialConvolutionMap__) [] = {
+  {"SpatialConvolutionMap_updateOutput", nn_(SpatialConvolutionMap_updateOutput)},
+  {"SpatialConvolutionMap_updateGradInput", nn_(SpatialConvolutionMap_updateGradInput)},
+  {"SpatialConvolutionMap_accGradParameters", nn_(SpatialConvolutionMap_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialConvolutionMap_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialConvolutionMap__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/generic/SpatialFullConvolutionMap.c b/generic/SpatialFullConvolutionMap.c
new file mode 100644
index 00000000000..9d5cff2b385
--- /dev/null
+++ b/generic/SpatialFullConvolutionMap.c
@@ -0,0 +1,268 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+static int nn_(SpatialFullConvolutionMap_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  int kW = luaT_getfieldcheckint(L, 1, "kW");
+  int kH = luaT_getfieldcheckint(L, 1, "kH");
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+
+
+  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
+  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+
+  real *input_data;
+  real *output_data;
+  real *weight_data;
+  real *bias_data;
+  real *connTable_data;
+
+  long input_h;
+  long input_w;
+  long output_h;
+  long output_w;
+  long weight_h;
+  long weight_w;
+
+  long p;
+
+  luaL_argcheck(L, input->nDimension == 3, 2, "3D tensor expected");
+  luaL_argcheck(L, input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+
+  THTensor_(resize3d)(output, nOutputPlane,
+                      (input->size[1] - 1) * dH + kH,
+                      (input->size[2] - 1) * dW + kW);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+
+  /* get raw pointers */
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  weight_data = THTensor_(data)(weight);
+  bias_data = THTensor_(data)(bias);
+  connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  input_h = input->size[1];
+  input_w = input->size[2];
+  output_h = output->size[1];
+  output_w = output->size[2];
+  weight_h = weight->size[1];
+  weight_w = weight->size[2];
+
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++) {
+    /* add bias */
+    real *ptr_output = output_data + p*output_w*output_h;
+    long j;
+    int nweight;
+    long k;
+
+    for(j = 0; j < output_h*output_w; j++)
+      ptr_output[j] = bias_data[p];
+
+    /* convolve all maps */
+    nweight = connTable->size[0];
+    for (k = 0; k < nweight; k++) {
+      /* get offsets for input/output */
+      int o = (int)connTable_data[k*2+1]-1;
+      int i = (int)connTable_data[k*2+0]-1;
+
+      if (o == p)
+        {
+          THTensor_(fullConv2Dptr)(output_data + o*output_w*output_h,
+				   1.0,
+				   input_data + i*input_w*input_h, input_h, input_w,
+				   weight_data + k*weight_w*weight_h, weight_h, weight_w,
+				   dH, dW);
+        }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+
+  return 1;
+}
+
+static int nn_(SpatialFullConvolutionMap_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
+
+  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *weight_data;
+  real *connTable_data;
+
+  long input_h;
+  long input_w;
+  long output_h;
+  long output_w;
+  long weight_h;
+  long weight_w;
+
+  long p;
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  weight_data = THTensor_(data)(weight);
+  connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  input_h = input->size[1];
+  input_w = input->size[2];
+  output_h = gradOutput->size[1];
+  output_w = gradOutput->size[2];
+  weight_h = weight->size[1];
+  weight_w = weight->size[2];
+
+#pragma omp parallel for private(p)
+  for(p = 0; p < nInputPlane; p++)
+    {
+      long k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for(k = 0; k < nkernel; k++)
+        {
+          int o = (int)connTable_data[k*2+1]-1;
+          int i = (int)connTable_data[k*2+0]-1;
+          if (i == p)
+            {
+              /* gradient to input */
+              THTensor_(validXCorr2Dptr)(gradInput_data + i*input_w*input_h,
+					 1.0,
+					 gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+					 weight_data + k*weight_w*weight_h, weight_h, weight_w,
+					 dH, dW);
+            }
+        }
+    }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+
+  return 1;
+}
+
+static int nn_(SpatialFullConvolutionMap_accGradParameters)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+  int dW = luaT_getfieldcheckint(L, 1, "dW");
+  int dH = luaT_getfieldcheckint(L, 1, "dH");
+  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  real scale = luaL_optnumber(L, 4, 1);
+
+  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
+  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+
+  real *input_data;
+  real *gradOutput_data;
+  real *gradWeight_data;
+  real *gradBias_data;
+
+  long input_h;
+  long input_w;
+  long output_h;
+  long output_w;
+  long weight_h;
+  long weight_w;
+
+  long k;
+  int nkernel;
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  input_data = THTensor_(data)(input);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  input_h = input->size[1];
+  input_w = input->size[2];
+  output_h = gradOutput->size[1];
+  output_w = gradOutput->size[2];
+  weight_h = weight->size[1];
+  weight_w = weight->size[2];
+
+  /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for(k = 0; k < nOutputPlane; k++) {
+    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+    long l;
+    for(l = 0; l < output_h*output_w; l++)
+      gradBias_data[k] += scale*ptr_gradOutput[l];
+  }
+
+  /* gradients wrt weight */
+  nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for(k = 0; k < nkernel; k++)
+    {
+      int o = (int)THTensor_(get2d)(connTable,k,1)-1;
+      int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h,
+                                 scale,
+                                 gradOutput_data + o*output_w*output_h, output_h, output_w,
+                                 input_data + i*input_w*input_h, input_h, input_w,
+                                 dH, dW);
+    }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  return 0;
+}
+
+static const struct luaL_Reg nn_(SpatialFullConvolutionMapStuff__) [] = {
+  {"SpatialFullConvolutionMap_updateOutput", nn_(SpatialFullConvolutionMap_updateOutput)},
+  {"SpatialFullConvolutionMap_updateGradInput", nn_(SpatialFullConvolutionMap_updateGradInput)},
+  {"SpatialFullConvolutionMap_accGradParameters", nn_(SpatialFullConvolutionMap_accGradParameters)},
+  {NULL, NULL}
+};
+
+static void nn_(SpatialFullConvolutionMap_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(SpatialFullConvolutionMapStuff__), "nn");
+  lua_pop(L,1);
+}
+
+#endif

From a5dd23e000a1e8c6622946d27b739b3c23047d29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20K=C3=B6pf?= <andreas.koepf@xamla.com>
Date: Thu, 18 Feb 2016 11:44:42 +0100
Subject: [PATCH 049/101] Add THNN conversion of Spatial*ConvolutionMap

---
 generic/SpatialConvolutionMap.c     | 248 +++++++++++------------
 generic/SpatialFullConvolutionMap.c | 298 +++++++++++-----------------
 generic/THNN.h                      |  66 ++++++
 init.c                              |   6 +
 4 files changed, 307 insertions(+), 311 deletions(-)

diff --git a/generic/SpatialConvolutionMap.c b/generic/SpatialConvolutionMap.c
index 004913021e3..aef0b1e2ee7 100644
--- a/generic/SpatialConvolutionMap.c
+++ b/generic/SpatialConvolutionMap.c
@@ -2,50 +2,46 @@
 #define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
 #else
 
-static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
+void THNN_(SpatialConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
 {
- THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
 
-  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-
-  real *input_data;
-  real *output_data;
   real *weight_data = THTensor_(data)(weight);
   real *bias_data = THTensor_(data)(bias);
   real *connTable_data = THTensor_(data)(connTable);
 
-
-  long p;
-
   int dimw = 2;
   int dimh = 1;
   int dimc = 0;
   long nbatch = 1;
-  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
 
-  if (input->nDimension == 4) {
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
     nbatch = input->size[0];
     dimc++;
     dimw++;
     dimh++;
   }
-  luaL_argcheck(L, input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
-  luaL_argcheck(L, input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
 
-  long input_w   = input->size[dimw];
-  long input_h   = input->size[dimh];
-  long output_w  = (input_w - kW) / dW + 1;
-  long output_h  = (input_h - kH) / dH + 1;
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
 
+  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  const long input_w  = input->size[dimw];
+  const long input_h  = input->size[dimh];
+  const long output_w = (input_w - kW) / dW + 1;
+  const long output_h = (input_h - kH) / dH + 1;
 
   if (input->nDimension == 3)
     THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
@@ -57,36 +53,41 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
   output = THTensor_(newContiguous)(output);
 
   /* get raw pointers */
-  input_data = THTensor_(data)(input);
-  output_data = THTensor_(data)(output);
-
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
 
+  long p;
 #pragma omp parallel for private(p)
-  for (p = 0; p < nOutputPlane; p++) {
-
+  for (p = 0; p < nOutputPlane; p++)
+  {
     long m;
-    for(m = 0; m < nbatch; m++){
+    for (m = 0; m < nbatch; m++)
+    {
       /* add bias */
       real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
-      long j,k;
+      long j, k;
       real z= bias_data[p];
-      for(j = 0; j < output_h*output_w; j++)
+      for (j = 0; j < output_h*output_w; j++)
         ptr_output[j] = z;
 
       /* convolve all maps */
       int nweight = connTable->size[0];
-      for (k = 0; k < nweight; k++) {
+      for (k = 0; k < nweight; k++)
+      {
         /* get offsets for input/output */
         int o = (int)connTable_data[k*2+1]-1;
         int i = (int)connTable_data[k*2+0]-1;
 
-        if (o == p){
-          THTensor_(validXCorr2Dptr)(output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
-                                    1.0,
-                                    input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
-                                    weight_data + k*kW*kH, kH, kW,
-                                    dH, dW);
-
+        if (o == p)
+        {
+          THTensor_(validXCorr2Dptr)(
+            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+            1.0,
+            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+            weight_data + k*kW*kH,
+            kH, kW,
+            dH, dW
+          );
         }
       }
     }
@@ -95,46 +96,39 @@ static int nn_(SpatialConvolutionMap_updateOutput)(lua_State *L)
   /* clean up */
   THTensor_(free)(input);
   THTensor_(free)(output);
-
-  return 1;
 }
 
-static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
 
-  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
-  real *gradInput_data;
-  real *gradOutput_data;
   real *weight_data = THTensor_(data)(weight);
   real *connTable_data = THTensor_(data)(connTable);
 
-    /* and dims */
+  /* and dims */
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
-  if (input->nDimension == 4) {
+  if (input->nDimension == 4)
+  {
     nbatch = input->size[0];
     dimw++;
     dimh++;
   }
 
-  long input_w   = input->size[dimw];
-  long input_h   = input->size[dimh];
-  long weight_h = weight->size[1];
-  long weight_w = weight->size[2];
-  long output_h = gradOutput->size[dimh];
-  long output_w = gradOutput->size[dimw];
-
-  long p;
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
 
   /* contiguous */
   gradInput = THTensor_(newContiguous)(gradInput);
@@ -145,26 +139,31 @@ static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
   THTensor_(zero)(gradInput);
 
   /* get raw pointers */
-  gradInput_data = THTensor_(data)(gradInput);
-  gradOutput_data = THTensor_(data)(gradOutput);
-
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
 
+  long p;
 #pragma omp parallel for private(p)
-  for(p = 0; p < nInputPlane; p++){
+  for (p = 0; p < nInputPlane; p++)
+  {
     long m;
-    for(m = 0; m < nbatch; m++){
+    for (m = 0; m < nbatch; m++)
+    {
       long k;
       /* backward all */
       int nkernel = connTable->size[0];
-      for(k = 0; k < nkernel; k++)
+      for (k = 0; k < nkernel; k++)
       {
         int o = (int)connTable_data[k*2+1]-1;
         int i = (int)connTable_data[k*2+0]-1;
-        if (i == p){
+        if (i == p)
+        {
           /* gradient to input */
-          THTensor_(fullConv2Dptr)(gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
-                gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
-                weight_data + k*weight_w*weight_h, weight_h, weight_w, dH, dW);
+          THTensor_(fullConv2Dptr)(
+            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+            weight_data + k*kW*kH, kH, kW, dH, dW
+          );
         }
       }
     }
@@ -173,107 +172,88 @@ static int nn_(SpatialConvolutionMap_updateGradInput)(lua_State *L)
   /* clean up */
   THTensor_(free)(gradInput);
   THTensor_(free)(gradOutput);
-
-  return 1;
 }
 
-static int nn_(SpatialConvolutionMap_accGradParameters)(lua_State *L)
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH, real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  real scale = luaL_optnumber(L, 4, 1);
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+  );
 
-  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-
-  real *input_data;
-  real *gradOutput_data;
   real *gradWeight_data = THTensor_(data)(gradWeight);
   real *gradBias_data = THTensor_(data)(gradBias);
 
-    /* and dims */
+  /* and dims */
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
-  if (input->nDimension == 4) {
+  if (input->nDimension == 4)
+  {
     nbatch = input->size[0];
     dimw++;
     dimh++;
   }
 
-  long input_w   = input->size[dimw];
-  long input_h   = input->size[dimh];
-  long output_h = gradOutput->size[dimh];
-  long output_w = gradOutput->size[dimw];
-  long weight_h  = weight->size[1];
-  long weight_w  = weight->size[2];
-
-  int nkernel;
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = gradWeight->size[1];
+  const long kW       = gradWeight->size[2];
 
   /* contiguous */
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
   /* get raw pointers */
-  input_data = THTensor_(data)(input);
-  gradOutput_data = THTensor_(data)(gradOutput);
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
 
   long k;
-    /* gradients wrt bias */
+  /* gradients wrt bias */
 #pragma omp parallel for private(k)
-  for(k = 0; k < nOutputPlane; k++) {
+  for (k = 0; k < nOutputPlane; k++)
+  {
     long m;
-    for(m = 0; m < nbatch; m++){
+    for (m = 0; m < nbatch; m++)
+    {
       real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
       long l;
-      for(l = 0; l < output_h*output_w; l++)
+      for (l = 0; l < output_h*output_w; l++)
         gradBias_data[k] += scale*ptr_gradOutput[l];
     }
   }
 
   /* gradients wrt weight */
-  nkernel = connTable->size[0];
+  const int nkernel = connTable->size[0];
 #pragma omp parallel for private(k)
-  for(k = 0; k < nkernel; k++){
+  for (k = 0; k < nkernel; k++)
+  {
     long m;
-    for(m = 0; m < nbatch; m++){
+    for (m = 0; m < nbatch; m++)
+    {
       int o = (int)THTensor_(get2d)(connTable,k,1)-1;
       int i = (int)THTensor_(get2d)(connTable,k,0)-1;
 
       /* gradient to kernel */
-      THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h,
-                                   scale,
-                                   input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
-                                   gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
-                                   dH, dW);
+      THTensor_(validXCorr2DRevptr)(
+        gradWeight_data + k*kW*kH,
+        scale,
+        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+        dH, dW
+      );
     }
   }
 
-
   /* clean up */
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialConvolutionMap__) [] = {
-  {"SpatialConvolutionMap_updateOutput", nn_(SpatialConvolutionMap_updateOutput)},
-  {"SpatialConvolutionMap_updateGradInput", nn_(SpatialConvolutionMap_updateGradInput)},
-  {"SpatialConvolutionMap_accGradParameters", nn_(SpatialConvolutionMap_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialConvolutionMap_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialConvolutionMap__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/SpatialFullConvolutionMap.c b/generic/SpatialFullConvolutionMap.c
index 9d5cff2b385..b1ebcb89a89 100644
--- a/generic/SpatialFullConvolutionMap.c
+++ b/generic/SpatialFullConvolutionMap.c
@@ -2,125 +2,97 @@
 #define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
 #else
 
-static int nn_(SpatialFullConvolutionMap_updateOutput)(lua_State *L)
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  int kW = luaT_getfieldcheckint(L, 1, "kW");
-  int kH = luaT_getfieldcheckint(L, 1, "kH");
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
 
+  const int kH = (int)weight->size[1];
+  const int kW = (int)weight->size[2];
 
-  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
-  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
+  THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
+  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
 
-  real *input_data;
-  real *output_data;
-  real *weight_data;
-  real *bias_data;
-  real *connTable_data;
-
-  long input_h;
-  long input_w;
-  long output_h;
-  long output_w;
-  long weight_h;
-  long weight_w;
-
-  long p;
-
-  luaL_argcheck(L, input->nDimension == 3, 2, "3D tensor expected");
-  luaL_argcheck(L, input->size[0] >= nInputPlane, 2, "invalid number of input planes");
-
-
-  THTensor_(resize3d)(output, nOutputPlane,
-                      (input->size[1] - 1) * dH + kH,
-                      (input->size[2] - 1) * dW + kW);
+  THTensor_(resize3d)(
+    output, nOutputPlane,
+    (input->size[1] - 1) * dH + kH,
+    (input->size[2] - 1) * dW + kW
+  );
 
   /* contiguous */
   input = THTensor_(newContiguous)(input);
   output = THTensor_(newContiguous)(output);
 
   /* get raw pointers */
-  input_data = THTensor_(data)(input);
-  output_data = THTensor_(data)(output);
-  weight_data = THTensor_(data)(weight);
-  bias_data = THTensor_(data)(bias);
-  connTable_data = THTensor_(data)(connTable);
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
 
   /* and dims */
-  input_h = input->size[1];
-  input_w = input->size[2];
-  output_h = output->size[1];
-  output_w = output->size[2];
-  weight_h = weight->size[1];
-  weight_w = weight->size[2];
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = output->size[1];
+  const long output_w = output->size[2];
+  const long weight_h = weight->size[1];
+  const long weight_w = weight->size[2];
 
+  long p;
 #pragma omp parallel for private(p)
-  for (p = 0; p < nOutputPlane; p++) {
+  for (p = 0; p < nOutputPlane; p++)
+  {
     /* add bias */
     real *ptr_output = output_data + p*output_w*output_h;
     long j;
     int nweight;
     long k;
 
-    for(j = 0; j < output_h*output_w; j++)
+    for (j = 0; j < output_h*output_w; j++)
       ptr_output[j] = bias_data[p];
 
     /* convolve all maps */
     nweight = connTable->size[0];
-    for (k = 0; k < nweight; k++) {
+    for (k = 0; k < nweight; k++)
+    {
       /* get offsets for input/output */
       int o = (int)connTable_data[k*2+1]-1;
       int i = (int)connTable_data[k*2+0]-1;
 
       if (o == p)
-        {
-          THTensor_(fullConv2Dptr)(output_data + o*output_w*output_h,
-				   1.0,
-				   input_data + i*input_w*input_h, input_h, input_w,
-				   weight_data + k*weight_w*weight_h, weight_h, weight_w,
-				   dH, dW);
-        }
+      {
+        THTensor_(fullConv2Dptr)(
+          output_data + o*output_w*output_h,
+          1.0,
+          input_data + i*input_w*input_h, input_h, input_w,
+          weight_data + k*weight_w*weight_h, weight_h, weight_w,
+          dH, dW
+        );
+      }
     }
   }
 
   /* clean up */
   THTensor_(free)(input);
   THTensor_(free)(output);
-
-  return 1;
 }
 
-static int nn_(SpatialFullConvolutionMap_updateGradInput)(lua_State *L)
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
-
-  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
-
-  real *gradInput_data;
-  real *gradOutput_data;
-  real *weight_data;
-  real *connTable_data;
-
-  long input_h;
-  long input_w;
-  long output_h;
-  long output_w;
-  long weight_h;
-  long weight_w;
-
-  long p;
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
 
   /* contiguous */
   gradInput = THTensor_(newContiguous)(gradInput);
@@ -131,138 +103,110 @@ static int nn_(SpatialFullConvolutionMap_updateGradInput)(lua_State *L)
   THTensor_(zero)(gradInput);
 
   /* get raw pointers */
-  gradInput_data = THTensor_(data)(gradInput);
-  gradOutput_data = THTensor_(data)(gradOutput);
-  weight_data = THTensor_(data)(weight);
-  connTable_data = THTensor_(data)(connTable);
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
 
   /* and dims */
-  input_h = input->size[1];
-  input_w = input->size[2];
-  output_h = gradOutput->size[1];
-  output_w = gradOutput->size[2];
-  weight_h = weight->size[1];
-  weight_w = weight->size[2];
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long kH = weight->size[1];
+  const long kW = weight->size[2];
 
+  long p;
 #pragma omp parallel for private(p)
-  for(p = 0; p < nInputPlane; p++)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long k;
+    /* backward all */
+    int nkernel = connTable->size[0];
+    for (k = 0; k < nkernel; k++)
     {
-      long k;
-      /* backward all */
-      int nkernel = connTable->size[0];
-      for(k = 0; k < nkernel; k++)
-        {
-          int o = (int)connTable_data[k*2+1]-1;
-          int i = (int)connTable_data[k*2+0]-1;
-          if (i == p)
-            {
-              /* gradient to input */
-              THTensor_(validXCorr2Dptr)(gradInput_data + i*input_w*input_h,
-					 1.0,
-					 gradOutput_data + o*output_w*output_h,  output_h,  output_w,
-					 weight_data + k*weight_w*weight_h, weight_h, weight_w,
-					 dH, dW);
-            }
-        }
+      int o = (int)connTable_data[k*2+1]-1;
+      int i = (int)connTable_data[k*2+0]-1;
+      if (i == p)
+      {
+        /* gradient to input */
+        THTensor_(validXCorr2Dptr)(
+          gradInput_data + i*input_w*input_h,
+          1.0,
+          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+          weight_data + k*kW*kH, kH, kW,
+          dH, dW
+        );
+      }
     }
+  }
 
   /* clean up */
   THTensor_(free)(gradInput);
   THTensor_(free)(gradOutput);
-
-  return 1;
 }
 
-static int nn_(SpatialFullConvolutionMap_accGradParameters)(lua_State *L)
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH, real scale)
 {
-  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
-  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
-  int dW = luaT_getfieldcheckint(L, 1, "dW");
-  int dH = luaT_getfieldcheckint(L, 1, "dH");
-  int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
-  real scale = luaL_optnumber(L, 4, 1);
-
-  THTensor *connTable = luaT_getfieldcheckudata(L, 1, "connTable", torch_Tensor);
-  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
-  THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-
-  real *input_data;
-  real *gradOutput_data;
-  real *gradWeight_data;
-  real *gradBias_data;
-
-  long input_h;
-  long input_w;
-  long output_h;
-  long output_w;
-  long weight_h;
-  long weight_w;
-
-  long k;
-  int nkernel;
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+  );
 
   /* contiguous */
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
   /* get raw pointers */
-  input_data = THTensor_(data)(input);
-  gradOutput_data = THTensor_(data)(gradOutput);
-  gradWeight_data = THTensor_(data)(gradWeight);
-  gradBias_data = THTensor_(data)(gradBias);
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
 
   /* and dims */
-  input_h = input->size[1];
-  input_w = input->size[2];
-  output_h = gradOutput->size[1];
-  output_w = gradOutput->size[2];
-  weight_h = weight->size[1];
-  weight_w = weight->size[2];
+  const long input_h  = input->size[1];
+  const long input_w  = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long weight_h = gradWeight->size[1];
+  const long weight_w = gradWeight->size[2];
 
   /* gradients wrt bias */
+  long k;
 #pragma omp parallel for private(k)
-  for(k = 0; k < nOutputPlane; k++) {
+  for (k = 0; k < nOutputPlane; k++)
+  {
     real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
     long l;
-    for(l = 0; l < output_h*output_w; l++)
+    for (l = 0; l < output_h*output_w; l++)
       gradBias_data[k] += scale*ptr_gradOutput[l];
   }
 
   /* gradients wrt weight */
-  nkernel = connTable->size[0];
+  int nkernel = connTable->size[0];
 #pragma omp parallel for private(k)
-  for(k = 0; k < nkernel; k++)
-    {
-      int o = (int)THTensor_(get2d)(connTable,k,1)-1;
-      int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+  for (k = 0; k < nkernel; k++)
+  {
+    int o = (int)THTensor_(get2d)(connTable,k,1)-1;
+    int i = (int)THTensor_(get2d)(connTable,k,0)-1;
 
-      /* gradient to kernel */
-      THTensor_(validXCorr2DRevptr)(gradWeight_data + k*weight_w*weight_h,
-                                 scale,
-                                 gradOutput_data + o*output_w*output_h, output_h, output_w,
-                                 input_data + i*input_w*input_h, input_h, input_w,
-                                 dH, dW);
-    }
+    /* gradient to kernel */
+    THTensor_(validXCorr2DRevptr)(
+      gradWeight_data + k*weight_w*weight_h,
+      scale,
+      gradOutput_data + o*output_w*output_h, output_h, output_w,
+      input_data + i*input_w*input_h, input_h, input_w,
+      dH, dW
+    );
+  }
 
   /* clean up */
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-  return 0;
-}
-
-static const struct luaL_Reg nn_(SpatialFullConvolutionMapStuff__) [] = {
-  {"SpatialFullConvolutionMap_updateOutput", nn_(SpatialFullConvolutionMap_updateOutput)},
-  {"SpatialFullConvolutionMap_updateGradInput", nn_(SpatialFullConvolutionMap_updateGradInput)},
-  {"SpatialFullConvolutionMap_accGradParameters", nn_(SpatialFullConvolutionMap_accGradParameters)},
-  {NULL, NULL}
-};
-
-static void nn_(SpatialFullConvolutionMap_init)(lua_State *L)
-{
-  luaT_pushmetatable(L, torch_Tensor);
-  luaT_registeratname(L, nn_(SpatialFullConvolutionMapStuff__), "nn");
-  lua_pop(L,1);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 0c0f801a5a6..811b39c32f4 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -426,6 +426,39 @@ TH_API void THNN_(SpatialBatchNormalization_backward)(
           THTensor *save_std,
           double scale);
 
+TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          real scale);            // scaling factor
+
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -590,6 +623,39 @@ TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
           int adjW, int adjH,
           real scale);
 
+TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          real scale);            // scaling factor
+
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index 614ef263b01..ea3191ad45c 100644
--- a/init.c
+++ b/init.c
@@ -103,6 +103,9 @@
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 
@@ -112,6 +115,9 @@
 #include "generic/SpatialFullConvolution.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialFullConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialAdaptiveMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 

From 8215d476d939b6d8619fc2a3904b855777c644de Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Fri, 19 Feb 2016 12:12:30 -0800
Subject: [PATCH 050/101] multi margin sizeAverage fix

---
 generic/MultiLabelMarginCriterion.c | 7 ++++---
 generic/MultiMarginCriterion.c      | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index a3cf96503af..3e45fe8596c 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -66,8 +66,9 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
     target_data += dim;
   }
 
-  if (sizeAverage)
-    sum /= dim;
+  sum /= dim;
+  if(sizeAverage)
+    sum /= nframe;
 
   THTensor_(set1d)(output, 0, sum);
 
@@ -107,7 +108,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor
   input_data = THTensor_(data)(input);
   target_data = THTensor_(data)(target);
 
-  g = (sizeAverage ? 1./((real)dim) : 1.);
+  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)nframe));
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);
diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index 6445bb040fc..39255ab18eb 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -51,8 +51,9 @@ void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
     input_data += dim;
   }
 
-  if (sizeAverage)
-    sum /= dim;
+  sum /= dim;
+  if(sizeAverage)
+    sum /= nframe;
 
   THTensor_(set1d)(output, 0, sum);
 
@@ -83,7 +84,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *inp
     THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
   }
 
-  g = (sizeAverage ? 1./((real)dim) : 1.);
+  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
 
   input = THTensor_(newContiguous)(input);
   target = THTensor_(newContiguous)(target);

From d07fc0e8127551206f9ba054a169f06a75599f82 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Fri, 19 Feb 2016 12:22:14 -0800
Subject: [PATCH 051/101] adding sanity checks to ClassNLLCriterion

---
 generic/ClassNLLCriterion.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index de8a82e8a35..6c5787aae68 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -2,7 +2,11 @@
 #define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
 #else
 
-void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input, THIndexTensor *target, THTensor *output, bool sizeAverage, THTensor *weights, THTensor *total_weight)
+void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input,
+                                           THIndexTensor *target,
+                                           THTensor *output, bool sizeAverage,
+                                           THTensor *weights,
+                                           THTensor *total_weight)
 {
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
@@ -33,6 +37,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input, TH
     output_data[0] = -input_data[cur_target] * total_weight_data[0];
   } else if (THTensor_(nDimension)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
+    THAssert(THTensor_(size)(target, 0) == batch_size);
+
     int n_target = THTensor_(size)(input, 1);
 
     int i;
@@ -57,7 +63,12 @@ void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input, TH
   THIndexTensor_(free)(target);
 }
 
-void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input, THIndexTensor *target, THTensor *gradInput, bool sizeAverage, THTensor *weights, THTensor *total_weight)
+void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input,
+                                              THIndexTensor *target,
+                                              THTensor *gradInput,
+                                              bool sizeAverage,
+                                              THTensor *weights,
+                                              THTensor *total_weight)
 {
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
@@ -96,6 +107,8 @@ void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input,
 
   } else if (THTensor_(nDimension)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
+    THAssert(THTensor_(size)(target, 0) == batch_size);
+
     int n_target = THTensor_(size)(input, 1);
 
     int i;

From 5193bc3b537c5036f1b9c4d6945fa4c970698623 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Fri, 19 Feb 2016 12:33:58 -0800
Subject: [PATCH 052/101] fixing previous sanity check for THIndexTensor
 change, and changing SpatialConvolution's reset to be more flexible wrt no
 bias

---
 generic/ClassNLLCriterion.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index 6c5787aae68..73be314428f 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -37,7 +37,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input,
     output_data[0] = -input_data[cur_target] * total_weight_data[0];
   } else if (THTensor_(nDimension)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
-    THAssert(THTensor_(size)(target, 0) == batch_size);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
 
     int n_target = THTensor_(size)(input, 1);
 
@@ -107,7 +107,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input,
 
   } else if (THTensor_(nDimension)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
-    THAssert(THTensor_(size)(target, 0) == batch_size);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
 
     int n_target = THTensor_(size)(input, 1);
 

From 687400c4fda8381fc97298f44ec4fc9546be1e5a Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Fri, 19 Feb 2016 10:31:42 -0800
Subject: [PATCH 053/101] adding paddingValue to LookupTable

---
 generic/LookupTable.c | 29 +++++++++++++++++------------
 generic/THNN.h        |  1 +
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index ed9656e9f16..47fbe0801d7 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -29,11 +29,12 @@ void THNN_(LookupTable_accGradParameters)(
   THTensor *sorted,
   THTensor *indices,
   bool scaleGradByFreq,
+  int paddingValue,
   real scale)
 {
   long i;
   THInteger_t *count_data = NULL;
-  
+
   if (scaleGradByFreq)
   {
     THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
@@ -81,13 +82,15 @@ void THNN_(LookupTable_accGradParameters)(
       long end = start + (numw/nthreads + 1);
       for (i=0; i<numel; i++)
       {
-        long k = input_data[i] - 1;
-        if (k >= start && k < end)
+        if (input_data[i] != paddingValue)
         {
-          real lr = scale;
-          if (count_data)
-            lr /= count_data[k];
-          THBlas_(axpy)(stride, lr, go + i*stride, 1, gw + k*stride, 1);
+            long k = input_data[i] - 1;
+            if (k >= start && k < end)
+            {
+                real scale_ = scale;
+                if (count_data) scale_ /= count_data[k];
+                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+            }
         }
       }
     }
@@ -99,11 +102,13 @@ void THNN_(LookupTable_accGradParameters)(
 
   for (i=0; i<numel; i++)
   {
-    long k = input_data[i] - 1;
-    real lr = scale;
-    if (count_data)
-      lr /= count_data[k];
-    THBlas_(axpy)(stride, lr, go + i*stride, 1, gw + k*stride, 1);
+    if (input_data[i] != paddingValue)
+    {
+        long k = input_data[i] - 1;
+        real scale_ = scale;
+        if (count_data) scale_ /= count_data[k];
+        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+     }
   }
 
   THTensor_(free)(gradOutput);
diff --git a/generic/THNN.h b/generic/THNN.h
index 811b39c32f4..e7582359f6c 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -150,6 +150,7 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *sorted,
           THTensor *indices,
           bool scaleGradByFreq,
+          int paddingValue,
           real scale);
 
 TH_API void THNN_(MarginCriterion_updateOutput)(

From ee7990bc56cc101d7fe991db2d3fef0347d1101f Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Fri, 19 Feb 2016 17:40:57 -0800
Subject: [PATCH 054/101] adding weights to MultiMarginCriterion

---
 generic/MultiMarginCriterion.c | 43 ++++++++++++++++++++++++----------
 generic/THNN.h                 |  6 +++--
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index 39255ab18eb..b57d9698cf0 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -2,9 +2,11 @@
 #define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
 #else
 
-void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage, int p)
+void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
+                                              THTensor *target, THTensor *output,
+                                              bool sizeAverage, int p, THTensor *weights)
 {
-  real *input_data, *target_data;
+  real *input_data, *target_data, *weights_data;
   long nframe, dim;
   long t, d;
   real sum;
@@ -14,7 +16,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
   if (input->nDimension == 1)
   {
     nframe = 1;
-    dim = input->size[0]; 
+    dim = input->size[0];
   }
   else
   {
@@ -31,8 +33,10 @@ void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
 
   input = THTensor_(newContiguous)(input);
   target = THTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
   input_data = THTensor_(data)(input);
   target_data = THTensor_(data)(target);
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
 
   sum = 0;
   for (t = 0; t < nframe; t++)
@@ -44,9 +48,13 @@ void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
       real z = 1 - input_target + input_data[d];
       if (d == target_idx)
         continue;
-    
-      if (z > 0)
-        sum += (p == 1) ? z : z*z;
+
+      if (z > 0) {
+        real h = (p==1) ? z : z*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        sum += h;
+      }
     }
     input_data += dim;
   }
@@ -59,13 +67,18 @@ void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
 
   THTensor_(free)(input);
   THTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
 }
 
-void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage, int p)
+void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *input,
+                                                 THTensor *target, THTensor *gradInput,
+                                                 bool sizeAverage, int p, THTensor *weights)
 {
   real *input_data;
   real *gradInput_data;
   real *target_data;
+  real *weights_data;
   long nframe, dim;
   long t, d;
   real g;
@@ -75,7 +88,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *inp
   if (input->nDimension == 1)
   {
     nframe = 1;
-    dim = input->size[0]; 
+    dim = input->size[0];
   }
   else
   {
@@ -94,7 +107,9 @@ void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *inp
   gradInput_data = THTensor_(data)(gradInput);
 
   target_data = THTensor_(data)(target);
-    
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
   for (t = 0; t < nframe; t++)
   {
     long target_idx = (long)(target_data[t])-1;
@@ -105,10 +120,12 @@ void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *inp
       real z = 1 - input_target + input_data[d];
       if (d == target_idx)
         continue;
-    
+
       if (z > 0)
       {
         real h = (p == 1) ? g : 2*g*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
         gradInput_target -= h;
         gradInput_data[d] = h;
       }
@@ -116,13 +133,15 @@ void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *inp
         gradInput_data[d] = 0;
     }
     gradInput_data[target_idx] = gradInput_target;
-    
+
     input_data += dim;
     gradInput_data += dim;
   }
 
-  THTensor_(free)(input);  
+  THTensor_(free)(input);
   THTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index e7582359f6c..c8c80c3e8d5 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -200,14 +200,16 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           THTensor *target,
           THTensor *output,
           bool sizeAverage,
-          int p);
+          int p,
+          THTensor* weights);
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
           THTensor *gradInput,
           bool sizeAverage,
-          int p);
+          int p,
+          THTensor *weights);
 
 TH_API void THNN_(PReLU_updateOutput)(
           THNNState *state,

From ec829c89c14d8f740c2c272d1ec3b374733e54fb Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Sat, 20 Feb 2016 16:10:40 +0100
Subject: [PATCH 055/101] Fix THNN header formatting

---
 generic/Abs.c                       |  11 +-
 generic/AbsCriterion.c              |  14 +-
 generic/ClassNLLCriterion.c         |  27 ++--
 generic/DistKLDivCriterion.c        |  14 +-
 generic/ELU.c                       |  16 +-
 generic/HardShrink.c                |  13 +-
 generic/HardTanh.c                  |  15 +-
 generic/L1Cost.c                    |  11 +-
 generic/LeakyReLU.c                 |  15 +-
 generic/LogSigmoid.c                |  13 +-
 generic/LogSoftMax.c                |  12 +-
 generic/LookupTable.c               |  24 +--
 generic/MSECriterion.c              |  14 +-
 generic/MarginCriterion.c           |  16 +-
 generic/MultiLabelMarginCriterion.c |  14 +-
 generic/MultiMarginCriterion.c      |  22 ++-
 generic/PReLU.c                     |  35 ++--
 generic/RReLU.c                     |  22 ++-
 generic/Sigmoid.c                   |  12 +-
 generic/SmoothL1Criterion.c         |  14 +-
 generic/SoftMax.c                   |  12 +-
 generic/SoftPlus.c                  |  16 +-
 generic/SoftShrink.c                |  13 +-
 generic/SparseLinear.c              |  56 ++++---
 generic/SpatialAdaptiveMaxPooling.c |  52 ++++--
 generic/SpatialAveragePooling.c     |  27 +++-
 generic/SpatialConvolutionMM.c      |  91 +++++++++--
 generic/SpatialMaxPooling.c         |  66 ++++++--
 generic/Sqrt.c                      |  13 +-
 generic/Square.c                    |  11 +-
 generic/THNN.h                      | 238 ++++++++++++++--------------
 generic/Tanh.c                      |  12 +-
 generic/TemporalConvolution.c       |  51 +++---
 generic/TemporalMaxPooling.c        |  26 +--
 generic/TemporalSubSampling.c       |  48 +++---
 generic/Threshold.c                 |  16 +-
 generic/VolumetricAveragePooling.c  |  68 +++++---
 generic/VolumetricConvolution.c     |  66 ++++----
 generic/VolumetricConvolutionMM.c   | 165 +++++++++++++------
 generic/VolumetricFullConvolution.c |  69 ++++----
 generic/VolumetricMaxPooling.c      |  80 +++++++---
 generic/VolumetricMaxUnpooling.c    |  89 +++++++----
 generic/unfold.c                    |  39 +++--
 43 files changed, 1143 insertions(+), 515 deletions(-)

diff --git a/generic/Abs.c b/generic/Abs.c
index cc96d5d4409..c5e36ff7e58 100644
--- a/generic/Abs.c
+++ b/generic/Abs.c
@@ -2,13 +2,20 @@
 #define TH_GENERIC_FILE "generic/Abs.c"
 #else
 
-void THNN_(Abs_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   THTensor_(resizeAs)(output, input);
   THTensor_(abs)(output, input);
 }
 
-void THNN_(Abs_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
+void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
 {
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
diff --git a/generic/AbsCriterion.c b/generic/AbsCriterion.c
index 8469ac5cc31..e87bb5b31f4 100644
--- a/generic/AbsCriterion.c
+++ b/generic/AbsCriterion.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/AbsCriterion.c"
 #else
 
-void THNN_(AbsCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
+void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
 {
   real sum = 0;
 
@@ -16,7 +21,12 @@ void THNN_(AbsCriterion_updateOutput)(THNNState *state, THTensor *input, THTenso
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(AbsCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
+void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
 {
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index 73be314428f..eb02f7c6b19 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -2,11 +2,14 @@
 #define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
 #else
 
-void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input,
-                                           THIndexTensor *target,
-                                           THTensor *output, bool sizeAverage,
-                                           THTensor *weights,
-                                           THTensor *total_weight)
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
 {
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
@@ -63,12 +66,14 @@ void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input,
   THIndexTensor_(free)(target);
 }
 
-void THNN_(ClassNLLCriterion_updateGradInput)(THNNState *state, THTensor *input,
-                                              THIndexTensor *target,
-                                              THTensor *gradInput,
-                                              bool sizeAverage,
-                                              THTensor *weights,
-                                              THTensor *total_weight)
+void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
 {
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
diff --git a/generic/DistKLDivCriterion.c b/generic/DistKLDivCriterion.c
index 62b10faaa04..507324d0c40 100644
--- a/generic/DistKLDivCriterion.c
+++ b/generic/DistKLDivCriterion.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
 #else
 
-void THNN_(DistKLDivCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
+void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
 {
   real sum = 0;
 
@@ -16,7 +21,12 @@ void THNN_(DistKLDivCriterion_updateOutput)(THNNState *state, THTensor *input, T
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(DistKLDivCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
+void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
 {
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
diff --git a/generic/ELU.c b/generic/ELU.c
index f748ee95782..f313212cee1 100644
--- a/generic/ELU.c
+++ b/generic/ELU.c
@@ -2,7 +2,11 @@
 #define TH_GENERIC_FILE "generic/ELU.c"
 #else
 
-void THNN_(ELU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real alpha)
+void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real alpha)
 {
   THTensor_(resizeAs)(output, input);
   TH_TENSOR_APPLY2(real, input, real, output,
@@ -10,8 +14,14 @@ void THNN_(ELU_updateOutput)(THNNState *state, THTensor *input, THTensor *output
   );
 }
 
-void THNN_(ELU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output, real alpha)
-{ 
+void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real alpha)
+{
   THTensor_(resizeAs)(gradInput, output);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
     *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
diff --git a/generic/HardShrink.c b/generic/HardShrink.c
index 9abee6b1b17..689f565fb90 100644
--- a/generic/HardShrink.c
+++ b/generic/HardShrink.c
@@ -2,7 +2,11 @@
 #define TH_GENERIC_FILE "generic/HardShrink.c"
 #else
 
-void THNN_(HardShrink_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real lambda)
+void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda)
 {
   THTensor_(resizeAs)(output, input);
 
@@ -16,7 +20,12 @@ void THNN_(HardShrink_updateOutput)(THNNState *state, THTensor *input, THTensor
   );
 }
 
-void THNN_(HardShrink_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real lambda)
+void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
 {
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
diff --git a/generic/HardTanh.c b/generic/HardTanh.c
index 1fe54df8016..9764ec09c50 100644
--- a/generic/HardTanh.c
+++ b/generic/HardTanh.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/HardTanh.c"
 #else
 
-void THNN_(HardTanh_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real min_val, real max_val)
+void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real min_val,
+          real max_val)
 {
   THTensor_(resizeAs)(output, input);
   
@@ -36,7 +41,13 @@ void THNN_(HardTanh_updateOutput)(THNNState *state, THTensor *input, THTensor *o
   }
 }
 
-void THNN_(HardTanh_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real min_val, real max_val)
+void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real min_val,
+          real max_val)
 {
   THTensor_(resizeAs)(gradInput, input);
 
diff --git a/generic/L1Cost.c b/generic/L1Cost.c
index 2d8d39e71d7..86f69a677eb 100644
--- a/generic/L1Cost.c
+++ b/generic/L1Cost.c
@@ -2,7 +2,10 @@
 #define TH_GENERIC_FILE "generic/L1Cost.c"
 #else
 
-void THNN_(L1Cost_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   accreal sum = 0;
 
@@ -13,7 +16,11 @@ void THNN_(L1Cost_updateOutput)(THNNState *state, THTensor *input, THTensor *out
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(L1Cost_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
+void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
 {
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY2(real, gradInput, real, input,
diff --git a/generic/LeakyReLU.c b/generic/LeakyReLU.c
index 2fc533b9d28..527698912eb 100644
--- a/generic/LeakyReLU.c
+++ b/generic/LeakyReLU.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/LeakyReLU.c"
 #else
 
-void THNN_(LeakyReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real negval, bool inplace)
+void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real negval,
+          bool inplace)
 {
   if (inplace)
   {
@@ -21,7 +26,13 @@ void THNN_(LeakyReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *
   }
 }
 
-void THNN_(LeakyReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real negval, bool inplace)
+void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real negval,
+          bool inplace)
 {
   if (inplace)
   {
diff --git a/generic/LogSigmoid.c b/generic/LogSigmoid.c
index c0510377e1f..20932f1ceb3 100644
--- a/generic/LogSigmoid.c
+++ b/generic/LogSigmoid.c
@@ -2,7 +2,11 @@
 #define TH_GENERIC_FILE "generic/LogSigmoid.c"
 #else
 
-void THNN_(LogSigmoid_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *buffer)
+void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer)
 {
   THTensor_(resizeAs)(output, input);
   THTensor_(resizeAs)(buffer, input);
@@ -14,7 +18,12 @@ void THNN_(LogSigmoid_updateOutput)(THNNState *state, THTensor *input, THTensor
   );
 }
 
-void THNN_(LogSigmoid_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *buffer)
+void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer)
 {
   THTensor_(resizeAs)(gradInput, buffer);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
diff --git a/generic/LogSoftMax.c b/generic/LogSoftMax.c
index f23622f7e56..73d96232d96 100644
--- a/generic/LogSoftMax.c
+++ b/generic/LogSoftMax.c
@@ -2,7 +2,10 @@
 #define TH_GENERIC_FILE "generic/LogSoftMax.c"
 #else
 
-void THNN_(LogSoftMax_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   real *input_data, *output_data;
   long nframe = 0, dim = 0;
@@ -53,7 +56,12 @@ void THNN_(LogSoftMax_updateOutput)(THNNState *state, THTensor *input, THTensor
   THTensor_(free)(input);
 }
 
-void THNN_(LogSoftMax_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
+void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
 {
   real *gradInput_data, *gradOutput_data, *output_data;
   long nframe = 0, dim = 0;
diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index 47fbe0801d7..852598253e6 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -2,7 +2,9 @@
 #define TH_GENERIC_FILE "generic/LookupTable.c"
 #else
 
-static void THNN_(LookupTable_resetCount)(THInteger_t *count_data, THIndexTensor *input)
+static void THNN_(LookupTable_resetCount)(
+          THInteger_t *count_data,
+          THIndexTensor *input)
 {
   int i;
   THIndex_t *input_data = THIndexTensor_(data)(input);
@@ -21,16 +23,16 @@ static void THNN_(LookupTable_resetCount)(THInteger_t *count_data, THIndexTensor
 }
 
 void THNN_(LookupTable_accGradParameters)(
-  THNNState *state,
-  THIndexTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradWeight,
-  THIntegerTensor *count,
-  THTensor *sorted,
-  THTensor *indices,
-  bool scaleGradByFreq,
-  int paddingValue,
-  real scale)
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          real scale)
 {
   long i;
   THInteger_t *count_data = NULL;
diff --git a/generic/MSECriterion.c b/generic/MSECriterion.c
index 048829581c4..c576e3d015c 100644
--- a/generic/MSECriterion.c
+++ b/generic/MSECriterion.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/MSECriterion.c"
 #else
 
-void THNN_(MSECriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
+void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
 {
   real sum = 0;
 
@@ -17,7 +22,12 @@ void THNN_(MSECriterion_updateOutput)(THNNState *state, THTensor *input, THTenso
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(MSECriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
+void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
 {
   real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
 
diff --git a/generic/MarginCriterion.c b/generic/MarginCriterion.c
index 4c88318bd46..792ce7b56c9 100644
--- a/generic/MarginCriterion.c
+++ b/generic/MarginCriterion.c
@@ -2,7 +2,13 @@
 #define TH_GENERIC_FILE "generic/MarginCriterion.c"
 #else
 
-void THNN_(MarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage, real margin)
+void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          real margin)
 {
   real sum = 0;
 
@@ -17,7 +23,13 @@ void THNN_(MarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTe
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(MarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage, real margin)
+void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          real margin)
 {
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index 3e45fe8596c..633e062be67 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
 #else
 
-void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
 {
   real *input_data, *target_data;
   long nframe, dim;
@@ -76,7 +81,12 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(THNNState *state, THTensor *i
   THTensor_(free)(target);
 }
 
-void THNN_(MultiLabelMarginCriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
 {
   real *input_data;
   real *gradInput_data;
diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index b57d9698cf0..f3309e57ea1 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -2,9 +2,14 @@
 #define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
 #else
 
-void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
-                                              THTensor *target, THTensor *output,
-                                              bool sizeAverage, int p, THTensor *weights)
+void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor *weights)
 {
   real *input_data, *target_data, *weights_data;
   long nframe, dim;
@@ -71,9 +76,14 @@ void THNN_(MultiMarginCriterion_updateOutput)(THNNState *state, THTensor *input,
     THTensor_(free)(weights);
 }
 
-void THNN_(MultiMarginCriterion_updateGradInput)(THNNState *state, THTensor *input,
-                                                 THTensor *target, THTensor *gradInput,
-                                                 bool sizeAverage, int p, THTensor *weights)
+void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights)
 {
   real *input_data;
   real *gradInput_data;
diff --git a/generic/PReLU.c b/generic/PReLU.c
index 9a828dfacb3..b1b2c0f8b8a 100644
--- a/generic/PReLU.c
+++ b/generic/PReLU.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/PReLU.c"
 #else
 
-void THNN_(PReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THIndex_t nOutputPlane)
+void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
 {
   THTensor_(resizeAs)(output, input);
 
@@ -63,7 +68,13 @@ void THNN_(PReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *outp
   }
 }
 
-void THNN_(PReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THIndex_t nOutputPlane)
+void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
 {
   THTensor_(resizeAs)(gradInput, input);
 
@@ -138,16 +149,16 @@ void THNN_(PReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *g
 }
 
 void THNN_(PReLU_accGradParameters)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *weight,
-  THTensor *gradWeight,
-  THTensor *gradWeightBuf,
-  THTensor *gradWeightBuf2,
-  THIndex_t nOutputPlane,
-  real scale)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          real scale)
 {
   real *gradWeight_data = THTensor_(data)(gradWeight);
 
diff --git a/generic/RReLU.c b/generic/RReLU.c
index 74c5df547ad..8bf6764e5e1 100644
--- a/generic/RReLU.c
+++ b/generic/RReLU.c
@@ -2,7 +2,16 @@
 #define TH_GENERIC_FILE "generic/RReLU.c"
 #else
 
-void THNN_(RReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *noise, real lower, real upper, bool train, bool inplace, THGenerator *generator)
+void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator)
 {
   if (train)
   {
@@ -66,7 +75,16 @@ void THNN_(RReLU_updateOutput)(THNNState *state, THTensor *input, THTensor *outp
   }  
 }
 
-void THNN_(RReLU_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *noise, real lower, real upper, bool train, bool inplace)
+void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace)
 {
   if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
   {
diff --git a/generic/Sigmoid.c b/generic/Sigmoid.c
index f58d33bf2d8..0a1b3750d1e 100644
--- a/generic/Sigmoid.c
+++ b/generic/Sigmoid.c
@@ -2,7 +2,10 @@
 #define TH_GENERIC_FILE "generic/Sigmoid.c"
 #else
 
-void THNN_(Sigmoid_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   THTensor_(resizeAs)(output, input);
 
@@ -11,7 +14,12 @@ void THNN_(Sigmoid_updateOutput)(THNNState *state, THTensor *input, THTensor *ou
   );
 }
 
-void THNN_(Sigmoid_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
+void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
 {
   THTensor_(resizeAs)(gradInput, output);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
diff --git a/generic/SmoothL1Criterion.c b/generic/SmoothL1Criterion.c
index 3111b3dc693..8b53100a525 100644
--- a/generic/SmoothL1Criterion.c
+++ b/generic/SmoothL1Criterion.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
 #else
 
-void THNN_(SmoothL1Criterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage)
+void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
 {
   real sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
@@ -16,7 +21,12 @@ void THNN_(SmoothL1Criterion_updateOutput)(THNNState *state, THTensor *input, TH
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(SmoothL1Criterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage)
+void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
 {
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
diff --git a/generic/SoftMax.c b/generic/SoftMax.c
index 598d35e8af8..78bec4dbba6 100644
--- a/generic/SoftMax.c
+++ b/generic/SoftMax.c
@@ -2,7 +2,10 @@
 #define TH_GENERIC_FILE "generic/SoftMax.c"
 #else
 
-void THNN_(SoftMax_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   real *input_data, *output_data;
   long nframe = 0, dim = 0, stride = 0;
@@ -75,7 +78,12 @@ void THNN_(SoftMax_updateOutput)(THNNState *state, THTensor *input, THTensor *ou
   THTensor_(free)(input);
 }
 
-void THNN_(SoftMax_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
+void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
 {
   real *gradInput_data, *gradOutput_data, *output_data;
   long nframe = 0, dim = 0, stride = 0;
diff --git a/generic/SoftPlus.c b/generic/SoftPlus.c
index 76c9c1c4eb7..407413f5ea2 100644
--- a/generic/SoftPlus.c
+++ b/generic/SoftPlus.c
@@ -2,7 +2,12 @@
 #define TH_GENERIC_FILE "generic/SoftPlus.c"
 #else
 
-void THNN_(SoftPlus_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real beta, real threshold)
+void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold)
 {
   THTensor_(resizeAs)(output, input);
 
@@ -12,7 +17,14 @@ void THNN_(SoftPlus_updateOutput)(THNNState *state, THTensor *input, THTensor *o
   );
 }
 
-void THNN_(SoftPlus_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output, real beta, real threshold)
+void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold)
 {
   THTensor_(resizeAs)(gradInput, output);
   
diff --git a/generic/SoftShrink.c b/generic/SoftShrink.c
index b15003fd2f6..7bd1cc8a034 100644
--- a/generic/SoftShrink.c
+++ b/generic/SoftShrink.c
@@ -2,7 +2,11 @@
 #define TH_GENERIC_FILE "generic/SoftShrink.c"
 #else
 
-void THNN_(SoftShrink_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real lambda)
+void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda)
 {
   THTensor_(resizeAs)(output, input);
   
@@ -16,7 +20,12 @@ void THNN_(SoftShrink_updateOutput)(THNNState *state, THTensor *input, THTensor
   );
 }
 
-void THNN_(SoftShrink_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real lambda)
+void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
 {
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index e7abecf75d4..2cf2697b0c9 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -21,7 +21,13 @@ static bool THNN_(checkSize1D)(THTensor* t, long size0)
   return t->nDimension == 1 && t->size[0] == size0;
 }
 
-void THNN_(SparseLinear_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *shardBuffer)
+void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *shardBuffer)
 {
   long i;
   long outDim = weight->size[0];
@@ -100,15 +106,15 @@ void THNN_(SparseLinear_updateOutput)(THNNState *state, THTensor *input, THTenso
 }
 
 void THNN_(SparseLinear_accGradParameters)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradWeight,
-  THTensor *gradBias,
-  THTensor *weight,
-  THTensor *bias,
-  real weightDecay,
-  real scale)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
 {
   long i;
   long nnz = input->size[0];
@@ -167,13 +173,13 @@ void THNN_(SparseLinear_accGradParameters)(
 }
 
 void THNN_(SparseLinear_updateParameters)(
-  THNNState *state,
-  THTensor *weight,
-  THTensor *bias,
-  THTensor *gradWeight,
-  THTensor *gradBias,
-  THTensor *lastInput,
-  real learningRate)
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate)
 {
   long i;
   long nnz = lastInput->size[0];
@@ -212,7 +218,11 @@ void THNN_(SparseLinear_updateParameters)(
   }
 }
 
-void THNN_(SparseLinear_zeroGradParameters)(THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput)
+void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
 {
   long i;
   long nnz = lastInput->size[0];
@@ -252,11 +262,11 @@ void THNN_(SparseLinear_zeroGradParameters)(THNNState *state, THTensor *gradWeig
 }
 
 void THNN_(SparseLinear_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *weight)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight)
 {
   long i;
   long nnz = input->size[0];
diff --git a/generic/SpatialAdaptiveMaxPooling.c b/generic/SpatialAdaptiveMaxPooling.c
index c34a5e779cc..61afc40734f 100644
--- a/generic/SpatialAdaptiveMaxPooling.c
+++ b/generic/SpatialAdaptiveMaxPooling.c
@@ -2,13 +2,19 @@
 #define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
 #else
 
-static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,real *output_p,
-                                                              real *indx_p, real *indy_p,
-                                                              long nslices,
-                                                              long iwidth, long iheight,
-                                                              long owidth, long oheight,
-                                                              long stridew,long strideh,
-                                                              long strided)
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *indx_p,
+          real *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -65,7 +71,13 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(real *input_p,re
   }
 }
 
-void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *indices, int owidth, int oheight)
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth,
+          int oheight)
 {
   int dimw = 2;
   int dimh = 1;
@@ -148,13 +160,16 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(THNNState *state, THTensor *i
   }
 }
 
-
-
-static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
-                                                                 real *indx_p, real *indy_p,
-                                                                 long nslices,
-                                                                 long iwidth, long iheight,
-                                                                 long owidth, long oheight)
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indx_p,
+          real *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -184,7 +199,12 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(real *gradInp
   }
 }
 
-void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *indices)
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices)
 {
   int dimw = 2;
   int dimh = 1;
diff --git a/generic/SpatialAveragePooling.c b/generic/SpatialAveragePooling.c
index 1bd297a12ad..37ee274850b 100644
--- a/generic/SpatialAveragePooling.c
+++ b/generic/SpatialAveragePooling.c
@@ -2,7 +2,18 @@
 #define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
 #else
 
-void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
+void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
 {
   real *output_data;
   real *input_data;
@@ -119,7 +130,19 @@ void THNN_(SpatialAveragePooling_updateOutput)(THNNState *state, THTensor *input
   THTensor_(free)(input);
 }
 
-void THNN_(SpatialAveragePooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
+void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
 {
   int dimw = 2;
   int dimh = 1;
diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
index e13037df557..a8427c4e4be 100644
--- a/generic/SpatialConvolutionMM.c
+++ b/generic/SpatialConvolutionMM.c
@@ -2,10 +2,24 @@
 #define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
 #else
 
-static void THNN_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
-                                                         int kW, int kH, int dW, int dH, int padW, int padH,
-                                                         long nInputPlane, long inputWidth, long inputHeight,
-                                                         long nOutputPlane, long outputWidth, long outputHeight)
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
 {
   long i;
   THTensor *output2d;
@@ -24,7 +38,20 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(THTensor *input, THTe
   THTensor_(free)(output2d);
 }
 
-void THNN_(SpatialConvolutionMM_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, THTensor* fgradInput, int kW, int kH, int dW, int dH, int padW, int padH)
+void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
 {
   int dimf = 0;
   int dimw = 2;
@@ -96,9 +123,17 @@ void THNN_(SpatialConvolutionMM_updateOutput)(THNNState *state, THTensor *input,
   }
 }
 
-
-static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
-                                                            int kW, int kH, int dW, int dH, int padW, int padH)
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
                                                        gradOutput->size[0], -1,
@@ -111,7 +146,21 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(THTensor *gradInpu
   THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
 }
 
-void THNN_(SpatialConvolutionMM_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias, THTensor *finput, THTensor *fgradInput, int kW, int kH, int dW, int dH, int padW, int padH)
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
 {
   long nOutputPlane = weight->size[0];
 
@@ -148,8 +197,12 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(THNNState *state, THTensor *inp
   THTensor_(transpose)(weight, weight, 0, 1);
 }
 
-static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput,
-                                                              real scale)
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
 {
   long i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
@@ -173,7 +226,21 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(THTensor *gradOu
   THTensor_(free)(gradOutput2d);
 }
 
-void THNN_(SpatialConvolutionMM_accGradParameters)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, THTensor *fgradInput, int kW, int kH, int dW, int dH, int padW, int padH, real scale)
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          real scale)
 {
   long nOutputPlane = gradWeight->size[0];
   THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c
index 30352822dff..d28fe85f17c 100644
--- a/generic/SpatialMaxPooling.c
+++ b/generic/SpatialMaxPooling.c
@@ -2,13 +2,21 @@
 #define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
 #else
 
-static void THNN_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *output_p,
-                                                      real *ind_p,
-                                                      long nslices,
-                                                      long iwidth, long iheight,
-                                                      long owidth, long oheight,
-                                                      int kW, int kH, int dW, int dH,
-                                                      int padW, int padH)
+static void THNN_(SpatialMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -61,7 +69,18 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(real *input_p, real *out
   }
 }
 
-void THNN_(SpatialMaxPooling_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
+void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
 {
   int dimw = 2;
   int dimh = 1;
@@ -163,12 +182,17 @@ void THNN_(SpatialMaxPooling_updateOutput)(THNNState *state, THTensor *input, TH
   THTensor_(free)(input);
 }
 
-static void THNN_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
-                                                         real *ind_p,
-                                                         long nslices,
-                                                         long iwidth, long iheight,
-                                                         long owidth, long oheight,
-                                                         int dW, int dH)
+static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int dW,
+          int dH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -193,7 +217,19 @@ static void THNN_(SpatialMaxPooling_updateGradInput_frame)(real *gradInput_p, re
   }
 }
 
-void THNN_(SpatialMaxPooling_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
+void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
 {
   int dimw = 2;
   int dimh = 1;
diff --git a/generic/Sqrt.c b/generic/Sqrt.c
index a1cd4a06897..826ed1daade 100644
--- a/generic/Sqrt.c
+++ b/generic/Sqrt.c
@@ -2,13 +2,22 @@
 #define TH_GENERIC_FILE "generic/Sqrt.c"
 #else
 
-void THNN_(Sqrt_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real eps)
+void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps)
 {
   THTensor_(resizeAs)(output, input);
   THTensor_(sqrt)(output, input);
 }
 
-void THNN_(Sqrt_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
+void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
 {
   THTensor_(resizeAs)(gradInput, input);
 
diff --git a/generic/Square.c b/generic/Square.c
index efdb54fb9d3..a26c001261f 100644
--- a/generic/Square.c
+++ b/generic/Square.c
@@ -2,7 +2,10 @@
 #define TH_GENERIC_FILE "generic/Square.c"
 #else
 
-void THNN_(Square_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   THTensor_(resizeAs)(output, input);
   
@@ -23,7 +26,11 @@ void THNN_(Square_updateOutput)(THNNState *state, THTensor *input, THTensor *out
   }
 }
 
-void THNN_(Square_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput)
+void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
 {
   THTensor_(resizeAs)(gradInput, input);
 
diff --git a/generic/THNN.h b/generic/THNN.h
index c8c80c3e8d5..24d4fec7049 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -3,143 +3,145 @@
 #else
 
 TH_API void THNN_(Abs_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] Abs output
 TH_API void THNN_(Abs_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput);        // [OUT] gradient w.r.t. input
 
 TH_API void THNN_(AbsCriterion_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *target,
-          THTensor *output,
-          bool sizeAverage);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage);           // if true, the loss will be divided by batch size
 TH_API void THNN_(AbsCriterion_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *target,
-          THTensor *gradInput,
-          bool sizeAverage);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the gradient will be normalized by batch size
 
 TH_API void THNN_(ClassNLLCriterion_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THIndexTensor *target,
-          THTensor *output,
-          bool sizeAverage,
-          THTensor *weights,
-          THTensor *total_weight);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
 TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THIndexTensor *target,
-          THTensor *gradInput,
-          bool sizeAverage,
-          THTensor *weights,
-          THTensor *total_weight);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
 
 TH_API void THNN_(ELU_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          real alpha);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] ELU output
+          real alpha);                 // an ELU parameter (as in paper)
 TH_API void THNN_(ELU_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *output,
-          real alpha);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output,            // output from a forward pass
+          real alpha);                 // an ELU parameter (as in paper)
 
 TH_API void THNN_(DistKLDivCriterion_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *target,
-          THTensor *output,
-          bool sizeAverage);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
 TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *target,
-          THTensor *gradInput,
-          bool sizeAverage);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
 
+// HardShink outputs 0 on interval of (-lambda; lambda) or original value otherwise.
 TH_API void THNN_(HardShrink_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          real lambda);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          real lambda);                // HardShrink parameter
 TH_API void THNN_(HardShrink_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          real lambda);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          real lambda);                // HardShrink parameter
 
+// HardTanh clamps the values to the interval [min_val; max_val].
 TH_API void THNN_(HardTanh_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          real min_val,
-          real max_val);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          real min_val,                // lower threshold
+          real max_val);               // upper threshold
 TH_API void THNN_(HardTanh_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          real min_val,
-          real max_val);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          real min_val,                // lower threshold
+          real max_val);               // upper threshold
 
 TH_API void THNN_(L1Cost_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
 TH_API void THNN_(L1Cost_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradInput);        // [OUT] gradient w.r.t the input
 
 TH_API void THNN_(LeakyReLU_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          real negval,
-          bool inplace);
+          THNNState *state,            // library's state
+          THTensor *input,             // [MODIFIED] input tensor
+          THTensor *output,            // [OUT] output tensor
+          real negval,                 // negative part slope
+          bool inplace);               // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
 TH_API void THNN_(LeakyReLU_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          real negval,
-          bool inplace);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [MODIFIED] gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          real negval,                 // negative part slope
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 
 TH_API void THNN_(LogSigmoid_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          THTensor *buffer);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // output tensor
+          THTensor *buffer);           // [BUFFER]
 TH_API void THNN_(LogSigmoid_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *buffer);
+          THNNState *state,            // library's state
+          THTensor *input,             // input
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *buffer);           // [BUFFER]
 
 TH_API void THNN_(LogSoftMax_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
 TH_API void THNN_(LogSoftMax_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *output);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output);           // module's output
 
 TH_API void THNN_(LookupTable_accGradParameters)(
           THNNState *state,
@@ -154,19 +156,19 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           real scale);
 
 TH_API void THNN_(MarginCriterion_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *target,
-          THTensor *output,
-          bool sizeAverage,
-          real margin);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contain only 1s and -1s)
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage,            // if true, the loss is normalized by **total number of elements**
+          real margin);                // a margin that is required for the loss to be 0
 TH_API void THNN_(MarginCriterion_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *target,
-          THTensor *gradInput,
-          bool sizeAverage,
-          real margin);
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contin only 1s and -1s)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. module's input
+          bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
+          real margin);                // a margin that is required for the loss to be 0
 
 TH_API void THNN_(MSECriterion_updateOutput)(
           THNNState *state,
diff --git a/generic/Tanh.c b/generic/Tanh.c
index ba8e2cea518..d6da1e451e2 100644
--- a/generic/Tanh.c
+++ b/generic/Tanh.c
@@ -2,13 +2,21 @@
 #define TH_GENERIC_FILE "generic/Tanh.c"
 #else
 
-void THNN_(Tanh_updateOutput)(THNNState *state, THTensor *input, THTensor *output)
+void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
 {
   THTensor_(resizeAs)(output, input);
   THTensor_(tanh)(output, input);
 }
 
-void THNN_(Tanh_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *output)
+void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
 {
   THTensor_(resizeAs)(gradInput, output);
 
diff --git a/generic/TemporalConvolution.c b/generic/TemporalConvolution.c
index a8109886a4a..a29a353abd9 100644
--- a/generic/TemporalConvolution.c
+++ b/generic/TemporalConvolution.c
@@ -2,16 +2,17 @@
 #define TH_GENERIC_FILE "generic/TemporalConvolution.c"
 #else
 
-void THNN_(TemporalConvolution_updateOutput)(THNNState *state,
-					     THTensor *input,
-					     THTensor *output,
-					     THTensor *weight,
-					     THTensor *bias,
-					     int kW, int dW,
-					     int inputFrameSize,
-					     int outputFrameSize
-					     )
-{  
+void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize,
+          int outputFrameSize)
+{
   THTensor *outputWindow, *inputWindow;
   int nInputFrame, nOutputFrame;
   long k, i;
@@ -129,12 +130,14 @@ void THNN_(TemporalConvolution_updateOutput)(THNNState *state,
 
 }
 
-void THNN_(TemporalConvolution_updateGradInput)(THNNState* state,
-						THTensor *input,
-						THTensor *gradOutput,
-						THTensor *gradInput,
-						THTensor *weight,
-						int kW, int dW)
+void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
 {
   long nInputFrame;
   long nOutputFrame;
@@ -226,13 +229,15 @@ void THNN_(TemporalConvolution_updateGradInput)(THNNState* state,
 
 }
 
-void THNN_(TemporalConvolution_accGradParameters)(THNNState *state,
-						  THTensor *input,
-						  THTensor *gradOutput,
-						  THTensor *gradWeight,
-						  THTensor *gradBias,
-						  int kW, int dW,
-						  real scale)
+void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          real scale)
 {
   long nInputFrame;
   long nOutputFrame;
diff --git a/generic/TemporalMaxPooling.c b/generic/TemporalMaxPooling.c
index 2b3d9703e9a..48cbcab56c5 100644
--- a/generic/TemporalMaxPooling.c
+++ b/generic/TemporalMaxPooling.c
@@ -2,11 +2,13 @@
 #define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
 #else
 
-void THNN_(TemporalMaxPooling_updateOutput)(THNNState *state,
-					    THTensor *input,
-					    THTensor *output,
-					    THTensor *indices,
-					    int kW, int dW)
+void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW,
+          int dW)
 {
   long niframe;
   long framesize;
@@ -138,12 +140,14 @@ void THNN_(TemporalMaxPooling_updateOutput)(THNNState *state,
 
 }
 
-void THNN_(TemporalMaxPooling_updateGradInput)(THNNState *state,
-					       THTensor *input,
-					       THTensor *gradOutput,
-					       THTensor *gradInput,
-					       THTensor *indices,
-					       int kW, int dW)
+void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW,
+          int dW)
 {
   long niframe;
   int noframe;
diff --git a/generic/TemporalSubSampling.c b/generic/TemporalSubSampling.c
index 012deb7d655..7fa323d0ab4 100644
--- a/generic/TemporalSubSampling.c
+++ b/generic/TemporalSubSampling.c
@@ -2,14 +2,15 @@
 #define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
 #else
 
-void THNN_(TemporalSubSampling_updateOutput)(THNNState *state,
-					     THTensor *input,
-					     THTensor *output,
-					     THTensor *weight,
-					     THTensor *bias,
-					     int kW, int dW,
-					     int inputFrameSize
-					     )
+void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize)
 {
   THTensor *outputFrame, *inputWindow;
   int nInputFrame, nOutputFrame;
@@ -42,13 +43,14 @@ void THNN_(TemporalSubSampling_updateOutput)(THNNState *state,
   THTensor_(free)(inputWindow);
 }
 
-void THNN_(TemporalSubSampling_updateGradInput)(THNNState *state,
-					      THTensor *input,
-					      THTensor *gradOutput,
-					      THTensor *gradInput,
-					      THTensor *weight,
-					      int kW, int dW
-					      )
+void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
 {
 
   THTensor *gradOutputFrame;
@@ -78,13 +80,15 @@ void THNN_(TemporalSubSampling_updateGradInput)(THNNState *state,
   THTensor_(free)(kwunit);
 }
 
-void THNN_(TemporalSubSampling_accGradParameters)(THNNState *state,
-						THTensor *input,
-						THTensor *gradOutput,
-						THTensor *gradWeight,
-						THTensor *gradBias,
-						int kW, int dW,
-						real scale)
+void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          real scale)
 {
   THTensor *gradOutputFrame;
   THTensor *inputWindow, *buffer;
diff --git a/generic/Threshold.c b/generic/Threshold.c
index acf8ee566a6..ac003608fed 100644
--- a/generic/Threshold.c
+++ b/generic/Threshold.c
@@ -2,7 +2,13 @@
 #define TH_GENERIC_FILE "generic/Threshold.c"
 #else
 
-void THNN_(Threshold_updateOutput)(THNNState *state, THTensor *input, THTensor *output, real threshold, real val, bool inplace)
+void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace)
 {
   if (inplace)
   {
@@ -21,7 +27,13 @@ void THNN_(Threshold_updateOutput)(THNNState *state, THTensor *input, THTensor *
   }
 }
 
-void THNN_(Threshold_updateGradInput)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, real threshold, bool inplace)
+void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace)
 {
   if (inplace)
   {
diff --git a/generic/VolumetricAveragePooling.c b/generic/VolumetricAveragePooling.c
index 0206585e18c..49b311e286d 100644
--- a/generic/VolumetricAveragePooling.c
+++ b/generic/VolumetricAveragePooling.c
@@ -3,11 +3,21 @@
 #else
 
 static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
-  real *input_p, real *output_p, long nslices,
-  long itime, long iwidth, long iheight,
-  long otime, long owidth, long oheight,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH)
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -51,9 +61,15 @@ static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
 }
 
 void THNN_(VolumetricAveragePooling_updateOutput)(
-  THNNState *state, THTensor *input, THTensor *output,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
 {
   long nslices;
   long itime;
@@ -146,11 +162,21 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
 }
 
 static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
-  real *gradInput_p, real *gradOutput_p, long nslices,
-  long itime, long iwidth, long iheight,
-  long otime, long owidth, long oheight,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH)
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -190,12 +216,16 @@ static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
 }
 
 void THNN_(VolumetricAveragePooling_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
 {
   int nslices;
   int itime;
diff --git a/generic/VolumetricConvolution.c b/generic/VolumetricConvolution.c
index 9d4046ec764..852dd54e7c7 100644
--- a/generic/VolumetricConvolution.c
+++ b/generic/VolumetricConvolution.c
@@ -3,15 +3,19 @@
 #else
 
 void THNN_(VolumetricConvolution_updateOutput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *output,
-  THTensor *weight,
-  THTensor *bias,
-  THTensor *finput,       // only used by cuda impl
-  THTensor *fgradInput,   // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
@@ -87,14 +91,18 @@ void THNN_(VolumetricConvolution_updateOutput)(
 }
 
 void THNN_(VolumetricConvolution_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *weight,
-  THTensor *finput,       // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
@@ -150,16 +158,20 @@ void THNN_(VolumetricConvolution_updateGradInput)(
 }
 
 void THNN_(VolumetricConvolution_accGradParameters)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradWeight,
-  THTensor *gradBias,
-  THTensor *finput,       // only used by cuda impl
-  THTensor *fgradInput,   // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH,
-  real scale)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          real scale)
 {
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
diff --git a/generic/VolumetricConvolutionMM.c b/generic/VolumetricConvolutionMM.c
index fc9e2e46197..a226350bf17 100644
--- a/generic/VolumetricConvolutionMM.c
+++ b/generic/VolumetricConvolutionMM.c
@@ -4,13 +4,24 @@
 
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
 static void THNN_(unfolded_acc_vol)(
-  THTensor *finput, THTensor *input,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH,
-  int nInputPlane,
-  int inputDepth, int inputWidth, int inputHeight,
-  int outputDepth, int outputWidth, int outputHeight)
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
 {
   int nip;
   real *input_data = THTensor_(data)(input);
@@ -78,13 +89,24 @@ static void THNN_(unfolded_acc_vol)(
 }
 
 static void THNN_(unfolded_copy_vol)(
-  THTensor *finput, THTensor *input,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH,
-  int nInputPlane,
-  int inputDepth, int inputWidth, int inputHeight,
-  int outputDepth, int outputWidth, int outputHeight)
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
 {
   long k;
   real *input_data = THTensor_(data)(input);
@@ -145,12 +167,28 @@ static void THNN_(unfolded_copy_vol)(
 }
 
 static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
-  THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT,int pW, int pH,
-  long nInputPlane, long inputDepth, long inputWidth, long inputHeight,
-  long nOutputPlane, long outputDepth, long outputWidth, long outputHeight)
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          long nInputPlane,
+          long inputDepth,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputDepth,
+          long outputWidth,
+          long outputHeight)
 {
   long i;
   THTensor *output2d;
@@ -185,15 +223,21 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
 }
 
 void THNN_(VolumetricConvolutionMM_updateOutput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *output,
-  THTensor *weight,
-  THTensor *bias,
-  THTensor *finput,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   int dimf = 0;
   int dimt = 1;
@@ -285,10 +329,19 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
 }
 
 static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
-  THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
     gradOutput->storage, gradOutput->storageOffset,
@@ -312,16 +365,22 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
 }
 
 void THNN_(VolumetricConvolutionMM_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *weight,
-  THTensor *finput,
-  THTensor *fgradInput,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   // number of input/output planes and kernel size is indirectly defined by the weight tensor
   THArgCheck(weight->nDimension == 2, 4,
@@ -376,7 +435,11 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
 }
 
 static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
-  THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale)
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
 {
   long i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
@@ -404,13 +467,13 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
 }
 
 void THNN_(VolumetricConvolutionMM_accGradParameters)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradWeight,
-  THTensor *gradBias,
-  THTensor *finput,
-  real scale)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
 {
   THArgCheck(gradWeight->nDimension == 2, 4,
     "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
index 73e81a140ec..267649ff0b9 100644
--- a/generic/VolumetricFullConvolution.c
+++ b/generic/VolumetricFullConvolution.c
@@ -3,15 +3,19 @@
 #else
 
 void THNN_(VolumetricFullConvolution_updateOutput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *output,
-  THTensor *weight,
-  THTensor *bias,
-  THTensor *finput,         // only used by cuda impl
-  THTensor *fgradInput,     // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   // number of input & output planes and kernel size is indirectly defined by the weight tensor
   THArgCheck(weight->nDimension == 5, 4,
@@ -100,16 +104,19 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
 }
 
 void THNN_(VolumetricFullConvolution_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *weight,
-  THTensor *finput,         // only used by cuda impl
-  THTensor *fgradInput,     // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH
-)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   // number of input/output planes and kernel size is indirectly defined by the weight tensor
   THArgCheck(weight->nDimension == 5, 4,
@@ -188,16 +195,20 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
 }
 
 void THNN_(VolumetricFullConvolution_accGradParameters)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradWeight,
-  THTensor *gradBias,
-  THTensor *finput,         // only used by cuda impl
-  THTensor *fgradInput,     // only used by cuda impl
-  int dT, int dW, int dH,
-  int pT, int pW, int pH,
-  real scale)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          real scale)
 {
   // number of input/output planes and kernel size is indirectly defined by the gradWeight tensor
   THArgCheck(gradWeight->nDimension == 5, 4,
diff --git a/generic/VolumetricMaxPooling.c b/generic/VolumetricMaxPooling.c
index b32a3819a51..053c02c0250 100644
--- a/generic/VolumetricMaxPooling.c
+++ b/generic/VolumetricMaxPooling.c
@@ -3,12 +3,25 @@
 #else
 
 static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
-  real *input_p, real *output_p, real *indz_p,
-  long nslices, long itime, long iwidth, long iheight,
-  long otime, long owidth, long oheight,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          real *input_p,
+          real *output_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -85,11 +98,20 @@ static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
 }
 
 void THNN_(VolumetricMaxPooling_updateOutput)(
-  THNNState *state, THTensor *input, THTensor *output, THTensor *indices,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH,
-  bool ceilMode)
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
 {
   long nslices;
   long itime;
@@ -220,12 +242,22 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
 }
 
 static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-  real *gradInput_p, real *gradOutput_p, real *indz_p,
-  long nslices,
-  long itime, long iwidth, long iheight,
-  long otime, long owidth, long oheight,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -259,9 +291,17 @@ static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
 }
 
 void THNN_(VolumetricMaxPooling_updateGradInput)(
-  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *indices,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   int nslices;
   int itime;
diff --git a/generic/VolumetricMaxUnpooling.c b/generic/VolumetricMaxUnpooling.c
index 9c6239b7eb3..247dd5fd9df 100644
--- a/generic/VolumetricMaxUnpooling.c
+++ b/generic/VolumetricMaxUnpooling.c
@@ -3,14 +3,22 @@
 #else
 
 static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
-  real *input_p,
-  real *output_p,
-  real *ind_p,
-  long nslices,
-  long iT, long iW, long iH,
-  long oT, long oW, long oH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          real *input_p,
+          real *output_p,
+          real *ind_p,
+          long nslices,
+          long iT,
+          long iW,
+          long iH,
+          long oT,
+          long oW,
+          long oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -50,13 +58,19 @@ static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
 }
 
 void THNN_(VolumetricMaxUnpooling_updateOutput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *output,
-  THTensor *indices,
-  int oT, int oW, int oH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   int dimw = 3;
   int dimh = 2;
@@ -149,13 +163,22 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
 }
 
 static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
-  real *gradInput_p, real *gradOutput_p,
-  real *ind_p,
-  long nslices,
-  long iT, long iW, long iH,
-  long oT, long oW, long oH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *ind_p,
+          long nslices,
+          long iT,
+          long iW,
+          long iH,
+          long oT,
+          long oW,
+          long oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -195,14 +218,20 @@ static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
 }
 
 void THNN_(VolumetricMaxUnpooling_updateGradInput)(
-  THNNState *state,
-  THTensor *input,
-  THTensor *gradOutput,
-  THTensor *gradInput,
-  THTensor *indices,
-  int oT, int oW, int oH,
-  int dT, int dW, int dH,
-  int pT, int pW, int pH)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
 {
   int dimw = 3;
   int dimh = 2;
diff --git a/generic/unfold.c b/generic/unfold.c
index 3581413badd..25146c0fe64 100644
--- a/generic/unfold.c
+++ b/generic/unfold.c
@@ -7,10 +7,20 @@
 #endif
 
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
-void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
-                         int kW, int kH, int dW, int dH, int padW, int padH,
-                         int nInputPlane, int inputWidth, int inputHeight,
-                         int outputWidth, int outputHeight)
+void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
 {
 #ifdef _WIN32
   LONG_PTR nip;
@@ -24,7 +34,7 @@ void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
 #pragma omp parallel for private(nip)
   for(nip = 0; nip < nInputPlane; nip++)
   {
-    size_t kw, kh, y, x; 
+    size_t kw, kh, y, x;
     long long ix = 0, iy = 0;
     for(kh = 0; kh < kH; kh++)
     {
@@ -71,11 +81,20 @@ void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
   }
 }
 
-
-void THNN_(unfolded_copy)(THTensor *finput, THTensor *input,
-                          int kW, int kH, int dW, int dH, int padW, int padH,
-                          int nInputPlane, int inputWidth, int inputHeight,
-                          int outputWidth, int outputHeight)
+void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
 {
   long k;
   real *input_data = THTensor_(data)(input);

From bae607590676d7eb0c0eec8be976caa4cb91f639 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Sat, 20 Feb 2016 16:11:27 +0100
Subject: [PATCH 056/101] Add basic THNN docs

---
 README.md               |  80 +++++++-----------------
 doc/api_reference.md    | 133 ++++++++++++++++++++++++++++++++++++++++
 doc/style_guidelines.md |  59 ++++++++++++++++++
 3 files changed, 213 insertions(+), 59 deletions(-)
 create mode 100644 doc/api_reference.md
 create mode 100644 doc/style_guidelines.md

diff --git a/README.md b/README.md
index dec5ffc64b4..eaf88555b96 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,34 @@
-## API design guidelines
+# THNN
 
-All functions should accept arguments in the following order. Dots represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. They should follow the order
-```
-[weight], [bias], [any buffers], [additional arguments], [optional arugments]
-```
+THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions, and an object oriented C/C++ wrapper will be created soon as another library.
 
-### Modules
-```
-updateOutput: state, input, output, ...
-updateGradInput: state, input, gradOutput, gradInput, ...
-accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
-```
+There is also a CUDA counterpart of THNN (CUTHNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
 
-e.g.
-```C
-void THNN_(HardShrink_updateGradInput)(
-          THNNState* state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          real lambda)
-```
+## Links
 
-### Criterions
-```
-updateOutput: state, input, target, output, ...
-updateGradInput: state, input, target, gradInput, ...
-```
+* [API reference](doc/api_reference.md)
+* [Style guidelines](doc/style_guidelines.md)
 
-e.g.
+## Motivation
 
-```C
-void THNN_(ClassNLLCriterion_updateOutput)(
-          THNNState* state,
-          THTensor *input,
-          THLongTensor *target,
-          THTensor *output,
-          THTensor *weights,
-          THTensor *total_weight,
-          bool sizeAverage)
-```
+Torch's nn module provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
 
-## Code style guide
+THNN is based on the same code, but is written in pure C, so it can be easily included in other code. **Future C implementations should be committed to THNN.**
 
-```C
-void THNN_Linear_updateOutput(
-          THTensor *input,
-          THTensor *output,
-          THTensor *weight,
-          THTensor *bias);
-//<- 10 ->
-```
+## API
 
-All arguments should start on a new line after function name, and they should be indented using 10 spaces.
+THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
 
-Use 2 spaces for block indentation.
+* **updateOutput** - applies the module to an input
+* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
+* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
 
+For information on argument types please check the [API reference](doc/api_reference.md).
 
-### Conversion Steps
+This is all THNN library provides. An object oriented implementation similar to nn will be provided in a separate library. This one is just a set of CPU kernels.
 
-1. copy old .c file to lib/THNN/generic 
-  - replace static int nn_ -> void THNN_
-  - replace lua_State \*L with 'actual' parameters (+ add THNNState\* state)
-  - remove any numeric values from return statements, remove the return at the end of the function body
-  - remove old luaL_Reg & _init function
-2. add forward declarations to generic/THNN.h
-3. include the generic/xyz.c file in init.c
-4. add functions to ffi.lua
-5. copy & adapt lua file: specify module THNN for torch.class(), use THNN.errcheck
-6. include module lua file in init.lua
-7. add & run unit test to lua/tests/test.lua
+## Developer docs
+
+* [Style guidelines](doc/style_guidelines.md)
+
+This section will be expanded when FFI refactoring will be finished.
diff --git a/doc/api_reference.md b/doc/api_reference.md
new file mode 100644
index 00000000000..9440dd2ba2a
--- /dev/null
+++ b/doc/api_reference.md
@@ -0,0 +1,133 @@
+# API docs
+
+This document only describes a THNN API. For a thorough review of all modules present here please refer to [nn's docs](http://github.com/torch/nn/tree/master/doc).
+
+### Note on function names
+
+Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
+
+* `void THNN_FloatAbs_updateOutput(...)`
+* `void THNN_DoubleAbs_updateOutput(...)`
+
+In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
+
+## Module list
+
+These are all modules implemented in THNN:
+
+* Nonlinear functions
+  * [Abs](#abs)
+  * [ELU](#elu)
+  * HardShrink
+  * HardTanh
+  * LeakyReLU
+  * LogSigmoid
+  * LogSoftMax
+  * PReLU
+  * RReLU
+  * Sigmoid
+  * SoftMax
+  * SoftPlus
+  * SoftShrink
+  * Sqrt
+  * Square
+  * Tanh
+  * Threshold
+* Criterions
+  * AbsCriterion
+  * ClassNLLCriterion
+  * DistKLDivCriterion
+  * L1Cost
+  * MSECriterion
+  * MarginCriterion
+  * MultiLabelMarginCriterion
+  * MultiMarginCriterion
+  * SmoothL1Criterion
+* Modules
+  * LookupTable
+  * SparseLinear
+* Spatial modules
+  * SpatialAdaptiveMaxPooling
+  * SpatialAdaptiveMaxPooling
+  * SpatialAveragePooling
+  * SpatialConvolutionMM
+* Volumetric modules
+  * VolumetricAveragePooling
+  * VolumetricConvoluion
+  * VolumetricConvoluionMM
+  * VolumetricFullConvolution
+  * VolumetricMaxPooling
+  * VolumetricMaxUnpooling
+
+## Abs
+
+```C
+void THNN_Abs_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+
+`state` - library's state
+<br/>
+`input` - input tensor
+<br/>
+`output` - **[OUT]** Abs output
+
+```C
+void THNN_Abs_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+```
+
+`state` - library's state
+<br/>
+`input` - input tensor
+<br/>
+`gradOutput` - gradient w.r.t. output
+<br/>
+`gradInput` - **[OUT]** gradient w.r.t. input
+
+## ELU
+
+For reference see [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](http://arxiv.org/abs/1511.07289).
+
+```C
+void THNN_ELU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real alpha);
+```
+
+`state` - library state
+<br/>
+`input` - input tensor
+<br/>
+`output` - **[OUT]** ELU output
+<br/>
+`alpha` - an ELU parameter
+
+```C
+void THNN_ELU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real alpha);
+```
+
+`state` - library state
+<br/>
+`input` - input tensor
+<br/>
+`gradOutput` - gradient w.r.t. output
+<br/>
+`gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`output` - module output for given input
+<br/>
+`alpha` - an ELU parameter
diff --git a/doc/style_guidelines.md b/doc/style_guidelines.md
new file mode 100644
index 00000000000..a7254540c8f
--- /dev/null
+++ b/doc/style_guidelines.md
@@ -0,0 +1,59 @@
+## API design guidelines
+
+Functions should return `void`.
+
+All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this:
+```
+[weight], [bias], [any buffers], [additional arguments], [optional arguments]
+```
+
+### Modules
+```
+updateOutput: state, input, output, ...
+updateGradInput: state, input, gradOutput, gradInput, ...
+accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
+```
+
+e.g.
+```C
+void THNN_(HardShrink_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+```
+
+### Criterions
+```
+updateOutput: state, input, target, output, ...
+updateGradInput: state, input, target, gradInput, ...
+```
+
+e.g.
+
+```C
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState* state,
+          THTensor *input,
+          THLongTensor *target,
+          THTensor *output,
+          THTensor *weights,
+          THTensor *total_weight,
+          bool sizeAverage)
+```
+
+## Code style guide
+
+```C
+void THNN_Linear_updateOutput(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+//<- 10 ->
+```
+
+All arguments should start on a new line after function name, and they should be indented using 10 spaces.
+
+Use 2 spaces for block indentation.

From 6ee6aedf20f6e8d89aff8fa9c37d4230cc03af6f Mon Sep 17 00:00:00 2001
From: Zeming Lin <misterabc@devgpu029.prn2.facebook.com>
Date: Tue, 23 Feb 2016 09:31:19 -0800
Subject: [PATCH 057/101] Adding SparseLinear with CUDA, requires buffer
 variable

---
 generic/SparseLinear.c | 3 ++-
 generic/THNN.h         | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index 2cf2697b0c9..3eeaf387791 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -27,6 +27,7 @@ void THNN_(SparseLinear_updateOutput)(
           THTensor *output,
           THTensor *weight,
           THTensor *bias,
+          THTensor *cudaBuffer,
           THTensor *shardBuffer)
 {
   long i;
@@ -286,7 +287,7 @@ void THNN_(SparseLinear_updateGradInput)(
 
     if (offset >= 0 && offset < inDim)
     {
-      real val = 
+      real val =
         THBlas_(dot)(
           outDim,
           THTensor_(data)(gradOutput),
diff --git a/generic/THNN.h b/generic/THNN.h
index 24d4fec7049..293158df930 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -327,6 +327,7 @@ TH_API void THNN_(SparseLinear_updateOutput)(
           THTensor *output,
           THTensor *weight,
           THTensor *bias,
+          THTensor *cudaBuffer,
           THTensor *shardBuffer);
 TH_API void THNN_(SparseLinear_updateGradInput)(
           THNNState *state,

From 9c1a72d932dcb0142a0d1eca2d9f25d38837f198 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Tue, 23 Feb 2016 18:49:24 -0800
Subject: [PATCH 058/101] sparselinear performance fixes

---
 generic/SparseLinear.c | 388 ++++++++++++++++++++++-------------------
 1 file changed, 206 insertions(+), 182 deletions(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index 3eeaf387791..834d97b2ae5 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -6,9 +6,12 @@
 #include <omp.h>
 #endif
 
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
 static bool THNN_(checkInput)(THTensor* t)
 {
-  return t->nDimension == 2 && t->size[1] == 2;
+  return t->nDimension == 3 && t->size[2] == 2;
 }
 
 static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
@@ -21,6 +24,18 @@ static bool THNN_(checkSize1D)(THTensor* t, long size0)
   return t->nDimension == 1 && t->size[0] == size0;
 }
 
+static void THNN_(set1d)(THTensor *t, long x0, real value) {
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1]);
+}
+
 void THNN_(SparseLinear_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -30,80 +45,49 @@ void THNN_(SparseLinear_updateOutput)(
           THTensor *cudaBuffer,
           THTensor *shardBuffer)
 {
-  long i;
-  long outDim = weight->size[0];
-  long inDim = weight->size[1];
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
 
-  THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 2");
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2");
   THArgCheck(THNN_(checkSize1D)(output, outDim), 3, "output size wrong");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 
-  if (shardBuffer != NULL)
-  {
-    long num_shards = shardBuffer->size[1];
-    THArgCheck(
-      shardBuffer->nDimension == 2 && shardBuffer->size[0] == outDim && num_shards > 0,
-      6,
-      "shardBuffer size wrong"
-    );
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(output, batchSize, outDim);
 
-    THTensor_(zero)(shardBuffer);
-    #pragma omp parallel for private(i) schedule(static) num_threads(num_shards)
-    for (i = 0; i < input->size[0]; i++)
-    {
-#ifdef _OPENMP
-      int shardId = omp_get_thread_num();
-#else
-      int shardId = 1;
-#endif
-
-      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-      if (offset >= 0 && offset < inDim)
-      {
-        THBlas_(axpy)(
-          outDim,
-          THTensor_(get2d)(input, i, 1),
-          THTensor_(data)(weight) + offset * weight->stride[1],
-          weight->stride[0],
-          THTensor_(data)(shardBuffer) + shardId * shardBuffer->stride[1],
-          shardBuffer->stride[0]
-        );
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
       }
-      else
-      {
-        THError("index out of bound. updateOutput: \
-%ld not between 1 and %ld", offset + 1, inDim);
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      COL_PTR2(weight, offset), weight->stride[0],
+                      ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+                offset + 1, inDim);
       }
     }
-
-    THTensor_(sum)(output, shardBuffer, 1);
-    THTensor_(cadd)(output, bias, 1.0, output);
-
-    return;
   }
 
-  THTensor_(copy)(output, bias);
-  for (i = 0; i < input->size[0]; i++)
-  {
-    long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-    if (offset >= 0 && offset < inDim) // make sure indices are in bounds..
-    {
-      real val = THTensor_(get2d)(input, i, 1);
-      THBlas_(axpy)(
-        output->size[0],
-        val,
-        THTensor_(data)(weight)+offset*weight->stride[1],
-        weight->stride[0],
-        THTensor_(data)(output),
-        output->stride[0]
-      );
-    }
-    else
-    {
-      THError("index out of bound. updateOutput: \
-%ld not between 1 and %ld", offset + 1, inDim);
-    }
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
   }
+  THTensor_(free)(output_row);
 }
 
 void THNN_(SparseLinear_accGradParameters)(
@@ -117,59 +101,58 @@ void THNN_(SparseLinear_accGradParameters)(
           real weightDecay,
           real scale)
 {
-  long i;
-  long nnz = input->size[0];
-  long outDim = weight->size[0];
-  long inDim = weight->size[1];
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
 
-  THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 2");
-  THArgCheck(THNN_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
-  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
-  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
 
-  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
-  for (i = 0; i < nnz; i++)
-  {
-      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
 
-      if (offset >= 0 && offset < inDim) // make sure indices are in bounds..
-      {
-        real val = scale*THTensor_(get2d)(input, i, 1);
-
-        THBlas_(axpy)(
-          outDim,
-          val,
-          THTensor_(data)(gradOutput),
-          gradOutput->stride[0],
-          THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
-          gradWeight->stride[0]
-        );
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  batchSize * nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    for (h = 0; h < batchSize; h++) {
+      real val = scale * THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
       }
-      else
-      {
-        THError("index out of bound. accGradParameters: \
-%ld not between 1 and %ld", offset + 1, inDim);
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
       }
+    }
   }
 
-  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+  // gradBias += gradOutput
+  THTensor* gradOutput_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+  }
+  THTensor_(free)(gradOutput_row);
 
-  if (weightDecay != 0)
-  {
-    #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
-    for (i = 0; i < nnz; i++)
-    {
-      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-      THBlas_(axpy)(
-        outDim,
-        weightDecay,
-        THTensor_(data)(weight) + offset*weight->stride[1],
-        weight->stride[0],
-        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
-        gradWeight->stride[0]
-      );
-    }
-    THTensor_(cadd)(gradBias, gradBias, weightDecay, bias);
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
   }
 }
 
@@ -182,41 +165,70 @@ void THNN_(SparseLinear_updateParameters)(
           THTensor *lastInput,
           real learningRate)
 {
-  long i;
-  long nnz = lastInput->size[0];
+  long h, i;
   long outDim = weight->size[0];
   long inDim = weight->size[1];
 
-  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input size must be batchsize x nnz x 2");
 
-  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
 
-  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
-  for (i = 0; i < nnz; i++)
-  {
-    long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
 
-    if (offset >= 0 && offset < inDim) // make sure indices are in bounds..
-    {
-      real* pGradWeight =
-        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
-      THBlas_(axpy)(
-        outDim,
-        -learningRate,
-        pGradWeight,
-        gradWeight->stride[0],
-        THTensor_(data)(weight)+offset*weight->stride[1],
-        weight->stride[0]
-      );
-    }
-    else
-    {
-      THError("index out of bound. updateParameters: \
-%ld not between 1 and %ld", offset + 1, inDim);
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+  long cnt = 0;
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(lastInput, h, i, 1);
+      if (val == 0 ) {
+        continue;
+      }
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THNN_(set1d)(offsets, cnt++, offset);
+      } else {
+        THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
     }
   }
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
 }
 
 void THNN_(SparseLinear_zeroGradParameters)(
@@ -225,40 +237,46 @@ void THNN_(SparseLinear_zeroGradParameters)(
           THTensor *gradBias,
           THTensor *lastInput)
 {
-  long i;
-  long nnz = lastInput->size[0];
+  long h, i, j;
+
   long outDim = gradWeight->size[0];
   long inDim = gradWeight->size[1];
 
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input size must be batchsize x nnz x 2");
 
   THTensor_(zero)(gradBias);
-  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
-  for (i = 0; i < nnz; i++)
-  {
-    long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
 
-    if(offset >= 0 && offset < inDim) // make sure indices are in bounds..
-    {
-      real* pGradWeight = THTensor_(data)(gradWeight) + offset * gradWeight->stride[1];
-      if (gradWeight->stride[0] == 1)
-      {
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        real* pGradWeight = COL_PTR2(gradWeight, offset);
+        if (gradWeight->stride[0] == 1) {
           THVector_(fill)(pGradWeight, 0, outDim);
-      }
-      else
-      {
-        long j;
-        for (j = 0; j < outDim; ++j)
-        {
-          pGradWeight[j * gradWeight->stride[0]] = 0;
+        } else {
+          long stride = gradWeight->stride[0];
+          for (j = 0; j < outDim; ++j) {
+            pGradWeight[j * stride] = 0;
+          }
         }
+      } else {
+        THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
       }
     }
-    else
-    {
-      THError("index out of bound. zeroGradParameters: \
-%ld not between 1 and %ld", offset + 1, inDim);
-    }
   }
 }
 
@@ -269,40 +287,46 @@ void THNN_(SparseLinear_updateGradInput)(
           THTensor *gradInput,
           THTensor *weight)
 {
-  long i;
-  long nnz = input->size[0];
+  long h, i;
   long outDim = weight->size[0];
   long inDim = weight->size[1];
 
-  THArgCheck(THNN_(checkInput)(input), 2, "input must be an nnz x 2 tensor");
-  THArgCheck(THNN_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be a batchSize x nnz x 2 tensor");
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+             "gradInput must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 3,
+             "gradOutput must be contiguous");
 
-  THTensor_(resize2d)(gradInput, input->size[0], input->size[1]);
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
+  THTensor_(resize3d)(gradInput, batchSize, nnz, 2);
 
-  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
-  for (i = 0; i < nnz; ++i)
-  {
-    long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-    THTensor_(set2d)(gradInput, i, 0, offset + 1);
+#pragma omp parallel for private(h, i) schedule(static) if (    \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; ++i) {
+      long offset = (long)(THTensor_(get3d)(input, h, i, 0)) - 1;
+      THTensor_(set3d)(gradInput, h, i, 0, offset + 1);
 
-    if (offset >= 0 && offset < inDim)
-    {
-      real val =
-        THBlas_(dot)(
+      if (offset >= 0 && offset < inDim) {
+        real val = THBlas_(dot)(
           outDim,
-          THTensor_(data)(gradOutput),
-          gradOutput->stride[0],
-          THTensor_(data)(weight) + offset * weight->stride[1],
-          weight->stride[0]
-        );
-      THTensor_(set2d)(gradInput, i, 1, val);
-    }
-    else
-    {
-      THError("index out of bound. updateGradInput: \
-%ld not between 1 and %ld", offset + 1, inDim);
+          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+          COL_PTR2(weight, offset), weight->stride[0]);
+        THTensor_(set3d)(gradInput, h, i, 1, val);
+      } else {
+        THError(
+          "index out of bound. updateGradInput: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
     }
   }
 }
 
+#undef ROW_PTR2
+#undef COL_PTR2
+
 #endif

From 291681e102acd3426a159c24bbfe86a5f7c9f4fb Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Wed, 24 Feb 2016 16:59:30 -0800
Subject: [PATCH 059/101] Adding SpatialReflection and SpatialReplication
 padding

---
 generic/SpatialReflectionPadding.c  | 255 ++++++++++++++++++++++++++++
 generic/SpatialReplicationPadding.c | 254 +++++++++++++++++++++++++++
 generic/THNN.h                      |  25 +++
 init.c                              |   6 +
 4 files changed, 540 insertions(+)
 create mode 100644 generic/SpatialReflectionPadding.c
 create mode 100644 generic/SpatialReplicationPadding.c

diff --git a/generic/SpatialReflectionPadding.c b/generic/SpatialReflectionPadding.c
new file mode 100644
index 00000000000..08e0ba00fd5
--- /dev/null
+++ b/generic/SpatialReflectionPadding.c
@@ -0,0 +1,255 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r,
+                                                  int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 3 ||
+    input->nDimension == 4 , 2, "input must be 3 or 4-dimensional");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/generic/SpatialReplicationPadding.c b/generic/SpatialReplicationPadding.c
new file mode 100644
index 00000000000..cdd6fc545c1
--- /dev/null
+++ b/generic/SpatialReplicationPadding.c
@@ -0,0 +1,254 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4,
+             2, "input must be 3 or 4-dimensional");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index b48bb133e7f..8189021e9a5 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -899,5 +899,30 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                            THTensor *input,
+                                                            THTensor *gradOutput,
+                                                            THTensor *gradInput,
+                                                            int pad_l, int pad_r,
+                                                            int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                            THTensor *input,
+                                                            THTensor *gradOutput,
+                                                            THTensor *gradInput,
+                                                            int pad_l, int pad_r,
+                                                            int pad_t, int pad_b);
 
 #endif
diff --git a/init.c b/init.c
index ea3191ad45c..abc88bb540a 100644
--- a/init.c
+++ b/init.c
@@ -156,3 +156,9 @@
 
 #include "generic/VolumetricMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReflectionPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReplicationPadding.c"
+#include "THGenerateFloatTypes.h"

From f581fba18df2da237222802c6b156bab693ce08e Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Wed, 24 Feb 2016 14:18:13 -0800
Subject: [PATCH 060/101] Add VolumetricBatchNormalization

The BatchNormalization modules now all extend nn.BatchNormalization and
use the same THNN/THCUNN implementation.
---
 ...chNormalization.c => BatchNormalization.c} | 38 ++++++++++---------
 init.c                                        |  2 +-
 2 files changed, 21 insertions(+), 19 deletions(-)
 rename generic/{SpatialBatchNormalization.c => BatchNormalization.c} (71%)

diff --git a/generic/SpatialBatchNormalization.c b/generic/BatchNormalization.c
similarity index 71%
rename from generic/SpatialBatchNormalization.c
rename to generic/BatchNormalization.c
index bf3108bf11a..9bcee2106de 100644
--- a/generic/SpatialBatchNormalization.c
+++ b/generic/BatchNormalization.c
@@ -1,31 +1,33 @@
 #ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/SpatialBatchNormalization.c"
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
 #else
 
-void THNN_(SpatialBatchNormalization_updateOutput)(THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *running_mean, THTensor *running_var, THTensor *save_mean, THTensor *save_std, bool train, double momentum, double eps)
+void THNN_(BatchNormalization_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  THTensor *weight, THTensor *bias,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double momentum, double eps)
 {
-  long nBatch = THTensor_(size)(input, 0);
-  long nFeature = THTensor_(size)(input, 1);
-  long iH = THTensor_(size)(input, 2);
-  long iW = THTensor_(size)(input, 3);
-  long n = nBatch * iH * iW;
+  long nInput = THTensor_(size)(input, 1);
+  long n = THTensor_(nElement)(input) / nInput;
 
   #pragma parallel for
-  for (long f = 0; f < nFeature; ++f) {
+  for (long f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *out = THTensor_(newSelect)(output, 1, f);
 
     real mean, invstd;
 
     if (train) {
-      // compute mean per feature plane
+      // compute mean per input
       accreal sum = 0;
       TH_TENSOR_APPLY(real, in, sum += *in_data;);
 
       mean = (real) sum / n;
       THTensor_(set1d)(save_mean, f, (real) mean);
 
-      // compute variance per feature plane
+      // compute variance per input
       sum = 0;
       TH_TENSOR_APPLY(real, in,
         sum += (*in_data - mean) * (*in_data - mean););
@@ -61,20 +63,20 @@ void THNN_(SpatialBatchNormalization_updateOutput)(THNNState *state, THTensor *i
   }
 }
 
-void THNN_(SpatialBatchNormalization_backward)(THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *save_mean, THTensor *save_std, double scale)
+void THNN_(BatchNormalization_backward)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+  THTensor *save_mean, THTensor *save_std, double scale)
 {
-  long nBatch = THTensor_(size)(input, 0);
-  long nFeature = THTensor_(size)(input, 1);
-  long iH = THTensor_(size)(input, 2);
-  long iW = THTensor_(size)(input, 3);
-  long n = nBatch * iH * iW;
+  long nInput = THTensor_(size)(input, 1);
+  long n = THTensor_(nElement)(input) / nInput;
 
   // Q(X) = X - E[x] ; i.e. input centered to zero mean
   // Y = Q(X) / σ    ; i.e. BN output before weight and bias
-  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ
+  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
 
   #pragma parallel for
-  for (long f = 0; f < nFeature; ++f) {
+  for (long f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
     real mean = THTensor_(get1d)(save_mean, f);
diff --git a/init.c b/init.c
index ea3191ad45c..21c48b698a9 100644
--- a/init.c
+++ b/init.c
@@ -97,7 +97,7 @@
 #include "generic/TemporalMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
-#include "generic/SpatialBatchNormalization.c"
+#include "generic/BatchNormalization.c"
 #include "THGenerateFloatTypes.h"
 
 #include "generic/unfold.c"

From aef48d21c4308bd3abf1ceae0adc1878b173e1f1 Mon Sep 17 00:00:00 2001
From: Nimalan Mahendran <nimalan@twitter.com>
Date: Tue, 16 Feb 2016 12:09:14 -0800
Subject: [PATCH 061/101] Making margin parameterizable in
 nn.MultiMarginCriterion

---
 generic/MultiMarginCriterion.c | 10 ++++++----
 generic/THNN.h                 |  6 ++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index f3309e57ea1..2463da1451b 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -9,7 +9,8 @@ void THNN_(MultiMarginCriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage,
           int p,
-          THTensor *weights)
+          THTensor *weights,
+          real margin)
 {
   real *input_data, *target_data, *weights_data;
   long nframe, dim;
@@ -50,7 +51,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
     real input_target = input_data[target_idx];
     for (d = 0; d < dim; d++)
     {
-      real z = 1 - input_target + input_data[d];
+      real z = margin - input_target + input_data[d];
       if (d == target_idx)
         continue;
 
@@ -83,7 +84,8 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage,
           int p,
-          THTensor *weights)
+          THTensor *weights,
+          real margin)
 {
   real *input_data;
   real *gradInput_data;
@@ -127,7 +129,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
     real gradInput_target = 0;
     for (d = 0; d < dim; d++)
     {
-      real z = 1 - input_target + input_data[d];
+      real z = margin - input_target + input_data[d];
       if (d == target_idx)
         continue;
 
diff --git a/generic/THNN.h b/generic/THNN.h
index 8189021e9a5..b080c3b6e77 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -203,7 +203,8 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage,
           int p,
-          THTensor* weights);
+          THTensor* weights,
+          real margin);
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
@@ -211,7 +212,8 @@ TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage,
           int p,
-          THTensor *weights);
+          THTensor *weights,
+          real margin);
 
 TH_API void THNN_(PReLU_updateOutput)(
           THNNState *state,

From d2e6c39fed987e552192848aab951d8d3211d939 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 27 Feb 2016 12:11:04 +0100
Subject: [PATCH 062/101] Add checks for convolution parameters

---
 generic/SpatialConvolutionMM.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
index a8427c4e4be..e3eec5120d6 100644
--- a/generic/SpatialConvolutionMM.c
+++ b/generic/SpatialConvolutionMM.c
@@ -64,7 +64,9 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   long outputWidth;
   long outputHeight;
 
-  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 1, "3D or 4D (batch mode) tensor expected");
+  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
 
   if (input->nDimension == 4) {
     dimf++;
@@ -164,7 +166,9 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
 {
   long nOutputPlane = weight->size[0];
 
-  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
@@ -243,7 +247,9 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
           real scale)
 {
   long nOutputPlane = gradWeight->size[0];
-  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
 
   if(input->nDimension == 3)
   {

From cbee35a43b8c87e4320072a5ae8ab95dea249ce1 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Mon, 29 Feb 2016 13:17:10 -0800
Subject: [PATCH 063/101] SoftMarginCriterion

---
 generic/SoftMarginCriterion.c | 40 +++++++++++++++++++++++++++++++++++
 generic/THNN.h                | 14 ++++++++++++
 init.c                        |  3 +++
 3 files changed, 57 insertions(+)
 create mode 100644 generic/SoftMarginCriterion.c

diff --git a/generic/SoftMarginCriterion.c b/generic/SoftMarginCriterion.c
new file mode 100644
index 00000000000..d9b618d6e63
--- /dev/null
+++ b/generic/SoftMarginCriterion.c
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *output,
+  bool sizeAverage)
+{
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = log(1. + exp(-*input_data* *target_data));
+                   sum += z;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *gradInput,
+  bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real z = exp(-*target_data * *input_data);
+                   *gradInput_data = -norm*(*target_data)*z/(1. + z);)
+}
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index b080c3b6e77..c3e8e91e6a6 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -170,6 +170,20 @@ TH_API void THNN_(MarginCriterion_updateGradInput)(
           bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
           real margin);                // a margin that is required for the loss to be 0
 
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
 TH_API void THNN_(MSECriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index c9f7b95c4da..6367b20e9b5 100644
--- a/init.c
+++ b/init.c
@@ -46,6 +46,9 @@
 #include "generic/MarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SoftMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/MultiLabelMarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 

From 7bad18d2c5eb3796465f5e44803d856d9f475676 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Thu, 25 Feb 2016 21:23:11 +0000
Subject: [PATCH 064/101] Add script that generates api_reference.md

---
 README.md                  |    6 +-
 doc/api_reference.md       | 1518 ++++++++++++++++++++++++++++++++++--
 doc/generate_reference.lua |  106 +++
 3 files changed, 1555 insertions(+), 75 deletions(-)
 create mode 100644 doc/generate_reference.lua

diff --git a/README.md b/README.md
index eaf88555b96..e6c61601d1e 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions, and an object oriented C/C++ wrapper will be created soon as another library.
 
-There is also a CUDA counterpart of THNN (CUTHNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
+There is also a CUDA counterpart of THNN (THCUNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
 
 ## Links
 
@@ -11,7 +11,7 @@ There is also a CUDA counterpart of THNN (CUTHNN) in the [cunn repository](https
 
 ## Motivation
 
-Torch's nn module provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
+Torch's neural network package (nn) provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
 
 THNN is based on the same code, but is written in pure C, so it can be easily included in other code. **Future C implementations should be committed to THNN.**
 
@@ -25,8 +25,6 @@ THNN is a purely functional library. It provides 2-3 functions for each module,
 
 For information on argument types please check the [API reference](doc/api_reference.md).
 
-This is all THNN library provides. An object oriented implementation similar to nn will be provided in a separate library. This one is just a set of CPU kernels.
-
 ## Developer docs
 
 * [Style guidelines](doc/style_guidelines.md)
diff --git a/doc/api_reference.md b/doc/api_reference.md
index 9440dd2ba2a..830cc3d685f 100644
--- a/doc/api_reference.md
+++ b/doc/api_reference.md
@@ -11,69 +11,79 @@ Please remember, that because C doesn't support function overloading, functions
 
 In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
 
+### Argument types
+
+Some arguments have additional tags placed in square brackets:
+* **[OUT]** - This is the output argument. It will be reshaped if needed.
+* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
+* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
+* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
+
 ## Module list
 
 These are all modules implemented in THNN:
 
-* Nonlinear functions
-  * [Abs](#abs)
-  * [ELU](#elu)
-  * HardShrink
-  * HardTanh
-  * LeakyReLU
-  * LogSigmoid
-  * LogSoftMax
-  * PReLU
-  * RReLU
-  * Sigmoid
-  * SoftMax
-  * SoftPlus
-  * SoftShrink
-  * Sqrt
-  * Square
-  * Tanh
-  * Threshold
-* Criterions
-  * AbsCriterion
-  * ClassNLLCriterion
-  * DistKLDivCriterion
-  * L1Cost
-  * MSECriterion
-  * MarginCriterion
-  * MultiLabelMarginCriterion
-  * MultiMarginCriterion
-  * SmoothL1Criterion
-* Modules
-  * LookupTable
-  * SparseLinear
-* Spatial modules
-  * SpatialAdaptiveMaxPooling
-  * SpatialAdaptiveMaxPooling
-  * SpatialAveragePooling
-  * SpatialConvolutionMM
-* Volumetric modules
-  * VolumetricAveragePooling
-  * VolumetricConvoluion
-  * VolumetricConvoluionMM
-  * VolumetricFullConvolution
-  * VolumetricMaxPooling
-  * VolumetricMaxUnpooling
+* [Abs](#abs)
+* [AbsCriterion](#abscriterion)
+* [ClassNLLCriterion](#classnllcriterion)
+* [DistKLDivCriterion](#distkldivcriterion)
+* [ELU](#elu)
+* [HardShrink](#hardshrink)
+* [HardTanh](#hardtanh)
+* [L1Cost](#l1cost)
+* [LeakyReLU](#leakyrelu)
+* [LogSigmoid](#logsigmoid)
+* [LogSoftMax](#logsoftmax)
+* [LookupTable](#lookuptable)
+* [MSECriterion](#msecriterion)
+* [MarginCriterion](#margincriterion)
+* [MultiLabelMarginCriterion](#multilabelmargincriterion)
+* [MultiMarginCriterion](#multimargincriterion)
+* [PReLU](#prelu)
+* [RReLU](#rrelu)
+* [Sigmoid](#sigmoid)
+* [SmoothL1Criterion](#smoothl1criterion)
+* [SoftMax](#softmax)
+* [SoftPlus](#softplus)
+* [SoftShrink](#softshrink)
+* [SparseLinear](#sparselinear)
+* [SpatialAdaptiveMaxPooling](#spatialadaptivemaxpooling)
+* [SpatialAveragePooling](#spatialaveragepooling)
+* [SpatialBatchNormalization](#spatialbatchnormalization)
+* [SpatialConvolutionLocal](#spatialconvolutionlocal)
+* [SpatialConvolutionMM](#spatialconvolutionmm)
+* [SpatialConvolutionMap](#spatialconvolutionmap)
+* [SpatialFractionalMaxPooling](#spatialfractionalmaxpooling)
+* [SpatialFullConvolution](#spatialfullconvolution)
+* [SpatialFullConvolutionMap](#spatialfullconvolutionmap)
+* [SpatialMaxPooling](#spatialmaxpooling)
+* [SpatialMaxUnpooling](#spatialmaxunpooling)
+* [SpatialSubSampling](#spatialsubsampling)
+* [SpatialUpSamplingNearest](#spatialupsamplingnearest)
+* [Sqrt](#sqrt)
+* [Square](#square)
+* [Tanh](#tanh)
+* [Threshold](#threshold)
+* [VolumetricAveragePooling](#volumetricaveragepooling)
+* [VolumetricConvolution](#volumetricconvolution)
+* [VolumetricConvolutionMM](#volumetricconvolutionmm)
+* [VolumetricFullConvolution](#volumetricfullconvolution)
+* [VolumetricMaxPooling](#volumetricmaxpooling)
+* [VolumetricMaxUnpooling](#volumetricmaxunpooling)
 
 ## Abs
-
 ```C
 void THNN_Abs_updateOutput(
           THNNState *state,
           THTensor *input,
           THTensor *output);
 ```
-
-`state` - library's state
+`THNNState *state` - library's state
 <br/>
-`input` - input tensor
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** Abs output
 <br/>
-`output` - **[OUT]** Abs output
-
 ```C
 void THNN_Abs_updateGradInput(
           THNNState *state,
@@ -81,19 +91,138 @@ void THNN_Abs_updateGradInput(
           THTensor *gradOutput,
           THTensor *gradInput);
 ```
-
-`state` - library's state
+`THNNState *state` - library's state
 <br/>
-`input` - input tensor
+`THTensor *input` - input tensor
 <br/>
-`gradOutput` - gradient w.r.t. output
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+## AbsCriterion
+```C
+void THNN_AbsCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - tensor with target values
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor with loss
+<br/>
+`bool sizeAverage` - if true, the loss will be divided by batch size
+<br/>
+```C
+void THNN_AbsCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - tensor with target values
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`bool sizeAverage` - if true, the gradient will be normalized by batch size
+<br/>
+## ClassNLLCriterion
+```C
+void THNN_ClassNLLCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor (1D/2D)
+<br/>
+`THIndexTensor *target` - tensor containing indexes of target classes
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor with loss
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized by batch size and class weights
+<br/>
+`THTensor *weights` - **[OPTIONAL]** class weights
+<br/>
+`THTensor *total_weight` - **[BUFFER]**
+<br/>
+```C
+void THNN_ClassNLLCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor (1D/2D)
+<br/>
+`THIndexTensor *target` - tensor containing indexes of target classes
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized by batch size and class weights
+<br/>
+`THTensor *weights` - **[OPTIONAL]** class weights
+<br/>
+`THTensor *total_weight` - **[BUFFER]**
+<br/>
+## DistKLDivCriterion
+```C
+void THNN_DistKLDivCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor containing the loss
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized **by total number of elements**
+<br/>
+```C
+void THNN_DistKLDivCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized **by total number of elements**
 <br/>
-`gradInput` - **[OUT]** gradient w.r.t. input
-
 ## ELU
-
-For reference see [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](http://arxiv.org/abs/1511.07289).
-
 ```C
 void THNN_ELU_updateOutput(
           THNNState *state,
@@ -101,15 +230,14 @@ void THNN_ELU_updateOutput(
           THTensor *output,
           real alpha);
 ```
-
-`state` - library state
+`THNNState *state` - library's state
 <br/>
-`input` - input tensor
+`THTensor *input` - input tensor
 <br/>
-`output` - **[OUT]** ELU output
+`THTensor *output` - **[OUT]** ELU output
+<br/>
+`real alpha` - an ELU parameter (as in paper)
 <br/>
-`alpha` - an ELU parameter
-
 ```C
 void THNN_ELU_updateGradInput(
           THNNState *state,
@@ -119,15 +247,1263 @@ void THNN_ELU_updateGradInput(
           THTensor *output,
           real alpha);
 ```
-
-`state` - library state
+`THNNState *state` - library's state
 <br/>
-`input` - input tensor
+`THTensor *input` - input tensor
 <br/>
-`gradOutput` - gradient w.r.t. output
+`THTensor *gradOutput` - gradient w.r.t. output
 <br/>
-`gradInput` - **[OUT]** gradient w.r.t. input
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
 <br/>
-`output` - module output for given input
+`THTensor *output` - output from a forward pass
 <br/>
-`alpha` - an ELU parameter
+`real alpha` - an ELU parameter (as in paper)
+<br/>
+## HardShrink
+```C
+void THNN_HardShrink_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+`real lambda` - HardShrink parameter
+<br/>
+```C
+void THNN_HardShrink_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`real lambda` - HardShrink parameter
+<br/>
+## HardTanh
+```C
+void THNN_HardTanh_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real min_val,
+          real max_val);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+`real min_val` - lower threshold
+<br/>
+`real max_val` - upper threshold
+<br/>
+```C
+void THNN_HardTanh_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real min_val,
+          real max_val);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. the input
+<br/>
+`real min_val` - lower threshold
+<br/>
+`real max_val` - upper threshold
+<br/>
+## L1Cost
+```C
+void THNN_L1Cost_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+```C
+void THNN_L1Cost_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t the input
+<br/>
+## LeakyReLU
+```C
+void THNN_LeakyReLU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real negval,
+          bool inplace);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - **[MODIFIED]** input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+`real negval` - negative part slope
+<br/>
+`bool inplace` - if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+<br/>
+```C
+void THNN_LeakyReLU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real negval,
+          bool inplace);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - **[MODIFIED]** gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. the input
+<br/>
+`real negval` - negative part slope
+<br/>
+`bool inplace` - if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+<br/>
+## LogSigmoid
+```C
+void THNN_LogSigmoid_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - output tensor
+<br/>
+`THTensor *buffer` - **[BUFFER]**
+<br/>
+```C
+void THNN_LogSigmoid_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *buffer` - **[BUFFER]**
+<br/>
+## LogSoftMax
+```C
+void THNN_LogSoftMax_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+```C
+void THNN_LogSoftMax_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *output` - module's output
+<br/>
+## LookupTable
+```C
+void THNN_LookupTable_accGradParameters(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          real scale);
+```
+## MSECriterion
+```C
+void THNN_MSECriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+```C
+void THNN_MSECriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+## MarginCriterion
+```C
+void THNN_MarginCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          real margin);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor (should contain only 1s and -1s)
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor containing the loss
+<br/>
+`bool sizeAverage` - if true, the loss is normalized by **total number of elements**
+<br/>
+`real margin` - a margin that is required for the loss to be 0
+<br/>
+```C
+void THNN_MarginCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          real margin);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor (should contin only 1s and -1s)
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. module's input
+<br/>
+`bool sizeAverage` - if true, the gradient is normalized by **total number of elements**
+<br/>
+`real margin` - a margin that is required for the loss to be 0
+<br/>
+## MultiLabelMarginCriterion
+```C
+void THNN_MultiLabelMarginCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+```C
+void THNN_MultiLabelMarginCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+## MultiMarginCriterion
+```C
+void THNN_MultiMarginCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor* weights);
+```
+```C
+void THNN_MultiMarginCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights);
+```
+## PReLU
+```C
+void THNN_PReLU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+```
+```C
+void THNN_PReLU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+```
+```C
+void THNN_PReLU_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          real scale);
+```
+## RReLU
+```C
+void THNN_RReLU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+```
+```C
+void THNN_RReLU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace);
+```
+## Sigmoid
+```C
+void THNN_Sigmoid_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_Sigmoid_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## SmoothL1Criterion
+```C
+void THNN_SmoothL1Criterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+```C
+void THNN_SmoothL1Criterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+## SoftMax
+```C
+void THNN_SoftMax_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_SoftMax_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## SoftPlus
+```C
+void THNN_SoftPlus_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold);
+```
+```C
+void THNN_SoftPlus_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold);
+```
+## SoftShrink
+```C
+void THNN_SoftShrink_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+```
+```C
+void THNN_SoftShrink_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+```
+## SparseLinear
+```C
+void THNN_SparseLinear_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *shardBuffer);
+```
+```C
+void THNN_SparseLinear_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+```
+```C
+void THNN_SparseLinear_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+```
+```C
+void THNN_SparseLinear_zeroGradParameters(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+```
+```C
+void THNN_SparseLinear_updateParameters(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate);
+```
+## SpatialAdaptiveMaxPooling
+```C
+void THNN_SpatialAdaptiveMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+```
+```C
+void THNN_SpatialAdaptiveMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices);
+```
+## SpatialAveragePooling
+```C
+void THNN_SpatialAveragePooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+```
+```C
+void THNN_SpatialAveragePooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+```
+## SpatialBatchNormalization
+```C
+void THNN_SpatialBatchNormalization_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+```
+```C
+void THNN_SpatialBatchNormalization_backward(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *save_mean,
+          THTensor *save_std,
+          double scale);
+```
+## SpatialConvolutionLocal
+```C
+void THNN_SpatialConvolutionLocal_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+```
+```C
+void THNN_SpatialConvolutionLocal_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+```
+```C
+void THNN_SpatialConvolutionLocal_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight,
+          real scale);
+```
+## SpatialConvolutionMM
+```C
+void THNN_SpatialConvolutionMM_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+```
+```C
+void THNN_SpatialConvolutionMM_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+```
+```C
+void THNN_SpatialConvolutionMM_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          real scale);
+```
+## SpatialConvolutionMap
+```C
+void THNN_SpatialConvolutionMap_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** convolution output
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialConvolutionMap_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialConvolutionMap_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          real scale);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradWeight` - 3D gradWeight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *gradBias` - 1D gradBias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+`real scale` - scaling factor
+<br/>
+## SpatialFractionalMaxPooling
+```C
+void THNN_SpatialFractionalMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices,
+          THTensor *randomSamples);
+```
+```C
+void THNN_SpatialFractionalMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices);
+```
+## SpatialFullConvolution
+```C
+void THNN_SpatialFullConvolution_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+```
+```C
+void THNN_SpatialFullConvolution_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+```
+```C
+void THNN_SpatialFullConvolution_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          real scale);
+```
+## SpatialFullConvolutionMap
+```C
+void THNN_SpatialFullConvolutionMap_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** convolution output
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialFullConvolutionMap_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialFullConvolutionMap_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          real scale);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradWeight` - 3D gradWeight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *gradBias` - 1D gradBias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+`real scale` - scaling factor
+<br/>
+## SpatialMaxPooling
+```C
+void THNN_SpatialMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+```
+```C
+void THNN_SpatialMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+```
+## SpatialMaxUnpooling
+```C
+void THNN_SpatialMaxUnpooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+```
+```C
+void THNN_SpatialMaxUnpooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int owidth, int oheight);
+```
+## SpatialSubSampling
+```C
+void THNN_SpatialSubSampling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+```
+```C
+void THNN_SpatialSubSampling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+```
+```C
+void THNN_SpatialSubSampling_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          real scale);
+```
+## SpatialUpSamplingNearest
+```C
+void THNN_SpatialUpSamplingNearest_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+```
+```C
+void THNN_SpatialUpSamplingNearest_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+```
+## Sqrt
+```C
+void THNN_Sqrt_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps);
+```
+```C
+void THNN_Sqrt_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## Square
+```C
+void THNN_Square_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_Square_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+```
+## Tanh
+```C
+void THNN_Tanh_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_Tanh_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## Threshold
+```C
+void THNN_Threshold_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace);
+```
+```C
+void THNN_Threshold_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace);
+```
+## VolumetricAveragePooling
+```C
+void THNN_VolumetricAveragePooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+```
+```C
+void THNN_VolumetricAveragePooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+```
+## VolumetricConvolution
+```C
+void THNN_VolumetricConvolution_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolution_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolution_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+```
+## VolumetricConvolutionMM
+```C
+void THNN_VolumetricConvolutionMM_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolutionMM_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolutionMM_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale);
+```
+## VolumetricFullConvolution
+```C
+void THNN_VolumetricFullConvolution_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricFullConvolution_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricFullConvolution_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+```
+## VolumetricMaxPooling
+```C
+void THNN_VolumetricMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+```
+```C
+void THNN_VolumetricMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+## VolumetricMaxUnpooling
+```C
+void THNN_VolumetricMaxUnpooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricMaxUnpooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
diff --git a/doc/generate_reference.lua b/doc/generate_reference.lua
new file mode 100644
index 00000000000..0f75474b49f
--- /dev/null
+++ b/doc/generate_reference.lua
@@ -0,0 +1,106 @@
+--[[
+  This script regenerates api_reference.md based on comments placed in THNN.h.
+]]--
+
+local header = [[
+# API docs
+
+This document only describes a THNN API. For a thorough review of all modules present here please refer to [nn's docs](http://github.com/torch/nn/tree/master/doc).
+
+### Note on function names
+
+Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
+
+* `void THNN_FloatAbs_updateOutput(...)`
+* `void THNN_DoubleAbs_updateOutput(...)`
+
+In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
+
+### Argument types
+
+Some arguments have additional tags placed in square brackets:
+* **[OUT]** - This is the output argument. It will be reshaped if needed.
+* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
+* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
+* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
+
+## Module list
+
+These are all modules implemented in THNN:
+
+]]
+
+local hfile = io.open('../generic/THNN.h', 'r')
+local lines = hfile:read('*a'):split('\n')
+hfile:close()
+
+-- Parse input
+local declarations = {}
+local current_declaration
+local declaration_module
+for i,line in ipairs(lines) do
+   if line:sub(1, 6) == 'TH_API' then
+     current_declaration = ''
+     declaration_module = line:match('THNN_%((.+)_.+%)')
+   end
+
+   if current_declaration then
+      current_declaration = current_declaration .. line .. '\n'
+   end
+
+   if line:match('%);') then
+     current_declaration = current_declaration:sub(1, -2) -- remove a trailing newline
+     declarations[declaration_module] = declarations[declaration_module] or {}
+     table.insert(declarations[declaration_module], current_declaration)
+     current_declaration = nil
+     declaration_module = nil
+   end
+end
+declarations["unfolded"] = nil
+
+-- Sort modules
+modules = {}
+for k,_ in pairs(declarations) do table.insert(modules, k) end
+table.sort(modules)
+
+-- Create an index
+local outfile = io.open('api_reference.md', 'w')
+outfile:write(header)
+for i, name in ipairs(modules) do
+    outfile:write(string.format('* [%s](#%s)\n', name, name:lower()))
+end
+outfile:write('\n')
+
+-- Write proper docs
+for i,name in ipairs(modules) do
+    outfile:write('## ' .. name ..'\n')
+
+    for i,declaration in ipairs(declarations[name]) do
+
+        -- Write source code
+        outfile:write('```C' .. '\n')
+        local declaration_lines = declaration:split('\n')
+        for i, line in ipairs(declaration_lines) do
+            if i == 1 then
+                line = line:gsub('TH_API ', ''):gsub('%(', ''):gsub('%)', '') .. '(' -- remove macro junk
+            else
+                line = line:gsub('%s*//.*$', '') -- remove the comment
+            end
+            outfile:write(line .. '\n')
+        end
+        outfile:write('```' .. '\n')
+
+        -- Describe arguments
+        table.remove(declaration_lines, 1)
+        for i,line in ipairs(declaration_lines) do
+            local param, comment = line:match('^%s*(.*),%s*// (.*)$')
+            if param == nil then param, comment = line:match('^%s*(.*)%);%s*// (.*)$') end
+
+            if param ~= nil then
+                comment = comment:gsub('%[', '%*%*%['):gsub('%]', '%]%*%*') -- use bold font for tags
+                outfile:write(string.format('`%s` - %s\n<br/>\n', param, comment))
+            end
+        end
+    end
+end
+outfile:close()

From b5d4cbbecbda7410ea01c69061018cbe8a7d70ca Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Thu, 3 Mar 2016 15:04:03 -0800
Subject: [PATCH 065/101] add test for SparseLinear, fix a spurious check

---
 generic/SparseLinear.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index 834d97b2ae5..a84e0303956 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -50,7 +50,6 @@ void THNN_(SparseLinear_updateOutput)(
   long inDim = THTensor_(size)(weight, 1);
 
   THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2");
-  THArgCheck(THNN_(checkSize1D)(output, outDim), 3, "output size wrong");
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 

From c07e0a6166e75b567b41c508a1ab08276ec09fd2 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 27 Feb 2016 19:20:05 +0100
Subject: [PATCH 066/101] Add missing declarations to THNN.h

---
 generic/THNN.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/generic/THNN.h b/generic/THNN.h
index c3e8e91e6a6..86c63da12da 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -423,7 +423,68 @@ TH_API void THNN_(Threshold_updateGradInput)(
           real threshold,
           bool inplace);
 
-TH_API void THNN_(SpatialBatchNormalization_updateOutput)(
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize,
+          int outputFrameSize);
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          real scale);
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize);
+TH_API void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          real scale);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
@@ -436,7 +497,7 @@ TH_API void THNN_(SpatialBatchNormalization_updateOutput)(
           bool train,
           double momentum,
           double eps);
-TH_API void THNN_(SpatialBatchNormalization_backward)(
+TH_API void THNN_(BatchNormalization_backward)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,

From 5450e44cefa20fcc81efdc577a58148e03ceb084 Mon Sep 17 00:00:00 2001
From: fsuzanomassa <fvsmassa@gmail.com>
Date: Tue, 8 Mar 2016 19:40:22 +0100
Subject: [PATCH 067/101] Move compilation flags from nn to THNN CMakeLists

---
 CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3bf40595ad..2cc1960943e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,50 @@ IF(NOT THNN_INSTALL_LIB_SUBDIR)
   SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
 ENDIF()
 
+# Flags
+# When using MSVC
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ENDIF(MSVC)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_C_FLAGS "--std=c99 ${CMAKE_C_FLAGS}")
+ELSE ()
+  SET(CMAKE_C_STANDARD 99)
+ENDIF ()
+
+# OpenMP support?
+SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+  IF (DARWIN_VERSION GREATER 9)
+    SET(APPLE_OPENMP_SUCKS 1)
+  ENDIF (DARWIN_VERSION GREATER 9)
+  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+    OUTPUT_VARIABLE GCC_VERSION)
+  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
+    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+  ENDIF ()
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
+
 SET(src init.c)
 ADD_LIBRARY(THNN MODULE init.c)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

From be7cefa4a4d0bad9a280f056d378369ca02d9480 Mon Sep 17 00:00:00 2001
From: Zeming Lin <misterabc@devgpu029.prn2.facebook.com>
Date: Mon, 7 Mar 2016 13:12:37 -0800
Subject: [PATCH 068/101] Adding table input support for batched SparseLinear,
 implementing gradInput correctly, fixing other bugs

---
 generic/SparseLinear.c | 167 +++++++++++++++++++++++++++++------------
 generic/THNN.h         |  23 ++++--
 2 files changed, 133 insertions(+), 57 deletions(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index a84e0303956..2e24d9192c0 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -5,15 +5,21 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include <stdio.h>
 
 #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
 #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
 
-static bool THNN_(checkInput)(THTensor* t)
+static bool THNN_(checkLegacyInput)(THTensor* t)
 {
   return t->nDimension == 3 && t->size[2] == 2;
 }
 
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
 static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
 {
   return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
@@ -41,15 +47,61 @@ void THNN_(SparseLinear_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
-          THTensor *cudaBuffer,
-          THTensor *shardBuffer)
+          THTensor *bias)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+  long batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(input, i, 2);
+    if (val == 0) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THBlas_(axpy)(outDim,
+                    val,
+                    COL_PTR2(weight, offset), weight->stride[0],
+                    ROW_PTR2(output, h), output->stride[1]);
+    } else {
+      THError("index out of bound. updateOutput: %d not between 1 and %d",
+              offset + 1, inDim);
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
 {
   long h, i;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
 
-  THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 
@@ -105,6 +157,65 @@ void THNN_(SparseLinear_accGradParameters)(
   long inDim = THTensor_(size)(weight, 1);
 
   THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long nnz = THTensor_(size)(input, 0);
+  // THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    real val = scale * THNN_(get2d)(input, i, 2);
+
+    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THBlas_(axpy)(outDim,
+          val,
+          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+          COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+    } else {
+      THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
              "input size must be batchsize x nnz x 2");
   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
@@ -279,51 +390,7 @@ void THNN_(SparseLinear_zeroGradParameters)(
   }
 }
 
-void THNN_(SparseLinear_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *weight)
-{
-  long h, i;
-  long outDim = weight->size[0];
-  long inDim = weight->size[1];
-
-  THArgCheck(THNN_(checkInput)(input), 2,
-             "input must be a batchSize x nnz x 2 tensor");
-  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
-             "gradInput must be contiguous");
-  THArgCheck(THTensor_(isContiguous)(gradOutput), 3,
-             "gradOutput must be contiguous");
-
-  long batchSize = THTensor_(size)(input, 0);
-  long nnz = THTensor_(size)(input, 1);
-  THTensor_(resize2d)(gradOutput, batchSize, outDim);
-  THTensor_(resize3d)(gradInput, batchSize, nnz, 2);
-
-#pragma omp parallel for private(h, i) schedule(static) if (    \
-  batchSize > 1 && batchSize * nnz * outDim > 10000)
-  for (h = 0; h < batchSize; h++) {
-    for (i = 0; i < nnz; ++i) {
-      long offset = (long)(THTensor_(get3d)(input, h, i, 0)) - 1;
-      THTensor_(set3d)(gradInput, h, i, 0, offset + 1);
-
-      if (offset >= 0 && offset < inDim) {
-        real val = THBlas_(dot)(
-          outDim,
-          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
-          COL_PTR2(weight, offset), weight->stride[0]);
-        THTensor_(set3d)(gradInput, h, i, 1, val);
-      } else {
-        THError(
-          "index out of bound. updateGradInput: %d not between 1 and %d",
-          offset + 1,
-          inDim);
-      }
-    }
-  }
-}
+void THNN_(SparseLinear_cudaClearState)(THNNState *state) {}
 
 #undef ROW_PTR2
 #undef COL_PTR2
diff --git a/generic/THNN.h b/generic/THNN.h
index 86c63da12da..544d317dd57 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -342,16 +342,24 @@ TH_API void THNN_(SparseLinear_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
-          THTensor *cudaBuffer,
-          THTensor *shardBuffer);
-TH_API void THNN_(SparseLinear_updateGradInput)(
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *weight);
-TH_API void THNN_(SparseLinear_accGradParameters)(
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
@@ -374,6 +382,7 @@ TH_API void THNN_(SparseLinear_updateParameters)(
           THTensor *gradBias,
           THTensor *lastInput,
           real learningRate);
+TH_API void THNN_(SparseLinear_cudaClearState)(THNNState *state);
 
 TH_API void THNN_(Sqrt_updateOutput)(
           THNNState *state,

From 747614b1aa6916e61bc6593d108076a1ef92c922 Mon Sep 17 00:00:00 2001
From: Aiden Nibali <dismaldenizen@gmail.com>
Date: Wed, 9 Mar 2016 15:54:00 +1100
Subject: [PATCH 069/101] In-place ELU

---
 generic/ELU.c  | 40 ++++++++++++++++++++++++++++++----------
 generic/THNN.h |  6 ++++--
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/generic/ELU.c b/generic/ELU.c
index f313212cee1..8303de09afd 100644
--- a/generic/ELU.c
+++ b/generic/ELU.c
@@ -6,12 +6,22 @@ void THNN_(ELU_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real alpha)
+          real alpha,
+          bool inplace)
 {
-  THTensor_(resizeAs)(output, input);
-  TH_TENSOR_APPLY2(real, input, real, output,
-    *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
-  );
+  if(inplace) {
+    TH_TENSOR_APPLY(real, input,
+      if(*input_data <= 0) {
+        *input_data = (exp(*input_data) - 1) * alpha;
+      }
+    );
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, input, real, output,
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+    );
+  }
 }
 
 void THNN_(ELU_updateGradInput)(
@@ -20,12 +30,22 @@ void THNN_(ELU_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *output,
-          real alpha)
+          real alpha,
+          bool inplace)
 {
-  THTensor_(resizeAs)(gradInput, output);
-  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
-    *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
-  );
+  if(inplace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, output,
+      if(*output_data <= 0) {
+        *gradOutput_data *= *output_data + alpha;
+      }
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, output);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+    );
+  }
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 86c63da12da..0253d9beb07 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -46,14 +46,16 @@ TH_API void THNN_(ELU_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] ELU output
-          real alpha);                 // an ELU parameter (as in paper)
+          real alpha,                  // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 TH_API void THNN_(ELU_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *gradOutput,        // gradient w.r.t. output
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
           THTensor *output,            // output from a forward pass
-          real alpha);                 // an ELU parameter (as in paper)
+          real alpha,                  // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 
 TH_API void THNN_(DistKLDivCriterion_updateOutput)(
           THNNState *state,            // library's state

From 0da5e29063a3a7473f475401e6d2b199b71ba99b Mon Sep 17 00:00:00 2001
From: Zeming Lin <ebetica0@gmail.com>
Date: Sun, 20 Mar 2016 00:33:22 -0400
Subject: [PATCH 070/101] Sparse Linear now does sparse updates from the last
 input

---
 generic/SparseLinear.c | 122 ++++++++++++++++++++++++++++++++++++++++-
 generic/THNN.h         |  18 +++++-
 2 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index 2e24d9192c0..a3be92a9d68 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -5,7 +5,6 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <stdio.h>
 
 #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
 #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
@@ -284,6 +283,79 @@ void THNN_(SparseLinear_updateParameters)(
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
   THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input must be in coo format, nnz x 3");
+
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+  long cnt = 0;
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(lastInput, i, 2);
+    if (val == 0) {
+      continue;
+    }
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THNN_(set1d)(offsets, cnt++, offset);
+    } else {
+      THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+  if (cnt == 0) return;
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate)
+{
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
              "input size must be batchsize x nnz x 2");
 
 
@@ -354,6 +426,52 @@ void THNN_(SparseLinear_zeroGradParameters)(
 
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
   THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input must be in coo format, nnz x 3");
+
+  THTensor_(zero)(gradBias);
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if (   \
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(lastInput, i, 0)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      real* pGradWeight = COL_PTR2(gradWeight, offset);
+      if (gradWeight->stride[0] == 1) {
+        THVector_(fill)(pGradWeight, 0, outDim);
+      } else {
+        long stride = gradWeight->stride[0];
+        for (j = 0; j < outDim; ++j) {
+          pGradWeight[j * stride] = 0;
+        }
+      }
+    } else {
+      THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
              "input size must be batchsize x nnz x 2");
 
   THTensor_(zero)(gradBias);
@@ -390,8 +508,6 @@ void THNN_(SparseLinear_zeroGradParameters)(
   }
 }
 
-void THNN_(SparseLinear_cudaClearState)(THNNState *state) {}
-
 #undef ROW_PTR2
 #undef COL_PTR2
 
diff --git a/generic/THNN.h b/generic/THNN.h
index 90f6890d9a1..6270d1d6be3 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -355,6 +355,19 @@ TH_API void THNN_(SparseLinear_accGradParameters)(
           THTensor *bias,
           real weightDecay,
           real scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate);
 TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
           THNNState *state,
           THTensor *input,
@@ -371,12 +384,12 @@ TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
           THTensor *bias,
           real weightDecay,
           real scale);
-TH_API void THNN_(SparseLinear_zeroGradParameters)(
+TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
           THNNState *state,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput);
-TH_API void THNN_(SparseLinear_updateParameters)(
+TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
           THNNState *state,
           THTensor *weight,
           THTensor *bias,
@@ -384,7 +397,6 @@ TH_API void THNN_(SparseLinear_updateParameters)(
           THTensor *gradBias,
           THTensor *lastInput,
           real learningRate);
-TH_API void THNN_(SparseLinear_cudaClearState)(THNNState *state);
 
 TH_API void THNN_(Sqrt_updateOutput)(
           THNNState *state,

From eb8db0a42de3cce44159d83266d7668448fd6de0 Mon Sep 17 00:00:00 2001
From: Zeming Lin <misterabc@devgpu013.ash5.facebook.com>
Date: Wed, 23 Mar 2016 16:25:29 -0700
Subject: [PATCH 071/101] Fixed Sparse Linear Bugs

---
 generic/SparseLinear.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index a3be92a9d68..0f426ba5b9b 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -439,7 +439,7 @@ void THNN_(SparseLinear_zeroGradParameters)(
       continue;
     }
 
-    long offset = (long)(THNN_(get2d)(lastInput, i, 0)) - 1;
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
     if (offset >= 0 && offset < inDim) {
       real* pGradWeight = COL_PTR2(gradWeight, offset);
       if (gradWeight->stride[0] == 1) {

From 29c91c80a287f3dae71ac5584fabee4fc9150e7c Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 26 Mar 2016 21:19:28 +0100
Subject: [PATCH 072/101] Fix SpatialFullConvolutionMap bug #521

---
 generic/SpatialFullConvolutionMap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/generic/SpatialFullConvolutionMap.c b/generic/SpatialFullConvolutionMap.c
index b1ebcb89a89..bbb0282b77b 100644
--- a/generic/SpatialFullConvolutionMap.c
+++ b/generic/SpatialFullConvolutionMap.c
@@ -3,7 +3,7 @@
 #else
 
 void THNN_(SpatialFullConvolutionMap_updateOutput)(
-  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
   THTensor *connTable, int nInputPlane, int nOutputPlane,
   int dW, int dH)
 {
@@ -20,14 +20,14 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
   THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
 
   THTensor_(resize3d)(
-    output, nOutputPlane,
+    output_, nOutputPlane,
     (input->size[1] - 1) * dH + kH,
     (input->size[2] - 1) * dW + kW
   );
 
   /* contiguous */
   input = THTensor_(newContiguous)(input);
-  output = THTensor_(newContiguous)(output);
+  THTensor* output = THTensor_(newContiguous)(output_);
 
   /* get raw pointers */
   real *input_data = THTensor_(data)(input);
@@ -80,11 +80,11 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
 
   /* clean up */
   THTensor_(free)(input);
-  THTensor_(free)(output);
+  THTensor_(freeCopyTo)(output, output_);
 }
 
 void THNN_(SpatialFullConvolutionMap_updateGradInput)(
-  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
   THTensor *connTable, int nInputPlane, int nOutputPlane,
   int dW, int dH)
 {
@@ -95,7 +95,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
   );
 
   /* contiguous */
-  gradInput = THTensor_(newContiguous)(gradInput);
+  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
   /* Resize/Zero */
@@ -142,7 +142,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
   }
 
   /* clean up */
-  THTensor_(free)(gradInput);
+  THTensor_(freeCopyTo)(gradInput, gradInput_);
   THTensor_(free)(gradOutput);
 }
 

From 9665854c9e95b2009561acdff499ef5020d586b6 Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Mon, 28 Mar 2016 11:52:28 -0700
Subject: [PATCH 073/101] Add missing 'omp' to pragma

---
 generic/BatchNormalization.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generic/BatchNormalization.c b/generic/BatchNormalization.c
index 9bcee2106de..418860981b5 100644
--- a/generic/BatchNormalization.c
+++ b/generic/BatchNormalization.c
@@ -12,7 +12,7 @@ void THNN_(BatchNormalization_updateOutput)(
   long nInput = THTensor_(size)(input, 1);
   long n = THTensor_(nElement)(input) / nInput;
 
-  #pragma parallel for
+  #pragma omp parallel for
   for (long f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *out = THTensor_(newSelect)(output, 1, f);
@@ -75,7 +75,7 @@ void THNN_(BatchNormalization_backward)(
   // Y = Q(X) / σ    ; i.e. BN output before weight and bias
   // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
 
-  #pragma parallel for
+  #pragma omp parallel for
   for (long f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);

From 0bf2fc46501e64ab5ebda729d14d4f87687d8887 Mon Sep 17 00:00:00 2001
From: Xianjie Chen <xianjiec@gmail.com>
Date: Mon, 11 Apr 2016 13:40:19 -0700
Subject: [PATCH 074/101] [LookupTable] Add Max-norm constraints to LookupTable
 (#739)

---
 generic/LookupTable.c | 94 +++++++++++++++++++++++++++++++++++++++++++
 generic/THNN.h        |  7 ++++
 2 files changed, 101 insertions(+)

diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index 852598253e6..a35ff8496ff 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -116,4 +116,98 @@ void THNN_(LookupTable_accGradParameters)(
   THTensor_(free)(gradOutput);
 }
 
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+          real *row_data,
+          long stride,
+          real maxNorm,
+          real normType)
+{
+  real norm = 0;
+  real new_norm;
+  long j;
+  for (j=0; j<stride; j++)
+  {
+    if (normType == 1) {
+      norm += fabs(row_data[j]);
+    } else if (normType == 2) {
+      norm += row_data[j] * row_data[j];
+    } else {
+      norm += pow(fabs(row_data[j]), normType);
+    }
+  }
+  norm = pow(norm, 1.0 / normType);
+  if (norm > maxNorm)
+  {
+    new_norm = maxNorm / (norm + 1e-7);
+    for (j=0; j<stride; j++) {
+      row_data[j] *= new_norm;
+    }
+  }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+          THNNState *state,
+          THIndexTensor *idx,
+          THTensor *weight,
+          real maxNorm,
+          real normType)
+{
+  if (!THTensor_(isContiguous)(weight))
+    THError("weight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(idx))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(idx) != 1)
+    THError("idx must be a vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  long i;
+  THIndex_t *row_idx = THIndexTensor_(data)(idx);
+  long numel = THIndexTensor_(nElement)(idx);
+
+  long numw = THTensor_(size)(weight, 0);
+  long stride = THTensor_(stride)(weight, 0);
+  real *gw = THTensor_(data)(weight);
+  for (i=0; i<numel; i++)
+    if (row_idx[i] < 1 || row_idx[i] > numw)
+      THError("input out of range");
+  // get unique indices
+  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+  long ptr = 0;
+  for (i=0; i<numel; i++)
+    if (i == 0 || row_idx[i] != row_idx[i-1])
+      row_idx[ptr++] = row_idx[i];
+  numel = ptr;
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over the rows that appear in
+    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+    // This distributes the work evenly to each thread.
+    #pragma omp parallel for private(i)
+    for (i=0; i<numel; i++)
+    {
+      long k = row_idx[i] - 1;
+      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+    }
+    return;
+  }
+#endif
+  for (i=0; i<numel; i++)
+  {
+    long k = row_idx[i] - 1;
+    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+  }
+}
+
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 6270d1d6be3..9c74622a874 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -157,6 +157,13 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           int paddingValue,
           real scale);
 
+TH_API void THNN_(LookupTable_renorm)(
+          THNNState *state,            // library's state
+          THIndexTensor *idx,          // vector that contains row indices (modified in function)
+          THTensor *weight,            // 2D tensor whose rows will be renormalized
+          real maxNorm,                // maximum norm
+          real normType);              // the norm type (e.g., normType=2, then it's 2-norm)
+
 TH_API void THNN_(MarginCriterion_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor

From a82dcc7bba5924ff41a555eebb779a56f22b38d9 Mon Sep 17 00:00:00 2001
From: Zhen He <hezhen13@qq.com>
Date: Sat, 16 Apr 2016 15:23:48 +0100
Subject: [PATCH 075/101] Modify the backward equation in the comment

---
 generic/BatchNormalization.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/BatchNormalization.c b/generic/BatchNormalization.c
index 418860981b5..cf2dc1811a4 100644
--- a/generic/BatchNormalization.c
+++ b/generic/BatchNormalization.c
@@ -73,7 +73,7 @@ void THNN_(BatchNormalization_backward)(
 
   // Q(X) = X - E[x] ; i.e. input centered to zero mean
   // Y = Q(X) / σ    ; i.e. BN output before weight and bias
-  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y / n) / σ * w
 
   #pragma omp parallel for
   for (long f = 0; f < nInput; ++f) {

From 4f985e9244d049671150096a7d96e655fa54769e Mon Sep 17 00:00:00 2001
From: Kaiyu Yang <yangky11@2379590902.vpn.umich.net>
Date: Thu, 14 Apr 2016 16:49:53 +0800
Subject: [PATCH 076/101] BatchNormalization: add evaluation mode, add doc for
 nn.Jacobian

---
 generic/BatchNormalization.c | 47 +++++++++++++++++++++++++-----------
 generic/THNN.h               |  6 ++++-
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/generic/BatchNormalization.c b/generic/BatchNormalization.c
index cf2dc1811a4..2a4823c7d9e 100644
--- a/generic/BatchNormalization.c
+++ b/generic/BatchNormalization.c
@@ -66,22 +66,26 @@ void THNN_(BatchNormalization_updateOutput)(
 void THNN_(BatchNormalization_backward)(
   THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
   THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
-  THTensor *save_mean, THTensor *save_std, double scale)
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
 {
   long nInput = THTensor_(size)(input, 1);
   long n = THTensor_(nElement)(input) / nInput;
 
-  // Q(X) = X - E[x] ; i.e. input centered to zero mean
-  // Y = Q(X) / σ    ; i.e. BN output before weight and bias
-  // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y / n) / σ * w
-
   #pragma omp parallel for
   for (long f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
-    real mean = THTensor_(get1d)(save_mean, f);
-    real invstd = THTensor_(get1d)(save_std, f);
     real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
 
     // sum over all gradOutput in feature plane
     accreal sum = 0;
@@ -95,14 +99,29 @@ void THNN_(BatchNormalization_backward)(
     if (gradInput) {
       THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
 
-      // projection of gradOutput on to output scaled by std
-      real k = (real) dotp * invstd * invstd / n;
-      TH_TENSOR_APPLY2(real, gradIn, real, in,
-        *gradIn_data = (*in_data - mean) * k;);
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
 
-      accreal gradMean = sum / n;
-      TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
-        *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
 
       THTensor_(free)(gradIn);
     }
diff --git a/generic/THNN.h b/generic/THNN.h
index 9c74622a874..337c4314a31 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -535,9 +535,13 @@ TH_API void THNN_(BatchNormalization_backward)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *weight,
+          THTensor *running_mean,
+          THTensor *running_var,
           THTensor *save_mean,
           THTensor *save_std,
-          double scale);
+          bool train,
+          double scale,
+          double eps);
 
 TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
           THNNState *state,       // library state

From cdd7fbf1324a8e5b9e421e40c56546b1d61ba8a5 Mon Sep 17 00:00:00 2001
From: Kaiyu Yang <yangky11@Kaiyus-MacBook-Pro.local>
Date: Mon, 18 Apr 2016 13:14:54 +0800
Subject: [PATCH 077/101] add noBias for nn.Linear and nn.SpatialConvolution

remove files

add noBias for nn.Linear and nn.SpatialConvolution

do not allocate new buffers when in noBias

fix a typo

fix a typo

add noBias for nn.Linear and nn.SpatialConvolution
---
 generic/SpatialConvolutionMM.c | 28 ++++++++++++++++------------
 generic/THNN.h                 |  1 -
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
index e3eec5120d6..a549a373c39 100644
--- a/generic/SpatialConvolutionMM.c
+++ b/generic/SpatialConvolutionMM.c
@@ -29,9 +29,12 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
   output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
                                          nOutputPlane, -1,
                                          outputHeight*outputWidth, -1);
-
-  for(i = 0; i < nOutputPlane; i++)
-    THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
 
   THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
 
@@ -154,7 +157,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *weight,
-          THTensor *bias,
           THTensor *finput,
           THTensor *fgradInput,
           int kW,
@@ -217,14 +219,16 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
   THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
   THTensor_(transpose)(finput, finput, 0, 1);
 
-  for(i = 0; i < gradBias->size[0]; i++)
-  {
-    long k;
-    real sum = 0;
-    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
-    for(k = 0; k < gradOutput2d->size[1]; k++)
-      sum += data[k];
-    (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
   }
 
   THTensor_(free)(gradOutput2d);
diff --git a/generic/THNN.h b/generic/THNN.h
index 9c74622a874..1916769777f 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -589,7 +589,6 @@ TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *weight,
-          THTensor *bias,
           THTensor *finput,
           THTensor *fgradInput,
           int kW, int kH,

From b3a4c61b02f3c6976c1271dcc4d4200eb54e8f68 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 26 Apr 2016 18:34:24 +0200
Subject: [PATCH 078/101] Add SpatialClassNLLCriterion

---
 generic/SpatialClassNLLCriterion.c | 123 +++++++++++++++++++++++++++++
 generic/THNN.h                     |  17 ++++
 init.c                             |   3 +
 3 files changed, 143 insertions(+)
 create mode 100644 generic/SpatialClassNLLCriterion.c

diff --git a/generic/SpatialClassNLLCriterion.c b/generic/SpatialClassNLLCriterion.c
new file mode 100644
index 00000000000..7d7c862543d
--- /dev/null
+++ b/generic/SpatialClassNLLCriterion.c
@@ -0,0 +1,123 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK                                                            \
+  THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
+              "only batches of spatial targets supported (3D tensors)");         \
+  THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
+              "only batches of spatial inputs supported (4D tensors)");          \
+                                                                                 \
+  {                                                                              \
+    long input0 = THTensor_(size)(input, 0);                                     \
+    long input1 = THTensor_(size)(input, 1);                                     \
+    long input2 = THTensor_(size)(input, 2);                                     \
+    long input3 = THTensor_(size)(input, 3);                                     \
+    long target0 = THIndexTensor_(size)(target, 0);                              \
+    long target1 = THIndexTensor_(size)(target, 1);                              \
+    long target2 = THIndexTensor_(size)(target, 2);                              \
+    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
+              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+              input0, input1, input2, input3, target0, target1, target2);        \
+  }
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real total_weight_acc = 0;
+  real output_acc = 0;
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - 1;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_acc += cur_weight;
+      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+    }
+  }
+  *total_weight_data = total_weight_acc;
+  *output_data = output_acc;
+
+  if (sizeAverage && *total_weight_data)
+    *output_data /= *total_weight_data;
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+              "gradInput must be contiguous");
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0)
+    return;
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real normalize = sizeAverage ? *total_weight_data : 1.0f;
+
+#pragma omp parallel for private(i)
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - 1;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[b * sample_size + cur_target * map_size + elem] =
+        -(weights ? weights_data[cur_target] : 1.0f) / normalize;
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index fc1f9027baf..45691d93228 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -42,6 +42,23 @@ TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
           THTensor *weights,           // [OPTIONAL] class weights
           THTensor *total_weight);     // [BUFFER]
 
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+
 TH_API void THNN_(ELU_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
diff --git a/init.c b/init.c
index 6367b20e9b5..022d1c3ad05 100644
--- a/init.c
+++ b/init.c
@@ -13,6 +13,9 @@
 #include "generic/ClassNLLCriterion.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/DistKLDivCriterion.c"
 #include "THGenerateFloatTypes.h"
 

From 95a5e252286b214446fec7e675ce7d280c5df749 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 26 Apr 2016 18:38:28 +0200
Subject: [PATCH 079/101] Fix ClassNLLCriterion buffer

---
 generic/SpatialClassNLLCriterion.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/SpatialClassNLLCriterion.c b/generic/SpatialClassNLLCriterion.c
index 7d7c862543d..011021a00c1 100644
--- a/generic/SpatialClassNLLCriterion.c
+++ b/generic/SpatialClassNLLCriterion.c
@@ -102,7 +102,7 @@ void THNN_(SpatialClassNLLCriterion_updateGradInput)(
 
   real normalize = sizeAverage ? *total_weight_data : 1.0f;
 
-#pragma omp parallel for private(i)
+#pragma omp parallel for
   for (int b = 0; b < batch_size; b++) {
     for (int elem = 0; elem < map_size; elem++) {
       int cur_target = target_data[b * map_size + elem] - 1;

From 100c45ac69e1d394397327ec30dcc939e62d3a53 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Wed, 27 Apr 2016 12:04:59 -0700
Subject: [PATCH 080/101] MultiLabelMarginCriterion fixes for CUDA

---
 generic/LogSoftMax.c                |  2 +-
 generic/MultiLabelMarginCriterion.c | 54 ++++++++++++++++-------------
 generic/THNN.h                      |  2 ++
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/generic/LogSoftMax.c b/generic/LogSoftMax.c
index 73d96232d96..20891184994 100644
--- a/generic/LogSoftMax.c
+++ b/generic/LogSoftMax.c
@@ -46,7 +46,7 @@ void THNN_(LogSoftMax_updateOutput)(
       maxInput = THMax(maxInput, input_data[d]);
 
     for (d = 0; d < dim; d++)
-      logsum += THExpMinusApprox(maxInput-input_data[d]);
+      logsum += exp(input_data[d] - maxInput);
     logsum = maxInput + log(logsum);
 
     for (d = 0; d < dim; d++)
diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index 633e062be67..4cbb0004ea1 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -7,9 +7,10 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
           THTensor *input,
           THTensor *target,
           THTensor *output,
+          THTensor *isTarget,
           bool sizeAverage)
 {
-  real *input_data, *target_data;
+  real *input_data, *target_data, *isTarget_data;
   long nframe, dim;
   long t, d, dt, ddt;
   real sum;
@@ -37,9 +38,20 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   input_data = THTensor_(data)(input);
   target_data = THTensor_(data)(target);
 
+  THTensor_(resizeAs)(isTarget, target);
+  THTensor_(zero)(isTarget);
+  isTarget_data = THTensor_(data)(isTarget);
+
   sum = 0;
   for (t = 0; t < nframe; t++)
   {
+    for (ddt = 0; ddt < dim; ddt++)
+    {
+      long target_idx = (long)target_data[ddt]-1;
+      if (target_idx < 0)
+        break;
+      isTarget_data[target_idx] = 1;
+    }
     for (dt = 0; dt < dim; dt++)
     {
       long target_idx = (long)target_data[dt]-1;
@@ -50,16 +62,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
       input_target = input_data[target_idx];
       for (d = 0; d < dim; d++)
       {
-        int istarget = 0;
-        for(ddt = 0; ddt < dim; ddt++)
-        {
-          if (!target_data[ddt])
-            break;
-          if (((long)target_data[ddt])-1 == d)
-            istarget = 1;
-        }
-
-        if (!istarget)
+        if (!isTarget_data[d])
         {
           real z = 1 - input_target + input_data[d];
           if (z > 0)
@@ -69,10 +72,11 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
     }
     input_data += dim;
     target_data += dim;
+    isTarget_data += dim;
   }
 
   sum /= dim;
-  if(sizeAverage)
+  if (sizeAverage)
     sum /= nframe;
 
   THTensor_(set1d)(output, 0, sum);
@@ -86,13 +90,15 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
           THTensor *input,
           THTensor *target,
           THTensor *gradInput,
+          THTensor *isTarget,
           bool sizeAverage)
 {
   real *input_data;
   real *gradInput_data;
   real *target_data;
+  real *isTarget_data;
   long nframe, dim;
-  long t, d, dt, ddt;
+  long t, d, dt;
   real g;
 
   THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
@@ -102,23 +108,30 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
     nframe = 1;
     dim = input->size[0];
     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3, "inconsistent isTarget size");
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe) && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
   }
 
   THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
   THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
 
+  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
   target = THTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
+  isTarget = THTensor_(newContiguous)(isTarget);
   input_data = THTensor_(data)(input);
   target_data = THTensor_(data)(target);
+  isTarget_data = THTensor_(data)(isTarget);
 
-  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)nframe));
+  g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);
@@ -136,16 +149,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
       input_target = input_data[target_idx];
       for (d = 0; d < dim; d++)
       {
-        int istarget = 0;
-        for (ddt = 0; ddt < dim; ddt++)
-        {
-          if (!target_data[ddt])
-            break;
-          if (((long)target_data[ddt])-1 == d)
-            istarget = 1;
-        }
-
-        if (!istarget)
+        if (!isTarget_data[d])
         {
           real z = 1 - input_target + input_data[d];
           if (z > 0)
@@ -158,11 +162,13 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
     }
     input_data += dim;
     target_data += dim;
+    isTarget_data += dim;
     gradInput_data += dim;
   }
 
   THTensor_(free)(input);
   THTensor_(free)(target);
+  THTensor_(free)(isTarget);
 }
 
 #endif
diff --git a/generic/THNN.h b/generic/THNN.h
index fc1f9027baf..dde95904b73 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -211,12 +211,14 @@ TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
           THTensor *input,
           THTensor *target,
           THTensor *output,
+          THTensor *isTarget,
           bool sizeAverage);
 TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
           THTensor *gradInput,
+          THTensor *isTarget,
           bool sizeAverage);
 
 TH_API void THNN_(MultiMarginCriterion_updateOutput)(

From b1abb4a311141f04cc9e0b7d4ca42bddef6c24b9 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Wed, 27 Apr 2016 14:21:19 -0700
Subject: [PATCH 081/101] adding SpatialDilatedConvolution + tests + doc

---
 generic/SpatialDilatedConvolution.c | 337 ++++++++++++++++++++++++++++
 generic/SpatialFullConvolution.c    |  90 ++++----
 generic/THNN.h                      |  39 ++++
 init.c                              |   3 +
 4 files changed, 426 insertions(+), 43 deletions(-)
 create mode 100644 generic/SpatialDilatedConvolution.c

diff --git a/generic/SpatialDilatedConvolution.c b/generic/SpatialDilatedConvolution.c
new file mode 100644
index 00000000000..3f75016dcbd
--- /dev/null
+++ b/generic/SpatialDilatedConvolution.c
@@ -0,0 +1,337 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    real scale)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/generic/SpatialFullConvolution.c b/generic/SpatialFullConvolution.c
index de2c18fc06a..20dd1268528 100644
--- a/generic/SpatialFullConvolution.c
+++ b/generic/SpatialFullConvolution.c
@@ -2,55 +2,56 @@
 #define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
 #else
 
-
 static void THNN_(im2col)(const real* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    real* data_col) {
-  int c, h, w;
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int channels_col = channels * kernel_h * kernel_w;
-  for (c = 0; c < channels_col; ++c) {
-    int w_offset = c % kernel_w;
-    int h_offset = (c / kernel_w) % kernel_h;
-    int c_im = c / kernel_h / kernel_w;
-    for (h = 0; h < height_col; ++h) {
-      for (w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_col[(c * height_col + h) * width_col + w] =
-            data_im[(c_im * height + h_pad) * width + w_pad];
-        else
-          data_col[(c * height_col + h) * width_col + w] = 0;
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_col) {
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+          data_im[(c_im * height + h_im) * width + w_im] : 0;
       }
     }
   }
 }
 
 static void THNN_(col2im)(const real* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    real* data_im) {
-  int c, h, w;
-  memset(data_im, 0, sizeof(real)*height * width * channels);
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int channels_col = channels * patch_h * patch_w;
-  for (c = 0; c < channels_col; ++c) {
-    int w_offset = c % patch_w;
-    int h_offset = (c / patch_w) % patch_h;
-    int c_im = c / patch_h / patch_w;
-    for (h = 0; h < height_col; ++h) {
-      for (w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_im[(c_im * height + h_pad) * width + w_pad] +=
-            data_col[(c * height_col + h) * width_col + w];
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_im) {
+  memset(data_im, 0, sizeof(real) * height * width * channels);
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+            data_col[(c_col * height_col + h_col) * width_col + w_col];
       }
     }
   }
@@ -68,7 +69,7 @@ void THNN_(SpatialFullConvolution_updateOutput)(
     int dW, int dH,
     int padW, int padH,
     int adjW, int adjH)
-{  
+{
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);
 
@@ -139,6 +140,7 @@ void THNN_(SpatialFullConvolution_updateOutput)(
     THNN_(col2im)(
       THTensor_(data)(columns),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
       THTensor_(data)(output_n)
     );
 
@@ -227,6 +229,7 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
     THNN_(im2col)(
       THTensor_(data)(gradOutput_n),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
       THTensor_(data)(gradColumns)
     );
 
@@ -323,6 +326,7 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
     THNN_(im2col)(
       THTensor_(data)(gradOutput_n),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
       THTensor_(data)(columns)
     );
 
diff --git a/generic/THNN.h b/generic/THNN.h
index fc1f9027baf..88a91d4250d 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -772,6 +772,45 @@ TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
           int dW, int dH,         // stride
           real scale);            // scaling factor
 
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    real scale);
+
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index 6367b20e9b5..8f65fc7d74d 100644
--- a/init.c
+++ b/init.c
@@ -121,6 +121,9 @@
 #include "generic/SpatialFullConvolutionMap.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialAdaptiveMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 

From 684f4e2b1835864704964dd81c7bf555631175c8 Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Fri, 29 Apr 2016 21:03:59 +0200
Subject: [PATCH 082/101] Fix CMakeLists for Intel compilers

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cc1960943e..807b9907e22 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,7 +17,7 @@ IF(MSVC)
 ENDIF(MSVC)
 
 IF (CMAKE_VERSION VERSION_LESS "3.1")
-  SET(CMAKE_C_FLAGS "--std=c99 ${CMAKE_C_FLAGS}")
+  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
 ELSE ()
   SET(CMAKE_C_STANDARD 99)
 ENDIF ()

From 06f1c8a2e192fed2d62750a64bb6bfa220f71dca Mon Sep 17 00:00:00 2001
From: fsuzanomassa <fvsmassa@gmail.com>
Date: Mon, 2 May 2016 18:59:48 +0200
Subject: [PATCH 083/101] Remove THExpMinusApprox from SoftMax

Should address #804
---
 generic/SoftMax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generic/SoftMax.c b/generic/SoftMax.c
index 78bec4dbba6..8bccefdba3c 100644
--- a/generic/SoftMax.c
+++ b/generic/SoftMax.c
@@ -64,7 +64,7 @@ void THNN_(SoftMax_updateOutput)(
     sum = 0;
     for (d = 0; d < dim; d++)
     {
-      real z = THExpMinusApprox(inputMax - input_ptr[d*stride]);
+      real z = exp(input_ptr[d*stride] - inputMax);
       output_ptr[d*stride] = z;
       sum += z;
     }

From d424e34e376ba0b238f8efdb642d3218b891f253 Mon Sep 17 00:00:00 2001
From: Zeming Lin <ebetica0@gmail.com>
Date: Tue, 10 May 2016 01:30:09 -0700
Subject: [PATCH 084/101] Fixing sparse linear race condition

---
 generic/SparseLinear.c | 104 +++++++++++++++++++++++++++--------------
 1 file changed, 69 insertions(+), 35 deletions(-)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index 0f426ba5b9b..b7bf8abd537 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -48,7 +48,7 @@ void THNN_(SparseLinear_updateOutput)(
           THTensor *weight,
           THTensor *bias)
 {
-  long h, i;
+  long h, i, j, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
   long batchSize = THTensor_(size)(output, 0);
@@ -59,25 +59,43 @@ void THNN_(SparseLinear_updateOutput)(
 
   long nnz = THTensor_(size)(input, 0);
 
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
   // output = weight * input + bias
   THTensor_(zero)(output);
-#pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000)
-  for (i = 0; i < nnz; i++) {
-    real val = THNN_(get2d)(input, i, 2);
-    if (val == 0) {
-      continue;
-    }
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    long i_start = THLongTensor_get1d(csr, h);
+    long i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
 
-    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
-    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
-    if (offset >= 0 && offset < inDim) {
-      THBlas_(axpy)(outDim,
-                    val,
-                    COL_PTR2(weight, offset), weight->stride[0],
-                    ROW_PTR2(output, h), output->stride[1]);
-    } else {
-      THError("index out of bound. updateOutput: %d not between 1 and %d",
-              offset + 1, inDim);
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
     }
   }
 
@@ -151,7 +169,7 @@ void THNN_(SparseLinear_accGradParameters)(
           real weightDecay,
           real scale)
 {
-  long h, i;
+  long h, i, col, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
 
@@ -165,26 +183,42 @@ void THNN_(SparseLinear_accGradParameters)(
              "gradOutput must be contiguous");
 
   long nnz = THTensor_(size)(input, 0);
-  // THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i = 0; i < nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
 
   // gradWeight += gradOutput * input
-#pragma omp parallel for private(h, i) schedule(static) if (\
-  nnz * outDim > 10000)
-  for (i = 0; i < nnz; i++) {
-    real val = scale * THNN_(get2d)(input, i, 2);
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    long i_start = THLongTensor_get1d(csc, col);
+    long i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
 
-    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
-    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
-    if (offset >= 0 && offset < inDim) {
-      THBlas_(axpy)(outDim,
-          val,
-          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
-          COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
-    } else {
-      THError(
-          "index out of bound. accGradParameters: %d not between 1 and %d",
-          offset + 1,
-          inDim);
+      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
     }
   }
 

From 922e2a8772be342e72a3b15d594fe546d91aa4f5 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@fb.com>
Date: Tue, 24 May 2016 17:28:23 -0400
Subject: [PATCH 085/101] logsoftmax non-contiguous gradOutput fix

---
 generic/LogSoftMax.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generic/LogSoftMax.c b/generic/LogSoftMax.c
index 20891184994..3160d8a88b5 100644
--- a/generic/LogSoftMax.c
+++ b/generic/LogSoftMax.c
@@ -63,6 +63,8 @@ void THNN_(LogSoftMax_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   real *gradInput_data, *gradOutput_data, *output_data;
   long nframe = 0, dim = 0;
   long t, d;
@@ -101,6 +103,8 @@ void THNN_(LogSoftMax_updateGradInput)(
     for (d = 0; d < dim; d++)
       gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
   }
+
+  THTensor_(free)(gradOutput);
 }
 
 #endif

From 7e5a1d91405b509e143b24a2f62ae7245dcaf512 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 27 May 2016 10:24:47 -0400
Subject: [PATCH 086/101] Visual Studio doesn't allow in-loop declaration in
 the 'omp parallel for' construct

---
 generic/BatchNormalization.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/generic/BatchNormalization.c b/generic/BatchNormalization.c
index 2a4823c7d9e..bf36d30035a 100644
--- a/generic/BatchNormalization.c
+++ b/generic/BatchNormalization.c
@@ -10,10 +10,10 @@ void THNN_(BatchNormalization_updateOutput)(
   bool train, double momentum, double eps)
 {
   long nInput = THTensor_(size)(input, 1);
-  long n = THTensor_(nElement)(input) / nInput;
+  long f,n = THTensor_(nElement)(input) / nInput;
 
   #pragma omp parallel for
-  for (long f = 0; f < nInput; ++f) {
+  for (f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *out = THTensor_(newSelect)(output, 1, f);
 
@@ -71,10 +71,10 @@ void THNN_(BatchNormalization_backward)(
   bool train, double scale, double eps)
 {
   long nInput = THTensor_(size)(input, 1);
-  long n = THTensor_(nElement)(input) / nInput;
+  long f,n = THTensor_(nElement)(input) / nInput;
 
   #pragma omp parallel for
-  for (long f = 0; f < nInput; ++f) {
+  for (f = 0; f < nInput; ++f) {
     THTensor *in = THTensor_(newSelect)(input, 1, f);
     THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
     real w = weight ? THTensor_(get1d)(weight, f) : 1;

From 801c62e9a06a46d05890c7d9d654774406d5f19a Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 27 May 2016 10:26:21 -0400
Subject: [PATCH 087/101] Visual Studio doesn't allow in-loop declaration in
 the 'omp parallel for' construct

---
 generic/SpatialClassNLLCriterion.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/generic/SpatialClassNLLCriterion.c b/generic/SpatialClassNLLCriterion.c
index 011021a00c1..3121c307c26 100644
--- a/generic/SpatialClassNLLCriterion.c
+++ b/generic/SpatialClassNLLCriterion.c
@@ -102,9 +102,10 @@ void THNN_(SpatialClassNLLCriterion_updateGradInput)(
 
   real normalize = sizeAverage ? *total_weight_data : 1.0f;
 
+  int b,elem;
 #pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int elem = 0; elem < map_size; elem++) {
+  for (b = 0; b < batch_size; b++) {
+    for (elem = 0; elem < map_size; elem++) {
       int cur_target = target_data[b * map_size + elem] - 1;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 

From 360c14b2c5386fb00fe7a27bbaefe065858ccc71 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 27 May 2016 10:56:36 -0400
Subject: [PATCH 088/101] lib prefix doe libTHNN.dll is missing on Windows

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 807b9907e22..b221d595d04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,10 @@ LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
 SET(src init.c)
 ADD_LIBRARY(THNN MODULE init.c)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(THNN PROPERTIES
+  PREFIX "lib"
+  IMPORT_PREFIX "lib")
 TARGET_LINK_LIBRARIES(THNN TH)
 
 INSTALL(TARGETS THNN LIBRARY DESTINATION ${THNN_INSTALL_LIB_SUBDIR})

From 445117e6a45b30468a112cfd57aa9c92773178ef Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Sat, 4 Jun 2016 18:56:41 -0500
Subject: [PATCH 089/101] fix memory leak in SparseLinear (#844)

---
 generic/SparseLinear.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
index b7bf8abd537..807280e01ec 100644
--- a/generic/SparseLinear.c
+++ b/generic/SparseLinear.c
@@ -105,6 +105,7 @@ void THNN_(SparseLinear_updateOutput)(
     THTensor_(cadd)(output_row, bias, 1.0, output_row);
   }
   THTensor_(free)(output_row);
+  THLongTensor_free(csr);
 }
 
 void THNN_(SparseLinear_legacyUpdateOutput)(
@@ -227,6 +228,7 @@ void THNN_(SparseLinear_accGradParameters)(
   THTensor_(sum)(buf, gradOutput, 0);
   THTensor_(cadd)(gradBias, gradBias, scale, buf);
   THTensor_(free)(buf);
+  THLongTensor_free(csc);
 
   if (weightDecay != 0) {
     THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);

From 1882a097e2198fcaa4dd7bcbc07ec605051dd460 Mon Sep 17 00:00:00 2001
From: Jonathan Tompson <jonathantompson@gmail.com>
Date: Tue, 14 Jun 2016 11:22:00 -0700
Subject: [PATCH 090/101] Added ReLU6 layer, test and doc.

---
 generic/ReLU6.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 generic/THNN.h  | 12 ++++++++++
 init.c          |  3 +++
 3 files changed, 73 insertions(+)
 create mode 100644 generic/ReLU6.c

diff --git a/generic/ReLU6.c b/generic/ReLU6.c
new file mode 100644
index 00000000000..2dc53f1bf2f
--- /dev/null
+++ b/generic/ReLU6.c
@@ -0,0 +1,58 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ReLU6.c"
+#else
+
+void THNN_(ReLU6_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data = 0;
+      else if (*input_data >= 6)
+        *input_data = 6;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data =
+         (*input_data > 0) ? ((*input_data < 6) ? *input_data : 6) : 0;
+    );
+  }
+}
+
+void THNN_(ReLU6_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= 0 || (*input_data) >= 6)
+        *gradOutput_data = 0;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > 0 && (*input_data) < 6)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
+}
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 1600fb1d399..06ac2cad6f2 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -472,6 +472,18 @@ TH_API void THNN_(Threshold_updateGradInput)(
           real threshold,
           bool inplace);
 
+TH_API void THNN_(ReLU6_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          bool inplace);
+TH_API void THNN_(ReLU6_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          bool inplace);
+
 TH_API void THNN_(TemporalConvolution_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index 7c0de94fa6a..b2b84fc7e3c 100644
--- a/init.c
+++ b/init.c
@@ -94,6 +94,9 @@
 #include "generic/Threshold.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/ReLU6.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/TemporalConvolution.c"
 #include "THGenerateFloatTypes.h"
 

From dd439d77b8bbc3e287f242920c2c2032f2cee602 Mon Sep 17 00:00:00 2001
From: Jonathan Tompson <jonathantompson@gmail.com>
Date: Fri, 17 Jun 2016 13:24:08 -0700
Subject: [PATCH 091/101] Added VolumetricReplicationPadding.

---
 generic/THNN.h                         |   9 +
 generic/VolumetricReplicationPadding.c | 301 +++++++++++++++++++++++++
 init.c                                 |   4 +
 3 files changed, 314 insertions(+)
 create mode 100644 generic/VolumetricReplicationPadding.c

diff --git a/generic/THNN.h b/generic/THNN.h
index 06ac2cad6f2..fa38f243e7a 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -1105,4 +1105,13 @@ TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
                                                             int pad_l, int pad_r,
                                                             int pad_t, int pad_b);
 
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+          THNNState *state, THTensor *input, THTensor *output, int pleft,
+          int pright, int ptop, int pbottom, int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+          THNNState *state, THTensor *input, THTensor *gradOutput,
+          THTensor *gradInput, int pleft, int pright, int ptop, int pbottom,
+          int pfront, int pback);
+
 #endif
diff --git a/generic/VolumetricReplicationPadding.c b/generic/VolumetricReplicationPadding.c
new file mode 100644
index 00000000000..c4ab02e5bca
--- /dev/null
+++ b/generic/VolumetricReplicationPadding.c
@@ -0,0 +1,301 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *dest_p = output_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *src_p = input_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p = *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *output,
+                                                      int pleft, int pright,
+                                                      int ptop, int pbottom,
+                                                      int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5,
+             2, "input must be 4 or 5-dimensional");
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1 , 2,
+             "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+         input_data, output_data, nslices, iwidth, iheight, idepth,
+         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+         pback);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+        input_data + p * nslices * iwidth * iheight * idepth,
+        output_data + p * nslices * owidth * oheight * odepth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *src_p = goutput_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p += *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *gradOutput,
+                                                         THTensor *gradInput,
+                                                         int pleft, int pright,
+                                                         int ptop, int pbottom,
+                                                         int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+  THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+                "gradOutput depth unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 4) {
+    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight, idepth,
+      owidth, oheight, odepth,
+      pleft, pright,
+      ptop, pbottom,
+      pfront, pback);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/init.c b/init.c
index b2b84fc7e3c..c805f33c6db 100644
--- a/init.c
+++ b/init.c
@@ -174,3 +174,7 @@
 
 #include "generic/SpatialReplicationPadding.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+

From f2b2ad6fbcae3090cc1ff4e1e830b753293c7a69 Mon Sep 17 00:00:00 2001
From: Sergey Zagoruyko <zagoruyko2@gmail.com>
Date: Wed, 15 Jun 2016 18:31:27 +0200
Subject: [PATCH 092/101] inplace HardTanh, subclass ReLU6

---
 generic/HardTanh.c | 113 +++++++++++++++++++++++++++++++--------------
 generic/ReLU6.c    |  58 -----------------------
 generic/THNN.h     |  18 ++------
 init.c             |   3 --
 4 files changed, 82 insertions(+), 110 deletions(-)
 delete mode 100644 generic/ReLU6.c

diff --git a/generic/HardTanh.c b/generic/HardTanh.c
index 9764ec09c50..3b7ba3dd169 100644
--- a/generic/HardTanh.c
+++ b/generic/HardTanh.c
@@ -7,37 +7,59 @@ void THNN_(HardTanh_updateOutput)(
           THTensor *input,
           THTensor *output,
           real min_val,
-          real max_val)
+          real max_val,
+          bool inplace)
 {
-  THTensor_(resizeAs)(output, input);
+  if (inplace)
+    THTensor_(set)(output, input);
+  else
+    THTensor_(resizeAs)(output, input);
   
   if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
   {
-    TH_TENSOR_APPLY2(real, output, real, input,
-      if (*input_data < min_val)
-        *output_data = min_val;
-      else if (*input_data <= max_val)
-        *output_data = *input_data;
-      else
-        *output_data = max_val;
-    );
+    if (inplace)
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data < min_val)
+          *input_data = min_val;
+        else if (*input_data > max_val)
+          *input_data = max_val;
+      );
+      TH_TENSOR_APPLY2(real, output, real, input,
+        if (*input_data < min_val)
+          *output_data = min_val;
+        else if (*input_data <= max_val)
+          *output_data = *input_data;
+        else
+          *output_data = max_val;
+      );
   }
   else
   {
-    real* ptr_output = THTensor_(data)(output);
     real* ptr_input  = THTensor_(data)(input);
+    real* ptr_output = THTensor_(data)(output);
     long i;
+    long n = THTensor_(nElement)(input);
 
+    if (inplace)
 #pragma omp parallel for private(i)
-    for (i = 0; i < THTensor_(nElement)(input); i++)
-    {
-      if (ptr_input[i] < min_val)
-        ptr_output[i] = min_val;
-      else if (ptr_input[i] <= max_val)
-        ptr_output[i] = ptr_input[i];
-      else
-        ptr_output[i] = max_val;
-    }
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_input[i] = min_val;
+        else if (ptr_input[i] > max_val)
+          ptr_input[i] = max_val;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_output[i] = min_val;
+        else if (ptr_input[i] <= max_val)
+          ptr_output[i] = ptr_input[i];
+        else
+          ptr_output[i] = max_val;
+      }
   }
 }
 
@@ -47,21 +69,33 @@ void THNN_(HardTanh_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           real min_val,
-          real max_val)
+          real max_val,
+          bool inplace)
 {
-  THTensor_(resizeAs)(gradInput, input);
+  if (inplace)
+    THTensor_(set)(gradInput, gradOutput);
+  else
+    THTensor_(resizeAs)(gradInput, input);
 
   if (input->nDimension == 1 ||
     !THTensor_(isContiguous)(input) ||
     !THTensor_(isContiguous)(gradOutput) ||
     !THTensor_(isContiguous)(gradInput))
   {
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-      if (*input_data < min_val || *input_data > max_val)
-        *gradInput_data = 0;
-      else
-        *gradInput_data = *gradOutput_data;
-    );
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data < min_val || *input_data > max_val)
+          *gradOutput_data = 0;
+      );
+    }
+    else
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        if (*input_data < min_val || *input_data > max_val)
+          *gradInput_data = 0;
+        else
+          *gradInput_data = *gradOutput_data;
+      );
   }
   else
   {
@@ -69,15 +103,24 @@ void THNN_(HardTanh_updateGradInput)(
     real* ptr_gradInput  = THTensor_(data)(gradInput);
     real* ptr_input      = THTensor_(data)(input);
     long i;
+    long n = THTensor_(nElement)(input);
 
+    if (inplace)
 #pragma omp parallel for private(i)
-    for (i = 0; i < THTensor_(nElement)(input); i++)
-    {
-      if (ptr_input[i] < min_val || ptr_input[i] > max_val)
-        ptr_gradInput[i] = 0;
-      else
-        ptr_gradInput[i] = ptr_gradOutput[i];
-    }
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val || ptr_input[i] > max_val)
+          ptr_gradInput[i] = 0;
+        else
+          ptr_gradInput[i] = ptr_gradOutput[i];
+      }
   }
 }
 
diff --git a/generic/ReLU6.c b/generic/ReLU6.c
deleted file mode 100644
index 2dc53f1bf2f..00000000000
--- a/generic/ReLU6.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/ReLU6.c"
-#else
-
-void THNN_(ReLU6_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          bool inplace)
-{
-  if (inplace)
-  {
-    TH_TENSOR_APPLY(real, input,
-      if (*input_data <= 0)
-        *input_data = 0;
-      else if (*input_data >= 6)
-        *input_data = 6;
-    );
-    THTensor_(set)(output, input);
-  }
-  else
-  {
-    THTensor_(resizeAs)(output, input);
-    TH_TENSOR_APPLY2(real, output, real, input,
-      *output_data =
-         (*input_data > 0) ? ((*input_data < 6) ? *input_data : 6) : 0;
-    );
-  }
-}
-
-void THNN_(ReLU6_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          bool inplace)
-{
-  if (inplace)
-  {
-    TH_TENSOR_APPLY2(real, gradOutput, real, input,
-      if ((*input_data) <= 0 || (*input_data) >= 6)
-        *gradOutput_data = 0;
-    );
-    THTensor_(set)(gradInput, gradOutput);
-  }
-  else
-  {
-    THTensor_(resizeAs)(gradInput, input);
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-      if ((*input_data) > 0 && (*input_data) < 6)
-        *gradInput_data = *gradOutput_data;
-      else
-        *gradInput_data = 0;
-    );
-  }
-}
-
-#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 06ac2cad6f2..724bf30983f 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -106,14 +106,16 @@ TH_API void THNN_(HardTanh_updateOutput)(
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] output tensor
           real min_val,                // lower threshold
-          real max_val);               // upper threshold
+          real max_val,
+          bool inplace);               // upper threshold
 TH_API void THNN_(HardTanh_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *gradOutput,        // gradient w.r.t. module's output
           THTensor *gradInput,         // [OUT] gradient w.r.t. the input
           real min_val,                // lower threshold
-          real max_val);               // upper threshold
+          real max_val,
+          bool inplace);               // upper threshold
 
 TH_API void THNN_(L1Cost_updateOutput)(
           THNNState *state,            // library's state
@@ -472,18 +474,6 @@ TH_API void THNN_(Threshold_updateGradInput)(
           real threshold,
           bool inplace);
 
-TH_API void THNN_(ReLU6_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          bool inplace);
-TH_API void THNN_(ReLU6_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          bool inplace);
-
 TH_API void THNN_(TemporalConvolution_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/init.c b/init.c
index b2b84fc7e3c..7c0de94fa6a 100644
--- a/init.c
+++ b/init.c
@@ -94,9 +94,6 @@
 #include "generic/Threshold.c"
 #include "THGenerateFloatTypes.h"
 
-#include "generic/ReLU6.c"
-#include "THGenerateFloatTypes.h"
-
 #include "generic/TemporalConvolution.c"
 #include "THGenerateFloatTypes.h"
 

From 3af4b939567b050b971fd2f4d7c83c030ce4544c Mon Sep 17 00:00:00 2001
From: PraveerSINGH <Praveer.Singh@imagine.enpc.fr>
Date: Thu, 23 Jun 2016 14:53:23 +0200
Subject: [PATCH 093/101] nobias in spatial full conv

---
 generic/SpatialFullConvolution.c | 41 +++++++++++++++++---------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/generic/SpatialFullConvolution.c b/generic/SpatialFullConvolution.c
index 20dd1268528..637cffa3d36 100644
--- a/generic/SpatialFullConvolution.c
+++ b/generic/SpatialFullConvolution.c
@@ -152,16 +152,17 @@ void THNN_(SpatialFullConvolution_updateOutput)(
     long k_ = 1;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THBlas_(gemm)(
-        't', 'n',
-        n_, m_, k_,
-        1,
-        THTensor_(data)(ones), k_,
-        THTensor_(data)(bias), k_,
-        1,
-        THTensor_(data)(output_n), n_
-    );
-
+    if (bias) {
+      THBlas_(gemm)(
+          't', 'n',
+          n_, m_, k_,
+          1,
+          THTensor_(data)(ones), k_,
+          THTensor_(data)(bias), k_,
+          1,
+          THTensor_(data)(output_n), n_
+      );
+    }
   }
 
   // Free
@@ -355,15 +356,17 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
     long k_ = outputHeight * outputWidth;
 
     // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    THBlas_(gemv)(
-        't',
-        k_, m_,
-        scale,
-        THTensor_(data)(gradOutput_n), k_,
-        THTensor_(data)(ones), 1,
-        1,
-        THTensor_(data)(gradBias), 1
-    );
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
   }
 
   // Free

From eaf03c8c4a04df0dfa3d326f4d577021a535ca29 Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Tue, 12 Jul 2016 09:02:46 -0700
Subject: [PATCH 094/101] Fix SpatialClassNLLCriterion when using OMP

Although the `b` variable is automatically private, the `elem`
loop variable is not private.

Test: change test.lua to run with more than one thread
---
 generic/SpatialClassNLLCriterion.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/generic/SpatialClassNLLCriterion.c b/generic/SpatialClassNLLCriterion.c
index 3121c307c26..c569d699ad7 100644
--- a/generic/SpatialClassNLLCriterion.c
+++ b/generic/SpatialClassNLLCriterion.c
@@ -102,9 +102,10 @@ void THNN_(SpatialClassNLLCriterion_updateGradInput)(
 
   real normalize = sizeAverage ? *total_weight_data : 1.0f;
 
-  int b,elem;
-#pragma omp parallel for
+  int b;
+  #pragma omp parallel for
   for (b = 0; b < batch_size; b++) {
+    int elem;
     for (elem = 0; elem < map_size; elem++) {
       int cur_target = target_data[b * map_size + elem] - 1;
       THAssert(cur_target >= 0 && cur_target < n_classes);

From 820ce3f2df3e21c83fd9a208ace781210833d655 Mon Sep 17 00:00:00 2001
From: Martin Simonovsky <simonovm@imagine.enpc.fr>
Date: Mon, 25 Jul 2016 12:00:52 +0200
Subject: [PATCH 095/101] added bound checking for weights

---
 generic/ClassNLLCriterion.c        | 7 +++++++
 generic/SpatialClassNLLCriterion.c | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index eb02f7c6b19..49cb57495c1 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -20,6 +20,9 @@ void THNN_(ClassNLLCriterion_updateOutput)(
   if (THTensor_(nDimension)(input) > 2) {
     THError("input tensor should be 1D or 2D");
   }
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
 
   input = THTensor_(newContiguous)(input);
   target = THIndexTensor_(newContiguous)(target);
@@ -95,6 +98,10 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   if (THTensor_(nDimension)(input) > 2) {
     THError("input tensor should be 1D or 2D");
   }
+  
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
 
   target = THIndexTensor_(newContiguous)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
diff --git a/generic/SpatialClassNLLCriterion.c b/generic/SpatialClassNLLCriterion.c
index 3121c307c26..b146016952f 100644
--- a/generic/SpatialClassNLLCriterion.c
+++ b/generic/SpatialClassNLLCriterion.c
@@ -7,6 +7,9 @@
               "only batches of spatial targets supported (3D tensors)");         \
   THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
               "only batches of spatial inputs supported (4D tensors)");          \
+  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
+    THError("weight tensor should be defined either for all or no classes");     \
+  }                                                                              \
                                                                                  \
   {                                                                              \
     long input0 = THTensor_(size)(input, 0);                                     \

From 891114bf54920483b97a82b542e7f1d14716d0db Mon Sep 17 00:00:00 2001
From: Pauline Luc <paulineluc@fb.com>
Date: Wed, 27 Jul 2016 12:03:29 +0200
Subject: [PATCH 096/101] Adding SpatialUpSamplingBilinear

---
 generic/SpatialUpSamplingBilinear.c | 127 ++++++++++++++++++++++++++++
 generic/THNN.h                      |   9 ++
 init.c                              |   3 +
 3 files changed, 139 insertions(+)
 create mode 100644 generic/SpatialUpSamplingBilinear.c

diff --git a/generic/SpatialUpSamplingBilinear.c b/generic/SpatialUpSamplingBilinear.c
new file mode 100644
index 00000000000..78290b65d16
--- /dev/null
+++ b/generic/SpatialUpSamplingBilinear.c
@@ -0,0 +1,127 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output){
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  int channels = THTensor_(size)(input, 0) * THTensor_(size)(input, 1);
+  int height1 = THTensor_(size)(input, 2);
+  int width1 = THTensor_(size)(input, 3);
+  int height2 = THTensor_(size)(output, 2);
+  int width2 = THTensor_(size)(output, 3);
+  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  // special case: just copy
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * width1 + w1];
+        real* pos2 = &odata[h2 * width2 + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += width1 * height1;
+          pos2 += width2 * height2;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      const real* pos1 = &idata[h1 * width1 + w1];
+      real* pos2 = &odata[h2 * width2 + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+                  + h1lambda * (w0lambda * pos1[h1p * width1]
+                  + w1lambda * pos1[h1p * width1 + w1p]);
+        pos1 += width1 * height1;
+        pos2 += width2 * height2;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput){
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THTensor_(zero)(gradInput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  int channels = THTensor_(size)(gradInput, 0) * THTensor_(size)(gradInput, 1);
+  int height1 = THTensor_(size)(gradInput, 2);
+  int width1 = THTensor_(size)(gradInput, 3);
+  int height2 = THTensor_(size)(gradOutput, 2);
+  int width2 = THTensor_(size)(gradOutput, 3);
+  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  // special case: same-size matching grids
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &data1[h1 * width1 + w1];
+        const real* pos2 = &data2[h2 * width2 + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += width1 * height1;
+          pos2 += width2 * height2;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      real* pos1 = &data1[h1 * width1 + w1];
+      const real* pos2 = &data2[h2 * width2 + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * width1] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * width1 + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += width1 * height1;
+        pos2 += width2 * height2;
+      }
+    }
+  }
+}
+
+#endif
diff --git a/generic/THNN.h b/generic/THNN.h
index 82181631f60..cda1029dbc2 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -904,6 +904,15 @@ TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
           THTensor *gradInput,
           int scale_factor);
 
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+         THNNState *state,
+         THTensor *input,
+         THTensor *output);
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+         THNNState *state,
+         THTensor *gradOutput,
+         THTensor *gradInput);
+
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
           THTensor *input,
diff --git a/init.c b/init.c
index e5bb8fcccb9..77fe8da3145 100644
--- a/init.c
+++ b/init.c
@@ -148,6 +148,9 @@
 #include "generic/SpatialUpSamplingNearest.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialUpSamplingBilinear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 

From 491173b7d9647b9230c240fc75310fe69f002592 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Mon, 1 Aug 2016 11:01:48 -0400
Subject: [PATCH 097/101] Use TH_INDEX_BASE in THNN

---
 generic/ClassNLLCriterion.c           |  8 ++++----
 generic/LookupTable.c                 | 16 ++++++++--------
 generic/MultiLabelMarginCriterion.c   |  6 +++---
 generic/MultiMarginCriterion.c        |  6 +++---
 generic/SpatialAdaptiveMaxPooling.c   | 18 +++++++++---------
 generic/SpatialClassNLLCriterion.c    |  4 ++--
 generic/SpatialConvolutionMap.c       | 18 +++++++++---------
 generic/SpatialFractionalMaxPooling.c |  6 +++---
 generic/SpatialFullConvolutionMap.c   | 18 +++++++++---------
 generic/SpatialMaxPooling.c           |  6 +++---
 generic/SpatialMaxUnpooling.c         | 16 ++++++++--------
 11 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/generic/ClassNLLCriterion.c b/generic/ClassNLLCriterion.c
index eb02f7c6b19..d156d41f7a8 100644
--- a/generic/ClassNLLCriterion.c
+++ b/generic/ClassNLLCriterion.c
@@ -34,7 +34,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
   output_data[0] = total_weight_data[0] = 0.0;
 
   if (THTensor_(nDimension)(input) == 1) {
-    int cur_target = target_data[0] - 1;
+    int cur_target = target_data[0] - TH_INDEX_BASE;
     THAssert(cur_target >= 0 && cur_target < n_classes);
     total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
     output_data[0] = -input_data[cur_target] * total_weight_data[0];
@@ -46,7 +46,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
 
     int i;
     for (i = 0; i < batch_size; i++) {
-      int cur_target = target_data[i] - 1;
+      int cur_target = target_data[i] - TH_INDEX_BASE;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
       real cur_weight = weights ? weights_data[cur_target] : 1.0f;
@@ -104,7 +104,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   real *gradInput_data = THTensor_(data)(gradInput);
 
   if (THTensor_(nDimension)(input) == 1) {
-    int cur_target = target_data[0] - 1;
+    int cur_target = target_data[0] - TH_INDEX_BASE;
     THAssert(cur_target >= 0 && cur_target < n_classes);
 
     gradInput_data[cur_target] =
@@ -118,7 +118,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
 
     int i;
     for (i = 0; i < batch_size; i++){
-      int cur_target = target_data[i] - 1;
+      int cur_target = target_data[i] - TH_INDEX_BASE;
 
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
diff --git a/generic/LookupTable.c b/generic/LookupTable.c
index a35ff8496ff..378d1c33eb8 100644
--- a/generic/LookupTable.c
+++ b/generic/LookupTable.c
@@ -12,12 +12,12 @@ static void THNN_(LookupTable_resetCount)(
 
   for (i = 0; i<numel; i++)
   {
-    long k = input_data[i] - 1;
+    long k = input_data[i] - TH_INDEX_BASE;
     count_data[k] = 0;
   }
   for (i = 0; i<numel; i++)
   {
-    long k = input_data[i] - 1;
+    long k = input_data[i] - TH_INDEX_BASE;
     count_data[k]++;
   }
 }
@@ -56,7 +56,7 @@ void THNN_(LookupTable_accGradParameters)(
 
   // check that inputs are all within range
   for (i=0; i<numel; i++)
-    if (input_data[i] < 1 || input_data[i] > numw)
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE)
       THError("input out of range");
 
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -86,7 +86,7 @@ void THNN_(LookupTable_accGradParameters)(
       {
         if (input_data[i] != paddingValue)
         {
-            long k = input_data[i] - 1;
+            long k = input_data[i] - TH_INDEX_BASE;
             if (k >= start && k < end)
             {
                 real scale_ = scale;
@@ -106,7 +106,7 @@ void THNN_(LookupTable_accGradParameters)(
   {
     if (input_data[i] != paddingValue)
     {
-        long k = input_data[i] - 1;
+        long k = input_data[i] - TH_INDEX_BASE;
         real scale_ = scale;
         if (count_data) scale_ /= count_data[k];
         THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
@@ -178,7 +178,7 @@ void THNN_(LookupTable_renorm)(
   long stride = THTensor_(stride)(weight, 0);
   real *gw = THTensor_(data)(weight);
   for (i=0; i<numel; i++)
-    if (row_idx[i] < 1 || row_idx[i] > numw)
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE)
       THError("input out of range");
   // get unique indices
   qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
@@ -197,7 +197,7 @@ void THNN_(LookupTable_renorm)(
     #pragma omp parallel for private(i)
     for (i=0; i<numel; i++)
     {
-      long k = row_idx[i] - 1;
+      long k = row_idx[i] - TH_INDEX_BASE;
       THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
     }
     return;
@@ -205,7 +205,7 @@ void THNN_(LookupTable_renorm)(
 #endif
   for (i=0; i<numel; i++)
   {
-    long k = row_idx[i] - 1;
+    long k = row_idx[i] - TH_INDEX_BASE;
     THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
   }
 }
diff --git a/generic/MultiLabelMarginCriterion.c b/generic/MultiLabelMarginCriterion.c
index 4cbb0004ea1..9cfc5fe86f8 100644
--- a/generic/MultiLabelMarginCriterion.c
+++ b/generic/MultiLabelMarginCriterion.c
@@ -47,14 +47,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   {
     for (ddt = 0; ddt < dim; ddt++)
     {
-      long target_idx = (long)target_data[ddt]-1;
+      long target_idx = (long)target_data[ddt] - TH_INDEX_BASE;
       if (target_idx < 0)
         break;
       isTarget_data[target_idx] = 1;
     }
     for (dt = 0; dt < dim; dt++)
     {
-      long target_idx = (long)target_data[dt]-1;
+      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
       real input_target;
       if (target_idx < 0)
         break;
@@ -141,7 +141,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   {
     for (dt = 0; dt < dim; dt++)
     {
-      long target_idx = (long)target_data[dt]-1;
+      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
       real input_target;
       if (target_idx < 0)
         break;
diff --git a/generic/MultiMarginCriterion.c b/generic/MultiMarginCriterion.c
index 2463da1451b..455cf5efa02 100644
--- a/generic/MultiMarginCriterion.c
+++ b/generic/MultiMarginCriterion.c
@@ -34,7 +34,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   for (t = 0; t < nframe; t++)
   {
     real idx = THTensor_(get1d)(target, t);
-    THArgCheck((idx >= 1) && (idx <= dim), 3, "target out of range");
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, "target out of range");
   }
 
   input = THTensor_(newContiguous)(input);
@@ -47,7 +47,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   sum = 0;
   for (t = 0; t < nframe; t++)
   {
-    long target_idx = (long)(target_data[t]-1);
+    long target_idx = (long)(target_data[t] - TH_INDEX_BASE);
     real input_target = input_data[target_idx];
     for (d = 0; d < dim; d++)
     {
@@ -124,7 +124,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
 
   for (t = 0; t < nframe; t++)
   {
-    long target_idx = (long)(target_data[t])-1;
+    long target_idx = (long)(target_data[t]) - TH_INDEX_BASE;
     real input_target = input_data[target_idx];
     real gradInput_target = 0;
     for (d = 0; d < dim; d++)
diff --git a/generic/SpatialAdaptiveMaxPooling.c b/generic/SpatialAdaptiveMaxPooling.c
index 61afc40734f..5d6d995ad94 100644
--- a/generic/SpatialAdaptiveMaxPooling.c
+++ b/generic/SpatialAdaptiveMaxPooling.c
@@ -30,7 +30,7 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
 
       for(j = 0; j < owidth; j++)
       {
-        
+
         int x_start = (int)floor((float)j / owidth * iwidth);
         int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
         int kW = x_end-x_start;
@@ -64,8 +64,8 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
         *op = maxval;
 
         /* store location of max (x,y) */
-        *indyp = (int)(maxindex / kW)+1;
-        *indxp = (maxindex % kW) +1;
+        *indyp = (int)(maxindex / kW) + TH_INDEX_BASE;
+        *indxp = (maxindex % kW) + TH_INDEX_BASE;
       }
     }
   }
@@ -85,7 +85,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
   long nslices;
   long iheight;
   long iwidth;
-  
+
   long istride_d;
   long istride_h;
   long istride_w;
@@ -98,7 +98,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 
   THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
 
-  if (input->nDimension == 4) 
+  if (input->nDimension == 4)
   {
     istride_b = input->stride[0];
     nbatch = input->size[0];
@@ -179,7 +179,7 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
     real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
     real *indx_p_k = indx_p + k*owidth*oheight;
     real *indy_p_k = indy_p + k*owidth*oheight;
-    
+
     /* calculate max points */
     long i, j;
     for(i = 0; i < oheight; i++)
@@ -189,9 +189,9 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
       {
         int x_start = (int)floor((float) j / owidth * iwidth);
         /* retrieve position of max */
-        long maxi = indy_p_k[i*owidth + j] - 1 + y_start;
-        long maxj = indx_p_k[i*owidth + j] - 1 + x_start;
-        
+        long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
+        long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
+
         /* update gradient */
         gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
       }
diff --git a/generic/SpatialClassNLLCriterion.c b/generic/SpatialClassNLLCriterion.c
index c569d699ad7..f043411d829 100644
--- a/generic/SpatialClassNLLCriterion.c
+++ b/generic/SpatialClassNLLCriterion.c
@@ -51,7 +51,7 @@ void THNN_(SpatialClassNLLCriterion_updateOutput)(
   real output_acc = 0;
   for (int b = 0; b < batch_size; b++) {
     for (int elem = 0; elem < map_size; elem++) {
-      int cur_target = target_data[b * map_size + elem] - 1;
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
       real cur_weight = weights ? weights_data[cur_target] : 1.0f;
@@ -107,7 +107,7 @@ void THNN_(SpatialClassNLLCriterion_updateGradInput)(
   for (b = 0; b < batch_size; b++) {
     int elem;
     for (elem = 0; elem < map_size; elem++) {
-      int cur_target = target_data[b * map_size + elem] - 1;
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
       gradInput_data[b * sample_size + cur_target * map_size + elem] =
diff --git a/generic/SpatialConvolutionMap.c b/generic/SpatialConvolutionMap.c
index aef0b1e2ee7..82886c28391 100644
--- a/generic/SpatialConvolutionMap.c
+++ b/generic/SpatialConvolutionMap.c
@@ -10,7 +10,7 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 4,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   real *weight_data = THTensor_(data)(weight);
@@ -75,8 +75,8 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
       for (k = 0; k < nweight; k++)
       {
         /* get offsets for input/output */
-        int o = (int)connTable_data[k*2+1]-1;
-        int i = (int)connTable_data[k*2+0]-1;
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
 
         if (o == p)
         {
@@ -106,7 +106,7 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 5,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   real *weight_data = THTensor_(data)(weight);
@@ -154,8 +154,8 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
       int nkernel = connTable->size[0];
       for (k = 0; k < nkernel; k++)
       {
-        int o = (int)connTable_data[k*2+1]-1;
-        int i = (int)connTable_data[k*2+0]-1;
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
         if (i == p)
         {
           /* gradient to input */
@@ -182,7 +182,7 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
   THArgCheck(
     gradWeight != NULL && gradWeight->nDimension == 3
     && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
-    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   real *gradWeight_data = THTensor_(data)(gradWeight);
@@ -237,8 +237,8 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
     long m;
     for (m = 0; m < nbatch; m++)
     {
-      int o = (int)THTensor_(get2d)(connTable,k,1)-1;
-      int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
 
       /* gradient to kernel */
       THTensor_(validXCorr2DRevptr)(
diff --git a/generic/SpatialFractionalMaxPooling.c b/generic/SpatialFractionalMaxPooling.c
index 1c2b6ab1900..c0a9384788b 100644
--- a/generic/SpatialFractionalMaxPooling.c
+++ b/generic/SpatialFractionalMaxPooling.c
@@ -79,7 +79,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
 
         outputForPlane[h * outputW + w] = maxVal;
         /* +1 to lua index */
-        indicesForPlane[h * outputW + w] = (real) maxIndex + 1;
+        indicesForPlane[h * outputW + w] = (real) maxIndex + TH_INDEX_BASE;
       }
     }
 
@@ -96,7 +96,7 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
     int poolSizeW, int poolSizeH,
     THTensor *indices,
     THTensor *randomSamples) {
-  
+
   long numBatch = 1;
   int planeDim = 0;
   int heightDim = 1;
@@ -177,7 +177,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
     for (h = 0; h < outputH; ++h) {
       for (w = 0; w < outputW; ++w) {
         long outputIndex = h * outputW + w;
-        long index = indicesForPlane[outputIndex] - 1;
+        long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
         THAssert(index >= 0 && index < inputW * inputH);
 
         gradInputForPlane[index] += gradOutputForPlane[outputIndex];
diff --git a/generic/SpatialFullConvolutionMap.c b/generic/SpatialFullConvolutionMap.c
index bbb0282b77b..1bd3455d756 100644
--- a/generic/SpatialFullConvolutionMap.c
+++ b/generic/SpatialFullConvolutionMap.c
@@ -10,7 +10,7 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 4,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   const int kH = (int)weight->size[1];
@@ -62,8 +62,8 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
     for (k = 0; k < nweight; k++)
     {
       /* get offsets for input/output */
-      int o = (int)connTable_data[k*2+1]-1;
-      int i = (int)connTable_data[k*2+0]-1;
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
 
       if (o == p)
       {
@@ -91,7 +91,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 5,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   /* contiguous */
@@ -125,8 +125,8 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
     int nkernel = connTable->size[0];
     for (k = 0; k < nkernel; k++)
     {
-      int o = (int)connTable_data[k*2+1]-1;
-      int i = (int)connTable_data[k*2+0]-1;
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
       if (i == p)
       {
         /* gradient to input */
@@ -154,7 +154,7 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
   THArgCheck(
     gradWeight != NULL && gradWeight->nDimension == 3
     && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
-    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   /* contiguous */
@@ -191,8 +191,8 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
 #pragma omp parallel for private(k)
   for (k = 0; k < nkernel; k++)
   {
-    int o = (int)THTensor_(get2d)(connTable,k,1)-1;
-    int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
 
     /* gradient to kernel */
     THTensor_(validXCorr2DRevptr)(
diff --git a/generic/SpatialMaxPooling.c b/generic/SpatialMaxPooling.c
index d28fe85f17c..829f3f0eec9 100644
--- a/generic/SpatialMaxPooling.c
+++ b/generic/SpatialMaxPooling.c
@@ -63,7 +63,7 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(
         *op = maxval;
 
         /* store location of max */
-        *indp = maxindex + 1;
+        *indp = maxindex + TH_INDEX_BASE;
       }
     }
   }
@@ -97,7 +97,7 @@ void THNN_(SpatialMaxPooling_updateOutput)(
 
   THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
 
-  if (input->nDimension == 4) 
+  if (input->nDimension == 4)
   {
     nbatch = input->size[0];
     dimw++;
@@ -209,7 +209,7 @@ static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
       for(j = 0; j < owidth; j++)
       {
         /* retrieve position of max */
-        long maxp = ind_p_k[i*owidth + j] - 1;
+        long maxp = ind_p_k[i*owidth + j] - TH_INDEX_BASE;
         /* update gradient */
         gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
       }
diff --git a/generic/SpatialMaxUnpooling.c b/generic/SpatialMaxUnpooling.c
index 6e7a76e9880..cd1739b4cf4 100644
--- a/generic/SpatialMaxUnpooling.c
+++ b/generic/SpatialMaxUnpooling.c
@@ -11,7 +11,7 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
   long k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
-  {    
+  {
     real *output_p_k = output_p + k*owidth*oheight;
     real *input_p_k = input_p + k*iwidth*iheight;
     real *ind_p_k = ind_p + k*iwidth*iheight;
@@ -21,7 +21,7 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
     {
       for(j = 0; j < iwidth; j++)
       {
-        maxp = ind_p_k[i*iwidth + j] - 1;  /* retrieve position of max */
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
         if(maxp<0 || maxp>=owidth*oheight){
             THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
         }
@@ -52,9 +52,9 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
   THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
   if (!THTensor_(isSameSizeAs)(input, indices)){
     THError("Invalid input size w.r.t current indices size");
-  }  
+  }
 
-  if (input->nDimension == 4) 
+  if (input->nDimension == 4)
   {
     nbatch = input->size[0];
     dimw++;
@@ -131,11 +131,11 @@ static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p,
     for(i = 0; i < iheight; i++)
     {
       for(j = 0; j < iwidth; j++)
-      {        
-        maxp = ind_p_k[i*iwidth + j] - 1; /* retrieve position of max */         
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
         if(maxp<0 || maxp>=owidth*oheight){
             THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
-        }  
+        }
         gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
       }
     }
@@ -162,7 +162,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
 
   if (!THTensor_(isSameSizeAs)(input, indices)){
     THError("Invalid input size w.r.t current indices size");
-  } 
+  }
 
   /* get contiguous gradOutput and indices */
   gradOutput = THTensor_(newContiguous)(gradOutput);

From a23ee8574c481e26c30e81cc6eb377d7a91bae84 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Tue, 2 Aug 2016 08:50:14 -0400
Subject: [PATCH 098/101] Fix THNN.h formatting

---
 generic/THNN.h | 142 +++++++++++++++++++++++++++----------------------
 1 file changed, 77 insertions(+), 65 deletions(-)

diff --git a/generic/THNN.h b/generic/THNN.h
index cda1029dbc2..4b88c5e3b91 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -794,43 +794,43 @@ TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
           real scale);            // scaling factor
 
 TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *output,
-    THTensor *weight,
-    THTensor *bias,
-    THTensor *columns,
-    THTensor *ones,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int dilationW, int dilationH);
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
 
 TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *gradOutput,
-    THTensor *gradInput,
-    THTensor *weight,
-    THTensor *gradColumns,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int dilationW, int dilationH);
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
 
 TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *gradOutput,
-    THTensor *gradWeight,
-    THTensor *gradBias,
-    THTensor *columns,
-    THTensor *ones,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int dilationW, int dilationH,
-    real scale);
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          real scale);
 
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
@@ -905,13 +905,13 @@ TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
           int scale_factor);
 
 TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
-         THNNState *state,
-         THTensor *input,
-         THTensor *output);
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
 TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
-         THNNState *state,
-         THTensor *gradOutput,
-         THTensor *gradInput);
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput);
 
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
@@ -1078,39 +1078,51 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
-TH_API void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
-                                                         THTensor *input,
-                                                         THTensor *output,
-                                                         int pad_l, int pad_r,
-                                                         int pad_t, int pad_b);
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
 
-TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
-                                                            THTensor *input,
-                                                            THTensor *gradOutput,
-                                                            THTensor *gradInput,
-                                                            int pad_l, int pad_r,
-                                                            int pad_t, int pad_b);
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
 
-TH_API void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
-                                                         THTensor *input,
-                                                         THTensor *output,
-                                                         int pad_l, int pad_r,
-                                                         int pad_t, int pad_b);
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
 
-TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
-                                                            THTensor *input,
-                                                            THTensor *gradOutput,
-                                                            THTensor *gradInput,
-                                                            int pad_l, int pad_r,
-                                                            int pad_t, int pad_b);
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
 
 TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
-          THNNState *state, THTensor *input, THTensor *output, int pleft,
-          int pright, int ptop, int pbottom, int pfront, int pback);
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
 
 TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
-          THNNState *state, THTensor *input, THTensor *gradOutput,
-          THTensor *gradInput, int pleft, int pright, int ptop, int pbottom,
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pleft, int pright,
+          int ptop, int pbottom,
           int pfront, int pback);
 
 #endif

From a78d909afa02f97711e0f81654bea29aaf5612fb Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Tue, 2 Aug 2016 23:48:34 -0400
Subject: [PATCH 099/101] memset the result buffers sent into mm (when beta =
 0) to zero to fix subtle BLAS behavior based bugs

---
 generic/SpatialConvolutionMM.c      | 4 ++++
 generic/SpatialDilatedConvolution.c | 2 ++
 generic/SpatialFullConvolution.c    | 2 ++
 generic/VolumetricConvolutionMM.c   | 4 ++++
 generic/VolumetricFullConvolution.c | 6 ++++--
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/generic/SpatialConvolutionMM.c b/generic/SpatialConvolutionMM.c
index a549a373c39..e7460c800ab 100644
--- a/generic/SpatialConvolutionMM.c
+++ b/generic/SpatialConvolutionMM.c
@@ -174,6 +174,10 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput); 
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if(input->nDimension == 3)
diff --git a/generic/SpatialDilatedConvolution.c b/generic/SpatialDilatedConvolution.c
index 3f75016dcbd..3928af01481 100644
--- a/generic/SpatialDilatedConvolution.c
+++ b/generic/SpatialDilatedConvolution.c
@@ -49,6 +49,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
 
   // Resize output
   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
@@ -171,6 +172,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
 
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
 
   // Helpers
   THTensor *gradInput_n = THTensor_(new)();
diff --git a/generic/SpatialFullConvolution.c b/generic/SpatialFullConvolution.c
index 637cffa3d36..a82477db885 100644
--- a/generic/SpatialFullConvolution.c
+++ b/generic/SpatialFullConvolution.c
@@ -98,6 +98,7 @@ void THNN_(SpatialFullConvolution_updateOutput)(
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+  THTensor_(zero)(columns);
 
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
@@ -211,6 +212,7 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
 
   // Resize output
   THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
 
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
diff --git a/generic/VolumetricConvolutionMM.c b/generic/VolumetricConvolutionMM.c
index a226350bf17..8fef1cf7e9e 100644
--- a/generic/VolumetricConvolutionMM.c
+++ b/generic/VolumetricConvolutionMM.c
@@ -395,6 +395,10 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);  
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if (input->nDimension == 4)
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
index 5a6a1a74705..dcae8decd1c 100644
--- a/generic/VolumetricFullConvolution.c
+++ b/generic/VolumetricFullConvolution.c
@@ -137,7 +137,8 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
+  THTensor_(zero)(columns);
+  
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
@@ -268,7 +269,8 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
 
   // Resize output
   THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
-
+  THTensor_(zero)(gradInput);
+  
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
 

From d920bf148833147769b8fc75262a92f6fe65b010 Mon Sep 17 00:00:00 2001
From: soumith <soumith@fb.com>
Date: Wed, 3 Aug 2016 16:04:43 -0700
Subject: [PATCH 100/101] volumetric dilated convolution

---
 generic/THNN.h                         |  39 +++
 generic/VolumetricDilatedConvolution.c | 356 +++++++++++++++++++++++++
 generic/VolumetricFullConvolution.c    |  41 +--
 init.c                                 |   4 +-
 4 files changed, 423 insertions(+), 17 deletions(-)
 create mode 100644 generic/VolumetricDilatedConvolution.c

diff --git a/generic/THNN.h b/generic/THNN.h
index 4b88c5e3b91..c7487cc2e4b 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -1042,6 +1042,45 @@ TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
           int aT, int aW, int aH,   // extra output adjustment
           real scale);              // scaling factor
 
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          real scale);
+
 TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/generic/VolumetricDilatedConvolution.c b/generic/VolumetricDilatedConvolution.c
new file mode 100644
index 00000000000..1a9cc932b3e
--- /dev/null
+++ b/generic/VolumetricDilatedConvolution.c
@@ -0,0 +1,356 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[0]);
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[1]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[4];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 ||
+      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kT*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          real scale)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kT*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/generic/VolumetricFullConvolution.c b/generic/VolumetricFullConvolution.c
index dcae8decd1c..4eb36c425e4 100644
--- a/generic/VolumetricFullConvolution.c
+++ b/generic/VolumetricFullConvolution.c
@@ -8,12 +8,13 @@ static void THNN_(vol2col)(
   const int kT, const int kH, const int kW,
   const int pT, const int pH, const int pW,
   const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
   real *data_col)
 {
   int c, t, h, w;
-  int depth_col  = (depth  + 2 * pT - kT) / dT + 1;
-  int height_col = (height + 2 * pH - kH) / dH + 1;
-  int width_col  = (width  + 2 * pW - kW) / dW + 1;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int channels_col = channels * kT * kH * kW;
   for (c = 0; c < channels_col; ++c)
   {
@@ -27,10 +28,12 @@ static void THNN_(vol2col)(
       {
         for (w = 0; w < width_col; ++w)
         {
-          int t_pad = t * dT - pT + t_offset;
-          int h_pad = h * dH - pH + h_offset;
-          int w_pad = w * dW - pW + w_offset;
-          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
             data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
               data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
           else
@@ -47,13 +50,14 @@ static void THNN_(col2vol)(
   const int kT, const int kH, const int kW,
   const int pT, const int pH, const int pW,
   const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
   real* data_vol)
 {
   int c, t, h, w;
   memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
-  int depth_col = (depth + 2 * pT - kT) / dT + 1;
-  int height_col = (height + 2 * pH - kH) / dH + 1;
-  int width_col = (width + 2 * pW - kW) / dW + 1;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int channels_col = channels * kT * kH * kW;
   for (c = 0; c < channels_col; ++c)
   {
@@ -67,10 +71,12 @@ static void THNN_(col2vol)(
       {
         for (w = 0; w < width_col; ++w)
         {
-          int t_pad = t * dT - pT + t_offset;
-          int h_pad = h * dH - pH + h_offset;
-          int w_pad = w * dW - pW + w_offset;
-          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
             data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
               data_col[((c * depth_col + t) * height_col + h) * width_col + w];
         }
@@ -138,7 +144,7 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
   THTensor_(zero)(columns);
-  
+
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
@@ -185,6 +191,7 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
       kT, kH, kW,
       pT, pH, pW,
       dT, dH, dW,
+       1,  1,  1,
       THTensor_(data)(output_n)
     );
 
@@ -270,7 +277,7 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
   // Resize output
   THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
   THTensor_(zero)(gradInput);
-  
+
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
 
@@ -293,6 +300,7 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
       kT, kH, kW,
       pT, pH, pW,
       dT, dH, dW,
+       1,  1,  1,
       THTensor_(data)(gradColumns)
     );
 
@@ -407,6 +415,7 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
       kT, kH, kW,
       pT, pH, pW,
       dT, dH, dW,
+       1,  1,  1,
       THTensor_(data)(columns)
     );
 
diff --git a/init.c b/init.c
index 77fe8da3145..739706cd7d5 100644
--- a/init.c
+++ b/init.c
@@ -163,6 +163,9 @@
 #include "generic/VolumetricFullConvolution.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/VolumetricDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -177,4 +180,3 @@
 
 #include "generic/VolumetricReplicationPadding.c"
 #include "THGenerateFloatTypes.h"
-

From 4fe7059a315d156ecd080ff7bd5b4fe3d3a9efad Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Thu, 4 Aug 2016 08:43:13 -0700
Subject: [PATCH 101/101] Mark optional arguments in THNN.h

---
 generic/THNN.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/generic/THNN.h b/generic/THNN.h
index c7487cc2e4b..974f56c62b4 100644
--- a/generic/THNN.h
+++ b/generic/THNN.h
@@ -124,7 +124,7 @@ TH_API void THNN_(L1Cost_updateOutput)(
 TH_API void THNN_(L1Cost_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
-          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradOutput,        // [OPTIONAL] gradient w.r.t module's output
           THTensor *gradInput);        // [OUT] gradient w.r.t the input
 
 TH_API void THNN_(LeakyReLU_updateOutput)(
@@ -170,8 +170,8 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *gradOutput,
           THTensor *gradWeight,
           THIntegerTensor *count,
-          THTensor *sorted,
-          THTensor *indices,
+          THTensor *sorted,            // [OPTIONAL]
+          THTensor *indices,           // [OPTIONAL]
           bool scaleGradByFreq,
           int paddingValue,
           real scale);
@@ -247,7 +247,7 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage,
           int p,
-          THTensor* weights,
+          THTensor* weights,      // [OPTIONAL]
           real margin);
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
@@ -256,7 +256,7 @@ TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage,
           int p,
-          THTensor *weights,
+          THTensor *weights,      // [OPTIONAL]
           real margin);
 
 TH_API void THNN_(PReLU_updateOutput)(
@@ -539,8 +539,8 @@ TH_API void THNN_(BatchNormalization_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *weight,
-          THTensor *bias,
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *bias,         // [OPTIONAL]
           THTensor *running_mean,
           THTensor *running_var,
           THTensor *save_mean,
@@ -552,10 +552,10 @@ TH_API void THNN_(BatchNormalization_backward)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *gradWeight,
-          THTensor *gradBias,
-          THTensor *weight,
+          THTensor *gradInput,    // [OPTIONAL]
+          THTensor *gradWeight,   // [OPTIONAL]
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *weight,       // [OPTIONAL]
           THTensor *running_mean,
           THTensor *running_var,
           THTensor *save_mean,
@@ -602,7 +602,7 @@ TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,         // [OPTIONAL]
           THTensor *finput,
           THTensor *fgradInput,
           int kW, int kH,
@@ -624,7 +624,7 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,     // [OPTIONAL]
           THTensor *finput,
           THTensor *fgradInput,
           int kW, int kH,
@@ -728,7 +728,7 @@ TH_API void THNN_(SpatialFullConvolution_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,         // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kW, int kH,
@@ -751,7 +751,7 @@ TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,     // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kW, int kH,
@@ -798,7 +798,7 @@ TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,         // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kW, int kH,
@@ -823,7 +823,7 @@ TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,     // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kW, int kH,