Add 'torch/lib/THNN/' from commit '4fe7059a315d156ecd080ff7bd5b4fe3d3a9efad'

git-subtree-dir: torch/lib/THNN git-subtree-mainline: c3f0c1e2e0 git-subtree-split: 4fe7059a31
2025-12-06 12:20:52 +01:00 · 2016-08-04 10:58:50 -07:00 · 2016-08-04 10:58:50 -07:00 · 035eb28e18
commit 035eb28e18
parent c3f0c1e2e0 4fe7059a31
67 changed files with 14062 additions and 0 deletions
--- a/torch/lib/THNN/CMakeLists.txt
+++ b/torch/lib/THNN/CMakeLists.txt
@ -0,0 +1,65 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+IF(NOT Torch_FOUND)
+  FIND_PACKAGE(Torch REQUIRED)
+ENDIF()
+
+IF(NOT THNN_INSTALL_LIB_SUBDIR)
+  SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
+ENDIF()
+
+# Flags
+# When using MSVC
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ENDIF(MSVC)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
+ELSE ()
+  SET(CMAKE_C_STANDARD 99)
+ENDIF ()
+
+# OpenMP support?
+SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+  IF (DARWIN_VERSION GREATER 9)
+    SET(APPLE_OPENMP_SUCKS 1)
+  ENDIF (DARWIN_VERSION GREATER 9)
+  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+    OUTPUT_VARIABLE GCC_VERSION)
+  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
+    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+  ENDIF ()
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
+
+SET(src init.c)
+ADD_LIBRARY(THNN MODULE init.c)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(THNN PROPERTIES
+  PREFIX "lib"
+  IMPORT_PREFIX "lib")
+TARGET_LINK_LIBRARIES(THNN TH)
+
+INSTALL(TARGETS THNN LIBRARY DESTINATION ${THNN_INSTALL_LIB_SUBDIR})
--- a/torch/lib/THNN/README.md
+++ b/torch/lib/THNN/README.md
@ -0,0 +1,32 @@
+# THNN
+
+THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions, and an object oriented C/C++ wrapper will be created soon as another library.
+
+There is also a CUDA counterpart of THNN (THCUNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
+
+## Links
+
+* [API reference](doc/api_reference.md)
+* [Style guidelines](doc/style_guidelines.md)
+
+## Motivation
+
+Torch's neural network package (nn) provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
+
+THNN is based on the same code, but is written in pure C, so it can be easily included in other code. **Future C implementations should be committed to THNN.**
+
+## API
+
+THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
+
+* **updateOutput** - applies the module to an input
+* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
+* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
+
+For information on argument types please check the [API reference](doc/api_reference.md).
+
+## Developer docs
+
+* [Style guidelines](doc/style_guidelines.md)
+
+This section will be expanded when FFI refactoring will be finished.
--- a/torch/lib/THNN/THNN.h
+++ b/torch/lib/THNN/THNN.h
@ -0,0 +1,25 @@
+#ifndef THNN_H
+#define THNN_H
+
+#include <stdbool.h>
+#include <TH.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
+
+#define THIndexTensor THLongTensor
+#define THIndexTensor_(NAME) THLongTensor_ ## NAME
+
+#define THIntegerTensor THIntTensor
+#define THIntegerTensor_(NAME) THIntTensor_ ## NAME
+
+typedef long THIndex_t;
+typedef int THInteger_t;
+typedef void THNNState;
+
+#include "generic/THNN.h"
+#include <THGenerateFloatTypes.h>
+
+#endif
--- a/torch/lib/THNN/doc/api_reference.md
+++ b/torch/lib/THNN/doc/api_reference.md
--- a/torch/lib/THNN/doc/generate_reference.lua
+++ b/torch/lib/THNN/doc/generate_reference.lua
@ -0,0 +1,106 @@
+--[[
+  This script regenerates api_reference.md based on comments placed in THNN.h.
+]]--
+
+local header = [[
+# API docs
+
+This document only describes a THNN API. For a thorough review of all modules present here please refer to [nn's docs](http://github.com/torch/nn/tree/master/doc).
+
+### Note on function names
+
+Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
+
+* `void THNN_FloatAbs_updateOutput(...)`
+* `void THNN_DoubleAbs_updateOutput(...)`
+
+In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
+
+### Argument types
+
+Some arguments have additional tags placed in square brackets:
+* **[OUT]** - This is the output argument. It will be reshaped if needed.
+* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
+* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
+* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
+
+## Module list
+
+These are all modules implemented in THNN:
+
+]]
+
+local hfile = io.open('../generic/THNN.h', 'r')
+local lines = hfile:read('*a'):split('\n')
+hfile:close()
+
+-- Parse input
+local declarations = {}
+local current_declaration
+local declaration_module
+for i,line in ipairs(lines) do
+   if line:sub(1, 6) == 'TH_API' then
+     current_declaration = ''
+     declaration_module = line:match('THNN_%((.+)_.+%)')
+   end
+
+   if current_declaration then
+      current_declaration = current_declaration .. line .. '\n'
+   end
+
+   if line:match('%);') then
+     current_declaration = current_declaration:sub(1, -2) -- remove a trailing newline
+     declarations[declaration_module] = declarations[declaration_module] or {}
+     table.insert(declarations[declaration_module], current_declaration)
+     current_declaration = nil
+     declaration_module = nil
+   end
+end
+declarations["unfolded"] = nil
+
+-- Sort modules
+modules = {}
+for k,_ in pairs(declarations) do table.insert(modules, k) end
+table.sort(modules)
+
+-- Create an index
+local outfile = io.open('api_reference.md', 'w')
+outfile:write(header)
+for i, name in ipairs(modules) do
+    outfile:write(string.format('* [%s](#%s)\n', name, name:lower()))
+end
+outfile:write('\n')
+
+-- Write proper docs
+for i,name in ipairs(modules) do
+    outfile:write('## ' .. name ..'\n')
+
+    for i,declaration in ipairs(declarations[name]) do
+
+        -- Write source code
+        outfile:write('```C' .. '\n')
+        local declaration_lines = declaration:split('\n')
+        for i, line in ipairs(declaration_lines) do
+            if i == 1 then
+                line = line:gsub('TH_API ', ''):gsub('%(', ''):gsub('%)', '') .. '(' -- remove macro junk
+            else
+                line = line:gsub('%s*//.*$', '') -- remove the comment
+            end
+            outfile:write(line .. '\n')
+        end
+        outfile:write('```' .. '\n')
+
+        -- Describe arguments
+        table.remove(declaration_lines, 1)
+        for i,line in ipairs(declaration_lines) do
+            local param, comment = line:match('^%s*(.*),%s*// (.*)$')
+            if param == nil then param, comment = line:match('^%s*(.*)%);%s*// (.*)$') end
+
+            if param ~= nil then
+                comment = comment:gsub('%[', '%*%*%['):gsub('%]', '%]%*%*') -- use bold font for tags
+                outfile:write(string.format('`%s` - %s\n<br/>\n', param, comment))
+            end
+        end
+    end
+end
+outfile:close()
--- a/torch/lib/THNN/doc/style_guidelines.md
+++ b/torch/lib/THNN/doc/style_guidelines.md
@ -0,0 +1,59 @@
+## API design guidelines
+
+Functions should return `void`.
+
+All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this:
+```
+[weight], [bias], [any buffers], [additional arguments], [optional arguments]
+```
+
+### Modules
+```
+updateOutput: state, input, output, ...
+updateGradInput: state, input, gradOutput, gradInput, ...
+accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
+```
+
+e.g.
+```C
+void THNN_(HardShrink_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+```
+
+### Criterions
+```
+updateOutput: state, input, target, output, ...
+updateGradInput: state, input, target, gradInput, ...
+```
+
+e.g.
+
+```C
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState* state,
+          THTensor *input,
+          THLongTensor *target,
+          THTensor *output,
+          THTensor *weights,
+          THTensor *total_weight,
+          bool sizeAverage)
+```
+
+## Code style guide
+
+```C
+void THNN_Linear_updateOutput(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+//<- 10 ->
+```
+
+All arguments should start on a new line after function name, and they should be indented using 10 spaces.
+
+Use 2 spaces for block indentation.
--- a/torch/lib/THNN/generic/Abs.c
+++ b/torch/lib/THNN/generic/Abs.c
@ -0,0 +1,27 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    real z = *input_data;
+    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/AbsCriterion.c
+++ b/torch/lib/THNN/generic/AbsCriterion.c
@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += fabs(*input_data - *target_data);
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/BatchNormalization.c
+++ b/torch/lib/THNN/generic/BatchNormalization.c
@ -0,0 +1,144 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
+#else
+
+void THNN_(BatchNormalization_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  THTensor *weight, THTensor *bias,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double momentum, double eps)
+{
+  long nInput = THTensor_(size)(input, 1);
+  long f,n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+    real mean, invstd;
+
+    if (train) {
+      // compute mean per input
+      accreal sum = 0;
+      TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+      mean = (real) sum / n;
+      THTensor_(set1d)(save_mean, f, (real) mean);
+
+      // compute variance per input
+      sum = 0;
+      TH_TENSOR_APPLY(real, in,
+        sum += (*in_data - mean) * (*in_data - mean););
+
+      if (sum == 0 && eps == 0.0) {
+        invstd = 0;
+      } else {
+        invstd = (real) (1 / sqrt(sum/n + eps));
+      }
+      THTensor_(set1d)(save_std, f, (real) invstd);
+
+      // update running averages
+      THTensor_(set1d)(running_mean, f,
+        (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+
+      accreal unbiased_var = sum / (n - 1);
+      THTensor_(set1d)(running_var, f,
+        (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // compute output
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(real, in, real, out,
+      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+    THTensor_(free)(out);
+    THTensor_(free)(in);
+  }
+}
+
+void THNN_(BatchNormalization_backward)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
+{
+  long nInput = THTensor_(size)(input, 1);
+  long f,n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // sum over all gradOutput in feature plane
+    accreal sum = 0;
+    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    accreal dotp = 0;
+    TH_TENSOR_APPLY2(real, in, real, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
+
+      THTensor_(free)(gradIn);
+    }
+
+    if (gradWeight) {
+      real val = THTensor_(get1d)(gradWeight, f);
+      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      real val = THTensor_(get1d)(gradBias, f);
+      THTensor_(set1d)(gradBias, f, val + scale * sum);
+    }
+
+    THTensor_(free)(gradOut);
+    THTensor_(free)(in);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/ClassNLLCriterion.c
+++ b/torch/lib/THNN/generic/ClassNLLCriterion.c
@ -0,0 +1,147 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  output_data[0] = total_weight_data[0] = 0.0;
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    THAssert(cur_target >= 0 && cur_target < n_classes);
+    total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+    output_data[0] = -input_data[cur_target] * total_weight_data[0];
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_data[0] += cur_weight;
+      output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+    }
+  }
+
+  if (sizeAverage && total_weight_data[0]) {
+    output_data[0] /= total_weight_data[0];
+  }
+
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+
+  if (!THTensor_(isContiguous)(gradInput)) {
+    THError("gradInput must be contiguous");
+  }
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  if (!(*total_weight_data > 0)) {
+    return;
+  }
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+  
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    THAssert(cur_target >= 0 && cur_target < n_classes);
+
+    gradInput_data[cur_target] =
+      (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++){
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[i * n_target + cur_target] =
+        -(weights ? weights_data[cur_target] : 1.0f);
+
+      if (sizeAverage && *total_weight_data) {
+        gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+      }
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/DistKLDivCriterion.c
+++ b/torch/lib/THNN/generic/DistKLDivCriterion.c
@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/ELU.c
+++ b/torch/lib/THNN/generic/ELU.c
@ -0,0 +1,51 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real alpha,
+          bool inplace)
+{
+  if(inplace) {
+    TH_TENSOR_APPLY(real, input,
+      if(*input_data <= 0) {
+        *input_data = (exp(*input_data) - 1) * alpha;
+      }
+    );
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, input, real, output,
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+    );
+  }
+}
+
+void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real alpha,
+          bool inplace)
+{
+  if(inplace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, output,
+      if(*output_data <= 0) {
+        *gradOutput_data *= *output_data + alpha;
+      }
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, output);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+    );
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/HardShrink.c
+++ b/torch/lib/THNN/generic/HardShrink.c
@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda)
+{
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if (*input_data > lambda)
+      *output_data = *input_data;
+    else if (*input_data < -lambda)
+      *output_data = *input_data;
+    else
+      *output_data = 0;
+  );
+}
+
+void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if (*input_data > lambda || *input_data < -lambda)
+      *gradInput_data = *gradOutput_data;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/HardTanh.c
+++ b/torch/lib/THNN/generic/HardTanh.c
@ -0,0 +1,127 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real min_val,
+          real max_val,
+          bool inplace)
+{
+  if (inplace)
+    THTensor_(set)(output, input);
+  else
+    THTensor_(resizeAs)(output, input);
+  
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    if (inplace)
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data < min_val)
+          *input_data = min_val;
+        else if (*input_data > max_val)
+          *input_data = max_val;
+      );
+      TH_TENSOR_APPLY2(real, output, real, input,
+        if (*input_data < min_val)
+          *output_data = min_val;
+        else if (*input_data <= max_val)
+          *output_data = *input_data;
+        else
+          *output_data = max_val;
+      );
+  }
+  else
+  {
+    real* ptr_input  = THTensor_(data)(input);
+    real* ptr_output = THTensor_(data)(output);
+    long i;
+    long n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_input[i] = min_val;
+        else if (ptr_input[i] > max_val)
+          ptr_input[i] = max_val;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_output[i] = min_val;
+        else if (ptr_input[i] <= max_val)
+          ptr_output[i] = ptr_input[i];
+        else
+          ptr_output[i] = max_val;
+      }
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real min_val,
+          real max_val,
+          bool inplace)
+{
+  if (inplace)
+    THTensor_(set)(gradInput, gradOutput);
+  else
+    THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 ||
+    !THTensor_(isContiguous)(input) ||
+    !THTensor_(isContiguous)(gradOutput) ||
+    !THTensor_(isContiguous)(gradInput))
+  {
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data < min_val || *input_data > max_val)
+          *gradOutput_data = 0;
+      );
+    }
+    else
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        if (*input_data < min_val || *input_data > max_val)
+          *gradInput_data = 0;
+        else
+          *gradInput_data = *gradOutput_data;
+      );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_input      = THTensor_(data)(input);
+    long i;
+    long n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val || ptr_input[i] > max_val)
+          ptr_gradInput[i] = 0;
+        else
+          ptr_gradInput[i] = ptr_gradOutput[i];
+      }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/L1Cost.c
+++ b/torch/lib/THNN/generic/L1Cost.c
@ -0,0 +1,36 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  accreal sum = 0;
+
+  TH_TENSOR_APPLY(real, input, 
+    sum += fabs(*input_data);
+  );
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY2(real, gradInput, real, input,
+    if (*input_data > 0)
+      *gradInput_data = 1;
+    else if (*input_data < 0)
+      *gradInput_data = -1;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/LeakyReLU.c
+++ b/torch/lib/THNN/generic/LeakyReLU.c
@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real negval,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data *= negval;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = *input_data > 0 ? *input_data : *input_data * negval;
+    );
+  }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real negval,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if (*input_data <= 0)
+        *gradOutput_data *= negval;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+    );
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/LogSigmoid.c
+++ b/torch/lib/THNN/generic/LogSigmoid.c
@ -0,0 +1,35 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(resizeAs)(buffer, input);
+
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+    real z = exp(-*input_data);
+    *buffer_data = z;
+    *output_data = -log(1. + z);
+  );
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(gradInput, buffer);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
+    real z = *buffer_data;
+    *gradInput_data = *gradOutput_data * z / (1. + z);
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/LogSoftMax.c
+++ b/torch/lib/THNN/generic/LogSoftMax.c
@ -0,0 +1,110 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSoftMax.c"
+#else
+
+void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  long nframe = 0, dim = 0;
+  long t, d;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+  }
+  else
+  {
+    THArgCheck(0, 2, "vector or matrix expected");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  real *input_data0 = THTensor_(data)(input);
+  real *output_data0 = THTensor_(data)(output);
+
+  accreal logsum;
+  real maxInput;
+  #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
+  for (t = 0; t < nframe; t++)
+  {
+    logsum = 0;
+    maxInput = -THInf;
+    input_data = input_data0 + dim*t;
+    output_data = output_data0 + dim*t;
+
+    for (d = 0; d < dim; d++)
+      maxInput = THMax(maxInput, input_data[d]);
+
+    for (d = 0; d < dim; d++)
+      logsum += exp(input_data[d] - maxInput);
+    logsum = maxInput + log(logsum);
+
+    for (d = 0; d < dim; d++)
+      output_data[d] = input_data[d] - logsum;
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  long nframe = 0, dim = 0;
+  long t, d;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+  }
+  else
+  {
+    THError("vector or matrix expected");
+  }
+
+  THTensor_(resizeAs)(gradInput, output);
+  real *gradInput_data0 = THTensor_(data)(gradInput);
+  real *output_data0 = THTensor_(data)(output);
+  real *gradOutput_data0 = THTensor_(data)(gradOutput);
+  accreal sum;
+  #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
+  for (t = 0; t < nframe; t++)
+  {
+    sum = 0;
+    gradInput_data = gradInput_data0 + dim*t;
+    output_data = output_data0 + dim*t;
+    gradOutput_data = gradOutput_data0 + dim*t;
+
+    for (d = 0; d < dim; d++)
+      sum += gradOutput_data[d];
+
+    for (d = 0; d < dim; d++)
+      gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/LookupTable.c
+++ b/torch/lib/THNN/generic/LookupTable.c
@ -0,0 +1,213 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void THNN_(LookupTable_resetCount)(
+          THInteger_t *count_data,
+          THIndexTensor *input)
+{
+  int i;
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  long numel = THIndexTensor_(nElement)(input);
+
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - TH_INDEX_BASE;
+    count_data[k] = 0;
+  }
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - TH_INDEX_BASE;
+    count_data[k]++;
+  }
+}
+
+void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          real scale)
+{
+  long i;
+  THInteger_t *count_data = NULL;
+
+  if (scaleGradByFreq)
+  {
+    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    count_data = THIntegerTensor_(data)(count);
+  }
+
+  if (!THTensor_(isContiguous)(gradWeight))
+    THError("gradWeight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(input))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)
+    THError("input must be a vector or matrix");
+
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  long numel = THIndexTensor_(nElement)(input);
+  long numw = THTensor_(size)(gradWeight, 0);
+
+  // check that inputs are all within range
+  for (i=0; i<numel; i++)
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE)
+      THError("input out of range");
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  real *gw = THTensor_(data)(gradWeight);
+  real *go = THTensor_(data)(gradOutput);
+  long stride = THTensor_(stride)(gradWeight, 0);
+
+  if (count_data)
+    THNN_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel private(i)
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+
+      long start = tid * (numw/nthreads + 1);
+      long end = start + (numw/nthreads + 1);
+      for (i=0; i<numel; i++)
+      {
+        if (input_data[i] != paddingValue)
+        {
+            long k = input_data[i] - TH_INDEX_BASE;
+            if (k >= start && k < end)
+            {
+                real scale_ = scale;
+                if (count_data) scale_ /= count_data[k];
+                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+            }
+        }
+      }
+    }
+
+    THTensor_(free)(gradOutput);
+    return;
+  }
+#endif
+
+  for (i=0; i<numel; i++)
+  {
+    if (input_data[i] != paddingValue)
+    {
+        long k = input_data[i] - TH_INDEX_BASE;
+        real scale_ = scale;
+        if (count_data) scale_ /= count_data[k];
+        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+     }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+          real *row_data,
+          long stride,
+          real maxNorm,
+          real normType)
+{
+  real norm = 0;
+  real new_norm;
+  long j;
+  for (j=0; j<stride; j++)
+  {
+    if (normType == 1) {
+      norm += fabs(row_data[j]);
+    } else if (normType == 2) {
+      norm += row_data[j] * row_data[j];
+    } else {
+      norm += pow(fabs(row_data[j]), normType);
+    }
+  }
+  norm = pow(norm, 1.0 / normType);
+  if (norm > maxNorm)
+  {
+    new_norm = maxNorm / (norm + 1e-7);
+    for (j=0; j<stride; j++) {
+      row_data[j] *= new_norm;
+    }
+  }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+          THNNState *state,
+          THIndexTensor *idx,
+          THTensor *weight,
+          real maxNorm,
+          real normType)
+{
+  if (!THTensor_(isContiguous)(weight))
+    THError("weight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(idx))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(idx) != 1)
+    THError("idx must be a vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  long i;
+  THIndex_t *row_idx = THIndexTensor_(data)(idx);
+  long numel = THIndexTensor_(nElement)(idx);
+
+  long numw = THTensor_(size)(weight, 0);
+  long stride = THTensor_(stride)(weight, 0);
+  real *gw = THTensor_(data)(weight);
+  for (i=0; i<numel; i++)
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE)
+      THError("input out of range");
+  // get unique indices
+  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+  long ptr = 0;
+  for (i=0; i<numel; i++)
+    if (i == 0 || row_idx[i] != row_idx[i-1])
+      row_idx[ptr++] = row_idx[i];
+  numel = ptr;
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over the rows that appear in
+    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+    // This distributes the work evenly to each thread.
+    #pragma omp parallel for private(i)
+    for (i=0; i<numel; i++)
+    {
+      long k = row_idx[i] - TH_INDEX_BASE;
+      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+    }
+    return;
+  }
+#endif
+  for (i=0; i<numel; i++)
+  {
+    long k = row_idx[i] - TH_INDEX_BASE;
+    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/MSECriterion.c
+++ b/torch/lib/THNN/generic/MSECriterion.c
@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (*input_data - *target_data);
+    sum += z*z;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = norm * (*input_data - *target_data);
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/MarginCriterion.c
+++ b/torch/lib/THNN/generic/MarginCriterion.c
@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          real margin)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (margin - *input_data * *target_data);
+    sum += z>0 ? z : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          real margin)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/MultiLabelMarginCriterion.c
+++ b/torch/lib/THNN/generic/MultiLabelMarginCriterion.c
@ -0,0 +1,174 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data, *target_data, *isTarget_data;
+  long nframe, dim;
+  long t, d, dt, ddt;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+  }
+
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+
+  target = THTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+
+  THTensor_(resizeAs)(isTarget, target);
+  THTensor_(zero)(isTarget);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    for (ddt = 0; ddt < dim; ddt++)
+    {
+      long target_idx = (long)target_data[ddt] - TH_INDEX_BASE;
+      if (target_idx < 0)
+        break;
+      isTarget_data[target_idx] = 1;
+    }
+    for (dt = 0; dt < dim; dt++)
+    {
+      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+            sum += z;
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+  }
+
+  sum /= dim;
+  if (sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data;
+  real *gradInput_data;
+  real *target_data;
+  real *isTarget_data;
+  long nframe, dim;
+  long t, d, dt;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3, "inconsistent isTarget size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe) && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+  }
+
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+
+  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
+  target = THTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  isTarget = THTensor_(newContiguous)(isTarget);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  for (t = 0; t < nframe; t++)
+  {
+    for (dt = 0; dt < dim; dt++)
+    {
+      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+          {
+            gradInput_data[target_idx] -= g;
+            gradInput_data[d] += g;
+          }
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  THTensor_(free)(isTarget);
+}
+
+#endif
--- a/torch/lib/THNN/generic/MultiMarginCriterion.c
+++ b/torch/lib/THNN/generic/MultiMarginCriterion.c
@ -0,0 +1,159 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          real margin)
+{
+  real *input_data, *target_data, *weights_data;
+  long nframe, dim;
+  long t, d;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+  }
+
+  for (t = 0; t < nframe; t++)
+  {
+    real idx = THTensor_(get1d)(target, t);
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, "target out of range");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    long target_idx = (long)(target_data[t] - TH_INDEX_BASE);
+    real input_target = input_data[target_idx];
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0) {
+        real h = (p==1) ? z : z*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        sum += h;
+      }
+    }
+    input_data += dim;
+  }
+
+  sum /= dim;
+  if(sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          real margin)
+{
+  real *input_data;
+  real *gradInput_data;
+  real *target_data;
+  real *weights_data;
+  long nframe, dim;
+  long t, d;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+  }
+
+  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
+
+  input = THTensor_(newContiguous)(input);
+  target = THTensor_(newContiguous)(target);
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  target_data = THTensor_(data)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  for (t = 0; t < nframe; t++)
+  {
+    long target_idx = (long)(target_data[t]) - TH_INDEX_BASE;
+    real input_target = input_data[target_idx];
+    real gradInput_target = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0)
+      {
+        real h = (p == 1) ? g : 2*g*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        gradInput_target -= h;
+        gradInput_data[d] = h;
+      }
+      else
+        gradInput_data[d] = 0;
+    }
+    gradInput_data[target_idx] = gradInput_target;
+
+    input_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+#endif
--- a/torch/lib/THNN/generic/PReLU.c
+++ b/torch/lib/THNN/generic/PReLU.c
@ -0,0 +1,228 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THTensor_(resizeAs)(output, input);
+
+  if (nOutputPlane == 0)
+  {
+    // handle shared parameter case
+    real w = *THTensor_(data)(weight);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
+    );
+  }
+  else
+  {
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
+      }
+
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
+    }
+
+    real *output_data = THTensor_(data)(output);
+    real *input_data = THTensor_(data)(input);
+    real *weight_data = THTensor_(data)(weight);
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      real* n_input_data = input_data + i*nOutputPlane*ks;
+      real* n_output_data = output_data + i*nOutputPlane*ks;
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        for (k = 0; k < ks; ++k)
+          n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+        n_input_data += ks;
+        n_output_data += ks;
+      }
+    }
+  }
+}
+
+void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (nOutputPlane == 0)
+  {
+    real w = THTensor_(data)(weight)[0];
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+       if ((*input_data) > 0)
+         *gradInput_data = *gradOutput_data;
+       else
+         *gradInput_data = w * (*gradOutput_data);
+    );
+  }
+  else
+  {
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradInput_data = THTensor_(data)(gradInput);
+
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
+      }
+
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
+    }
+
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real w = weight_data[j];
+        for (k = 0; k < ks; ++k)
+        {
+          if (n_input_data[k] > 0)
+            n_gradInput_data[k] = n_gradOutput_data[k];
+          else
+            n_gradInput_data[k] = n_gradOutput_data[k] * w;
+        }
+        n_input_data += ks;
+        n_gradInput_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+  }
+}
+
+void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          real scale)
+{
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+
+  if (nOutputPlane == 0)
+  {
+    real sum = 0;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,
+      if ((*input_data) <= 0)
+        sum += (*input_data) * (*gradOutput_data);
+    );
+    gradWeight_data[0] += scale * sum;
+  }
+  else
+  {
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
+      }
+
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
+    }
+
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+
+    THIndex_t i, j, k;
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real sum = 0;
+        for (k = 0; k < ks; ++k)
+          if (n_input_data[k] <= 0)
+            sum += n_gradOutput_data[k] * n_input_data[k];
+        gradWeight_data[j] += scale * sum;
+        n_input_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/RReLU.c
+++ b/torch/lib/THNN/generic/RReLU.c
@ -0,0 +1,127 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator)
+{
+  if (train)
+  {
+    // get default random generator
+    THTensor_(resizeAs)(noise, input);
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, input, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *input_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *noise_data = 1;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *output_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *output_data = *input_data;
+          *noise_data = 1;
+        }
+      );
+    }
+  }
+  else
+  {
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data <= 0)
+        {
+          *input_data = *input_data * negSlope;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY2(real, input, real, output,
+        const real r = (*input_data) <= 0 ? negSlope : 1;
+        *output_data = *input_data * r;
+      );
+    }
+  }  
+}
+
+void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace)
+{
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THTensor_(cmul)(gradOutput, gradOutput, noise);
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor_(cmul)(gradInput, gradOutput, noise);
+    }    
+  }
+  else
+  { 
+    // use constant factor for negative input values
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= 0)
+        {
+          *gradOutput_data = (*gradOutput_data) * negSlope;
+        }
+      );
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
+      );
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/Sigmoid.c
+++ b/torch/lib/THNN/generic/Sigmoid.c
@ -0,0 +1,31 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    *output_data = 1./(1.+ exp(- *input_data));
+  );
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = *output_data;
+    *gradInput_data = *gradOutput_data * (1. - z) * z;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/SmoothL1Criterion.c
+++ b/torch/lib/THNN/generic/SmoothL1Criterion.c
@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = fabs(*input_data - *target_data);
+    sum += z < 1 ? 0.5*z*z : z - 0.5;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data - *target_data;
+    if (x < -1.)
+     *gradInput_data = - norm;
+    else if (x > 1.)
+     *gradInput_data = norm;
+    else
+     *gradInput_data = norm * x;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/SoftMarginCriterion.c
+++ b/torch/lib/THNN/generic/SoftMarginCriterion.c
@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *output,
+  bool sizeAverage)
+{
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = log(1. + exp(-*input_data* *target_data));
+                   sum += z;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *gradInput,
+  bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real z = exp(-*target_data * *input_data);
+                   *gradInput_data = -norm*(*target_data)*z/(1. + z);)
+}
+
+#endif
--- a/torch/lib/THNN/generic/SoftMax.c
+++ b/torch/lib/THNN/generic/SoftMax.c
@ -0,0 +1,149 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMax.c"
+#else
+
+void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  long nframe = 0, dim = 0, stride = 0;
+  long t;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+  {
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+
+    real inputMax = -THInf;
+    accreal sum;
+
+    long d;
+    for (d = 0; d < dim; d++)
+    {
+      if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
+    }
+
+    sum = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = exp(input_ptr[d*stride] - inputMax);
+      output_ptr[d*stride] = z;
+      sum += z;
+    }
+
+    for (d = 0; d < dim; d++)
+    {
+      output_ptr[d*stride] *= 1/sum;
+    }
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  real *gradInput_data, *gradOutput_data, *output_data;
+  long nframe = 0, dim = 0, stride = 0;
+  long t;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if (output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D tensor expected");
+  }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  output = THTensor_(newContiguous)(output);
+
+  THTensor_(resizeAs)(gradInput, output);
+  gradInput_data = THTensor_(data)(gradInput);
+  output_data = THTensor_(data)(output);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+    real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
+
+    long d;
+    accreal sum = 0;
+    for (d = 0; d < dim; d++)
+      sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
+
+    for (d = 0; d < dim; d++)
+      gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SoftPlus.c
+++ b/torch/lib/THNN/generic/SoftPlus.c
@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold)
+{
+  THTensor_(resizeAs)(output, input);
+
+  // f(x) = 1/beta * log(1 + exp(beta * x))
+  TH_TENSOR_APPLY2(real, output, real, input,               \
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+  );
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold)
+{
+  THTensor_(resizeAs)(gradInput, output);
+  
+  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+  // SINCE
+  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+  // THEREFORE:
+  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = exp(*output_data * beta);
+    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/SoftShrink.c
+++ b/torch/lib/THNN/generic/SoftShrink.c
@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda)
+{
+  THTensor_(resizeAs)(output, input);
+  
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+     *output_data = *input_data - lambda;
+    else if ((*input_data) < -lambda)
+     *output_data = *input_data + lambda;
+    else
+     *output_data = 0;
+  );
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
--- a/torch/lib/THNN/generic/SparseLinear.c
+++ b/torch/lib/THNN/generic/SparseLinear.c
@ -0,0 +1,550 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
+static bool THNN_(checkLegacyInput)(THTensor* t)
+{
+  return t->nDimension == 3 && t->size[2] == 2;
+}
+
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
+{
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THTensor* t, long size0)
+{
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static void THNN_(set1d)(THTensor *t, long x0, real value) {
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1]);
+}
+
+void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i, j, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+  long batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    long i_start = THLongTensor_get1d(csr, h);
+    long i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THLongTensor_free(csr);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      COL_PTR2(weight, offset), weight->stride[0],
+                      ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+                offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
+{
+  long h, i, col, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i = 0; i < nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    long i_start = THLongTensor_get1d(csc, col);
+    long i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
+
+      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+  THLongTensor_free(csc);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
+             "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  batchSize * nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    for (h = 0; h < batchSize; h++) {
+      real val = scale * THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* gradOutput_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+  }
+  THTensor_(free)(gradOutput_row);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate)
+{
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input must be in coo format, nnz x 3");
+
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+  long cnt = 0;
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(lastInput, i, 2);
+    if (val == 0) {
+      continue;
+    }
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THNN_(set1d)(offsets, cnt++, offset);
+    } else {
+      THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+  if (cnt == 0) return;
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate)
+{
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
+             "input size must be batchsize x nnz x 2");
+
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+  long cnt = 0;
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(lastInput, h, i, 1);
+      if (val == 0 ) {
+        continue;
+      }
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THNN_(set1d)(offsets, cnt++, offset);
+      } else {
+        THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input must be in coo format, nnz x 3");
+
+  THTensor_(zero)(gradBias);
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if (   \
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      real* pGradWeight = COL_PTR2(gradWeight, offset);
+      if (gradWeight->stride[0] == 1) {
+        THVector_(fill)(pGradWeight, 0, outDim);
+      } else {
+        long stride = gradWeight->stride[0];
+        for (j = 0; j < outDim; ++j) {
+          pGradWeight[j * stride] = 0;
+        }
+      }
+    } else {
+      THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
+             "input size must be batchsize x nnz x 2");
+
+  THTensor_(zero)(gradBias);
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        real* pGradWeight = COL_PTR2(gradWeight, offset);
+        if (gradWeight->stride[0] == 1) {
+          THVector_(fill)(pGradWeight, 0, outDim);
+        } else {
+          long stride = gradWeight->stride[0];
+          for (j = 0; j < outDim; ++j) {
+            pGradWeight[j * stride] = 0;
+          }
+        }
+      } else {
+        THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+}
+
+#undef ROW_PTR2
+#undef COL_PTR2
+
+#endif
--- a/torch/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
+++ b/torch/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@ -0,0 +1,274 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *indx_p,
+          real *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float)i / oheight * iheight);
+      int y_end   = (int)ceil((float)(i + 1) / oheight * iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = (int)floor((float)j / owidth * iwidth);
+        int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        real *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+        real *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -FLT_MAX;
+        long tcntr = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+            tcntr++;
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max (x,y) */
+        *indyp = (int)(maxindex / kW) + TH_INDEX_BASE;
+        *indxp = (maxindex % kW) + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data+nslices*owidth*oheight, indices_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indx_p,
+          real *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *indx_p_k = indx_p + k*owidth*oheight;
+    real *indy_p_k = indy_p + k*owidth*oheight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float) i / oheight * iheight);
+      for(j = 0; j < owidth; j++)
+      {
+        int x_start = (int)floor((float) j / owidth * iwidth);
+        /* retrieve position of max */
+        long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
+        long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
+
+        /* update gradient */
+        gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         indices_data+nslices*owidth*oheight, indices_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
--- a/torch/lib/THNN/generic/SpatialAveragePooling.c
+++ b/torch/lib/THNN/generic/SpatialAveragePooling.c
@ -0,0 +1,258 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  long k;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+  
+  input = THTensor_(newContiguous)(input);
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = 0;
+      
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real sum = 0;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+
+          for(ky = hstart; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              sum += ptr_input[ky*inputWidth + kx];
+          }
+          /* Update output */
+          *ptr_output++ += sum/divide_factor;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
+
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real z = *ptr_gradOutput++;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+          for(ky = hstart ; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+          }
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialClassNLLCriterion.c
+++ b/torch/lib/THNN/generic/SpatialClassNLLCriterion.c
@ -0,0 +1,128 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK                                                            \
+  THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
+              "only batches of spatial targets supported (3D tensors)");         \
+  THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
+              "only batches of spatial inputs supported (4D tensors)");          \
+  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
+    THError("weight tensor should be defined either for all or no classes");     \
+  }                                                                              \
+                                                                                 \
+  {                                                                              \
+    long input0 = THTensor_(size)(input, 0);                                     \
+    long input1 = THTensor_(size)(input, 1);                                     \
+    long input2 = THTensor_(size)(input, 2);                                     \
+    long input3 = THTensor_(size)(input, 3);                                     \
+    long target0 = THIndexTensor_(size)(target, 0);                              \
+    long target1 = THIndexTensor_(size)(target, 1);                              \
+    long target2 = THIndexTensor_(size)(target, 2);                              \
+    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
+              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+              input0, input1, input2, input3, target0, target1, target2);        \
+  }
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real total_weight_acc = 0;
+  real output_acc = 0;
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_acc += cur_weight;
+      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+    }
+  }
+  *total_weight_data = total_weight_acc;
+  *output_data = output_acc;
+
+  if (sizeAverage && *total_weight_data)
+    *output_data /= *total_weight_data;
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+              "gradInput must be contiguous");
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0)
+    return;
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real normalize = sizeAverage ? *total_weight_data : 1.0f;
+
+  int b;
+  #pragma omp parallel for
+  for (b = 0; b < batch_size; b++) {
+    int elem;
+    for (elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[b * sample_size + cur_target * map_size + elem] =
+        -(weights ? weights_data[cur_target] : 1.0f) / normalize;
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
--- a/torch/lib/THNN/generic/SpatialConvolutionLocal.c
+++ b/torch/lib/THNN/generic/SpatialConvolutionLocal.c
@ -0,0 +1,241 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+                                                         int kW, int kH, int dW, int dH, int padW, int padH,
+                                                         long nInputPlane, long inputWidth, long inputHeight,
+                                                         long nOutputPlane, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output3d, *finput3d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+
+  THTensor_(copy)(output, bias);
+
+  output3d = THTensor_(newWithStorage3d)(output->storage, output->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         nOutputPlane, outputHeight*outputWidth,
+                                         1, nOutputPlane*outputHeight*outputWidth);
+ 
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+  // output3d:  oH*oW x nOutputPlane x 1
+  
+  THTensor_(free)(output3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kW, kH, dW, dH, padW, padH,
+                                                 nInputPlane, inputWidth, inputHeight,
+                                                 nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                   kW, kH, dW, dH, padW, padH,
+                                                   nInputPlane, inputWidth, inputHeight,
+                                                   nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
+                                                            long nInputPlane, long inputWidth, long inputHeight,
+                                                            long nOutputPlane, long outputWidth, long outputHeight)
+{
+  THTensor *gradOutput3d, *fgradInput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+  // gradOutput3d:  oH*oW x nOutputPlane x 1         
+  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(fgradInput3d);
+  
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
+                                            nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 1, 2);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
+                                                       nInputPlane, inputWidth, inputHeight,
+                                                       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
+                                                         nInputPlane, inputWidth, inputHeight,
+                                                         nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 1, 2);
+}
+
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
+                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
+                                                            long nInputPlane, long inputWidth, long inputHeight,
+                                                            long nOutputPlane, long outputWidth, long outputHeight)
+{
+   
+  THTensor *gradOutput3d, *finput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth);
+  // gradOutput3d:  oH*oW x nOutputPlane x 1  
+  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight,
+    real scale)
+{
+  long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
+                                                         nInputPlane, inputWidth, inputHeight,
+                                                         nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
+                                                           nInputPlane, inputWidth, inputHeight,
+                                                           nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/torch/lib/THNN/generic/SpatialConvolutionMM.c
@ -0,0 +1,284 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  int dimf = 0;
+  int dimw = 2;
+  int dimh = 1;
+
+  long nInputPlane;
+  long inputWidth;
+  long inputHeight;
+  long nOutputPlane;
+  long outputWidth;
+  long outputHeight;
+
+  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  if (input->nDimension == 4) {
+    dimf++;
+    dimw++;
+    dimh++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputWidth   = input->size[dimw];
+  inputHeight  = input->size[dimh];
+  nOutputPlane = weight->size[0];
+  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  if (nInputPlane*kW*kH != weight->size[1])
+    THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH));
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kW, kH, dW, dH, padW, padH,
+                                                 nInputPlane, inputWidth, inputHeight,
+                                                 nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                   kW, kH, dW, dH, padW, padH,
+                                                   nInputPlane, inputWidth, inputHeight,
+                                                   nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nOutputPlane = weight->size[0];
+
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput); 
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 0, 1);
+}
+
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          real scale)
+{
+  long nOutputPlane = gradWeight->size[0];
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialConvolutionMap.c
+++ b/torch/lib/THNN/generic/SpatialConvolutionMap.c
@ -0,0 +1,259 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+void THNN_(SpatialConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimc++;
+    dimw++;
+    dimh++;
+  }
+
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  const long input_w  = input->size[dimw];
+  const long input_h  = input->size[dimh];
+  const long output_w = (input_w - kW) / dW + 1;
+  const long output_h = (input_h - kH) / dH + 1;
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+  else
+    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      /* add bias */
+      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long j, k;
+      real z= bias_data[p];
+      for (j = 0; j < output_h*output_w; j++)
+        ptr_output[j] = z;
+
+      /* convolve all maps */
+      int nweight = connTable->size[0];
+      for (k = 0; k < nweight; k++)
+      {
+        /* get offsets for input/output */
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+        if (o == p)
+        {
+          THTensor_(validXCorr2Dptr)(
+            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+            1.0,
+            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+            weight_data + k*kW*kH,
+            kH, kW,
+            dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+}
+
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      long k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for (k = 0; k < nkernel; k++)
+      {
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+        if (i == p)
+        {
+          /* gradient to input */
+          THTensor_(fullConv2Dptr)(
+            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+            weight_data + k*kW*kH, kH, kW, dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH, real scale)
+{
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = gradWeight->size[1];
+  const long kW       = gradWeight->size[2];
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+
+  long k;
+  /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long l;
+      for (l = 0; l < output_h*output_w; l++)
+        gradBias_data[k] += scale*ptr_gradOutput[l];
+    }
+  }
+
+  /* gradients wrt weight */
+  const int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(
+        gradWeight_data + k*kW*kH,
+        scale,
+        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+        dH, dW
+      );
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/torch/lib/THNN/generic/SpatialDilatedConvolution.c
@ -0,0 +1,339 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    real scale)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialFractionalMaxPooling.c
+++ b/torch/lib/THNN/generic/SpatialFractionalMaxPooling.c
@ -0,0 +1,251 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  real* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH,
+  int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 2 random samples, one for W and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 2;
+
+    /* Generate interval sequence */
+    long* sequenceW =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w;
+
+    real* inputForPlane = input + plane * inputW * inputH;
+    real* outputForPlane = output + plane * outputW * outputH;
+    real* indicesForPlane = indices + plane * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        real maxVal = -THInf;
+        long maxIndex = -1;
+
+        long h2, w2;
+        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+            THAssert(h2 >= 0 && h2 < inputH);
+            THAssert(w2 >= 0 && w2 < inputW);
+
+            long planeIndex = h2 * inputW + w2;
+            real val = inputForPlane[planeIndex];
+            if (val > maxVal) {
+              maxVal = val;
+              maxIndex = planeIndex;
+            }
+          }
+        }
+
+        THAssert(maxVal != -THInf);
+        THAssert(maxIndex != -1);
+
+        outputForPlane[h * outputW + w] = maxVal;
+        /* +1 to lua index */
+        indicesForPlane[h * outputW + w] = (real) maxIndex + TH_INDEX_BASE;
+      }
+    }
+
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THTensor *indices,
+    THTensor *randomSamples) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+             "3D or 4D (batch mode) tensor expected");
+
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+             "poolSizeH too large relative to input height");
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+             "poolSizeW too large relative to input width");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  real* indices,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+    real* indicesForPlane = indices + plane * outputW * outputH;
+
+    long h, w;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        long outputIndex = h * outputW + w;
+        long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+        THAssert(index >= 0 && index < inputW * inputH);
+
+        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 3) {
+    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THTensor_(data)(indices),
+      numPlanes, inputW, inputH, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        numPlanes, inputW, inputH, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialFullConvolution.c
+++ b/torch/lib/THNN/generic/SpatialFullConvolution.c
@ -0,0 +1,385 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+static void THNN_(im2col)(const real* data_im, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_col) {
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+          data_im[(c_im * height + h_im) * width + w_im] : 0;
+      }
+    }
+  }
+}
+
+static void THNN_(col2im)(const real* data_col, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_im) {
+  memset(data_im, 0, sizeof(real) * height * width * channels);
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+            data_col[(c_col * height_col + h_col) * width_col + w_col];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+          't', 'n',
+          n_, m_, k_,
+          1,
+          THTensor_(data)(ones), k_,
+          THTensor_(data)(bias), k_,
+          1,
+          THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH,
+    real scale)
+{
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+  int nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialFullConvolutionMap.c
+++ b/torch/lib/THNN/generic/SpatialFullConvolutionMap.c
@ -0,0 +1,212 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  const int kH = (int)weight->size[1];
+  const int kW = (int)weight->size[2];
+
+  THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
+  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+  THTensor_(resize3d)(
+    output_, nOutputPlane,
+    (input->size[1] - 1) * dH + kH,
+    (input->size[2] - 1) * dW + kW
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  THTensor* output = THTensor_(newContiguous)(output_);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = output->size[1];
+  const long output_w = output->size[2];
+  const long weight_h = weight->size[1];
+  const long weight_w = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    /* add bias */
+    real *ptr_output = output_data + p*output_w*output_h;
+    long j;
+    int nweight;
+    long k;
+
+    for (j = 0; j < output_h*output_w; j++)
+      ptr_output[j] = bias_data[p];
+
+    /* convolve all maps */
+    nweight = connTable->size[0];
+    for (k = 0; k < nweight; k++)
+    {
+      /* get offsets for input/output */
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+      if (o == p)
+      {
+        THTensor_(fullConv2Dptr)(
+          output_data + o*output_w*output_h,
+          1.0,
+          input_data + i*input_w*input_h, input_h, input_w,
+          weight_data + k*weight_w*weight_h, weight_h, weight_w,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(freeCopyTo)(output, output_);
+}
+
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long kH = weight->size[1];
+  const long kW = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long k;
+    /* backward all */
+    int nkernel = connTable->size[0];
+    for (k = 0; k < nkernel; k++)
+    {
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+      if (i == p)
+      {
+        /* gradient to input */
+        THTensor_(validXCorr2Dptr)(
+          gradInput_data + i*input_w*input_h,
+          1.0,
+          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+          weight_data + k*kW*kH, kH, kW,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(freeCopyTo)(gradInput, gradInput_);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH, real scale)
+{
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  const long input_h  = input->size[1];
+  const long input_w  = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long weight_h = gradWeight->size[1];
+  const long weight_w = gradWeight->size[2];
+
+  /* gradients wrt bias */
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+    long l;
+    for (l = 0; l < output_h*output_w; l++)
+      gradBias_data[k] += scale*ptr_gradOutput[l];
+  }
+
+  /* gradients wrt weight */
+  int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+    /* gradient to kernel */
+    THTensor_(validXCorr2DRevptr)(
+      gradWeight_data + k*weight_w*weight_h,
+      scale,
+      gradOutput_data + o*output_w*output_h, output_h, output_w,
+      input_data + i*input_w*input_h, input_h, input_w,
+      dH, dW
+    );
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialMaxPooling.c
+++ b/torch/lib/THNN/generic/SpatialMaxPooling.c
@ -0,0 +1,300 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+static void THNN_(SpatialMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    real *ip = input_p   + k*iwidth*iheight;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        long hstart = i * dH - padH;
+        long wstart = j * dW - padW;
+        long hend = fminf(hstart + kH, iheight);
+        long wend = fminf(wstart + kW, iwidth);
+        hstart = fmaxf(hstart, 0);
+        wstart = fmaxf(wstart, 0);
+
+        /* local pointers */
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        real *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long tcntr = 0;
+        long x,y;
+        for(y = hstart; y < hend; y++)
+        {
+          for(x = wstart; x < wend; x++)
+          {
+            tcntr = y*iwidth + x;
+            real val = *(ip + tcntr);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  if (ceil_mode)
+  {
+    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((oheight - 1)*dH >= iheight + padH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + padW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize3d)(indices,  nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight,
+                                              kW, kH, dW, dH,
+                                              padW, padH);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize4d)(indices, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+                                                indices_data+p*nslices*owidth*oheight,
+                                                nslices,
+                                                iwidth, iheight,
+                                                owidth, oheight,
+                                                kW, kH, dW, dH,
+                                                padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *ind_p_k = ind_p + k*owidth*oheight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        /* retrieve position of max */
+        long maxp = ind_p_k[i*owidth + j] - TH_INDEX_BASE;
+        /* update gradient */
+        gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight,
+                                                 dW, dH);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*owidth*oheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight,
+                                                   dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialMaxUnpooling.c
+++ b/torch/lib/THNN/generic/SpatialMaxUnpooling.c
@ -0,0 +1,223 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      real *ind_p,
+                                                      long nslices,
+                                                      long iwidth, long iheight,
+                                                      long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *output_p_k = output_p + k*owidth*oheight;
+    real *input_p_k = input_p + k*iwidth*iheight;
+    real *ind_p_k = ind_p + k*iwidth*iheight;
+
+    long i, j, maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        }
+        output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input and indices */
+  input = THTensor_(newContiguous)(input);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+                                                indices_data+p*nslices*iwidth*iheight,
+                                                nslices,
+                                                iwidth, iheight,
+                                                owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THTensor_(free)(indices);
+}
+
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         real *ind_p,
+                                                         long nslices,
+                                                         long iwidth, long iheight,
+                                                         long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *ind_p_k = ind_p + k*iwidth*iheight;
+
+    long i, j, maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        }
+        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  /* get contiguous gradOutput and indices */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*iwidth*iheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(indices);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialReflectionPadding.c
+++ b/torch/lib/THNN/generic/SpatialReflectionPadding.c
@ -0,0 +1,255 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r,
+                                                  int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 3 ||
+    input->nDimension == 4 , 2, "input must be 3 or 4-dimensional");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialReplicationPadding.c
+++ b/torch/lib/THNN/generic/SpatialReplicationPadding.c
@ -0,0 +1,254 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4,
+             2, "input must be 3 or 4-dimensional");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
--- a/torch/lib/THNN/generic/SpatialSubSampling.c
+++ b/torch/lib/THNN/generic/SpatialSubSampling.c
@ -0,0 +1,267 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+void THNN_(SpatialSubSampling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    int kW, int kH,
+    int dW, int dH)
+{
+  
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  long k;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+  
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      /* Get the good mask for (k,i) (k out, i in) */
+      real the_weight = weight_data[k];
+      /* Initialize to the bias */
+      real z = bias_data[k];
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = z;
+      
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real sum = 0;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += ptr_input[kx];
+            ptr_input += inputWidth; /* next input line */
+          }
+          /* Update output */
+          *ptr_output++ += the_weight*sum;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    int kW, int kH,
+    int dW, int dH)
+{
+  
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  real *weight_data;
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  weight_data = THTensor_(data)(weight);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real the_weight = weight_data[k];
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++ * the_weight;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              ptr_gradInput[kx] += z;
+            ptr_gradInput += inputWidth;
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    int kW, int kH,
+    int dW, int dH,
+    real scale)
+{
+  long nbatch = 1;
+  long dimw = 2;
+  long dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+
+  real *gradWeight_data;
+  real *gradBias_data;
+  real *gradOutput_data;
+  real *input_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+    nbatch = input->size[0];
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      real sum;
+      long xx, yy;
+      long i;
+
+      sum = 0;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        sum += ptr_gradOutput[i];
+      gradBias_data[k] += scale*sum;
+
+      sum = 0;
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += z * ptr_input[kx];
+            ptr_input += inputWidth;
+          }
+        }
+      }
+      gradWeight_data[k] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(input);
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialUpSamplingBilinear.c
+++ b/torch/lib/THNN/generic/SpatialUpSamplingBilinear.c
@ -0,0 +1,127 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output){
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  int channels = THTensor_(size)(input, 0) * THTensor_(size)(input, 1);
+  int height1 = THTensor_(size)(input, 2);
+  int width1 = THTensor_(size)(input, 3);
+  int height2 = THTensor_(size)(output, 2);
+  int width2 = THTensor_(size)(output, 3);
+  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  // special case: just copy
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * width1 + w1];
+        real* pos2 = &odata[h2 * width2 + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += width1 * height1;
+          pos2 += width2 * height2;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      const real* pos1 = &idata[h1 * width1 + w1];
+      real* pos2 = &odata[h2 * width2 + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+                  + h1lambda * (w0lambda * pos1[h1p * width1]
+                  + w1lambda * pos1[h1p * width1 + w1p]);
+        pos1 += width1 * height1;
+        pos2 += width2 * height2;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput){
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THTensor_(zero)(gradInput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  int channels = THTensor_(size)(gradInput, 0) * THTensor_(size)(gradInput, 1);
+  int height1 = THTensor_(size)(gradInput, 2);
+  int width1 = THTensor_(size)(gradInput, 3);
+  int height2 = THTensor_(size)(gradOutput, 2);
+  int width2 = THTensor_(size)(gradOutput, 3);
+  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  // special case: same-size matching grids
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &data1[h1 * width1 + w1];
+        const real* pos2 = &data2[h2 * width2 + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += width1 * height1;
+          pos2 += width2 * height2;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      real* pos1 = &data1[h1 * width1 + w1];
+      const real* pos2 = &data2[h2 * width2 + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * width1] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * width1 + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += width1 * height1;
+        pos2 += width2 * height2;
+      }
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/SpatialUpSamplingNearest.c
+++ b/torch/lib/THNN/generic/SpatialUpSamplingNearest.c
@ -0,0 +1,143 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-2;
+  int yDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;  // Gauranteed to be between 3 and 5
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = 1;
+  if (idim > 3) {
+    osz3 = output->size[3];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst;
+  int iout[4];  // Output indices
+  int iin[4];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+
+          // set the indices for the upsampled dimensions
+          iin[xDim] = iout[xDim] / dW;
+          iin[yDim] = iout[yDim] / dH;
+
+          idst = i0*os[0] + i1*os[1] + i2*os[2];
+          isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
+          if (idim > 3) {
+            idst += i3*os[3];
+            isrc += iin[3]*is[3];
+          }
+
+          pout[idst] = pin[isrc];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = gradInput->nDimension-2;
+  int yDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Gauranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = 1;
+  if (idim > 3) {
+    isz3 = gradInput->size[3];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst, x, y;
+  int iin[4];  // Input indices
+  int iout[4];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          idst = i0*is[0] + i1*is[1] + i2*is[2];
+          if (idim > 3) {
+            idst += i3*is[3];
+          }
+
+          // Now accumulate the gradients from gradOutput
+          for (y = 0; y < dH; y++) {
+            for (x = 0; x < dW; x++) {
+              iout[xDim] = dW * iin[xDim] + x;
+              iout[yDim] = dH * iin[yDim] + y;
+              isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
+              if (idim > 3) {
+                isrc += iout[3]*os[3];
+              }
+              pin[idst] += pout[isrc];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/Sqrt.c
+++ b/torch/lib/THNN/generic/Sqrt.c
@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(sqrt)(output, input);
+}
+
+void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (output->nDimension == 1 || 
+      !THTensor_(isContiguous)(output) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *output_data     = THTensor_(data)(output);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(output); i++)
+    {
+      if (output_data[i] == 0.0)
+        gradInput_data[i] = 0.0;
+      else
+        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/Square.c
+++ b/torch/lib/THNN/generic/Square.c
@ -0,0 +1,58 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *output_data = THTensor_(data)(output);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+      output_data[i] = input_data[i]*input_data[i];
+  }
+}
+
+void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 || 
+      !THTensor_(isContiguous)(input) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/THNN.h
+++ b/torch/lib/THNN/generic/THNN.h
--- a/torch/lib/THNN/generic/Tanh.c
+++ b/torch/lib/THNN/generic/Tanh.c
@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(tanh)(output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(gradInput, output);
+
+  if (output->nDimension == 1 || 
+      !THTensor_(isContiguous)(output) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      real z = *output_data;            \
+      *gradInput_data = *gradOutput_data * (1. - z*z);
+    );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_output     = THTensor_(data)(output);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+    {
+      real z = ptr_output[i];
+      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/TemporalConvolution.c
+++ b/torch/lib/THNN/generic/TemporalConvolution.c
@ -0,0 +1,349 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize,
+          int outputFrameSize)
+{
+  THTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+  
+  if (input->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck(input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  input = THTensor_(newContiguous)(input);
+  outputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THTensor_(resize2d)(output,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(outputWindow, output, 0, k);
+      THTensor_(copy)(outputWindow, bias);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(outputWindow, output->storage, 
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THTensor_(transpose)(weight, NULL, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
+      THTensor_(transpose)(weight, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THTensor *outputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    THTensor_(resize3d)(output,
+                        nBatchFrame,
+                        nOutputFrame,
+                        outputFrameSize);
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(outputSample, output, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+      
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(outputWindow, outputSample, 0, k);
+        THTensor_(copy)(outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage, 
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THTensor_(transpose)(weight, NULL, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
+        THTensor_(transpose)(weight, NULL, 0, 1);
+      }
+    }
+    THTensor_(free)(outputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(outputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *gradInputWindow;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  if (gradOutput->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  gradOutputWindow = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (gradOutput->nDimension == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *gradInputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(gradInputSample, gradInput, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+      
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(gradInputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(gradInputWindow);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          real scale)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *inputWindow;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  if (gradOutput->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  
+  if (input->nDimension == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, 
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
+      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+      
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, 
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
+        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+#endif
--- a/torch/lib/THNN/generic/TemporalMaxPooling.c
+++ b/torch/lib/THNN/generic/TemporalMaxPooling.c
@ -0,0 +1,235 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  long framesize;
+  long noframe;
+
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  long t, y;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  /* sizes */
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 2)
+  {
+    /* resize output */
+    THTensor_(resize2d)(output, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THTensor_(resize2d)(indices, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    for(t = 0; t < noframe; t++)
+    {
+      real *ip = input_data + t*framesize*dW;
+      real *op = output_data + t*framesize;
+      real *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long x;
+        for(x = 0; x < kW; x++)
+        {
+          real val = ip[x*framesize+y];
+          if (val > maxval)
+          {
+            maxval = val;
+            maxindex = x;
+          }
+        }
+
+        /* set output to local max */
+        op[y] = maxval;
+        xp[y] = (real)maxindex;
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    /* resize output */
+    THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *inputSample_data = input_data + i*niframe*framesize;
+      real *outputSample_data = output_data + i*noframe*framesize;
+      real *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *ip = inputSample_data + t*framesize*dW;
+        real *op = outputSample_data + t*framesize;
+        real *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = -1;
+          real maxval = -THInf;
+          long x;
+          for(x = 0; x < kW; x++)
+          {
+            real val = ip[x*framesize+y];
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = x;
+            }
+          }
+
+          /* set output to local max */
+          op[y] = maxval;
+          xp[y] = (real)maxindex;
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  int noframe;
+  long framesize;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  long t, y;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize and zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  /* sizes */
+  niframe = input->size[dimS];
+  noframe = gradOutput->size[dimS];
+  framesize = gradOutput->size[dimF];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  if (input->nDimension == 2)
+  {
+    for(t = 0; t < noframe; t++)
+    {
+      real *gip = gradInput_data + t*framesize*dW;
+      real *gop = gradOutput_data + t*framesize;
+      real *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = (long)xp[y];
+        gip[maxindex*framesize+y] += gop[y];
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+      real *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *gip = gradInputSample_data + t*framesize*dW;
+        real *gop = gradOutputSample_data + t*framesize;
+        real *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = (long)xp[y];
+          gip[maxindex*framesize+y] += gop[y];
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/TemporalSubSampling.c
+++ b/torch/lib/THNN/generic/TemporalSubSampling.c
@ -0,0 +1,116 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize)
+{
+  THTensor *outputFrame, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k;
+  
+  THArgCheck( input->nDimension == 2, 2, "2D tensor expected");
+  THArgCheck( input->size[1] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck( input->size[0] >= kW, 2, "input sequence smaller than kernel size");
+
+  outputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  THTensor_(resize2d)(output,
+                      nOutputFrame,
+                      inputFrameSize);
+  
+  for(k = 0; k < nOutputFrame; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(outputFrame, output, 0, k);
+    THTensor_(sum)(outputFrame, inputWindow, 0);
+    THTensor_(cmul)(outputFrame, outputFrame, weight);
+    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+  }
+
+  THTensor_(free)(outputFrame);
+  THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+
+  THTensor *gradOutputFrame;
+  THTensor *gradInputWindow, *buffer, *kwunit;
+  long k;
+
+  gradOutputFrame = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+  kwunit = THTensor_(newWithSize1d)(kW);
+
+  THTensor_(fill)(kwunit, 1);
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(cmul)(buffer, weight, gradOutputFrame);
+    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(buffer);
+  THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          real scale)
+{
+  THTensor *gradOutputFrame;
+  THTensor *inputWindow, *buffer;
+  long k;
+
+
+  gradOutputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(sum)(buffer, inputWindow, 0);
+    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(buffer);
+}
+
+#endif
--- a/torch/lib/THNN/generic/Threshold.c
+++ b/torch/lib/THNN/generic/Threshold.c
@ -0,0 +1,58 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= threshold)
+        *input_data = val;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > threshold) ? *input_data : val;
+    );
+  }
+}
+
+void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= threshold)
+        *gradOutput_data = 0;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > threshold)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricAveragePooling.c
+++ b/torch/lib/THNN/generic/VolumetricAveragePooling.c
@ -0,0 +1,309 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = input_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local sum: */
+          real sum = 0.0;
+          int x, y, z;
+
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = sum / (kT * kW * kH);
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricAveragePooling_updateOutput_frame)(
+      input_data, output_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else  /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = gradInput_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = gradOutput_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* scatter gradients out to footprint: */
+          real val  = *op / (kT * kW * kH);
+          int x,y,z;
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                *(ip + z * iwidth * iheight + y * iwidth + x) += val;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricConvolution.c
@ -0,0 +1,247 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nOutputPlane = weight->size[0];
+  long kT           = weight->size[2];
+  long kH           = weight->size[3];
+  long kW           = weight->size[4];
+  long inputDepth   = input->size[dimt];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth - kT) / dT + 1;
+  long outputWidth  = (inputWidth - kW) / dW + 1;
+  long outputHeight = (inputHeight - kH) / dH + 1;
+  THTensor *outn = THTensor_(new)();
+  long i, j;
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    /* add bias */
+    for (i = 0; i < bias->size[0]; i++)
+    {
+      THTensor_(select)(outn, output, 0, i);
+      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+    }
+
+    /* do convolutions */
+    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+  }
+  else /* batch mode */
+  {
+    long nBatch = input->size[0];
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor *inb = THTensor_(new)();
+    THTensor *outb = THTensor_(new)();
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inb, input, 0, j);
+      THTensor_(select)(outb, output, 0, j);
+
+      /* add bias */
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, outb, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+
+      /* do convolutions */
+      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+    }
+
+    THTensor_(free)(inb);
+    THTensor_(free)(outb);
+  }
+  THTensor_(free)(outn);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  /* gradient to input */
+  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *ginpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    THTensor_(resize5d)(gradInput,
+      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+    );
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(ginpb, gradInput, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+    }
+    THTensor_(free)(ginpb);
+    THTensor_(free)(goutb);
+  }
+
+  THTensor_(free)(tweight);
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          real scale)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
+
+  long k;
+  real *gradBias_data;
+  THTensor *gradOutSlice;
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    /* gradient to bias */
+    gradBias_data = THTensor_(data)(gradBias);
+    gradOutSlice = THTensor_(new)();
+    for (k = 0; k < nOutputPlane; k++)
+    {
+      THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+      gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+    }
+    THTensor_(free)(gradOutSlice);
+
+    /* gradient to kernels */
+    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *inpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inpb, input, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+
+      /* gradient to bias */
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, goutb, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
+
+      /* gradient to kernels */
+      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+    }
+    THTensor_(free)(inpb);
+    THTensor_(free)(goutb);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/torch/lib/THNN/generic/VolumetricConvolutionMM.c
@ -0,0 +1,518 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void THNN_(unfolded_acc_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  int nip;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+//#pragma omp parallel for private(nip)
+  for (nip = 0; nip < nInputPlane; nip++)
+  {
+    int kt, kw, kh, t, y, x, it, ix, iy;
+    for (kt = 0; kt < kT; kt++)
+    {
+      for (kh = 0; kh < kH; kh++)
+      {
+        for (kw = 0; kw < kW; kw++)
+        {
+          real *src = finput_data
+            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+            + kh  * (kW*outputDepth*outputHeight*outputWidth)
+            + kw  * (outputDepth*outputHeight*outputWidth);
+
+          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
+          if (pT > 0 || pH > 0 || pW > 0)
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT - pT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH - pH + kh;
+                for (x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW - pW + kw;
+                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+                  {
+                  }
+                  else
+                  {
+                    THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                  }
+                }
+              }
+            }
+          }
+          else
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH + kh;
+                for(x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW + kw;
+                  THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(unfolded_copy_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+// #pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane*kT*kH*kW; k++)
+  {
+    int nip = k / (kT*kH*kW);
+    int rest = k % (kT*kH*kW);
+    int kt = rest / (kH*kW);
+    rest = rest % (kH*kW);
+    int kh = rest / kW;
+    int kw = rest % kW;
+    int t,x,y,it,ix,iy;
+    real *dst = finput_data
+      + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+      + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+      + kh  * (kW*outputDepth*outputHeight*outputWidth)
+      + kw  * (outputDepth*outputHeight*outputWidth);
+    real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
+
+    if (pT > 0 || pH > 0 || pW > 0)
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT - pT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH - pH + kh;
+          for (x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW - pW + kw;
+            if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+              memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
+            else
+              memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+    else
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH + kh;
+          for(x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW + kw;
+            memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          long nInputPlane,
+          long inputDepth,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputDepth,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy_vol)(
+    finput, input,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    nInputPlane,
+    inputDepth, inputWidth, inputHeight,
+    outputDepth, outputWidth, outputHeight
+  );
+
+  output2d = THTensor_(newWithStorage2d)(
+    output->storage, output->storageOffset, nOutputPlane, -1,
+    outputDepth*outputHeight*outputWidth, -1
+  );
+
+  for (i = 0; i < nOutputPlane; i++)
+  {
+    THVector_(fill)(
+      output->storage->data+output->storageOffset+output->stride[0]*i,
+      THTensor_(get1d)(bias, i),
+      outputDepth*outputHeight*outputWidth
+    );
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D(batch mode) tensor expected"
+  );
+
+  if (input->nDimension == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
+
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+      input, output, weight, bias, finput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      nInputPlane, inputDepth, inputWidth, inputHeight,
+      nOutputPlane, outputDepth, outputWidth, outputHeight
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+// #pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+        input_t, output_t, weight, bias, finput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        nInputPlane, inputDepth, inputWidth, inputHeight,
+        nOutputPlane, outputDepth, outputWidth, outputHeight
+      );
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc_vol)(
+    fgradInput, gradInput,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+  );
+}
+
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  // number of input/output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 2, 4,
+    "2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);  
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+      gradInput, gradOutput, weight, fgradInput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+//#pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+        gradInput_t, gradOutput_t, weight, fgradInput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 0, 1);
+}
+
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  for (i = 0; i < gradBias->size[0]; i++)
+  {
+    long k;
+    real sum = 0;
+    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+    for (k = 0; k < gradOutput2d->size[1]; k++)
+      sum += data[k];
+
+    (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  THArgCheck(gradWeight->nDimension == 2, 4,
+    "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
+  );
+
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
+
+  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (input->nDimension == 4)   // non-batch mode
+  {
+    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else  // batch mode
+  {
+    long T = input->size[0];
+    long t;
+
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricDilatedConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricDilatedConvolution.c
@ -0,0 +1,356 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[0]);
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[1]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[4];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 ||
+      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kT*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          real scale)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kT*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricFullConvolution.c
@ -0,0 +1,469 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+static void THNN_(vol2col)(
+  const real *data_vol, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
+  real *data_col)
+{
+  int c, t, h, w;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(col2vol)(
+  const real* data_col, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
+  real* data_vol)
+{
+  int c, t, h, w;
+  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *columns = finput;
+  THTensor *ones    = fgradInput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+  else
+  {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const long n = columns->size[1];
+    const long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 't',
+      n, m, k,
+      1,
+      THTensor_(data)(input_n), n,
+      THTensor_(data)(weight), m,
+      0,
+      THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long n_ = outputDepth * outputHeight * outputWidth;
+    const long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      't', 'n',
+      n_, m_, k_,
+      1,
+      THTensor_(data)(ones), k_,
+      THTensor_(data)(bias), k_,
+      1,
+      THTensor_(data)(output_n), n_
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *gradColumns = finput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(gradColumns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[0];
+    const long n = gradColumns->size[1];
+    const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(gradColumns), n,
+      THTensor_(data)(weight), k,
+      0,
+      THTensor_(data)(gradInput_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH,   // extra output adjustment
+  real scale)
+{
+  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
+
+  int nInputPlane  = (int)gradWeight->size[0];
+  int nOutputPlane = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
+
+  THTensor *columns = finput;
+  THTensor *ones = fgradInput;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n), nOutputPlane,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    const long m = input_n->size[0];   // nInputPlane
+    const long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      't', 'n',
+      n, m, k,
+      scale,
+      THTensor_(data)(columns), k,
+      THTensor_(data)(input_n), k,
+      1,
+      THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    THBlas_(gemv)(
+      't',
+      k_, m_,
+      scale,
+      THTensor_(data)(gradOutput_n), k_,
+      THTensor_(data)(ones), 1,
+      1,
+      THTensor_(data)(gradBias), 1
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/torch/lib/THNN/generic/VolumetricMaxPooling.c
@ -0,0 +1,392 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          long kernel_t = fminf(kT, kT + start_t);
+          long kernel_h = fminf(kH, kH + start_h);
+          long kernel_w = fminf(kW, kW + start_w);
+
+          start_t = fmaxf(start_t, 0);
+          start_h = fmaxf(start_h, 0);
+          start_w = fmaxf(start_w, 0);
+
+          real *ip = input_p + k * itime * iwidth * iheight
+            + start_t * iwidth * iheight + start_h * iwidth + start_w;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          real *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local max: */
+          real maxval = -THInf;
+          int x,y,z;
+          int mx, my, mz;
+
+          for (z = 0; z < kernel_t; z++)
+          {
+            for (y = 0; y < kernel_h; y++)
+            {
+              for (x = 0; x < kernel_w; x++)
+              {
+                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
+                {
+                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
+                  if (val > maxval)
+                  {
+                    maxval = val;
+                    // Store indices w.r.t the kernel dimension
+                    mz = z + (kT - kernel_t);
+                    my = y + (kH - kernel_h);
+                    mx = x + (kW - kernel_w);
+                  }
+                }
+              }
+            }
+          }
+
+          // set max values
+          ((unsigned char*)(indzp))[0] = mz;
+          ((unsigned char*)(indzp))[1] = my;
+          ((unsigned char*)(indzp))[2] = mx;
+          ((unsigned char*)(indzp))[3] = 0;
+
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+    "pad should be smaller than half of kernel size"
+  );
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
+    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
+    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+  }
+  else
+  {
+    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
+    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
+    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(VolumetricMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    real *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    long ti, i, j;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* retrieve position of max */
+          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
+
+          /* update gradient */
+          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricMaxUnpooling.c
+++ b/torch/lib/THNN/generic/VolumetricMaxUnpooling.c
@ -0,0 +1,325 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *ind_p,
+          long nslices,
+          long iT,
+          long iW,
+          long iH,
+          long oT,
+          long oW,
+          long oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    long ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          //real *output_p_k = output_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
+          real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          output_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5 , 2,
+    "4D or 5D (batch mode) tensor expected"
+  );
+
+  if (!THTensor_(isSameSizeAs)(input, indices))
+  {
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH, pT, pW, pH
+    );
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+        input_data+p*nslices*iT*iW*iH,
+        output_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THTensor_(free)(indices);
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *ind_p,
+          long nslices,
+          long iT,
+          long iW,
+          long iH,
+          long oT,
+          long oW,
+          long oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    long ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          //real *gradOutput_p_k = gradOutput_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
+          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  if (!THTensor_(isSameSizeAs)(input, indices))
+  {
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+  {
+    THError(
+      "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%d",
+      oT, oH, oW,gradOutput->size[dimh], gradOutput->size[dimw]
+    );
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+        gradInput_data+p*nslices*iT*iW*iH,
+        gradOutput_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(indices);
+}
+
+#endif
--- a/torch/lib/THNN/generic/VolumetricReplicationPadding.c
+++ b/torch/lib/THNN/generic/VolumetricReplicationPadding.c
@ -0,0 +1,301 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *dest_p = output_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *src_p = input_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p = *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *output,
+                                                      int pleft, int pright,
+                                                      int ptop, int pbottom,
+                                                      int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5,
+             2, "input must be 4 or 5-dimensional");
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1 , 2,
+             "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+         input_data, output_data, nslices, iwidth, iheight, idepth,
+         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+         pback);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+        input_data + p * nslices * iwidth * iheight * idepth,
+        output_data + p * nslices * owidth * oheight * odepth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *src_p = goutput_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p += *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *gradOutput,
+                                                         THTensor *gradInput,
+                                                         int pleft, int pright,
+                                                         int ptop, int pbottom,
+                                                         int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+  THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+                "gradOutput depth unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 4) {
+    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight, idepth,
+      owidth, oheight, odepth,
+      pleft, pright,
+      ptop, pbottom,
+      pfront, pback);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
--- a/torch/lib/THNN/generic/unfold.c
+++ b/torch/lib/THNN/generic/unfold.c
@ -0,0 +1,158 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+#ifdef _WIN32
+# include <windows.h>
+#endif
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+#ifdef _WIN32
+  LONG_PTR nip;
+#else
+  size_t nip;
+#endif
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    size_t kw, kh, y, x;
+    long long ix = 0, iy = 0;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+        real *dst = input_data + nip*(inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          size_t lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH - padH + kh);
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = (long long)(0 - padW + kw);
+                 lpad = fmaxf(0,(int)(padW-kw));
+                 rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
+                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (long long)(x*dW - padW + kw);
+                   if (ix < 0 || ix >= inputWidth){
+                   }else
+                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH + kh);
+            ix = (long long)(0 + kw);
+            if (dW == 1 )
+               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            else{
+              for(x = 0; x < outputWidth; x++)
+                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane*kH*kW; k++) {
+    size_t nip = k / (kH*kW);
+    size_t rest = k % (kH*kW);
+    size_t kh = rest / kW;
+    size_t kw = rest % kW;
+    size_t x,y;
+    long long ix,iy;
+    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+    real *src = input_data + nip*(inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      size_t lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH - padH + kh);
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = (long long)(0 - padW + kw);
+             lpad = fmaxf(0,(int)(padW-kw));
+             rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (long long)(x*dW - padW + kw);
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH + kh);
+        ix = (long long)(0 + kw);
+        if (dW == 1)
+           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+#endif
--- a/torch/lib/THNN/init.c
+++ b/torch/lib/THNN/init.c
@ -0,0 +1,182 @@
+#include "TH.h"
+#include "THNN.h"
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#include "generic/Abs.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/AbsCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/DistKLDivCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ELU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardTanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/L1Cost.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LeakyReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LookupTable.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MSECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiLabelMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/PReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/RReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SmoothL1Criterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftPlus.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SparseLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sqrt.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Square.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Tanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Threshold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/BatchNormalization.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/unfold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionLocal.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingBilinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReflectionPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricReplicationPadding.c"
+#include "THGenerateFloatTypes.h"