Add 'torch/lib/THNN/' from commit '4fe7059a315d156ecd080ff7bd5b4fe3d3a9efad'

git-subtree-dir: torch/lib/THNN git-subtree-mainline: c3f0c1e2e0 git-subtree-split: 4fe7059a31
2025-12-07 00:21:07 +01:00 · 2016-08-04 10:58:50 -07:00 · 2016-08-04 10:58:50 -07:00 · 035eb28e18
commit 035eb28e18
parent c3f0c1e2e0 4fe7059a31
67 changed files with 14062 additions and 0 deletions
--- a/torch/lib/THNN/CMakeLists.txt
+++ b/torch/lib/THNN/CMakeLists.txt
@ -0,0 +1,65 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
 CMAKE_POLICY(VERSION 2.6)
 IF(NOT Torch_FOUND)
  FIND_PACKAGE(Torch REQUIRED)
 ENDIF()
 IF(NOT THNN_INSTALL_LIB_SUBDIR)
  SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
 ENDIF()
 # Flags
 # When using MSVC
 IF(MSVC)
  # we want to respect the standard, and we are bored of those **** .
  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
 ENDIF(MSVC)
 IF (CMAKE_VERSION VERSION_LESS "3.1")
  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
 ELSE ()
  SET(CMAKE_C_STANDARD 99)
 ENDIF ()
 # OpenMP support?
 SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
 IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
  IF (DARWIN_VERSION GREATER 9)
    SET(APPLE_OPENMP_SUCKS 1)
  ENDIF (DARWIN_VERSION GREATER 9)
  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
    OUTPUT_VARIABLE GCC_VERSION)
  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
  ENDIF ()
 ENDIF ()
 IF (WITH_OPENMP)
  FIND_PACKAGE(OpenMP)
  IF(OPENMP_FOUND)
    MESSAGE(STATUS "Compiling with OpenMP support")
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
  ENDIF(OPENMP_FOUND)
 ENDIF (WITH_OPENMP)
 LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
 SET(src init.c)
 ADD_LIBRARY(THNN MODULE init.c)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 ### Torch packages supposes libraries prefix is "lib"
 SET_TARGET_PROPERTIES(THNN PROPERTIES
  PREFIX "lib"
  IMPORT_PREFIX "lib")
 TARGET_LINK_LIBRARIES(THNN TH)
 INSTALL(TARGETS THNN LIBRARY DESTINATION ${THNN_INSTALL_LIB_SUBDIR})
--- a/torch/lib/THNN/README.md
+++ b/torch/lib/THNN/README.md
@ -0,0 +1,32 @@
 # THNN
 THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions, and an object oriented C/C++ wrapper will be created soon as another library.
 There is also a CUDA counterpart of THNN (THCUNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
 ## Links
 * [API reference](doc/api_reference.md)
 * [Style guidelines](doc/style_guidelines.md)
 ## Motivation
 Torch's neural network package (nn) provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
 THNN is based on the same code, but is written in pure C, so it can be easily included in other code. **Future C implementations should be committed to THNN.**
 ## API
 THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
 * **updateOutput** - applies the module to an input
 * **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
 * **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
 For information on argument types please check the [API reference](doc/api_reference.md).
 ## Developer docs
 * [Style guidelines](doc/style_guidelines.md)
 This section will be expanded when FFI refactoring will be finished.
--- a/torch/lib/THNN/THNN.h
+++ b/torch/lib/THNN/THNN.h
@ -0,0 +1,25 @@
 #ifndef THNN_H
 #define THNN_H
 #include <stdbool.h>
 #include <TH.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
 #define THIndexTensor THLongTensor
 #define THIndexTensor_(NAME) THLongTensor_ ## NAME
 #define THIntegerTensor THIntTensor
 #define THIntegerTensor_(NAME) THIntTensor_ ## NAME
 typedef long THIndex_t;
 typedef int THInteger_t;
 typedef void THNNState;
 #include "generic/THNN.h"
 #include <THGenerateFloatTypes.h>
 #endif
--- a/torch/lib/THNN/doc/api_reference.md
+++ b/torch/lib/THNN/doc/api_reference.md
--- a/torch/lib/THNN/doc/generate_reference.lua
+++ b/torch/lib/THNN/doc/generate_reference.lua
@ -0,0 +1,106 @@
 --[[
  This script regenerates api_reference.md based on comments placed in THNN.h.
 ]]--
 local header = [[
 # API docs
 This document only describes a THNN API. For a thorough review of all modules present here please refer to [nn's docs](http://github.com/torch/nn/tree/master/doc).
 ### Note on function names
 Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
 * `void THNN_FloatAbs_updateOutput(...)`
 * `void THNN_DoubleAbs_updateOutput(...)`
 In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
 ### Argument types
 Some arguments have additional tags placed in square brackets:
 * **[OUT]** - This is the output argument. It will be reshaped if needed.
 * **[OPTIONAL]** - This argument is optional and can be safely set to NULL
 * **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
 * **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
 ## Module list
 These are all modules implemented in THNN:
 ]]
 local hfile = io.open('../generic/THNN.h', 'r')
 local lines = hfile:read('*a'):split('\n')
 hfile:close()
 -- Parse input
 local declarations = {}
 local current_declaration
 local declaration_module
 for i,line in ipairs(lines) do
   if line:sub(1, 6) == 'TH_API' then
     current_declaration = ''
     declaration_module = line:match('THNN_%((.+)_.+%)')
   end
   if current_declaration then
      current_declaration = current_declaration .. line .. '\n'
   end
   if line:match('%);') then
     current_declaration = current_declaration:sub(1, -2) -- remove a trailing newline
     declarations[declaration_module] = declarations[declaration_module] or {}
     table.insert(declarations[declaration_module], current_declaration)
     current_declaration = nil
     declaration_module = nil
   end
 end
 declarations["unfolded"] = nil
 -- Sort modules
 modules = {}
 for k,_ in pairs(declarations) do table.insert(modules, k) end
 table.sort(modules)
 -- Create an index
 local outfile = io.open('api_reference.md', 'w')
 outfile:write(header)
 for i, name in ipairs(modules) do
    outfile:write(string.format('* [%s](#%s)\n', name, name:lower()))
 end
 outfile:write('\n')
 -- Write proper docs
 for i,name in ipairs(modules) do
    outfile:write('## ' .. name ..'\n')
    for i,declaration in ipairs(declarations[name]) do
        -- Write source code
        outfile:write('```C' .. '\n')
        local declaration_lines = declaration:split('\n')
        for i, line in ipairs(declaration_lines) do
            if i == 1 then
                line = line:gsub('TH_API ', ''):gsub('%(', ''):gsub('%)', '') .. '(' -- remove macro junk
            else
                line = line:gsub('%s*//.*$', '') -- remove the comment
            end
            outfile:write(line .. '\n')
        end
        outfile:write('```' .. '\n')
        -- Describe arguments
        table.remove(declaration_lines, 1)
        for i,line in ipairs(declaration_lines) do
            local param, comment = line:match('^%s*(.*),%s*// (.*)$')
            if param == nil then param, comment = line:match('^%s*(.*)%);%s*// (.*)$') end
            if param ~= nil then
                comment = comment:gsub('%[', '%*%*%['):gsub('%]', '%]%*%*') -- use bold font for tags
                outfile:write(string.format('`%s` - %s\n<br/>\n', param, comment))
            end
        end
    end
 end
 outfile:close()
--- a/torch/lib/THNN/doc/style_guidelines.md
+++ b/torch/lib/THNN/doc/style_guidelines.md
@ -0,0 +1,59 @@
 ## API design guidelines
 Functions should return `void`.
 All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this:
 ```
 [weight], [bias], [any buffers], [additional arguments], [optional arguments]
 ```
 ### Modules
 ```
 updateOutput: state, input, output, ...
 updateGradInput: state, input, gradOutput, gradInput, ...
 accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
 ```
 e.g.
 ```C
 void THNN_(HardShrink_updateGradInput)(
          THNNState* state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          real lambda)
 ```
 ### Criterions
 ```
 updateOutput: state, input, target, output, ...
 updateGradInput: state, input, target, gradInput, ...
 ```
 e.g.
 ```C
 void THNN_(ClassNLLCriterion_updateOutput)(
          THNNState* state,
          THTensor *input,
          THLongTensor *target,
          THTensor *output,
          THTensor *weights,
          THTensor *total_weight,
          bool sizeAverage)
 ```
 ## Code style guide
 ```C
 void THNN_Linear_updateOutput(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias);
 //<- 10 ->
 ```
 All arguments should start on a new line after function name, and they should be indented using 10 spaces.
 Use 2 spaces for block indentation.
--- a/torch/lib/THNN/generic/Abs.c
+++ b/torch/lib/THNN/generic/Abs.c
@ -0,0 +1,27 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/Abs.c"
 #else
 void THNN_(Abs_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  THTensor_(resizeAs)(output, input);
  THTensor_(abs)(output, input);
 }
 void THNN_(Abs_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput)
 {
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
    real z = *input_data;
    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
  );
 }
 #endif
--- a/torch/lib/THNN/generic/AbsCriterion.c
+++ b/torch/lib/THNN/generic/AbsCriterion.c
@ -0,0 +1,39 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/AbsCriterion.c"
 #else
 void THNN_(AbsCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          bool sizeAverage)
 {
  real sum = 0;
  TH_TENSOR_APPLY2(real, input, real, target,
    sum += fabs(*input_data - *target_data);
  );
  if (sizeAverage)
    sum /= THTensor_(nElement)(input);
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(AbsCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          bool sizeAverage)
 {
  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/BatchNormalization.c
+++ b/torch/lib/THNN/generic/BatchNormalization.c
@ -0,0 +1,144 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/BatchNormalization.c"
 #else
 void THNN_(BatchNormalization_updateOutput)(
  THNNState *state, THTensor *input, THTensor *output,
  THTensor *weight, THTensor *bias,
  THTensor *running_mean, THTensor *running_var,
  THTensor *save_mean, THTensor *save_std,
  bool train, double momentum, double eps)
 {
  long nInput = THTensor_(size)(input, 1);
  long f,n = THTensor_(nElement)(input) / nInput;
  #pragma omp parallel for
  for (f = 0; f < nInput; ++f) {
    THTensor *in = THTensor_(newSelect)(input, 1, f);
    THTensor *out = THTensor_(newSelect)(output, 1, f);
    real mean, invstd;
    if (train) {
      // compute mean per input
      accreal sum = 0;
      TH_TENSOR_APPLY(real, in, sum += *in_data;);
      mean = (real) sum / n;
      THTensor_(set1d)(save_mean, f, (real) mean);
      // compute variance per input
      sum = 0;
      TH_TENSOR_APPLY(real, in,
        sum += (*in_data - mean) * (*in_data - mean););
      if (sum == 0 && eps == 0.0) {
        invstd = 0;
      } else {
        invstd = (real) (1 / sqrt(sum/n + eps));
      }
      THTensor_(set1d)(save_std, f, (real) invstd);
      // update running averages
      THTensor_(set1d)(running_mean, f,
        (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
      accreal unbiased_var = sum / (n - 1);
      THTensor_(set1d)(running_var, f,
        (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
    } else {
      mean = THTensor_(get1d)(running_mean, f);
      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
    }
    // compute output
    real w = weight ? THTensor_(get1d)(weight, f) : 1;
    real b = bias ? THTensor_(get1d)(bias, f) : 0;
    TH_TENSOR_APPLY2(real, in, real, out,
      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
    THTensor_(free)(out);
    THTensor_(free)(in);
  }
 }
 void THNN_(BatchNormalization_backward)(
  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
  THTensor *running_mean, THTensor *running_var,
  THTensor *save_mean, THTensor *save_std,
  bool train, double scale, double eps)
 {
  long nInput = THTensor_(size)(input, 1);
  long f,n = THTensor_(nElement)(input) / nInput;
  #pragma omp parallel for
  for (f = 0; f < nInput; ++f) {
    THTensor *in = THTensor_(newSelect)(input, 1, f);
    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
    real w = weight ? THTensor_(get1d)(weight, f) : 1;
    real mean, invstd;
    if (train) {
      mean = THTensor_(get1d)(save_mean, f);
      invstd = THTensor_(get1d)(save_std, f);
    } else {
      mean = THTensor_(get1d)(running_mean, f);
      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
    }
    // sum over all gradOutput in feature plane
    accreal sum = 0;
    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
    // dot product of the Q(X) and gradOuput
    accreal dotp = 0;
    TH_TENSOR_APPLY2(real, in, real, gradOut,
      dotp += (*in_data - mean) * (*gradOut_data););
    if (gradInput) {
      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
      if (train) {
        // when in training mode
        // Q(X) = X - E[x] ; i.e. input centered to zero mean
        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
        // projection of gradOutput on to output scaled by std
        real k = (real) dotp * invstd * invstd / n;
        TH_TENSOR_APPLY2(real, gradIn, real, in,
          *gradIn_data = (*in_data - mean) * k;);
        accreal gradMean = sum / n;
        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
      } else {
        // when in evaluation mode
        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
        // dL/dX = w / running_std
        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
          *gradIn_data = *gradOut_data * invstd * w;);
      }
      THTensor_(free)(gradIn);
    }
    if (gradWeight) {
      real val = THTensor_(get1d)(gradWeight, f);
      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
    }
    if (gradBias) {
      real val = THTensor_(get1d)(gradBias, f);
      THTensor_(set1d)(gradBias, f, val + scale * sum);
    }
    THTensor_(free)(gradOut);
    THTensor_(free)(in);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/ClassNLLCriterion.c
+++ b/torch/lib/THNN/generic/ClassNLLCriterion.c
@ -0,0 +1,147 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
 #else
 void THNN_(ClassNLLCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THIndexTensor *target,
          THTensor *output,
          bool sizeAverage,
          THTensor *weights,
          THTensor *total_weight)
 {
  int n_dims = THTensor_(nDimension)(input);
  int n_classes = THTensor_(size)(input, n_dims - 1);
  if (THIndexTensor_(nDimension)(target) > 1) {
    THError("multi-target not supported");
  }
  if (THTensor_(nDimension)(input) > 2) {
    THError("input tensor should be 1D or 2D");
  }
  if (weights && THTensor_(nElement)(weights) != n_classes) {
    THError("weight tensor should be defined either for all or no classes");
  }
  input = THTensor_(newContiguous)(input);
  target = THIndexTensor_(newContiguous)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
  real *input_data = THTensor_(data)(input);
  THIndex_t *target_data = THIndexTensor_(data)(target);
  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
  real *output_data = THTensor_(data)(output);
  real *total_weight_data = THTensor_(data)(total_weight);
  output_data[0] = total_weight_data[0] = 0.0;
  if (THTensor_(nDimension)(input) == 1) {
    int cur_target = target_data[0] - TH_INDEX_BASE;
    THAssert(cur_target >= 0 && cur_target < n_classes);
    total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
    output_data[0] = -input_data[cur_target] * total_weight_data[0];
  } else if (THTensor_(nDimension)(input) == 2) {
    int batch_size = THTensor_(size)(input, 0);
    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
    int n_target = THTensor_(size)(input, 1);
    int i;
    for (i = 0; i < batch_size; i++) {
      int cur_target = target_data[i] - TH_INDEX_BASE;
      THAssert(cur_target >= 0 && cur_target < n_classes);
      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
      total_weight_data[0] += cur_weight;
      output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
    }
  }
  if (sizeAverage && total_weight_data[0]) {
    output_data[0] /= total_weight_data[0];
  }
  if (weights) {
    THTensor_(free)(weights);
  }
  THTensor_(free)(input);
  THIndexTensor_(free)(target);
 }
 void THNN_(ClassNLLCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THIndexTensor *target,
          THTensor *gradInput,
          bool sizeAverage,
          THTensor *weights,
          THTensor *total_weight)
 {
  int n_dims = THTensor_(nDimension)(input);
  int n_classes = THTensor_(size)(input, n_dims - 1);
  if (!THTensor_(isContiguous)(gradInput)) {
    THError("gradInput must be contiguous");
  }
  real *total_weight_data = THTensor_(data)(total_weight);
  if (!(*total_weight_data > 0)) {
    return;
  }
  if (THIndexTensor_(nDimension)(target) > 1) {
    THError("multi-target not supported");
  }
  if (THTensor_(nDimension)(input) > 2) {
    THError("input tensor should be 1D or 2D");
  }
  if (weights && THTensor_(nElement)(weights) != n_classes) {
    THError("weight tensor should be defined either for all or no classes");
  }
  target = THIndexTensor_(newContiguous)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
  THIndex_t *target_data = THIndexTensor_(data)(target);
  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
  real *gradInput_data = THTensor_(data)(gradInput);
  if (THTensor_(nDimension)(input) == 1) {
    int cur_target = target_data[0] - TH_INDEX_BASE;
    THAssert(cur_target >= 0 && cur_target < n_classes);
    gradInput_data[cur_target] =
      (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
  } else if (THTensor_(nDimension)(input) == 2) {
    int batch_size = THTensor_(size)(input, 0);
    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
    int n_target = THTensor_(size)(input, 1);
    int i;
    for (i = 0; i < batch_size; i++){
      int cur_target = target_data[i] - TH_INDEX_BASE;
      THAssert(cur_target >= 0 && cur_target < n_classes);
      gradInput_data[i * n_target + cur_target] =
        -(weights ? weights_data[cur_target] : 1.0f);
      if (sizeAverage && *total_weight_data) {
        gradInput_data[i * n_target + cur_target] /= *total_weight_data;
      }
    }
  }
  THIndexTensor_(free)(target);
  if (weights) {
    THTensor_(free)(weights);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/DistKLDivCriterion.c
+++ b/torch/lib/THNN/generic/DistKLDivCriterion.c
@ -0,0 +1,39 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
 #else
 void THNN_(DistKLDivCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          bool sizeAverage)
 {
  real sum = 0;
  TH_TENSOR_APPLY2(real, input, real, target,
    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
  );
  if (sizeAverage)
    sum /= THTensor_(nElement)(input);
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(DistKLDivCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          bool sizeAverage)
 {
  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
    *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/ELU.c
+++ b/torch/lib/THNN/generic/ELU.c
@ -0,0 +1,51 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/ELU.c"
 #else
 void THNN_(ELU_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real alpha,
          bool inplace)
 {
  if(inplace) {
    TH_TENSOR_APPLY(real, input,
      if(*input_data <= 0) {
        *input_data = (exp(*input_data) - 1) * alpha;
      }
    );
    THTensor_(set)(output, input);
  } else {
    THTensor_(resizeAs)(output, input);
    TH_TENSOR_APPLY2(real, input, real, output,
      *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
    );
  }
 }
 void THNN_(ELU_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output,
          real alpha,
          bool inplace)
 {
  if(inplace) {
    TH_TENSOR_APPLY2(real, gradOutput, real, output,
      if(*output_data <= 0) {
        *gradOutput_data *= *output_data + alpha;
      }
    );
    THTensor_(set)(gradInput, gradOutput);
  } else {
    THTensor_(resizeAs)(gradInput, output);
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
      *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
    );
  }
 }
 #endif
--- a/torch/lib/THNN/generic/HardShrink.c
+++ b/torch/lib/THNN/generic/HardShrink.c
@ -0,0 +1,39 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/HardShrink.c"
 #else
 void THNN_(HardShrink_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real lambda)
 {
  THTensor_(resizeAs)(output, input);
  TH_TENSOR_APPLY2(real, output, real, input,
    if (*input_data > lambda)
      *output_data = *input_data;
    else if (*input_data < -lambda)
      *output_data = *input_data;
    else
      *output_data = 0;
  );
 }
 void THNN_(HardShrink_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          real lambda)
 {
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
    if (*input_data > lambda || *input_data < -lambda)
      *gradInput_data = *gradOutput_data;
    else
      *gradInput_data = 0;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/HardTanh.c
+++ b/torch/lib/THNN/generic/HardTanh.c
@ -0,0 +1,127 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/HardTanh.c"
 #else
 void THNN_(HardTanh_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real min_val,
          real max_val,
          bool inplace)
 {
  if (inplace)
    THTensor_(set)(output, input);
  else
    THTensor_(resizeAs)(output, input);
  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
  {
    if (inplace)
      TH_TENSOR_APPLY(real, input,
        if (*input_data < min_val)
          *input_data = min_val;
        else if (*input_data > max_val)
          *input_data = max_val;
      );
      TH_TENSOR_APPLY2(real, output, real, input,
        if (*input_data < min_val)
          *output_data = min_val;
        else if (*input_data <= max_val)
          *output_data = *input_data;
        else
          *output_data = max_val;
      );
  }
  else
  {
    real* ptr_input  = THTensor_(data)(input);
    real* ptr_output = THTensor_(data)(output);
    long i;
    long n = THTensor_(nElement)(input);
    if (inplace)
 #pragma omp parallel for private(i)
      for (i = 0; i < n; i++)
      {
        if (ptr_input[i] < min_val)
          ptr_input[i] = min_val;
        else if (ptr_input[i] > max_val)
          ptr_input[i] = max_val;
      }
    else
 #pragma omp parallel for private(i)
      for (i = 0; i < n; i++)
      {
        if (ptr_input[i] < min_val)
          ptr_output[i] = min_val;
        else if (ptr_input[i] <= max_val)
          ptr_output[i] = ptr_input[i];
        else
          ptr_output[i] = max_val;
      }
  }
 }
 void THNN_(HardTanh_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          real min_val,
          real max_val,
          bool inplace)
 {
  if (inplace)
    THTensor_(set)(gradInput, gradOutput);
  else
    THTensor_(resizeAs)(gradInput, input);
  if (input->nDimension == 1 ||
    !THTensor_(isContiguous)(input) ||
    !THTensor_(isContiguous)(gradOutput) ||
    !THTensor_(isContiguous)(gradInput))
  {
    if (inplace)
    {
      TH_TENSOR_APPLY2(real, gradOutput, real, input,
        if (*input_data < min_val || *input_data > max_val)
          *gradOutput_data = 0;
      );
    }
    else
      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
        if (*input_data < min_val || *input_data > max_val)
          *gradInput_data = 0;
        else
          *gradInput_data = *gradOutput_data;
      );
  }
  else
  {
    real* ptr_gradOutput = THTensor_(data)(gradOutput);
    real* ptr_gradInput  = THTensor_(data)(gradInput);
    real* ptr_input      = THTensor_(data)(input);
    long i;
    long n = THTensor_(nElement)(input);
    if (inplace)
 #pragma omp parallel for private(i)
      for (i = 0; i < n; i++)
      {
        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
          ptr_gradInput[i] = 0;
      }
    else
 #pragma omp parallel for private(i)
      for (i = 0; i < n; i++)
      {
        if (ptr_input[i] < min_val || ptr_input[i] > max_val)
          ptr_gradInput[i] = 0;
        else
          ptr_gradInput[i] = ptr_gradOutput[i];
      }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/L1Cost.c
+++ b/torch/lib/THNN/generic/L1Cost.c
@ -0,0 +1,36 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/L1Cost.c"
 #else
 void THNN_(L1Cost_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  accreal sum = 0;
  TH_TENSOR_APPLY(real, input, 
    sum += fabs(*input_data);
  );
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(L1Cost_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput)
 {
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY2(real, gradInput, real, input,
    if (*input_data > 0)
      *gradInput_data = 1;
    else if (*input_data < 0)
      *gradInput_data = -1;
    else
      *gradInput_data = 0;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/LeakyReLU.c
+++ b/torch/lib/THNN/generic/LeakyReLU.c
@ -0,0 +1,54 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/LeakyReLU.c"
 #else
 void THNN_(LeakyReLU_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real negval,
          bool inplace)
 {
  if (inplace)
  {
    TH_TENSOR_APPLY(real, input,
      if (*input_data <= 0)
        *input_data *= negval;
    );
    THTensor_(set)(output, input);
  }
  else
  {
    THTensor_(resizeAs)(output, input);
    TH_TENSOR_APPLY2(real, output, real, input,
      *output_data = *input_data > 0 ? *input_data : *input_data * negval;
    );
  }
 }
 void THNN_(LeakyReLU_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          real negval,
          bool inplace)
 {
  if (inplace)
  {
    TH_TENSOR_APPLY2(real, gradOutput, real, input,
      if (*input_data <= 0)
        *gradOutput_data *= negval;
    );
    THTensor_(set)(gradInput, gradOutput);
  }
  else
  {
    THTensor_(resizeAs)(gradInput, input);
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
    );
  }
 }
 #endif
--- a/torch/lib/THNN/generic/LogSigmoid.c
+++ b/torch/lib/THNN/generic/LogSigmoid.c
@ -0,0 +1,35 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/LogSigmoid.c"
 #else
 void THNN_(LogSigmoid_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *buffer)
 {
  THTensor_(resizeAs)(output, input);
  THTensor_(resizeAs)(buffer, input);
  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
    real z = exp(-*input_data);
    *buffer_data = z;
    *output_data = -log(1. + z);
  );
 }
 void THNN_(LogSigmoid_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *buffer)
 {
  THTensor_(resizeAs)(gradInput, buffer);
  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
    real z = *buffer_data;
    *gradInput_data = *gradOutput_data * z / (1. + z);
  );
 }
 #endif
--- a/torch/lib/THNN/generic/LogSoftMax.c
+++ b/torch/lib/THNN/generic/LogSoftMax.c
@ -0,0 +1,110 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/LogSoftMax.c"
 #else
 void THNN_(LogSoftMax_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  real *input_data, *output_data;
  long nframe = 0, dim = 0;
  long t, d;
  if (input->nDimension == 1)
  {
    nframe = 1;
    dim = input->size[0];
  }
  else if (input->nDimension == 2)
  {
    nframe = input->size[0];
    dim = input->size[1];
  }
  else
  {
    THArgCheck(0, 2, "vector or matrix expected");
  }
  input = THTensor_(newContiguous)(input);
  THTensor_(resizeAs)(output, input);
  real *input_data0 = THTensor_(data)(input);
  real *output_data0 = THTensor_(data)(output);
  accreal logsum;
  real maxInput;
  #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
  for (t = 0; t < nframe; t++)
  {
    logsum = 0;
    maxInput = -THInf;
    input_data = input_data0 + dim*t;
    output_data = output_data0 + dim*t;
    for (d = 0; d < dim; d++)
      maxInput = THMax(maxInput, input_data[d]);
    for (d = 0; d < dim; d++)
      logsum += exp(input_data[d] - maxInput);
    logsum = maxInput + log(logsum);
    for (d = 0; d < dim; d++)
      output_data[d] = input_data[d] - logsum;
  }
  THTensor_(free)(input);
 }
 void THNN_(LogSoftMax_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output)
 {
  gradOutput = THTensor_(newContiguous)(gradOutput);
  real *gradInput_data, *gradOutput_data, *output_data;
  long nframe = 0, dim = 0;
  long t, d;
  if (output->nDimension == 1)
  {
    nframe = 1;
    dim = output->size[0];
  }
  else if (output->nDimension == 2)
  {
    nframe = output->size[0];
    dim = output->size[1];
  }
  else
  {
    THError("vector or matrix expected");
  }
  THTensor_(resizeAs)(gradInput, output);
  real *gradInput_data0 = THTensor_(data)(gradInput);
  real *output_data0 = THTensor_(data)(output);
  real *gradOutput_data0 = THTensor_(data)(gradOutput);
  accreal sum;
  #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
  for (t = 0; t < nframe; t++)
  {
    sum = 0;
    gradInput_data = gradInput_data0 + dim*t;
    output_data = output_data0 + dim*t;
    gradOutput_data = gradOutput_data0 + dim*t;
    for (d = 0; d < dim; d++)
      sum += gradOutput_data[d];
    for (d = 0; d < dim; d++)
      gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
  }
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/LookupTable.c
+++ b/torch/lib/THNN/generic/LookupTable.c
@ -0,0 +1,213 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/LookupTable.c"
 #else
 static void THNN_(LookupTable_resetCount)(
          THInteger_t *count_data,
          THIndexTensor *input)
 {
  int i;
  THIndex_t *input_data = THIndexTensor_(data)(input);
  long numel = THIndexTensor_(nElement)(input);
  for (i = 0; i<numel; i++)
  {
    long k = input_data[i] - TH_INDEX_BASE;
    count_data[k] = 0;
  }
  for (i = 0; i<numel; i++)
  {
    long k = input_data[i] - TH_INDEX_BASE;
    count_data[k]++;
  }
 }
 void THNN_(LookupTable_accGradParameters)(
          THNNState *state,
          THIndexTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THIntegerTensor *count,
          THTensor *sorted,
          THTensor *indices,
          bool scaleGradByFreq,
          int paddingValue,
          real scale)
 {
  long i;
  THInteger_t *count_data = NULL;
  if (scaleGradByFreq)
  {
    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
    count_data = THIntegerTensor_(data)(count);
  }
  if (!THTensor_(isContiguous)(gradWeight))
    THError("gradWeight must be contiguous");
  if (!THIndexTensor_(isContiguous)(input))
    THError("input must be contiguous");
  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)
    THError("input must be a vector or matrix");
  THIndex_t *input_data = THIndexTensor_(data)(input);
  long numel = THIndexTensor_(nElement)(input);
  long numw = THTensor_(size)(gradWeight, 0);
  // check that inputs are all within range
  for (i=0; i<numel; i++)
    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE)
      THError("input out of range");
  gradOutput = THTensor_(newContiguous)(gradOutput);
  real *gw = THTensor_(data)(gradWeight);
  real *go = THTensor_(data)(gradOutput);
  long stride = THTensor_(stride)(gradWeight, 0);
  if (count_data)
    THNN_(LookupTable_resetCount)(count_data, input);
 #ifdef _OPENMP
  if (numel > 1000)
  {
    // The strategy is to parallelize over sections of the vocabulary, so that
    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
    // has to traverse the entire input, but the dominating factor is the axpy
    // BLAS call.
    #pragma omp parallel private(i)
    {
      int tid = omp_get_thread_num();
      int nthreads = omp_get_num_threads();
      long start = tid * (numw/nthreads + 1);
      long end = start + (numw/nthreads + 1);
      for (i=0; i<numel; i++)
      {
        if (input_data[i] != paddingValue)
        {
            long k = input_data[i] - TH_INDEX_BASE;
            if (k >= start && k < end)
            {
                real scale_ = scale;
                if (count_data) scale_ /= count_data[k];
                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
            }
        }
      }
    }
    THTensor_(free)(gradOutput);
    return;
  }
 #endif
  for (i=0; i<numel; i++)
  {
    if (input_data[i] != paddingValue)
    {
        long k = input_data[i] - TH_INDEX_BASE;
        real scale_ = scale;
        if (count_data) scale_ /= count_data[k];
        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
     }
  }
  THTensor_(free)(gradOutput);
 }
 /*
 * Keep the norm of weight smaller than maxNorm
 */
 static void THNN_(LookupTable_renormRow)(
          real *row_data,
          long stride,
          real maxNorm,
          real normType)
 {
  real norm = 0;
  real new_norm;
  long j;
  for (j=0; j<stride; j++)
  {
    if (normType == 1) {
      norm += fabs(row_data[j]);
    } else if (normType == 2) {
      norm += row_data[j] * row_data[j];
    } else {
      norm += pow(fabs(row_data[j]), normType);
    }
  }
  norm = pow(norm, 1.0 / normType);
  if (norm > maxNorm)
  {
    new_norm = maxNorm / (norm + 1e-7);
    for (j=0; j<stride; j++) {
      row_data[j] *= new_norm;
    }
  }
 }
 static int THNN_(compare_THIndex)(const void* a, const void* b)
 {
   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
 }
 void THNN_(LookupTable_renorm)(
          THNNState *state,
          THIndexTensor *idx,
          THTensor *weight,
          real maxNorm,
          real normType)
 {
  if (!THTensor_(isContiguous)(weight))
    THError("weight must be contiguous");
  if (!THIndexTensor_(isContiguous)(idx))
    THError("input must be contiguous");
  if (THIndexTensor_(nDimension)(idx) != 1)
    THError("idx must be a vector");
  if (normType <= 0)
    THError("non-positive-norm not supported");
  long i;
  THIndex_t *row_idx = THIndexTensor_(data)(idx);
  long numel = THIndexTensor_(nElement)(idx);
  long numw = THTensor_(size)(weight, 0);
  long stride = THTensor_(stride)(weight, 0);
  real *gw = THTensor_(data)(weight);
  for (i=0; i<numel; i++)
    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE)
      THError("input out of range");
  // get unique indices
  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
  long ptr = 0;
  for (i=0; i<numel; i++)
    if (i == 0 || row_idx[i] != row_idx[i-1])
      row_idx[ptr++] = row_idx[i];
  numel = ptr;
 #ifdef _OPENMP
  if (numel > 1000)
  {
    // The strategy is to parallelize over the rows that appear in
    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
    // This distributes the work evenly to each thread.
    #pragma omp parallel for private(i)
    for (i=0; i<numel; i++)
    {
      long k = row_idx[i] - TH_INDEX_BASE;
      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
    }
    return;
  }
 #endif
  for (i=0; i<numel; i++)
  {
    long k = row_idx[i] - TH_INDEX_BASE;
    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/MSECriterion.c
+++ b/torch/lib/THNN/generic/MSECriterion.c
@ -0,0 +1,40 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/MSECriterion.c"
 #else
 void THNN_(MSECriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          bool sizeAverage)
 {
  real sum = 0;
  TH_TENSOR_APPLY2(real, input, real, target,
    real z = (*input_data - *target_data);
    sum += z*z;
  );
  if (sizeAverage)
    sum /= THTensor_(nElement)(input);
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(MSECriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          bool sizeAverage)
 {
  real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
    *gradInput_data = norm * (*input_data - *target_data);
  );
 }
 #endif
--- a/torch/lib/THNN/generic/MarginCriterion.c
+++ b/torch/lib/THNN/generic/MarginCriterion.c
@ -0,0 +1,42 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/MarginCriterion.c"
 #else
 void THNN_(MarginCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          bool sizeAverage,
          real margin)
 {
  real sum = 0;
  TH_TENSOR_APPLY2(real, input, real, target,
    real z = (margin - *input_data * *target_data);
    sum += z>0 ? z : 0;
  );
  if (sizeAverage)
    sum /= THTensor_(nElement)(input);
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(MarginCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          bool sizeAverage,
          real margin)
 {
  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/MultiLabelMarginCriterion.c
+++ b/torch/lib/THNN/generic/MultiLabelMarginCriterion.c
@ -0,0 +1,174 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
 #else
 void THNN_(MultiLabelMarginCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          THTensor *isTarget,
          bool sizeAverage)
 {
  real *input_data, *target_data, *isTarget_data;
  long nframe, dim;
  long t, d, dt, ddt;
  real sum;
  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
  if (input->nDimension == 1)
  {
    nframe = 1;
    dim = input->size[0];
    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
  }
  else
  {
    nframe = input->size[0];
    dim = input->size[1];
    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
  }
  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
  target = THTensor_(newContiguous)(target);
  input = THTensor_(newContiguous)(input);
  input_data = THTensor_(data)(input);
  target_data = THTensor_(data)(target);
  THTensor_(resizeAs)(isTarget, target);
  THTensor_(zero)(isTarget);
  isTarget_data = THTensor_(data)(isTarget);
  sum = 0;
  for (t = 0; t < nframe; t++)
  {
    for (ddt = 0; ddt < dim; ddt++)
    {
      long target_idx = (long)target_data[ddt] - TH_INDEX_BASE;
      if (target_idx < 0)
        break;
      isTarget_data[target_idx] = 1;
    }
    for (dt = 0; dt < dim; dt++)
    {
      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
      real input_target;
      if (target_idx < 0)
        break;
      input_target = input_data[target_idx];
      for (d = 0; d < dim; d++)
      {
        if (!isTarget_data[d])
        {
          real z = 1 - input_target + input_data[d];
          if (z > 0)
            sum += z;
        }
      }
    }
    input_data += dim;
    target_data += dim;
    isTarget_data += dim;
  }
  sum /= dim;
  if (sizeAverage)
    sum /= nframe;
  THTensor_(set1d)(output, 0, sum);
  THTensor_(free)(input);
  THTensor_(free)(target);
 }
 void THNN_(MultiLabelMarginCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          THTensor *isTarget,
          bool sizeAverage)
 {
  real *input_data;
  real *gradInput_data;
  real *target_data;
  real *isTarget_data;
  long nframe, dim;
  long t, d, dt;
  real g;
  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
  if (input->nDimension == 1)
  {
    nframe = 1;
    dim = input->size[0];
    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3, "inconsistent isTarget size");
  }
  else
  {
    nframe = input->size[0];
    dim = input->size[1];
    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe) && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
  }
  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
  target = THTensor_(newContiguous)(target);
  input = THTensor_(newContiguous)(input);
  isTarget = THTensor_(newContiguous)(isTarget);
  input_data = THTensor_(data)(input);
  target_data = THTensor_(data)(target);
  isTarget_data = THTensor_(data)(isTarget);
  g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  gradInput_data = THTensor_(data)(gradInput);
  for (t = 0; t < nframe; t++)
  {
    for (dt = 0; dt < dim; dt++)
    {
      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
      real input_target;
      if (target_idx < 0)
        break;
      input_target = input_data[target_idx];
      for (d = 0; d < dim; d++)
      {
        if (!isTarget_data[d])
        {
          real z = 1 - input_target + input_data[d];
          if (z > 0)
          {
            gradInput_data[target_idx] -= g;
            gradInput_data[d] += g;
          }
        }
      }
    }
    input_data += dim;
    target_data += dim;
    isTarget_data += dim;
    gradInput_data += dim;
  }
  THTensor_(free)(input);
  THTensor_(free)(target);
  THTensor_(free)(isTarget);
 }
 #endif
--- a/torch/lib/THNN/generic/MultiMarginCriterion.c
+++ b/torch/lib/THNN/generic/MultiMarginCriterion.c
@ -0,0 +1,159 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
 #else
 void THNN_(MultiMarginCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          bool sizeAverage,
          int p,
          THTensor *weights,
          real margin)
 {
  real *input_data, *target_data, *weights_data;
  long nframe, dim;
  long t, d;
  real sum;
  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
  if (input->nDimension == 1)
  {
    nframe = 1;
    dim = input->size[0];
  }
  else
  {
    nframe = input->size[0];
    dim = input->size[1];
    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
  }
  for (t = 0; t < nframe; t++)
  {
    real idx = THTensor_(get1d)(target, t);
    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, "target out of range");
  }
  input = THTensor_(newContiguous)(input);
  target = THTensor_(newContiguous)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
  input_data = THTensor_(data)(input);
  target_data = THTensor_(data)(target);
  weights_data = weights ? THTensor_(data)(weights) : NULL;
  sum = 0;
  for (t = 0; t < nframe; t++)
  {
    long target_idx = (long)(target_data[t] - TH_INDEX_BASE);
    real input_target = input_data[target_idx];
    for (d = 0; d < dim; d++)
    {
      real z = margin - input_target + input_data[d];
      if (d == target_idx)
        continue;
      if (z > 0) {
        real h = (p==1) ? z : z*z;
        if(weights_data)
          h *= weights_data[target_idx];
        sum += h;
      }
    }
    input_data += dim;
  }
  sum /= dim;
  if(sizeAverage)
    sum /= nframe;
  THTensor_(set1d)(output, 0, sum);
  THTensor_(free)(input);
  THTensor_(free)(target);
  if(weights)
    THTensor_(free)(weights);
 }
 void THNN_(MultiMarginCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          bool sizeAverage,
          int p,
          THTensor *weights,
          real margin)
 {
  real *input_data;
  real *gradInput_data;
  real *target_data;
  real *weights_data;
  long nframe, dim;
  long t, d;
  real g;
  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
  if (input->nDimension == 1)
  {
    nframe = 1;
    dim = input->size[0];
  }
  else
  {
    nframe = input->size[0];
    dim = input->size[1];
    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
  }
  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
  input = THTensor_(newContiguous)(input);
  target = THTensor_(newContiguous)(target);
  input_data = THTensor_(data)(input);
  THTensor_(resizeAs)(gradInput, input);
  gradInput_data = THTensor_(data)(gradInput);
  target_data = THTensor_(data)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
  weights_data = weights ? THTensor_(data)(weights) : NULL;
  for (t = 0; t < nframe; t++)
  {
    long target_idx = (long)(target_data[t]) - TH_INDEX_BASE;
    real input_target = input_data[target_idx];
    real gradInput_target = 0;
    for (d = 0; d < dim; d++)
    {
      real z = margin - input_target + input_data[d];
      if (d == target_idx)
        continue;
      if (z > 0)
      {
        real h = (p == 1) ? g : 2*g*z;
        if(weights_data)
          h *= weights_data[target_idx];
        gradInput_target -= h;
        gradInput_data[d] = h;
      }
      else
        gradInput_data[d] = 0;
    }
    gradInput_data[target_idx] = gradInput_target;
    input_data += dim;
    gradInput_data += dim;
  }
  THTensor_(free)(input);
  THTensor_(free)(target);
  if(weights)
    THTensor_(free)(weights);
 }
 #endif
--- a/torch/lib/THNN/generic/PReLU.c
+++ b/torch/lib/THNN/generic/PReLU.c
@ -0,0 +1,228 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/PReLU.c"
 #else
 void THNN_(PReLU_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THIndex_t nOutputPlane)
 {
  THTensor_(resizeAs)(output, input);
  if (nOutputPlane == 0)
  {
    // handle shared parameter case
    real w = *THTensor_(data)(weight);
    TH_TENSOR_APPLY2(real, output, real, input,
      *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
    );
  }
  else
  {
    long bs, ks;
    {
      long input_ndim = THTensor_(nDimension)(input);
      switch (input_ndim)
      {
        case 1:
          bs = 1;
          ks = 1;
          break;
        case 2:
          bs = input->size[0];
          ks = 1;
          break;
        case 3:
          bs = 1;
          ks = input->size[1] * input->size[2];
          break;
        case 4:
          bs = input->size[0];
          ks = input->size[2] * input->size[3];
          break;
      }
      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
        THError("wrong number of input planes");
    }
    real *output_data = THTensor_(data)(output);
    real *input_data = THTensor_(data)(input);
    real *weight_data = THTensor_(data)(weight);
    THIndex_t i, j, k;
 #pragma omp parallel for private(j,k)
    for (i = 0; i < bs; ++i)
    {
      real* n_input_data = input_data + i*nOutputPlane*ks;
      real* n_output_data = output_data + i*nOutputPlane*ks;
      for (j = 0; j < nOutputPlane; ++j)
      {
        for (k = 0; k < ks; ++k)
          n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
        n_input_data += ks;
        n_output_data += ks;
      }
    }
  }
 }
 void THNN_(PReLU_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          THIndex_t nOutputPlane)
 {
  THTensor_(resizeAs)(gradInput, input);
  if (nOutputPlane == 0)
  {
    real w = THTensor_(data)(weight)[0];
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
       if ((*input_data) > 0)
         *gradInput_data = *gradOutput_data;
       else
         *gradInput_data = w * (*gradOutput_data);
    );
  }
  else
  {
    const real *input_data = THTensor_(data)(input);
    const real *gradOutput_data = THTensor_(data)(gradOutput);
    const real *weight_data = THTensor_(data)(weight);
    real *gradInput_data = THTensor_(data)(gradInput);
    long bs, ks;
    {
      long input_ndim = THTensor_(nDimension)(input);
      switch (input_ndim)
      {
        case 1:
          bs = 1;
          ks = 1;
          break;
        case 2:
          bs = input->size[0];
          ks = 1;
          break;
        case 3:
          bs = 1;
          ks = input->size[1] * input->size[2];
          break;
        case 4:
          bs = input->size[0];
          ks = input->size[2] * input->size[3];
          break;
      }
      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
        THError("wrong number of input planes");
    }
    THIndex_t i, j, k;
 #pragma omp parallel for private(j,k)
    for (i = 0; i < bs; ++i)
    {
      const real *n_input_data = input_data + i*nOutputPlane*ks;
      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
      real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
      for (j = 0; j < nOutputPlane; ++j)
      {
        real w = weight_data[j];
        for (k = 0; k < ks; ++k)
        {
          if (n_input_data[k] > 0)
            n_gradInput_data[k] = n_gradOutput_data[k];
          else
            n_gradInput_data[k] = n_gradOutput_data[k] * w;
        }
        n_input_data += ks;
        n_gradInput_data += ks;
        n_gradOutput_data += ks;
      }
    }
  }
 }
 void THNN_(PReLU_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          THTensor *gradWeight,
          THTensor *gradWeightBuf,
          THTensor *gradWeightBuf2,
          THIndex_t nOutputPlane,
          real scale)
 {
  real *gradWeight_data = THTensor_(data)(gradWeight);
  if (nOutputPlane == 0)
  {
    real sum = 0;
    TH_TENSOR_APPLY2(real, input, real, gradOutput,
      if ((*input_data) <= 0)
        sum += (*input_data) * (*gradOutput_data);
    );
    gradWeight_data[0] += scale * sum;
  }
  else
  {
    long bs, ks;
    {
      long input_ndim = THTensor_(nDimension)(input);
      switch (input_ndim)
      {
        case 1:
          bs = 1;
          ks = 1;
          break;
        case 2:
          bs = input->size[0];
          ks = 1;
          break;
        case 3:
          bs = 1;
          ks = input->size[1] * input->size[2];
          break;
        case 4:
          bs = input->size[0];
          ks = input->size[2] * input->size[3];
          break;
      }
      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
        THError("wrong number of input planes");
    }
    const real *input_data = THTensor_(data)(input);
    const real *gradOutput_data = THTensor_(data)(gradOutput);
    const real *weight_data = THTensor_(data)(weight);
    real *gradWeight_data = THTensor_(data)(gradWeight);
    THIndex_t i, j, k;
    for (i = 0; i < bs; ++i)
    {
      const real *n_input_data = input_data + i*nOutputPlane*ks;
      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
      for (j = 0; j < nOutputPlane; ++j)
      {
        real sum = 0;
        for (k = 0; k < ks; ++k)
          if (n_input_data[k] <= 0)
            sum += n_gradOutput_data[k] * n_input_data[k];
        gradWeight_data[j] += scale * sum;
        n_input_data += ks;
        n_gradOutput_data += ks;
      }
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/RReLU.c
+++ b/torch/lib/THNN/generic/RReLU.c
@ -0,0 +1,127 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/RReLU.c"
 #else
 void THNN_(RReLU_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *noise,
          real lower,
          real upper,
          bool train,
          bool inplace,
          THGenerator *generator)
 {
  if (train)
  {
    // get default random generator
    THTensor_(resizeAs)(noise, input);
    if (inplace)
    {
      TH_TENSOR_APPLY2(real, input, real, noise,
        if (*input_data <= 0)
        {
          const real r = (real)THRandom_uniform(generator, lower, upper);
          *input_data = (*input_data) * r;
          *noise_data = r;
        }
        else
        {
          *noise_data = 1;
        }
      );
      THTensor_(set)(output, input);
    }
    else
    {
      THTensor_(resizeAs)(output, input);
      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
        if (*input_data <= 0)
        {
          const real r = (real)THRandom_uniform(generator, lower, upper);
          *output_data = (*input_data) * r;
          *noise_data = r;
        }
        else
        {
          *output_data = *input_data;
          *noise_data = 1;
        }
      );
    }
  }
  else
  {
    const real negSlope = (lower + upper) / 2;
    if (inplace)
    {
      TH_TENSOR_APPLY(real, input,
        if (*input_data <= 0)
        {
          *input_data = *input_data * negSlope;
        }
      );
      THTensor_(set)(output, input);
    }
    else
    {
      THTensor_(resizeAs)(output, input);
      TH_TENSOR_APPLY2(real, input, real, output,
        const real r = (*input_data) <= 0 ? negSlope : 1;
        *output_data = *input_data * r;
      );
    }
  }  
 }
 void THNN_(RReLU_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *noise,
          real lower,
          real upper,
          bool train,
          bool inplace)
 {
  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
  {
    // multiply the gradient by the noise tensor
    if (inplace)
    {
      THTensor_(cmul)(gradOutput, gradOutput, noise);
      THTensor_(set)(gradInput, gradOutput);
    }
    else
    {
      THTensor_(resizeAs)(gradInput, input);
      THTensor_(cmul)(gradInput, gradOutput, noise);
    }    
  }
  else
  { 
    // use constant factor for negative input values
    const real negSlope = (lower + upper) / 2;
    if (inplace)
    {
      TH_TENSOR_APPLY2(real, gradOutput, real, input,
        if (*input_data <= 0)
        {
          *gradOutput_data = (*gradOutput_data) * negSlope;
        }
      );
      THTensor_(set)(gradInput, gradOutput);
    }
    else
    {
      THTensor_(resizeAs)(gradInput, input);
      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
      );
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/Sigmoid.c
+++ b/torch/lib/THNN/generic/Sigmoid.c
@ -0,0 +1,31 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/Sigmoid.c"
 #else
 void THNN_(Sigmoid_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  THTensor_(resizeAs)(output, input);
  TH_TENSOR_APPLY2(real, output, real, input,
    *output_data = 1./(1.+ exp(- *input_data));
  );
 }
 void THNN_(Sigmoid_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output)
 {
  THTensor_(resizeAs)(gradInput, output);
  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
    real z = *output_data;
    *gradInput_data = *gradOutput_data * (1. - z) * z;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/SmoothL1Criterion.c
+++ b/torch/lib/THNN/generic/SmoothL1Criterion.c
@ -0,0 +1,45 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
 #else
 void THNN_(SmoothL1Criterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *output,
          bool sizeAverage)
 {
  real sum = 0;
  TH_TENSOR_APPLY2(real, input, real, target,
    real z = fabs(*input_data - *target_data);
    sum += z < 1 ? 0.5*z*z : z - 0.5;
  );
  if (sizeAverage)
    sum /= THTensor_(nElement)(input);
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(SmoothL1Criterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *target,
          THTensor *gradInput,
          bool sizeAverage)
 {
  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
    real x = *input_data - *target_data;
    if (x < -1.)
     *gradInput_data = - norm;
    else if (x > 1.)
     *gradInput_data = norm;
    else
     *gradInput_data = norm * x;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/SoftMarginCriterion.c
+++ b/torch/lib/THNN/generic/SoftMarginCriterion.c
@ -0,0 +1,40 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
 #else
 void THNN_(SoftMarginCriterion_updateOutput)(
  THNNState *state,
  THTensor *input,
  THTensor *target,
  THTensor *output,
  bool sizeAverage)
 {
  real sum;
  sum = 0;
  TH_TENSOR_APPLY2(real, input, real, target,
                   real z = log(1. + exp(-*input_data* *target_data));
                   sum += z;)
  if(sizeAverage)
    sum /= THTensor_(nElement)(input);
  THTensor_(set1d)(output, 0, sum);
 }
 void THNN_(SoftMarginCriterion_updateGradInput)(
  THNNState *state,
  THTensor *input,
  THTensor *target,
  THTensor *gradInput,
  bool sizeAverage)
 {
  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
                   real z = exp(-*target_data * *input_data);
                   *gradInput_data = -norm*(*target_data)*z/(1. + z);)
 }
 #endif
--- a/torch/lib/THNN/generic/SoftMax.c
+++ b/torch/lib/THNN/generic/SoftMax.c
@ -0,0 +1,149 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SoftMax.c"
 #else
 void THNN_(SoftMax_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  real *input_data, *output_data;
  long nframe = 0, dim = 0, stride = 0;
  long t;
  if (input->nDimension == 1)
  {
    nframe = 1;
    dim = input->size[0];
    stride = 1;
  }
  else if (input->nDimension == 2)
  {
    nframe = input->size[0];
    dim = input->size[1];
    stride = 1;
  }
  else if (input->nDimension == 3)
  {
    nframe = 1;
    dim = input->size[0];
    stride = input->size[1]*input->size[2];
  }
  else if (input->nDimension == 4)
  {
    nframe = input->size[0];
    dim = input->size[1];
    stride = input->size[2]*input->size[3];
  }
  else
  {
    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
  }
  input = THTensor_(newContiguous)(input);
  THTensor_(resizeAs)(output, input);
  input_data = THTensor_(data)(input);
  output_data = THTensor_(data)(output);
 #pragma omp parallel for private(t)
  for (t = 0; t < stride*nframe; t++)
  {
    real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
    real inputMax = -THInf;
    accreal sum;
    long d;
    for (d = 0; d < dim; d++)
    {
      if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
    }
    sum = 0;
    for (d = 0; d < dim; d++)
    {
      real z = exp(input_ptr[d*stride] - inputMax);
      output_ptr[d*stride] = z;
      sum += z;
    }
    for (d = 0; d < dim; d++)
    {
      output_ptr[d*stride] *= 1/sum;
    }
  }
  THTensor_(free)(input);
 }
 void THNN_(SoftMax_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output)
 {
  real *gradInput_data, *gradOutput_data, *output_data;
  long nframe = 0, dim = 0, stride = 0;
  long t;
  if (output->nDimension == 1)
  {
    nframe = 1;
    dim = output->size[0];
    stride = 1;
  }
  else if (output->nDimension == 2)
  {
    nframe = output->size[0];
    dim = output->size[1];
    stride = 1;
  }
  else if (output->nDimension == 3)
  {
    nframe = 1;
    dim = output->size[0];
    stride = output->size[1]*output->size[2];
  }
  else if (output->nDimension == 4)
  {
    nframe = output->size[0];
    dim = output->size[1];
    stride = output->size[2]*output->size[3];
  }
  else
  {
    THError("1D, 2D, 3D or 4D tensor expected");
  }
  gradOutput = THTensor_(newContiguous)(gradOutput);
  output = THTensor_(newContiguous)(output);
  THTensor_(resizeAs)(gradInput, output);
  gradInput_data = THTensor_(data)(gradInput);
  output_data = THTensor_(data)(output);
  gradOutput_data = THTensor_(data)(gradOutput);
 #pragma omp parallel for private(t)
  for (t = 0; t < stride*nframe; t++)
  {
    real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
    real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
    long d;
    accreal sum = 0;
    for (d = 0; d < dim; d++)
      sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
    for (d = 0; d < dim; d++)
      gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
  }
  THTensor_(free)(gradOutput);
  THTensor_(free)(output);
 }
 #endif
--- a/torch/lib/THNN/generic/SoftPlus.c
+++ b/torch/lib/THNN/generic/SoftPlus.c
@ -0,0 +1,42 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SoftPlus.c"
 #else
 void THNN_(SoftPlus_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real beta,
          real threshold)
 {
  THTensor_(resizeAs)(output, input);
  // f(x) = 1/beta * log(1 + exp(beta * x))
  TH_TENSOR_APPLY2(real, output, real, input,               \
    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
  );
 }
 void THNN_(SoftPlus_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output,
          real beta,
          real threshold)
 {
  THTensor_(resizeAs)(gradInput, output);
  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
  // SINCE
  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
  // THEREFORE:
  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
    real z = exp(*output_data * beta);
    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/SoftShrink.c
+++ b/torch/lib/THNN/generic/SoftShrink.c
@ -0,0 +1,39 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SoftShrink.c"
 #else
 void THNN_(SoftShrink_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real lambda)
 {
  THTensor_(resizeAs)(output, input);
  TH_TENSOR_APPLY2(real, output, real, input,
    if ((*input_data) > lambda)
     *output_data = *input_data - lambda;
    else if ((*input_data) < -lambda)
     *output_data = *input_data + lambda;
    else
     *output_data = 0;
  );
 }
 void THNN_(SoftShrink_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          real lambda)
 {
  THTensor_(resizeAs)(gradInput, input);
  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
    if ((*input_data) > lambda || (*input_data) < -lambda)
      *gradInput_data = (*gradOutput_data);
    else
      *gradInput_data = 0;
  );
 }
 #endif
--- a/torch/lib/THNN/generic/SparseLinear.c
+++ b/torch/lib/THNN/generic/SparseLinear.c
@ -0,0 +1,550 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SparseLinear.c"
 #else
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
 #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
 static bool THNN_(checkLegacyInput)(THTensor* t)
 {
  return t->nDimension == 3 && t->size[2] == 2;
 }
 static bool THNN_(checkInput)(THTensor* t)
 {
  return t->nDimension == 2 && t->size[1] == 3;
 }
 static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
 {
  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
 }
 static bool THNN_(checkSize1D)(THTensor* t, long size0)
 {
  return t->nDimension == 1 && t->size[0] == size0;
 }
 static void THNN_(set1d)(THTensor *t, long x0, real value) {
  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
 }
 static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
  return THStorage_(get)(t->storage, t->storageOffset +
                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
 }
 static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
  return THStorage_(get)(t->storage, t->storageOffset +
                         x0*t->stride[0] + x1*t->stride[1]);
 }
 void THNN_(SparseLinear_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias)
 {
  long h, i, j, hp0, hp1;
  long outDim = THTensor_(size)(weight, 0);
  long inDim = THTensor_(size)(weight, 1);
  long batchSize = THTensor_(size)(output, 0);
  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
  long nnz = THTensor_(size)(input, 0);
  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
  THLongTensor_zero(csr);
 //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
  for (i=0; i<nnz; i++) {
    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
    hp1 = (i+1 == nnz) ?
            batchSize :
            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
      THLongTensor_set1d(csr, h+1, i+1);
    }
  }
  // output = weight * input + bias
  THTensor_(zero)(output);
 #pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
  for (h = 0; h < batchSize; h++) {
    long i_start = THLongTensor_get1d(csr, h);
    long i_end = THLongTensor_get1d(csr, h+1);
    for (i = i_start; i < i_end; i++) {
      real val = THNN_(get2d)(input, i, 2);
      if (val == 0) {
        continue;
      }
      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
            val,
            COL_PTR2(weight, offset), weight->stride[0],
            ROW_PTR2(output, h), output->stride[1]);
      } else {
        THError("index out of bound. updateOutput: %d not between 1 and %d",
            offset + 1, inDim);
      }
    }
  }
  THTensor* output_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(output_row, output, 0, h);
    THTensor_(cadd)(output_row, bias, 1.0, output_row);
  }
  THTensor_(free)(output_row);
  THLongTensor_free(csr);
 }
 void THNN_(SparseLinear_legacyUpdateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias)
 {
  long h, i;
  long outDim = THTensor_(size)(weight, 0);
  long inDim = THTensor_(size)(weight, 1);
  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
  long batchSize = THTensor_(size)(input, 0);
  long nnz = THTensor_(size)(input, 1);
  THTensor_(resize2d)(output, batchSize, outDim);
  // output = weight * input + bias
  THTensor_(zero)(output);
 #pragma omp parallel for private(h, i) schedule(static) if (   \
  batchSize > 1 && batchSize * nnz * outDim > 10000)
  for (h = 0; h < batchSize; h++) {
    for (i = 0; i < nnz; i++) {
      real val = THNN_(get3d)(input, h, i, 1);
      if (val == 0) {
        continue;
      }
      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
                      val,
                      COL_PTR2(weight, offset), weight->stride[0],
                      ROW_PTR2(output, h), output->stride[1]);
      } else {
        THError("index out of bound. updateOutput: %d not between 1 and %d",
                offset + 1, inDim);
      }
    }
  }
  THTensor* output_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(output_row, output, 0, h);
    THTensor_(cadd)(output_row, bias, 1.0, output_row);
  }
  THTensor_(free)(output_row);
 }
 void THNN_(SparseLinear_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          real weightDecay,
          real scale)
 {
  long h, i, col, hp0, hp1;
  long outDim = THTensor_(size)(weight, 0);
  long inDim = THTensor_(size)(weight, 1);
  THArgCheck(THNN_(checkInput)(input), 2,
             "input must be in coo format, nnz x 3");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");
  long nnz = THTensor_(size)(input, 0);
  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
  THLongTensor_zero(csc);
 #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
  for (i = 0; i < nnz; i++) {
    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
    hp1 = (i+1 == nnz) ?
            inDim :
            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
      THLongTensor_set1d(csc, h+1, i+1);
    }
  }
  // gradWeight += gradOutput * input
 #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
  for (col = 0; col < inDim; col++) {
    long i_start = THLongTensor_get1d(csc, col);
    long i_end = THLongTensor_get1d(csc, col+1);
    for (i = i_start; i < i_end; i++) {
      real val = scale * THNN_(get2d)(input, i, 2);
      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
            val,
            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
      } else {
        THError(
            "index out of bound. accGradParameters: %d not between 1 and %d",
            offset + 1,
            inDim);
      }
    }
  }
  // gradBias += gradOutput
  THTensor* buf = THTensor_(new)();
  THTensor_(sum)(buf, gradOutput, 0);
  THTensor_(cadd)(gradBias, gradBias, scale, buf);
  THTensor_(free)(buf);
  THLongTensor_free(csc);
  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
 }
 void THNN_(SparseLinear_legacyAccGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          real weightDecay,
          real scale)
 {
  long h, i;
  long outDim = THTensor_(size)(weight, 0);
  long inDim = THTensor_(size)(weight, 1);
  THArgCheck(THNN_(checkLegacyInput)(input), 2,
             "input size must be batchsize x nnz x 2");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");
  long batchSize = THTensor_(size)(input, 0);
  long nnz = THTensor_(size)(input, 1);
  THTensor_(resize2d)(gradOutput, batchSize, outDim);
  // gradWeight += gradOutput * input
 #pragma omp parallel for private(h, i) schedule(static) if (\
  batchSize * nnz * outDim > 10000)
  for (i = 0; i < nnz; i++) {
    for (h = 0; h < batchSize; h++) {
      real val = scale * THNN_(get3d)(input, h, i, 1);
      if (val == 0) {
        continue;
      }
      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
                      val,
                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
      } else {
        THError(
          "index out of bound. accGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }
  // gradBias += gradOutput
  THTensor* gradOutput_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
  }
  THTensor_(free)(gradOutput_row);
  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
 }
 void THNN_(SparseLinear_updateParameters)(
          THNNState *state,
          THTensor *weight,
          THTensor *bias,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *lastInput,
          real learningRate)
 {
  long h, i;
  long outDim = weight->size[0];
  long inDim = weight->size[1];
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
  THArgCheck(THNN_(checkInput)(lastInput), 6,
             "input must be in coo format, nnz x 3");
  long nnz = THTensor_(size)(lastInput, 0);
  // collect unique offsets of non-0 val in input
  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
  long cnt = 0;
  for (i = 0; i < nnz; i++) {
    real val = THNN_(get2d)(lastInput, i, 2);
    if (val == 0) {
      continue;
    }
    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
    if (offset >= 0 && offset < inDim) {
      THNN_(set1d)(offsets, cnt++, offset);
    } else {
      THError(
          "index out of bound. updateParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
    }
  }
  if (cnt == 0) return;
  THTensor_(resize1d)(offsets, cnt);
  THTensor* uniqueOffsets = THTensor_(new)();
  THLongTensor* ri = THLongTensor_new();
  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
  THLongTensor_free(ri);
  THTensor_(free)(offsets);
  cnt = 1;
  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
    }
  }
  THTensor_(resize1d)(uniqueOffsets, cnt);
  // weight += -learningRate * gradWeight
  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
 #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
  for (i = 0; i < cnt; i++) {
    long offset = (long)uniqueOffsets_p[i];
    THBlas_(axpy)(outDim,
                  -learningRate,
                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
                  COL_PTR2(weight, offset), weight->stride[0]);
  }
  THTensor_(free)(uniqueOffsets);
 }
 void THNN_(SparseLinear_legacyUpdateParameters)(
          THNNState *state,
          THTensor *weight,
          THTensor *bias,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *lastInput,
          real learningRate)
 {
  long h, i;
  long outDim = weight->size[0];
  long inDim = weight->size[1];
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
             "input size must be batchsize x nnz x 2");
  long batchSize = THTensor_(size)(lastInput, 0);
  long nnz = THTensor_(size)(lastInput, 1);
  // collect unique offsets of non-0 val in input
  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
  long cnt = 0;
  for (h = 0; h < batchSize; h++) {
    for (i = 0; i < nnz; i++) {
      real val = THNN_(get3d)(lastInput, h, i, 1);
      if (val == 0 ) {
        continue;
      }
      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THNN_(set1d)(offsets, cnt++, offset);
      } else {
        THError(
          "index out of bound. updateParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }
  THTensor_(resize1d)(offsets, cnt);
  THTensor* uniqueOffsets = THTensor_(new)();
  THLongTensor* ri = THLongTensor_new();
  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
  THLongTensor_free(ri);
  THTensor_(free)(offsets);
  cnt = 1;
  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
    }
  }
  THTensor_(resize1d)(uniqueOffsets, cnt);
  // weight += -learningRate * gradWeight
  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
 #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
  for (i = 0; i < cnt; i++) {
    long offset = (long)uniqueOffsets_p[i];
    THBlas_(axpy)(outDim,
                  -learningRate,
                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
                  COL_PTR2(weight, offset), weight->stride[0]);
  }
  THTensor_(free)(uniqueOffsets);
 }
 void THNN_(SparseLinear_zeroGradParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *lastInput)
 {
  long h, i, j;
  long outDim = gradWeight->size[0];
  long inDim = gradWeight->size[1];
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
  THArgCheck(THNN_(checkInput)(lastInput), 4,
             "input must be in coo format, nnz x 3");
  THTensor_(zero)(gradBias);
  long nnz = THTensor_(size)(lastInput, 0);
 #pragma omp parallel for private(i, j) schedule(static) if (   \
  nnz * outDim > 10000)
  for (i = 0; i < nnz; i++) {
    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
      continue;
    }
    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
    if (offset >= 0 && offset < inDim) {
      real* pGradWeight = COL_PTR2(gradWeight, offset);
      if (gradWeight->stride[0] == 1) {
        THVector_(fill)(pGradWeight, 0, outDim);
      } else {
        long stride = gradWeight->stride[0];
        for (j = 0; j < outDim; ++j) {
          pGradWeight[j * stride] = 0;
        }
      }
    } else {
      THError(
          "index out of bound. zeroGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
    }
  }
 }
 void THNN_(SparseLinear_legacyZeroGradParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *lastInput)
 {
  long h, i, j;
  long outDim = gradWeight->size[0];
  long inDim = gradWeight->size[1];
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
             "input size must be batchsize x nnz x 2");
  THTensor_(zero)(gradBias);
  long batchSize = THTensor_(size)(lastInput, 0);
  long nnz = THTensor_(size)(lastInput, 1);
 #pragma omp parallel for private(h, i, j) schedule(static) if (   \
  batchSize > 1 && batchSize * nnz * outDim > 10000)
  for (h = 0; h < batchSize; h++) {
    for (i = 0; i < nnz; i++) {
      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
        continue;
      }
      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        real* pGradWeight = COL_PTR2(gradWeight, offset);
        if (gradWeight->stride[0] == 1) {
          THVector_(fill)(pGradWeight, 0, outDim);
        } else {
          long stride = gradWeight->stride[0];
          for (j = 0; j < outDim; ++j) {
            pGradWeight[j * stride] = 0;
          }
        }
      } else {
        THError(
          "index out of bound. zeroGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }
 }
 #undef ROW_PTR2
 #undef COL_PTR2
 #endif
--- a/torch/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
+++ b/torch/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@ -0,0 +1,274 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
 #else
 static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
          real *input_p,
          real *output_p,
          real *indx_p,
          real *indy_p,
          long nslices,
          long iwidth,
          long iheight,
          long owidth,
          long oheight,
          long stridew,
          long strideh,
          long strided)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    /* loop over output */
    long i, j;
    for(i = 0; i < oheight; i++)
    {
      int y_start = (int)floor((float)i / oheight * iheight);
      int y_end   = (int)ceil((float)(i + 1) / oheight * iheight);
      int kH = y_end-y_start;
      for(j = 0; j < owidth; j++)
      {
        int x_start = (int)floor((float)j / owidth * iwidth);
        int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
        int kW = x_end-x_start;
        /* local pointers */
        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
        real *op = output_p  + k*owidth*oheight + i*owidth + j;
        real *indyp = indy_p + k*owidth*oheight + i*owidth + j;
        real *indxp = indx_p + k*owidth*oheight + i*owidth + j;
        /* compute local max: */
        long maxindex = -1;
        real maxval = -FLT_MAX;
        long tcntr = 0;
        int x,y;
        for(y = 0; y < kH; y++)
        {
          for(x = 0; x < kW; x++)
          {
            real val = *(ip + y*strideh + x*stridew);
            if (val > maxval)
            {
              maxval = val;
              maxindex = tcntr;
            }
            tcntr++;
          }
        }
        /* set output to local max */
        *op = maxval;
        /* store location of max (x,y) */
        *indyp = (int)(maxindex / kW) + TH_INDEX_BASE;
        *indxp = (maxindex % kW) + TH_INDEX_BASE;
      }
    }
  }
 }
 void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *indices,
          int owidth,
          int oheight)
 {
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  long nslices;
  long iheight;
  long iwidth;
  long istride_d;
  long istride_h;
  long istride_w;
  long istride_b;
  real *input_data;
  real *output_data;
  real *indices_data;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
  if (input->nDimension == 4)
  {
    istride_b = input->stride[0];
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimh-1];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  /* strides */
  istride_d = input->stride[dimh-1];
  istride_h = input->stride[dimh];
  istride_w = input->stride[dimw];
  /* resize output */
  if (input->nDimension == 3)
  {
    THTensor_(resize3d)(output, nslices, oheight, owidth);
    /* indices will contain i,j locations for each output point */
    THTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
                                                      indices_data+nslices*owidth*oheight, indices_data,
                                                      nslices,
                                                      iwidth, iheight,
                                                      owidth, oheight,
                                                      istride_w,istride_h,
                                                      istride_d);
  }
  else
  {
    long p;
    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
    /* indices will contain i,j locations for each output point */
    THTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
                                                        indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
                                                        nslices,
                                                        iwidth, iheight,
                                                        owidth, oheight,
                                                        istride_w,istride_h,
                                                        istride_d);
    }
  }
 }
 static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
          real *gradInput_p,
          real *gradOutput_p,
          real *indx_p,
          real *indy_p,
          long nslices,
          long iwidth,
          long iheight,
          long owidth,
          long oheight)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
    real *indx_p_k = indx_p + k*owidth*oheight;
    real *indy_p_k = indy_p + k*owidth*oheight;
    /* calculate max points */
    long i, j;
    for(i = 0; i < oheight; i++)
    {
      int y_start = (int)floor((float) i / oheight * iheight);
      for(j = 0; j < owidth; j++)
      {
        int x_start = (int)floor((float) j / owidth * iwidth);
        /* retrieve position of max */
        long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
        long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
        /* update gradient */
        gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
      }
    }
  }
 }
 void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *indices)
 {
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  int nslices;
  int iheight;
  int iwidth;
  int oheight;
  int owidth;
  real *gradInput_data;
  real *gradOutput_data;
  real *indices_data;
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimh-1];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  oheight = gradOutput->size[dimh];
  owidth = gradOutput->size[dimw];
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THTensor_(data)(indices);
  /* backprop */
  if (input->nDimension == 3)
  {
    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                         indices_data+nslices*owidth*oheight, indices_data,
                                                         nslices,
                                                         iwidth, iheight,
                                                         owidth, oheight);
  }
  else
  {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                           indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
                                                           nslices,
                                                           iwidth, iheight,
                                                           owidth, oheight);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialAveragePooling.c
+++ b/torch/lib/THNN/generic/SpatialAveragePooling.c
@ -0,0 +1,258 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
 #else
 void THNN_(SpatialAveragePooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          bool ceil_mode,
          bool count_include_pad)
 {
  real *output_data;
  real *input_data;
  int dimw = 2;
  int dimh = 1;
  int dimc = 0;
  long nbatch = 1;
  long inputWidth;
  long inputHeight;
  long outputWidth;
  long outputHeight;
  long nInputPlane; // number of channels (or colors)
  long k;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimc++;
  }
  inputWidth = input->size[dimw];
  inputHeight = input->size[dimh];
  nInputPlane = input->size[dimc];
  if(ceil_mode)
  {
    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
  }
  else
  {
    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
  }
  if (padW || padH)
  {
    // ensure that the last pooling starts inside the image
    // needed to avoid problems in ceil mode
    if ((outputHeight - 1)*dH >= inputHeight + padH)
      --outputHeight;
    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
      --outputWidth;
  }
  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
  if (input->nDimension == 3)
    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
  else
    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
  input = THTensor_(newContiguous)(input);
  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
  input_data = THTensor_(data)(input);
  output_data = THTensor_(data)(output);
 #pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane; k++)
  {
    long p;
    for(p = 0; p < nbatch; p++)
    {
      long xx, yy;
      /* For all output pixels... */
      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
      long i;
      for(i = 0; i < outputWidth*outputHeight; i++)
        ptr_output[i] = 0;
      for(yy = 0; yy < outputHeight; yy++)
      {
        for(xx = 0; xx < outputWidth; xx++)
        {
          /* Compute the mean of the input image... */
          long hstart = yy * dH - padH;
          long wstart = xx * dW - padW;
          long hend = fminf(hstart + kH, inputHeight + padH);
          long wend = fminf(wstart + kW, inputWidth + padW);
          int pool_size = (hend - hstart) * (wend - wstart);
          hstart = fmaxf(hstart, 0);
          wstart = fmaxf(wstart, 0);
          hend = fminf(hend, inputHeight);
          wend = fminf(wend, inputWidth);
          real sum = 0;
          int divide_factor;
          if(count_include_pad)
            divide_factor = pool_size;
          else
            divide_factor = (hend - hstart) * (wend - wstart);
          long kx, ky;
          for(ky = hstart; ky < hend; ky++)
          {
            for(kx = wstart; kx < wend; kx++)
              sum += ptr_input[ky*inputWidth + kx];
          }
          /* Update output */
          *ptr_output++ += sum/divide_factor;
        }
      }
    }
  }
  THTensor_(free)(input);
 }
 void THNN_(SpatialAveragePooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          bool ceil_mode,
          bool count_include_pad)
 {
  int dimw = 2;
  int dimh = 1;
  int dimc = 0;
  long nbatch = 1;
  long inputWidth;
  long inputHeight;
  long outputWidth;
  long outputHeight;
  long nInputPlane; // number of channels (or colors)
  real *gradOutput_data;
  real *input_data, *gradInput_data;
  long k;
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimc++;
  }
  inputWidth = input->size[dimw];
  inputHeight = input->size[dimh];
  nInputPlane = input->size[dimc];
  if(ceil_mode)
  {
    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
  }
  else
  {
    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
  }
  if (padW || padH)
  {
    // ensure that the last pooling starts inside the image
    // needed to avoid problems in ceil mode
    if ((outputHeight - 1)*dH >= inputHeight + padH)
      --outputHeight;
    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
      --outputWidth;
  }
  input_data = THTensor_(data)(input);
  THTensor_(resizeAs)(gradInput, input);
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
 #pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane; k++)
  {
    long p;
    for(p = 0; p < nbatch; p++)
    {
      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
      long xx, yy;
      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
      long i;
      for(i=0; i<inputWidth*inputHeight; i++)
        ptr_gi[i] = 0.0;
      for(yy = 0; yy < outputHeight; yy++)
      {
        for(xx = 0; xx < outputWidth; xx++)
        {
          long hstart = yy * dH - padH;
          long wstart = xx * dW - padW;
          long hend = fminf(hstart + kH, inputHeight + padH);
          long wend = fminf(wstart + kW, inputWidth + padW);
          int pool_size = (hend - hstart) * (wend - wstart);
          hstart = fmaxf(hstart, 0);
          wstart = fmaxf(wstart, 0);
          hend = fminf(hend, inputHeight);
          wend = fminf(wend, inputWidth);
          real z = *ptr_gradOutput++;
          int divide_factor;
          if(count_include_pad)
            divide_factor = pool_size;
          else
            divide_factor = (hend - hstart) * (wend - wstart);
          long kx, ky;
          for(ky = hstart ; ky < hend; ky++)
          {
            for(kx = wstart; kx < wend; kx++)
              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
          }
        }
      }
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialClassNLLCriterion.c
+++ b/torch/lib/THNN/generic/SpatialClassNLLCriterion.c
@ -0,0 +1,128 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
 #else
 #define INITIAL_CHECK                                                            \
  THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
              "only batches of spatial targets supported (3D tensors)");         \
  THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
              "only batches of spatial inputs supported (4D tensors)");          \
  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
    THError("weight tensor should be defined either for all or no classes");     \
  }                                                                              \
                                                                                 \
  {                                                                              \
    long input0 = THTensor_(size)(input, 0);                                     \
    long input1 = THTensor_(size)(input, 1);                                     \
    long input2 = THTensor_(size)(input, 2);                                     \
    long input3 = THTensor_(size)(input, 3);                                     \
    long target0 = THIndexTensor_(size)(target, 0);                              \
    long target1 = THIndexTensor_(size)(target, 1);                              \
    long target2 = THIndexTensor_(size)(target, 2);                              \
    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
              input0, input1, input2, input3, target0, target1, target2);        \
  }
 void THNN_(SpatialClassNLLCriterion_updateOutput)(
          THNNState *state,
          THTensor *input,
          THIndexTensor *target,
          THTensor *output,
          bool sizeAverage,
          THTensor *weights,
          THTensor *total_weight)
 {
  INITIAL_CHECK;
  input = THTensor_(newContiguous)(input);
  target = THIndexTensor_(newContiguous)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
  real *input_data = THTensor_(data)(input);
  THIndex_t *target_data = THIndexTensor_(data)(target);
  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
  real *output_data = THTensor_(data)(output);
  real *total_weight_data = THTensor_(data)(total_weight);
  long batch_size = THTensor_(size)(input, 0);
  long n_classes = THTensor_(size)(input, 1);
  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
  long sample_size = map_size * n_classes;
  real total_weight_acc = 0;
  real output_acc = 0;
  for (int b = 0; b < batch_size; b++) {
    for (int elem = 0; elem < map_size; elem++) {
      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
      THAssert(cur_target >= 0 && cur_target < n_classes);
      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
      total_weight_acc += cur_weight;
      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
    }
  }
  *total_weight_data = total_weight_acc;
  *output_data = output_acc;
  if (sizeAverage && *total_weight_data)
    *output_data /= *total_weight_data;
  THTensor_(free)(input);
  THIndexTensor_(free)(target);
  if (weights)
    THTensor_(free)(weights);
 }
 void THNN_(SpatialClassNLLCriterion_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THIndexTensor *target,
          THTensor *gradInput,
          bool sizeAverage,
          THTensor *weights,
          THTensor *total_weight)
 {
  INITIAL_CHECK;
  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
              "gradInput must be contiguous");
  real *total_weight_data = THTensor_(data)(total_weight);
  if (*total_weight_data <= 0)
    return;
  target = THIndexTensor_(newContiguous)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
  THIndex_t *target_data = THIndexTensor_(data)(target);
  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
  real *gradInput_data = THTensor_(data)(gradInput);
  long batch_size = THTensor_(size)(input, 0);
  long n_classes = THTensor_(size)(input, 1);
  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
  long sample_size = map_size * n_classes;
  real normalize = sizeAverage ? *total_weight_data : 1.0f;
  int b;
  #pragma omp parallel for
  for (b = 0; b < batch_size; b++) {
    int elem;
    for (elem = 0; elem < map_size; elem++) {
      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
      THAssert(cur_target >= 0 && cur_target < n_classes);
      gradInput_data[b * sample_size + cur_target * map_size + elem] =
        -(weights ? weights_data[cur_target] : 1.0f) / normalize;
    }
  }
  THIndexTensor_(free)(target);
  if (weights)
    THTensor_(free)(weights);
 }
 #undef INITIAL_CHECK
 #endif
--- a/torch/lib/THNN/generic/SpatialConvolutionLocal.c
+++ b/torch/lib/THNN/generic/SpatialConvolutionLocal.c
@ -0,0 +1,241 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
 #else
 static void THNN_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
                                                         int kW, int kH, int dW, int dH, int padW, int padH,
                                                         long nInputPlane, long inputWidth, long inputHeight,
                                                         long nOutputPlane, long outputWidth, long outputHeight)
 {
  long i;
  THTensor *output3d, *finput3d;
  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
  THTensor_(copy)(output, bias);
  output3d = THTensor_(newWithStorage3d)(output->storage, output->storageOffset,
                                         outputHeight*outputWidth, 1,
                                         nOutputPlane, outputHeight*outputWidth,
                                         1, nOutputPlane*outputHeight*outputWidth);
  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
                                         outputHeight*outputWidth, 1,
                                         kW*kH*nInputPlane, outputHeight*outputWidth,
                                         1, kW*kH*nInputPlane*outputHeight*outputWidth);
  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
  // finput3d:  oH*oW x nInputPlane*kH*kW x 1  
  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
  // output3d:  oH*oW x nOutputPlane x 1
  THTensor_(free)(output3d);
  THTensor_(free)(finput3d);
 }
 void THNN_(SpatialConvolutionLocal_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    THTensor *weight,
    THTensor *bias,
    THTensor *finput,
    THTensor *fgradInput,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    long inputWidth, long inputHeight,
    long outputWidth, long outputHeight)
 {
  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
  long nOutputPlane = THTensor_(size)(weight,1);
  if(input->nDimension == 3)
  {
    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
    THNN_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
                                                 kW, kH, dW, dH, padW, padH,
                                                 nInputPlane, inputWidth, inputHeight,
                                                 nOutputPlane, outputWidth, outputHeight);
  }
  else
  {
    long T = input->size[0];
    long t;
    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
 #pragma omp parallel for private(t)
    for(t = 0; t < T; t++)
    {
      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
      THNN_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
                                                   kW, kH, dW, dH, padW, padH,
                                                   nInputPlane, inputWidth, inputHeight,
                                                   nOutputPlane, outputWidth, outputHeight);
      THTensor_(free)(input_t);
      THTensor_(free)(output_t);
      THTensor_(free)(finput_t);
    }
  }
 }
 static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
                                                            long nInputPlane, long inputWidth, long inputHeight,
                                                            long nOutputPlane, long outputWidth, long outputHeight)
 {
  THTensor *gradOutput3d, *fgradInput3d;
  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
                                             outputHeight*outputWidth, 1,
                                             nOutputPlane, outputHeight*outputWidth,
                                             1, nOutputPlane*outputHeight*outputWidth);
  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
                                             outputHeight*outputWidth, 1,
                                             kW*kH*nInputPlane, outputHeight*outputWidth,
                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
  // gradOutput3d:  oH*oW x nOutputPlane x 1         
  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1  
  THTensor_(free)(gradOutput3d);
  THTensor_(free)(fgradInput3d);
  THTensor_(zero)(gradInput);
  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
                                            nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
 }
 void THNN_(SpatialConvolutionLocal_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    THTensor *weight,
    THTensor *finput,
    THTensor *fgradInput,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    long inputWidth, long inputHeight,
    long outputWidth, long outputHeight)
 {
  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
  long nOutputPlane = THTensor_(size)(weight,1);
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(resizeAs)(fgradInput, finput);
  THTensor_(transpose)(weight, weight, 1, 2);
  if(input->nDimension == 3)
  {
    THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
                                                       nInputPlane, inputWidth, inputHeight,
                                                       nOutputPlane, outputWidth, outputHeight);
  }
  else
  {
    long T = input->size[0];
    long t;
 #pragma omp parallel for private(t)
    for(t = 0; t < T; t++)
    {
      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
      THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
                                                         nInputPlane, inputWidth, inputHeight,
                                                         nOutputPlane, outputWidth, outputHeight);
      THTensor_(free)(gradInput_t);
      THTensor_(free)(gradOutput_t);
      THTensor_(free)(fgradInput_t);
    }
  }
  THTensor_(transpose)(weight, weight, 1, 2);
 }
 static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
                                                            long nInputPlane, long inputWidth, long inputHeight,
                                                            long nOutputPlane, long outputWidth, long outputHeight)
 {
  THTensor *gradOutput3d, *finput3d;
  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
                                             outputHeight*outputWidth, 1,
                                             nOutputPlane, outputHeight*outputWidth,
                                             1, nOutputPlane*outputHeight*outputWidth);
  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
                                         outputHeight*outputWidth, 1,
                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
                                         kW*kH*nInputPlane, outputHeight*outputWidth);
  // gradOutput3d:  oH*oW x nOutputPlane x 1  
  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
  THTensor_(free)(gradOutput3d);
  THTensor_(free)(finput3d);
 }
 void THNN_(SpatialConvolutionLocal_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    THTensor *finput,
    THTensor *fgradInput,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    long inputWidth, long inputHeight,
    long outputWidth, long outputHeight,
    real scale)
 {
  long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
  long nOutputPlane = THTensor_(size)(gradWeight,1);
  if(input->nDimension == 3)
  {
    THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
                                                         nInputPlane, inputWidth, inputHeight,
                                                         nOutputPlane, outputWidth, outputHeight);
  }
  else
  {
    long T = input->size[0];
    long t;
    for(t = 0; t < T; t++)
    {
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
      THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
                                                           nInputPlane, inputWidth, inputHeight,
                                                           nOutputPlane, outputWidth, outputHeight);
      THTensor_(free)(gradOutput_t);
      THTensor_(free)(finput_t);
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/torch/lib/THNN/generic/SpatialConvolutionMM.c
@ -0,0 +1,284 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
 #else
 static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          long nInputPlane,
          long inputWidth,
          long inputHeight,
          long nOutputPlane,
          long outputWidth,
          long outputHeight)
 {
  long i;
  THTensor *output2d;
  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
                                         nOutputPlane, -1,
                                         outputHeight*outputWidth, -1);
  if (bias) {
    for(i = 0; i < nOutputPlane; i++)
        THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
  } else {
    THTensor_(zero)(output);
  }
  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
  THTensor_(free)(output2d);
 }
 void THNN_(SpatialConvolutionMM_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          THTensor *fgradInput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH)
 {
  int dimf = 0;
  int dimw = 2;
  int dimh = 1;
  long nInputPlane;
  long inputWidth;
  long inputHeight;
  long nOutputPlane;
  long outputWidth;
  long outputHeight;
  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
  if (input->nDimension == 4) {
    dimf++;
    dimw++;
    dimh++;
  }
  nInputPlane = input->size[dimf];
  inputWidth   = input->size[dimw];
  inputHeight  = input->size[dimh];
  nOutputPlane = weight->size[0];
  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
  if (outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
  if (nInputPlane*kW*kH != weight->size[1])
    THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH));
  if(input->nDimension == 3)
  {
    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
    THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
                                                 kW, kH, dW, dH, padW, padH,
                                                 nInputPlane, inputWidth, inputHeight,
                                                 nOutputPlane, outputWidth, outputHeight);
  }
  else
  {
    long T = input->size[0];
    long t;
    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
 #pragma omp parallel for private(t)
    for(t = 0; t < T; t++)
    {
      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
      THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
                                                   kW, kH, dW, dH, padW, padH,
                                                   nInputPlane, inputWidth, inputHeight,
                                                   nOutputPlane, outputWidth, outputHeight);
      THTensor_(free)(input_t);
      THTensor_(free)(output_t);
      THTensor_(free)(finput_t);
    }
  }
 }
 static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
          THTensor *gradInput,
          THTensor *gradOutput,
          THTensor *weight,
          THTensor *fgradInput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH)
 {
  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
                                                       gradOutput->size[0], -1,
                                                       gradOutput->size[1]*gradOutput->size[2], -1);
  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
  THTensor_(free)(gradOutput2d);
  THTensor_(zero)(gradInput);
  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
 }
 void THNN_(SpatialConvolutionMM_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          THTensor *finput,
          THTensor *fgradInput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH)
 {
  long nOutputPlane = weight->size[0];
  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(resizeAs)(fgradInput, finput);
  // depending on the BLAS library, fgradInput (result tensor) might
  // be left uninitialized on zero alpha, which might lead to weird behavior
  // hence, to be safe, zero it
  THTensor_(zero)(fgradInput); 
  THTensor_(transpose)(weight, weight, 0, 1);
  if(input->nDimension == 3)
  {
    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
  }
  else
  {
    long T = input->size[0];
    long t;
 #pragma omp parallel for private(t)
    for(t = 0; t < T; t++)
    {
      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
      THTensor_(free)(gradInput_t);
      THTensor_(free)(gradOutput_t);
      THTensor_(free)(fgradInput_t);
    }
  }
  THTensor_(transpose)(weight, weight, 0, 1);
 }
 static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,
          real scale)
 {
  long i;
  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
                                                       gradOutput->size[0], -1,
                                                       gradOutput->size[1]*gradOutput->size[2], -1);
  THTensor_(transpose)(finput, finput, 0, 1);
  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
  THTensor_(transpose)(finput, finput, 0, 1);
  if (gradBias) {
    for(i = 0; i < gradBias->size[0]; i++)
    {
      long k;
      real sum = 0;
      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
      for(k = 0; k < gradOutput2d->size[1]; k++)
        sum += data[k];
      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
    }
  }
  THTensor_(free)(gradOutput2d);
 }
 void THNN_(SpatialConvolutionMM_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,
          THTensor *fgradInput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          real scale)
 {
  long nOutputPlane = gradWeight->size[0];
  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
  if(input->nDimension == 3)
  {
    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
  }
  else
  {
    long T = input->size[0];
    long t;
    for(t = 0; t < T; t++)
    {
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
      THTensor_(free)(gradOutput_t);
      THTensor_(free)(finput_t);
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialConvolutionMap.c
+++ b/torch/lib/THNN/generic/SpatialConvolutionMap.c
@ -0,0 +1,259 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
 #else
 void THNN_(SpatialConvolutionMap_updateOutput)(
  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
  THTensor *connTable, int nInputPlane, int nOutputPlane,
  int dW, int dH)
 {
  THArgCheck(
    weight != NULL && weight->nDimension == 3
    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );
  real *weight_data = THTensor_(data)(weight);
  real *bias_data = THTensor_(data)(bias);
  real *connTable_data = THTensor_(data)(connTable);
  int dimw = 2;
  int dimh = 1;
  int dimc = 0;
  long nbatch = 1;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimc++;
    dimw++;
    dimh++;
  }
  const long kH       = weight->size[1];
  const long kW       = weight->size[2];
  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
  const long input_w  = input->size[dimw];
  const long input_h  = input->size[dimh];
  const long output_w = (input_w - kW) / dW + 1;
  const long output_h = (input_h - kH) / dH + 1;
  if (input->nDimension == 3)
    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
  else
    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
  /* contiguous */
  input = THTensor_(newContiguous)(input);
  output = THTensor_(newContiguous)(output);
  /* get raw pointers */
  real *input_data = THTensor_(data)(input);
  real *output_data = THTensor_(data)(output);
  long p;
 #pragma omp parallel for private(p)
  for (p = 0; p < nOutputPlane; p++)
  {
    long m;
    for (m = 0; m < nbatch; m++)
    {
      /* add bias */
      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
      long j, k;
      real z= bias_data[p];
      for (j = 0; j < output_h*output_w; j++)
        ptr_output[j] = z;
      /* convolve all maps */
      int nweight = connTable->size[0];
      for (k = 0; k < nweight; k++)
      {
        /* get offsets for input/output */
        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
        if (o == p)
        {
          THTensor_(validXCorr2Dptr)(
            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
            1.0,
            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
            weight_data + k*kW*kH,
            kH, kW,
            dH, dW
          );
        }
      }
    }
  }
  /* clean up */
  THTensor_(free)(input);
  THTensor_(free)(output);
 }
 void THNN_(SpatialConvolutionMap_updateGradInput)(
  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
  THTensor *connTable, int nInputPlane, int nOutputPlane,
  int dW, int dH)
 {
  THArgCheck(
    weight != NULL && weight->nDimension == 3
    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );
  real *weight_data = THTensor_(data)(weight);
  real *connTable_data = THTensor_(data)(connTable);
  /* and dims */
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  const long input_h  = input->size[dimh];
  const long input_w  = input->size[dimw];
  const long output_h = gradOutput->size[dimh];
  const long output_w = gradOutput->size[dimw];
  const long kH       = weight->size[1];
  const long kW       = weight->size[2];
  /* contiguous */
  gradInput = THTensor_(newContiguous)(gradInput);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* Resize/Zero */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  /* get raw pointers */
  real *gradInput_data = THTensor_(data)(gradInput);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  long p;
 #pragma omp parallel for private(p)
  for (p = 0; p < nInputPlane; p++)
  {
    long m;
    for (m = 0; m < nbatch; m++)
    {
      long k;
      /* backward all */
      int nkernel = connTable->size[0];
      for (k = 0; k < nkernel; k++)
      {
        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
        if (i == p)
        {
          /* gradient to input */
          THTensor_(fullConv2Dptr)(
            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
            weight_data + k*kW*kH, kH, kW, dH, dW
          );
        }
      }
    }
  }
  /* clean up */
  THTensor_(free)(gradInput);
  THTensor_(free)(gradOutput);
 }
 void THNN_(SpatialConvolutionMap_accGradParameters)(
  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
  THTensor *connTable, int nInputPlane, int nOutputPlane,
  int dW, int dH, real scale)
 {
  THArgCheck(
    gradWeight != NULL && gradWeight->nDimension == 3
    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );
  real *gradWeight_data = THTensor_(data)(gradWeight);
  real *gradBias_data = THTensor_(data)(gradBias);
  /* and dims */
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  const long input_h  = input->size[dimh];
  const long input_w  = input->size[dimw];
  const long output_h = gradOutput->size[dimh];
  const long output_w = gradOutput->size[dimw];
  const long kH       = gradWeight->size[1];
  const long kW       = gradWeight->size[2];
  /* contiguous */
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* get raw pointers */
  real *input_data = THTensor_(data)(input);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  long k;
  /* gradients wrt bias */
 #pragma omp parallel for private(k)
  for (k = 0; k < nOutputPlane; k++)
  {
    long m;
    for (m = 0; m < nbatch; m++)
    {
      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
      long l;
      for (l = 0; l < output_h*output_w; l++)
        gradBias_data[k] += scale*ptr_gradOutput[l];
    }
  }
  /* gradients wrt weight */
  const int nkernel = connTable->size[0];
 #pragma omp parallel for private(k)
  for (k = 0; k < nkernel; k++)
  {
    long m;
    for (m = 0; m < nbatch; m++)
    {
      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
      /* gradient to kernel */
      THTensor_(validXCorr2DRevptr)(
        gradWeight_data + k*kW*kH,
        scale,
        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
        dH, dW
      );
    }
  }
  /* clean up */
  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/torch/lib/THNN/generic/SpatialDilatedConvolution.c
@ -0,0 +1,339 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
 #else
 void THNN_(SpatialDilatedConvolution_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    THTensor *weight,
    THTensor *bias,
    THTensor *columns,
    THTensor *ones,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int dilationW, int dilationH)
 {
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
  // Params:
  int nInputPlane = weight->size[1];
  int nOutputPlane = weight->size[0];
  int batch = 1;
  if (input->nDimension == 3) {
    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
  } else {
    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
  }
  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  if (outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
            nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
  // Batch size + input planes
  long batchSize = input->size[0];
  // Resize output
  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
  THTensor_(zero)(output);
  // Resize temporary columns
  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
  // Define a buffer of ones, for bias accumulation
  // Note: this buffer can be shared with other modules, it only ever gets increased,
  // and always contains ones.
  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize2d)(ones, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *output_n = THTensor_(new)();
  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(output_n, output, 0, elt);
    // Do Bias first:
    // M,N,K are dims of matrix A and B
    long m_ = nOutputPlane;
    long n_ = outputHeight * outputWidth;
    long k_ = 1;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    if (bias) {
      THBlas_(gemm)(
        't', 'n',
        n_, m_, k_,
        1,
        THTensor_(data)(ones), k_,
        THTensor_(data)(bias), k_,
        0,
        THTensor_(data)(output_n), n_
      );
    } else {
      THTensor_(zero)(output_n);
    }
    // Extract columns:
    THNN_(im2col)(
      THTensor_(data)(input_n),
      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
      dilationH, dilationW,
      THTensor_(data)(columns)
    );
    // M,N,K are dims of matrix A and B
    long m = nOutputPlane;
    long n = columns->size[1];
    long k = nInputPlane*kH*kW;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
      'n', 'n',
      n, m, k,
      1,
      THTensor_(data)(columns), n,
      THTensor_(data)(weight), k,
      1,
      THTensor_(data)(output_n), n
    );
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(output_n);
  // Resize output
  if (batch == 0) {
    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
  }
 }
 void THNN_(SpatialDilatedConvolution_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    THTensor *weight,
    THTensor *gradColumns,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int dilationW, int dilationH)
 {
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
  // Params
  int nInputPlane = weight->size[1];
  int nOutputPlane = weight->size[0];
  int batch = 1;
  if (input->nDimension == 3) {
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
  }
  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Resize output
  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
  // Resize temporary columns
  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
  THTensor_(zero)(gradColumns);
  // Helpers
  THTensor *gradInput_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per sample:
    THTensor_(select)(gradInput_n, gradInput, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // M,N,K are dims of matrix A and B
    long m = nInputPlane*kW*kH;
    long n = gradColumns->size[1];
    long k = nOutputPlane;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        'n', 't',
        n, m, k,
        1,
        THTensor_(data)(gradOutput_n), n,
        THTensor_(data)(weight), m,
        0,
        THTensor_(data)(gradColumns), n
    );
    // Unpack columns back into input:
    THNN_(col2im)(
      THTensor_(data)(gradColumns),
      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
      dilationH, dilationW,
      THTensor_(data)(gradInput_n)
    );
  }
  // Free
  THTensor_(free)(gradInput_n);
  THTensor_(free)(gradOutput_n);
  // Resize output
  if (batch == 0) {
    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
  }
 }
 void THNN_(SpatialDilatedConvolution_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    THTensor *columns,
    THTensor *ones,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int dilationW, int dilationH,
    real scale)
 {
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
  // Params
  int nInputPlane = gradWeight->size[1];
  int nOutputPlane = gradWeight->size[0];
  int batch = 1;
  if (input->nDimension == 3) {
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
  }
  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Define a buffer of ones, for bias accumulation
  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize2d)(ones, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Resize temporary columns
  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // Extract columns:
    THNN_(im2col)(
      THTensor_(data)(input_n),
      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
      dilationH, dilationW,
      THTensor_(data)(columns)
    );
    // M,N,K are dims of matrix A and B
    long m = nOutputPlane;
    long n = nInputPlane*kW*kH;
    long k = columns->size[1];
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        't', 'n',
        n, m, k,
        scale,
        THTensor_(data)(columns), k,
        THTensor_(data)(gradOutput_n), k,
        1,
        THTensor_(data)(gradWeight), n
    );
    // Do Bias:
    // M,N,K are dims of matrix A and B
    long m_ = nOutputPlane;
    long k_ = outputHeight * outputWidth;
    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
    if (gradBias) {
      THBlas_(gemv)(
          't',
          k_, m_,
          scale,
          THTensor_(data)(gradOutput_n), k_,
          THTensor_(data)(ones), 1,
          1,
          THTensor_(data)(gradBias), 1
      );
    }
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(gradOutput_n);
  // Resize
  if (batch == 0) {
    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialFractionalMaxPooling.c
+++ b/torch/lib/THNN/generic/SpatialFractionalMaxPooling.c
@ -0,0 +1,251 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
 #else
 static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
  real sample,
  long inputSize,
  long outputSize,
  int poolSize) {
  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
  long i;
  for (i = 0; i < outputSize - 1; ++i) {
    sequence[i] =
      (long) ((i + sample) * alpha) - (long) (sample * alpha);
  }
  sequence[outputSize - 1] = inputSize - poolSize;
  return sequence;
 }
 static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
  real* input,
  real* output,
  real* indices,
  real* randomSamples,
  long numPlanes,
  long inputW, long inputH,
  long outputW, long outputH,
  int poolSizeW, int poolSizeH) {
  long plane;
 #pragma omp parallel for private(plane)
  for (plane = 0; plane < numPlanes; ++plane) {
    /* each plane contains 2 random samples, one for W and one for H */
    real* randomSamplesForPlane = randomSamples + plane * 2;
    /* Generate interval sequence */
    long* sequenceW =
      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
    long* sequenceH =
      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
    /* loop over output */
    long h, w;
    real* inputForPlane = input + plane * inputW * inputH;
    real* outputForPlane = output + plane * outputW * outputH;
    real* indicesForPlane = indices + plane * outputW * outputH;
    for (h = 0; h < outputH; ++h) {
      long inputHStart = sequenceH[h];
      for (w = 0; w < outputW; ++w) {
        long inputWStart = sequenceW[w];
        real maxVal = -THInf;
        long maxIndex = -1;
        long h2, w2;
        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
            THAssert(h2 >= 0 && h2 < inputH);
            THAssert(w2 >= 0 && w2 < inputW);
            long planeIndex = h2 * inputW + w2;
            real val = inputForPlane[planeIndex];
            if (val > maxVal) {
              maxVal = val;
              maxIndex = planeIndex;
            }
          }
        }
        THAssert(maxVal != -THInf);
        THAssert(maxIndex != -1);
        outputForPlane[h * outputW + w] = maxVal;
        /* +1 to lua index */
        indicesForPlane[h * outputW + w] = (real) maxIndex + TH_INDEX_BASE;
      }
    }
    THFree(sequenceW);
    THFree(sequenceH);
  }
 }
 void THNN_(SpatialFractionalMaxPooling_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    int outputW, int outputH,
    int poolSizeW, int poolSizeH,
    THTensor *indices,
    THTensor *randomSamples) {
  long numBatch = 1;
  int planeDim = 0;
  int heightDim = 1;
  int widthDim = 2;
  long numInputDims = THTensor_(nDimension)(input);
  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
             "3D or 4D (batch mode) tensor expected");
  if (numInputDims == 4) {
    numBatch = THTensor_(size)(input, 0);
    planeDim++;
    heightDim++;
    widthDim++;
  }
  /* sizes */
  long numPlanes = THTensor_(size)(input, planeDim);
  long inputH = THTensor_(size)(input, heightDim);
  long inputW = THTensor_(size)(input, widthDim);
  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
             "poolSizeH too large relative to input height");
  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
             "poolSizeW too large relative to input width");
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  if (numInputDims == 3) {
    /* resize output */
    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
    /* indices will contain the locations for each output point */
    THTensor_(resize3d)(indices, numPlanes, outputH, outputW);
    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
      THTensor_(data)(input),
      THTensor_(data)(output),
      THTensor_(data)(indices),
      THTensor_(data)(randomSamples),
      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
  } else {
    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
    /* indices will contain the locations for each output point */
    THTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
    long batch;
 #pragma omp parallel for private(batch)
    for (batch = 0; batch < numBatch; ++batch) {
      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
  real* gradInput,
  real* gradOutput,
  real* indices,
  long numPlanes,
  long inputW, long inputH,
  long outputW, long outputH) {
  long plane;
 #pragma omp parallel for private(plane)
  for (plane = 0; plane < numPlanes; plane++) {
    real* gradInputForPlane = gradInput + plane * inputW * inputH;
    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
    real* indicesForPlane = indices + plane * outputW * outputH;
    long h, w;
    for (h = 0; h < outputH; ++h) {
      for (w = 0; w < outputW; ++w) {
        long outputIndex = h * outputW + w;
        long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
        THAssert(index >= 0 && index < inputW * inputH);
        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
      }
    }
  }
 }
 void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    int outputW, int outputH,
    int poolSizeW, int poolSizeH,
    THTensor *indices) {
  long numBatch = 1;
  int planeDim = 0;
  int heightDim = 1;
  int widthDim = 2;
  long numInputDims = THTensor_(nDimension)(input);
  if (numInputDims == 4) {
    numBatch = THTensor_(size)(input, 0);
    planeDim = 1;
    heightDim++;
    widthDim++;
  }
  /* sizes */
  long numPlanes = THTensor_(size)(input, planeDim);
  long inputH = THTensor_(size)(input, heightDim);
  long inputW = THTensor_(size)(input, widthDim);
  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
             "gradOutput width unexpected");
  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
             "gradOutput height unexpected");
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  /* backprop */
  if (numInputDims == 3) {
    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
      THTensor_(data)(gradInput),
      THTensor_(data)(gradOutput),
      THTensor_(data)(indices),
      numPlanes, inputW, inputH, outputW, outputH);
  } else {
    long batch;
 #pragma omp parallel for private(batch)
    for (batch = 0; batch < numBatch; ++batch) {
      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
        numPlanes, inputW, inputH, outputW, outputH);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialFullConvolution.c
+++ b/torch/lib/THNN/generic/SpatialFullConvolution.c
@ -0,0 +1,385 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
 #else
 static void THNN_(im2col)(const real* data_im, const int channels,
      const int height, const int width, const int kernel_h, const int kernel_w,
      const int pad_h, const int pad_w,
      const int stride_h, const int stride_w,
      const int dilation_h, const int dilation_w,
      real* data_col) {
  const int height_col = (height + 2 * pad_h -
                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_col = (width + 2 * pad_w -
                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
  const int channels_col = channels * kernel_h * kernel_w;
  for (int c_col = 0; c_col < channels_col; ++c_col) {
    int w_offset = c_col % kernel_w;
    int h_offset = (c_col / kernel_w) % kernel_h;
    int c_im = c_col / kernel_h / kernel_w;
    for (int h_col = 0; h_col < height_col; ++h_col) {
      for (int w_col = 0; w_col < width_col; ++w_col) {
        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
        data_col[(c_col * height_col + h_col) * width_col + w_col] =
          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
          data_im[(c_im * height + h_im) * width + w_im] : 0;
      }
    }
  }
 }
 static void THNN_(col2im)(const real* data_col, const int channels,
      const int height, const int width, const int kernel_h, const int kernel_w,
      const int pad_h, const int pad_w,
      const int stride_h, const int stride_w,
      const int dilation_h, const int dilation_w,
      real* data_im) {
  memset(data_im, 0, sizeof(real) * height * width * channels);
  const int height_col = (height + 2 * pad_h -
                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
  const int width_col = (width + 2 * pad_w -
                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
  const int channels_col = channels * kernel_h * kernel_w;
  for (int c_col = 0; c_col < channels_col; ++c_col) {
    int w_offset = c_col % kernel_w;
    int h_offset = (c_col / kernel_w) % kernel_h;
    int c_im = c_col / kernel_h / kernel_w;
    for (int h_col = 0; h_col < height_col; ++h_col) {
      for (int w_col = 0; w_col < width_col; ++w_col) {
        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
          data_im[(c_im * height + h_im) * width + w_im] +=
            data_col[(c_col * height_col + h_col) * width_col + w_col];
      }
    }
  }
 }
 void THNN_(SpatialFullConvolution_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    THTensor *weight,
    THTensor *bias,
    THTensor *columns,
    THTensor *ones,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int adjW, int adjH)
 {
  int nInputPlane = THTensor_(size)(weight,0);
  int nOutputPlane = THTensor_(size)(weight,1);
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
  int batch = 1;
  if (input->nDimension == 3) {
    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
  } else {
    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
  }
  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Resize output
  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
  // Resize temporary columns
  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
  THTensor_(zero)(columns);
  // Define a buffer of ones, for bias accumulation
  // Note: this buffer can be shared with other modules, it only ever gets increased,
  // and always contains ones.
  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize2d)(ones, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *output_n = THTensor_(new)();
  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(output_n, output, 0, elt);
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    long m = weight->size[1] * weight->size[2] * weight->size[3];
    long n = columns->size[1];
    long k = weight->size[0];
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        'n', 't',
        n, m, k,
        1,
        THTensor_(data)(input_n), n,
        THTensor_(data)(weight), m,
        0,
        THTensor_(data)(columns), n
    );
    // Unpack columns back into input:
    THNN_(col2im)(
      THTensor_(data)(columns),
      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
      1, 1,
      THTensor_(data)(output_n)
    );
    // Do Bias after:
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    long m_ = nOutputPlane;
    long n_ = outputHeight * outputWidth;
    long k_ = 1;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    if (bias) {
      THBlas_(gemm)(
          't', 'n',
          n_, m_, k_,
          1,
          THTensor_(data)(ones), k_,
          THTensor_(data)(bias), k_,
          1,
          THTensor_(data)(output_n), n_
      );
    }
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(output_n);
  // Resize output
  if (batch == 0) {
    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
  }
 }
 void THNN_(SpatialFullConvolution_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    THTensor *weight,
    THTensor *gradColumns,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int adjW, int adjH)
 {
  int nInputPlane = THTensor_(size)(weight,0);
  int nOutputPlane = THTensor_(size)(weight,1);
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
  int batch = 1;
  if (input->nDimension == 3) {
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
  }
  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Resize output
  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
  THTensor_(zero)(gradInput);
  // Resize temporary columns
  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
  // Helpers
  THTensor *gradInput_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per sample:
    THTensor_(select)(gradInput_n, gradInput, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // Extract columns:
    THNN_(im2col)(
      THTensor_(data)(gradOutput_n),
      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
      1, 1,
      THTensor_(data)(gradColumns)
    );
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    long m = weight->size[0];
    long n = gradColumns->size[1];
    long k = weight->size[1] * weight->size[2] * weight->size[3];
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        'n', 'n',
        n, m, k,
        1,
        THTensor_(data)(gradColumns), n,
        THTensor_(data)(weight), k,
        0,
        THTensor_(data)(gradInput_n), n
    );
  }
  // Free
  THTensor_(free)(gradInput_n);
  THTensor_(free)(gradOutput_n);
  // Resize output
  if (batch == 0) {
    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
  }
 }
 void THNN_(SpatialFullConvolution_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    THTensor *columns,
    THTensor *ones,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int adjW, int adjH,
    real scale)
 {
  int nInputPlane = THTensor_(size)(gradWeight,0);
  int nOutputPlane = THTensor_(size)(gradWeight,1);
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
  int batch = 1;
  if (input->nDimension == 3) {
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
  }
  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Define a buffer of ones, for bias accumulation
  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize2d)(ones, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Resize temporary columns
  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // Extract columns:
    THNN_(im2col)(
      THTensor_(data)(gradOutput_n),
      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
      1, 1,
      THTensor_(data)(columns)
    );
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    long n = columns->size[0];   // nOutputPlane * kh * kw
    long m = input_n->size[0];   // nInputPlane
    long k = columns->size[1];   // inputHeight * inputWidth
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        't', 'n',
        n, m, k,
        scale,
        THTensor_(data)(columns), k,
        THTensor_(data)(input_n), k,
        1,
        THTensor_(data)(gradWeight), n
    );
    // Do Bias:
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    long m_ = nOutputPlane;
    long k_ = outputHeight * outputWidth;
    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
    if (gradBias) {
      THBlas_(gemv)(
          't',
          k_, m_,
          scale,
          THTensor_(data)(gradOutput_n), k_,
          THTensor_(data)(ones), 1,
          1,
          THTensor_(data)(gradBias), 1
      );
    }
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(gradOutput_n);
  // Resize
  if (batch == 0) {
    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialFullConvolutionMap.c
+++ b/torch/lib/THNN/generic/SpatialFullConvolutionMap.c
@ -0,0 +1,212 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
 #else
 void THNN_(SpatialFullConvolutionMap_updateOutput)(
  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
  THTensor *connTable, int nInputPlane, int nOutputPlane,
  int dW, int dH)
 {
  THArgCheck(
    weight != NULL && weight->nDimension == 3
    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );
  const int kH = (int)weight->size[1];
  const int kW = (int)weight->size[2];
  THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
  THTensor_(resize3d)(
    output_, nOutputPlane,
    (input->size[1] - 1) * dH + kH,
    (input->size[2] - 1) * dW + kW
  );
  /* contiguous */
  input = THTensor_(newContiguous)(input);
  THTensor* output = THTensor_(newContiguous)(output_);
  /* get raw pointers */
  real *input_data = THTensor_(data)(input);
  real *output_data = THTensor_(data)(output);
  real *weight_data = THTensor_(data)(weight);
  real *bias_data = THTensor_(data)(bias);
  real *connTable_data = THTensor_(data)(connTable);
  /* and dims */
  const long input_h = input->size[1];
  const long input_w = input->size[2];
  const long output_h = output->size[1];
  const long output_w = output->size[2];
  const long weight_h = weight->size[1];
  const long weight_w = weight->size[2];
  long p;
 #pragma omp parallel for private(p)
  for (p = 0; p < nOutputPlane; p++)
  {
    /* add bias */
    real *ptr_output = output_data + p*output_w*output_h;
    long j;
    int nweight;
    long k;
    for (j = 0; j < output_h*output_w; j++)
      ptr_output[j] = bias_data[p];
    /* convolve all maps */
    nweight = connTable->size[0];
    for (k = 0; k < nweight; k++)
    {
      /* get offsets for input/output */
      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
      if (o == p)
      {
        THTensor_(fullConv2Dptr)(
          output_data + o*output_w*output_h,
          1.0,
          input_data + i*input_w*input_h, input_h, input_w,
          weight_data + k*weight_w*weight_h, weight_h, weight_w,
          dH, dW
        );
      }
    }
  }
  /* clean up */
  THTensor_(free)(input);
  THTensor_(freeCopyTo)(output, output_);
 }
 void THNN_(SpatialFullConvolutionMap_updateGradInput)(
  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
  THTensor *connTable, int nInputPlane, int nOutputPlane,
  int dW, int dH)
 {
  THArgCheck(
    weight != NULL && weight->nDimension == 3
    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );
  /* contiguous */
  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* Resize/Zero */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  /* get raw pointers */
  real *gradInput_data = THTensor_(data)(gradInput);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  real *weight_data = THTensor_(data)(weight);
  real *connTable_data = THTensor_(data)(connTable);
  /* and dims */
  const long input_h = input->size[1];
  const long input_w = input->size[2];
  const long output_h = gradOutput->size[1];
  const long output_w = gradOutput->size[2];
  const long kH = weight->size[1];
  const long kW = weight->size[2];
  long p;
 #pragma omp parallel for private(p)
  for (p = 0; p < nInputPlane; p++)
  {
    long k;
    /* backward all */
    int nkernel = connTable->size[0];
    for (k = 0; k < nkernel; k++)
    {
      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
      if (i == p)
      {
        /* gradient to input */
        THTensor_(validXCorr2Dptr)(
          gradInput_data + i*input_w*input_h,
          1.0,
          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
          weight_data + k*kW*kH, kH, kW,
          dH, dW
        );
      }
    }
  }
  /* clean up */
  THTensor_(freeCopyTo)(gradInput, gradInput_);
  THTensor_(free)(gradOutput);
 }
 void THNN_(SpatialFullConvolutionMap_accGradParameters)(
  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
  THTensor *connTable, int nInputPlane, int nOutputPlane,
  int dW, int dH, real scale)
 {
  THArgCheck(
    gradWeight != NULL && gradWeight->nDimension == 3
    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );
  /* contiguous */
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* get raw pointers */
  real *input_data = THTensor_(data)(input);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  real *gradWeight_data = THTensor_(data)(gradWeight);
  real *gradBias_data = THTensor_(data)(gradBias);
  /* and dims */
  const long input_h  = input->size[1];
  const long input_w  = input->size[2];
  const long output_h = gradOutput->size[1];
  const long output_w = gradOutput->size[2];
  const long weight_h = gradWeight->size[1];
  const long weight_w = gradWeight->size[2];
  /* gradients wrt bias */
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nOutputPlane; k++)
  {
    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
    long l;
    for (l = 0; l < output_h*output_w; l++)
      gradBias_data[k] += scale*ptr_gradOutput[l];
  }
  /* gradients wrt weight */
  int nkernel = connTable->size[0];
 #pragma omp parallel for private(k)
  for (k = 0; k < nkernel; k++)
  {
    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
    /* gradient to kernel */
    THTensor_(validXCorr2DRevptr)(
      gradWeight_data + k*weight_w*weight_h,
      scale,
      gradOutput_data + o*output_w*output_h, output_h, output_w,
      input_data + i*input_w*input_h, input_h, input_w,
      dH, dW
    );
  }
  /* clean up */
  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialMaxPooling.c
+++ b/torch/lib/THNN/generic/SpatialMaxPooling.c
@ -0,0 +1,300 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
 #else
 static void THNN_(SpatialMaxPooling_updateOutput_frame)(
          real *input_p,
          real *output_p,
          real *ind_p,
          long nslices,
          long iwidth,
          long iheight,
          long owidth,
          long oheight,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    /* loop over output */
    long i, j;
    real *ip = input_p   + k*iwidth*iheight;
    for(i = 0; i < oheight; i++)
    {
      for(j = 0; j < owidth; j++)
      {
        long hstart = i * dH - padH;
        long wstart = j * dW - padW;
        long hend = fminf(hstart + kH, iheight);
        long wend = fminf(wstart + kW, iwidth);
        hstart = fmaxf(hstart, 0);
        wstart = fmaxf(wstart, 0);
        /* local pointers */
        real *op = output_p  + k*owidth*oheight + i*owidth + j;
        real *indp = ind_p   + k*owidth*oheight + i*owidth + j;
        /* compute local max: */
        long maxindex = -1;
        real maxval = -THInf;
        long tcntr = 0;
        long x,y;
        for(y = hstart; y < hend; y++)
        {
          for(x = wstart; x < wend; x++)
          {
            tcntr = y*iwidth + x;
            real val = *(ip + tcntr);
            if (val > maxval)
            {
              maxval = val;
              maxindex = tcntr;
            }
          }
        }
        /* set output to local max */
        *op = maxval;
        /* store location of max */
        *indp = maxindex + TH_INDEX_BASE;
      }
    }
  }
 }
 void THNN_(SpatialMaxPooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *indices,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          bool ceil_mode)
 {
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  long nslices;
  long iheight;
  long iwidth;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;
  real *indices_data;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
  /* sizes */
  nslices = input->size[dimh-1];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  if (ceil_mode)
  {
    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
  }
  else
  {
    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
  }
  if (padW || padH)
  {
    // ensure that the last pooling starts inside the image
    if ((oheight - 1)*dH >= iheight + padH)
      --oheight;
    if ((owidth  - 1)*dW >= iwidth  + padW)
      --owidth;
  }
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  /* resize output */
  if (input->nDimension == 3)
  {
    THTensor_(resize3d)(output, nslices, oheight, owidth);
    /* indices will contain the locations for each output point */
    THTensor_(resize3d)(indices,  nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    THNN_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
                                              indices_data,
                                              nslices,
                                              iwidth, iheight,
                                              owidth, oheight,
                                              kW, kH, dW, dH,
                                              padW, padH);
  }
  else
  {
    long p;
    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
    /* indices will contain the locations for each output point */
    THTensor_(resize4d)(indices, nbatch, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
                                                indices_data+p*nslices*owidth*oheight,
                                                nslices,
                                                iwidth, iheight,
                                                owidth, oheight,
                                                kW, kH, dW, dH,
                                                padW, padH);
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
          real *gradInput_p,
          real *gradOutput_p,
          real *ind_p,
          long nslices,
          long iwidth,
          long iheight,
          long owidth,
          long oheight,
          int dW,
          int dH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
    real *ind_p_k = ind_p + k*owidth*oheight;
    /* calculate max points */
    long i, j;
    for(i = 0; i < oheight; i++)
    {
      for(j = 0; j < owidth; j++)
      {
        /* retrieve position of max */
        long maxp = ind_p_k[i*owidth + j] - TH_INDEX_BASE;
        /* update gradient */
        gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
      }
    }
  }
 }
 void THNN_(SpatialMaxPooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *indices,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          bool ceil_mode)
 {
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  int nslices;
  int iheight;
  int iwidth;
  int oheight;
  int owidth;
  real *gradInput_data;
  real *gradOutput_data;
  real *indices_data;
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimh-1];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  oheight = gradOutput->size[dimh];
  owidth = gradOutput->size[dimw];
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THTensor_(data)(indices);
  /* backprop */
  if (input->nDimension == 3)
  {
    THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                 indices_data,
                                                 nslices,
                                                 iwidth, iheight,
                                                 owidth, oheight,
                                                 dW, dH);
  }
  else
  {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                   indices_data+p*nslices*owidth*oheight,
                                                   nslices,
                                                   iwidth, iheight,
                                                   owidth, oheight,
                                                   dW, dH);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialMaxUnpooling.c
+++ b/torch/lib/THNN/generic/SpatialMaxUnpooling.c
@ -0,0 +1,223 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
 #else
 static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
                                                      real *ind_p,
                                                      long nslices,
                                                      long iwidth, long iheight,
                                                      long owidth, long oheight)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    real *output_p_k = output_p + k*owidth*oheight;
    real *input_p_k = input_p + k*iwidth*iheight;
    real *ind_p_k = ind_p + k*iwidth*iheight;
    long i, j, maxp;
    for(i = 0; i < iheight; i++)
    {
      for(j = 0; j < iwidth; j++)
      {
        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
        if(maxp<0 || maxp>=owidth*oheight){
            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
        }
        output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
      }
    }
  }
 }
 void THNN_(SpatialMaxUnpooling_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    THTensor *indices,
    int owidth, int oheight)
 {
  int dimw = 2;
  int dimh = 1;
  int nbatch = 1;
  int nslices;
  int iheight;
  int iwidth;
  real *input_data;
  real *output_data;
  real *indices_data;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
  if (!THTensor_(isSameSizeAs)(input, indices)){
    THError("Invalid input size w.r.t current indices size");
  }
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimh-1];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  /* get contiguous input and indices */
  input = THTensor_(newContiguous)(input);
  indices = THTensor_(newContiguous)(indices);
  /* resize output */
  if (input->nDimension == 3)
  {
    THTensor_(resize3d)(output, nslices, oheight, owidth);
    THTensor_(zero)(output);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
                                              indices_data,
                                              nslices,
                                              iwidth, iheight,
                                              owidth, oheight);
  }
  else
  {
    long p;
    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
    THTensor_(zero)(output);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
                                                indices_data+p*nslices*iwidth*iheight,
                                                nslices,
                                                iwidth, iheight,
                                                owidth, oheight);
    }
  }
  /* cleanup */
  THTensor_(free)(input);
  THTensor_(free)(indices);
 }
 static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
                                                         real *ind_p,
                                                         long nslices,
                                                         long iwidth, long iheight,
                                                         long owidth, long oheight)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
    real *ind_p_k = ind_p + k*iwidth*iheight;
    long i, j, maxp;
    for(i = 0; i < iheight; i++)
    {
      for(j = 0; j < iwidth; j++)
      {
        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
        if(maxp<0 || maxp>=owidth*oheight){
            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
        }
        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
      }
    }
  }
 }
 void THNN_(SpatialMaxUnpooling_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    THTensor *indices,
    int owidth, int oheight)
 {
  int dimw = 2;
  int dimh = 1;
  int nbatch = 1;
  int nslices;
  int iheight;
  int iwidth;
  real *gradInput_data;
  real *gradOutput_data;
  real *indices_data;
  if (!THTensor_(isSameSizeAs)(input, indices)){
    THError("Invalid input size w.r.t current indices size");
  }
  /* get contiguous gradOutput and indices */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  indices = THTensor_(newContiguous)(indices);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimh-1];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
  }
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THTensor_(data)(indices);
  /* backprop */
  if (input->nDimension == 3)
  {
    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                 indices_data,
                                                 nslices,
                                                 iwidth, iheight,
                                                 owidth, oheight);
  }
  else
  {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                   indices_data+p*nslices*iwidth*iheight,
                                                   nslices,
                                                   iwidth, iheight,
                                                   owidth, oheight);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
  THTensor_(free)(indices);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialReflectionPadding.c
+++ b/torch/lib/THNN/generic/SpatialReflectionPadding.c
@ -0,0 +1,255 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
 #else
 static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
  real *input_p, real *output_p,
  long nslices,
  long iwidth, long iheight,
  long owidth, long oheight,
  int pad_l, int pad_r,
  int pad_t, int pad_b)
 {
  int iStartX = fmax(0, -pad_l);
  int iStartY = fmax(0, -pad_t);
  int oStartX = fmax(0, pad_l);
  int oStartY = fmax(0, pad_t);
  long k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)
  for (k = 0; k < nslices; k++)
  {
    long i, j;
    for (i = 0; i < oheight; i++) {
      for (j = 0; j < owidth; j++) {
        if (j < pad_l) {
          ip_x = pad_l * 2 - j;
        } else if (j >= pad_l && j < iwidth + pad_l) {
          ip_x = j;
        } else {
          ip_x = (iwidth + pad_l - 1) * 2 - j;
        }
        ip_x = ip_x - oStartX + iStartX;
        if (i < pad_t) {
          ip_y = pad_t * 2 - i;
        } else if (i >= pad_t && i < iheight + pad_t) {
          ip_y = i;
        } else {
          ip_y = (iheight + pad_t - 1) * 2 - i;
        }
        ip_y = ip_y - oStartY + iStartY;
        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
        *dest_p = *src_p;
      }
    }
  }
 }
 void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
                                                  THTensor *input,
                                                  THTensor *output,
                                                  int pad_l, int pad_r,
                                                  int pad_t, int pad_b)
 {
  int dimw = 2;
  int dimh = 1;
  int dimslices = 0;
  long nbatch = 1;
  long nslices;
  long iheight;
  long iwidth;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;
  THArgCheck(input->nDimension == 3 ||
    input->nDimension == 4 , 2, "input must be 3 or 4-dimensional");
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimslices++;
  }
  /* sizes */
  nslices = input->size[dimslices];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  oheight = iheight + pad_t + pad_b;
  owidth  = iwidth + pad_l + pad_r;
  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  /* resize output */
  if (input->nDimension == 3)
  {
    THTensor_(resize3d)(output, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
                                                    nslices,
                                                    iwidth, iheight,
                                                    owidth, oheight,
                                                    pad_l, pad_r,
                                                    pad_t, pad_b);
  }
  else
  {
    long p;
    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialReflectionPadding_updateOutput_frame)(
        input_data+p*nslices*iwidth*iheight,
        output_data+p*nslices*owidth*oheight,
        nslices,
        iwidth, iheight,
        owidth, oheight,
        pad_l, pad_r,
        pad_t, pad_b);
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
  real *ginput_p, real *goutput_p,
  long nslices,
  long iwidth, long iheight,
  long owidth, long oheight,
  int pad_l, int pad_r,
  int pad_t, int pad_b)
 {
  int iStartX = fmax(0, -pad_l);
  int iStartY = fmax(0, -pad_t);
  int oStartX = fmax(0, pad_l);
  int oStartY = fmax(0, pad_t);
  long k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)
  for (k = 0; k < nslices; k++)
  {
    long i, j;
    for (i = 0; i < oheight; i++) {
      for (j = 0; j < owidth; j++) {
        if (j < pad_l) {
          ip_x = pad_l * 2 - j;
        } else if (j >= pad_l && j < iwidth + pad_l) {
          ip_x = j;
        } else {
          ip_x = (iwidth + pad_l - 1) * 2 - j;
        }
        ip_x = ip_x - oStartX + iStartX;
        if (i < pad_t) {
          ip_y = pad_t * 2 - i;
        } else if (i >= pad_t && i < iheight + pad_t) {
          ip_y = i;
        } else {
          ip_y = (iheight + pad_t - 1) * 2 - i;
        }
        ip_y = ip_y - oStartY + iStartY;
        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
        *dest_p += *src_p;
      }
    }
  }
 }
 void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
                                                      THTensor *input,
                                                      THTensor *gradOutput,
                                                      THTensor *gradInput,
                                                      int pad_l, int pad_r,
                                                      int pad_t, int pad_b)
 {
  int dimw = 2;
  int dimh = 1;
  int dimslices = 0;
  long nbatch = 1;
  long nslices;
  long iheight;
  long iwidth;
  long oheight;
  long owidth;
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimslices++;
  }
  /* sizes */
  nslices = input->size[dimslices];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  oheight = iheight + pad_t + pad_b;
  owidth  = iwidth + pad_l + pad_r;
  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
                "gradOutput width unexpected");
  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
                "gradOutput height unexpected");
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  /* backprop */
  if (input->nDimension == 3) {
    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
      THTensor_(data)(gradInput),
      THTensor_(data)(gradOutput),
      nslices,
      iwidth, iheight,
      owidth, oheight,
      pad_l, pad_r,
      pad_t, pad_b);
  } else {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++) {
      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
        nslices,
        iwidth, iheight,
        owidth, oheight,
        pad_l, pad_r,
        pad_t, pad_b);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialReplicationPadding.c
+++ b/torch/lib/THNN/generic/SpatialReplicationPadding.c
@ -0,0 +1,254 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
 #else
 static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
  real *input_p, real *output_p,
  long nslices,
  long iwidth, long iheight,
  long owidth, long oheight,
  int pad_l, int pad_r,
  int pad_t, int pad_b)
 {
  int iStartX = fmax(0, -pad_l);
  int iStartY = fmax(0, -pad_t);
  int oStartX = fmax(0, pad_l);
  int oStartY = fmax(0, pad_t);
  long k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)
  for (k = 0; k < nslices; k++)
  {
    long i, j;
    for (i = 0; i < oheight; i++) {
      for (j = 0; j < owidth; j++) {
        if (j < pad_l) {
          ip_x = pad_l;
        } else if (j >= pad_l && j < iwidth + pad_l) {
          ip_x = j;
        } else {
          ip_x = iwidth + pad_l - 1;
        }
        ip_x = ip_x - oStartX + iStartX;
        if (i < pad_t) {
          ip_y = pad_t;
        } else if (i >= pad_t && i < iheight + pad_t) {
          ip_y = i;
        } else {
          ip_y = iheight + pad_t - 1;
        }
        ip_y = ip_y - oStartY + iStartY;
        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
        *dest_p = *src_p;
      }
    }
  }
 }
 void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
                                                         THTensor *input,
                                                         THTensor *output,
                                                         int pad_l, int pad_r,
                                                         int pad_t, int pad_b)
 {
  int dimw = 2;
  int dimh = 1;
  int dimslices = 0;
  long nbatch = 1;
  long nslices;
  long iheight;
  long iwidth;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4,
             2, "input must be 3 or 4-dimensional");
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimslices++;
  }
  /* sizes */
  nslices = input->size[dimslices];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  oheight = iheight + pad_t + pad_b;
  owidth  = iwidth + pad_l + pad_r;
  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  /* resize output */
  if (input->nDimension == 3)
  {
    THTensor_(resize3d)(output, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
                                                    nslices,
                                                    iwidth, iheight,
                                                    owidth, oheight,
                                                    pad_l, pad_r,
                                                    pad_t, pad_b);
  }
  else
  {
    long p;
    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialReplicationPadding_updateOutput_frame)(
        input_data+p*nslices*iwidth*iheight,
        output_data+p*nslices*owidth*oheight,
        nslices,
        iwidth, iheight,
        owidth, oheight,
        pad_l, pad_r,
        pad_t, pad_b);
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
  real *ginput_p, real *goutput_p,
  long nslices,
  long iwidth, long iheight,
  long owidth, long oheight,
  int pad_l, int pad_r,
  int pad_t, int pad_b)
 {
  int iStartX = fmax(0, -pad_l);
  int iStartY = fmax(0, -pad_t);
  int oStartX = fmax(0, pad_l);
  int oStartY = fmax(0, pad_t);
  long k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)
  for (k = 0; k < nslices; k++)
  {
    long i, j;
    for (i = 0; i < oheight; i++) {
      for (j = 0; j < owidth; j++) {
        if (j < pad_l) {
          ip_x = pad_l;
        } else if (j >= pad_l && j < iwidth + pad_l) {
          ip_x = j;
        } else {
          ip_x = iwidth + pad_l - 1;
        }
        ip_x = ip_x - oStartX + iStartX;
        if (i < pad_t) {
          ip_y = pad_t;
        } else if (i >= pad_t && i < iheight + pad_t) {
          ip_y = i;
        } else {
          ip_y = iheight + pad_t - 1;
        }
        ip_y = ip_y - oStartY + iStartY;
        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
        *dest_p += *src_p;
      }
    }
  }
 }
 void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
                                                      THTensor *input,
                                                      THTensor *gradOutput,
                                                      THTensor *gradInput,
                                                      int pad_l, int pad_r,
                                                      int pad_t, int pad_b)
 {
  int dimw = 2;
  int dimh = 1;
  int dimslices = 0;
  long nbatch = 1;
  long nslices;
  long iheight;
  long iwidth;
  long oheight;
  long owidth;
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimslices++;
  }
  /* sizes */
  nslices = input->size[dimslices];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  oheight = iheight + pad_t + pad_b;
  owidth  = iwidth + pad_l + pad_r;
  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
                "gradOutput width unexpected");
  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
                "gradOutput height unexpected");
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  /* backprop */
  if (input->nDimension == 3) {
    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
      THTensor_(data)(gradInput),
      THTensor_(data)(gradOutput),
      nslices,
      iwidth, iheight,
      owidth, oheight,
      pad_l, pad_r,
      pad_t, pad_b);
  } else {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++) {
      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
        nslices,
        iwidth, iheight,
        owidth, oheight,
        pad_l, pad_r,
        pad_t, pad_b);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialSubSampling.c
+++ b/torch/lib/THNN/generic/SpatialSubSampling.c
@ -0,0 +1,267 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
 #else
 void THNN_(SpatialSubSampling_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    THTensor *weight,
    THTensor *bias,
    int kW, int kH,
    int dW, int dH)
 {
  real *weight_data = THTensor_(data)(weight);
  real *bias_data = THTensor_(data)(bias);
  real *output_data;
  real *input_data;
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  long inputWidth;
  long inputHeight;
  long outputWidth;
  long outputHeight;
  int nInputPlane = THTensor_(size)(weight,0);
  long k;
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  inputWidth = input->size[dimw];
  inputHeight = input->size[dimh];
  outputWidth = (inputWidth - kW) / dW + 1;
  outputHeight = (inputHeight - kH) / dH + 1;
  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
  if (input->nDimension == 3)
    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
  else
    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
  input = THTensor_(newContiguous)(input);
  input_data = THTensor_(data)(input);
  output_data = THTensor_(data)(output);
 #pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane; k++)
  {
    long p;
    for(p = 0; p < nbatch; p++)
    {
      long xx, yy;
      /* For all output pixels... */
      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
      /* Get the good mask for (k,i) (k out, i in) */
      real the_weight = weight_data[k];
      /* Initialize to the bias */
      real z = bias_data[k];
      long i;
      for(i = 0; i < outputWidth*outputHeight; i++)
        ptr_output[i] = z;
      for(yy = 0; yy < outputHeight; yy++)
      {
        for(xx = 0; xx < outputWidth; xx++)
        {
          /* Compute the mean of the input image... */
          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
          real sum = 0;
          long kx, ky;
          for(ky = 0; ky < kH; ky++)
          {
            for(kx = 0; kx < kW; kx++)
              sum += ptr_input[kx];
            ptr_input += inputWidth; /* next input line */
          }
          /* Update output */
          *ptr_output++ += the_weight*sum;
        }
      }
    }
  }
  THTensor_(free)(input);
 }
 void THNN_(SpatialSubSampling_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    THTensor *weight,
    int kW, int kH,
    int dW, int dH)
 {
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  long inputWidth;
  long inputHeight;
  long outputWidth;
  long outputHeight;
  int nInputPlane = THTensor_(size)(weight,0);
  real *weight_data;
  real *gradOutput_data;
  real *input_data, *gradInput_data;
  long k;
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  inputWidth = input->size[dimw];
  inputHeight = input->size[dimh];
  outputWidth = (inputWidth - kW) / dW + 1;
  outputHeight = (inputHeight - kH) / dH + 1;
  weight_data = THTensor_(data)(weight);
  gradOutput_data = THTensor_(data)(gradOutput);
  input_data = THTensor_(data)(input);
  THTensor_(resizeAs)(gradInput, input);
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
 #pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane; k++)
  {
    long p;
    for(p = 0; p < nbatch; p++)
    {
      real the_weight = weight_data[k];
      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
      long xx, yy;
      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
      long i;
      for(i=0; i<inputWidth*inputHeight; i++)
        ptr_gi[i] = 0.0;
      for(yy = 0; yy < outputHeight; yy++)
      {
        for(xx = 0; xx < outputWidth; xx++)
        {
          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
          real z = *ptr_gradOutput++ * the_weight;
          long kx, ky;
          for(ky = 0; ky < kH; ky++)
          {
            for(kx = 0; kx < kW; kx++)
              ptr_gradInput[kx] += z;
            ptr_gradInput += inputWidth;
          }
        }
      }
    }
  }
 }
 void THNN_(SpatialSubSampling_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    int kW, int kH,
    int dW, int dH,
    real scale)
 {
  long nbatch = 1;
  long dimw = 2;
  long dimh = 1;
  long inputWidth;
  long inputHeight;
  long outputWidth;
  long outputHeight;
  int nInputPlane = THTensor_(size)(gradWeight,0);
  real *gradWeight_data;
  real *gradBias_data;
  real *gradOutput_data;
  real *input_data;
  long k;
  if (input->nDimension == 4) {
    dimw++;
    dimh++;
    nbatch = input->size[0];
  }
  inputWidth = input->size[dimw];
  inputHeight = input->size[dimh];
  outputWidth = (inputWidth - kW) / dW + 1;
  outputHeight = (inputHeight - kH) / dH + 1;
  gradWeight_data = THTensor_(data)(gradWeight);
  gradBias_data = THTensor_(data)(gradBias);
  gradOutput_data = THTensor_(data)(gradOutput);
  input = THTensor_(newContiguous)(input);
  input_data = THTensor_(data)(input);
 #pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane; k++)
  {
    long p;
    for(p = 0; p < nbatch; p++)
    {
      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
      real sum;
      long xx, yy;
      long i;
      sum = 0;
      for(i = 0; i < outputWidth*outputHeight; i++)
        sum += ptr_gradOutput[i];
      gradBias_data[k] += scale*sum;
      sum = 0;
      for(yy = 0; yy < outputHeight; yy++)
      {
        for(xx = 0; xx < outputWidth; xx++)
        {
          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
          real z = *ptr_gradOutput++;
          long kx, ky;
          for(ky = 0; ky < kH; ky++)
          {
            for(kx = 0; kx < kW; kx++)
              sum += z * ptr_input[kx];
            ptr_input += inputWidth;
          }
        }
      }
      gradWeight_data[k] += scale*sum;
    }
  }
  THTensor_(free)(input);
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialUpSamplingBilinear.c
+++ b/torch/lib/THNN/generic/SpatialUpSamplingBilinear.c
@ -0,0 +1,127 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
 #else
 void THNN_(SpatialUpSamplingBilinear_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output){
  input = THTensor_(newContiguous)(input);
  output = THTensor_(newContiguous)(output);
  THTensor_(zero)(output);
  real *idata = THTensor_(data)(input);
  real *odata = THTensor_(data)(output);
  int channels = THTensor_(size)(input, 0) * THTensor_(size)(input, 1);
  int height1 = THTensor_(size)(input, 2);
  int width1 = THTensor_(size)(input, 3);
  int height2 = THTensor_(size)(output, 2);
  int width2 = THTensor_(size)(output, 3);
  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
  // special case: just copy
  if (height1 == height2 && width1 == width2) {
    for (int h2 = 0; h2 < height2; ++h2) {
      const int h1 = h2;
      for (int w2 = 0; w2 < width2; ++w2) {
        const int w1 = w2;
        const real* pos1 = &idata[h1 * width1 + w1];
        real* pos2 = &odata[h2 * width2 + w2];
        for (int c = 0; c < channels; ++c) {
          pos2[0] = pos1[0];
          pos1 += width1 * height1;
          pos2 += width2 * height2;
        }
      }
    }
    return;
  }
  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
  const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
  for (int h2 = 0; h2 < height2; ++h2) {
    const float h1r = rheight * h2;
    const int h1 = h1r;
    const int h1p = (h1 < height1 - 1) ? 1 : 0;
    const real h1lambda = h1r - h1;
    const real h0lambda = (real)1. - h1lambda;
    for (int w2 = 0; w2 < width2; ++w2) {
      const float w1r = rwidth * w2;
      const int w1 = w1r;
      const int w1p = (w1 < width1 - 1) ? 1 : 0;
      const real w1lambda = w1r - w1;
      const real w0lambda = (real)1. - w1lambda;
      const real* pos1 = &idata[h1 * width1 + w1];
      real* pos2 = &odata[h2 * width2 + w2];
      for (int c = 0; c < channels; ++c) {
        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
                  + h1lambda * (w0lambda * pos1[h1p * width1]
                  + w1lambda * pos1[h1p * width1 + w1p]);
        pos1 += width1 * height1;
        pos2 += width2 * height2;
      }
    }
  }
 }
 void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
    THNNState *state,
    THTensor *gradOutput,
    THTensor *gradInput){
  gradInput = THTensor_(newContiguous)(gradInput);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  THTensor_(zero)(gradInput);
  real *data1 = THTensor_(data)(gradInput);
  real *data2 = THTensor_(data)(gradOutput);
  int channels = THTensor_(size)(gradInput, 0) * THTensor_(size)(gradInput, 1);
  int height1 = THTensor_(size)(gradInput, 2);
  int width1 = THTensor_(size)(gradInput, 3);
  int height2 = THTensor_(size)(gradOutput, 2);
  int width2 = THTensor_(size)(gradOutput, 3);
  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
  // special case: same-size matching grids
  if (height1 == height2 && width1 == width2) {
    for (int h2 = 0; h2 < height2; ++h2) {
      const int h1 = h2;
      for (int w2 = 0; w2 < width2; ++w2) {
        const int w1 = w2;
        real* pos1 = &data1[h1 * width1 + w1];
        const real* pos2 = &data2[h2 * width2 + w2];
        for (int c = 0; c < channels; ++c) {
          pos1[0] += pos2[0];
          pos1 += width1 * height1;
          pos2 += width2 * height2;
        }
      }
    }
    return;
  }
  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
  const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
  for (int h2 = 0; h2 < height2; ++h2) {
    const float h1r = rheight * h2;
    const int h1 = h1r;
    const int h1p = (h1 < height1 - 1) ? 1 : 0;
    const real h1lambda = h1r - h1;
    const real h0lambda = (real)1. - h1lambda;
    for (int w2 = 0; w2 < width2; ++w2) {
      const float w1r = rwidth * w2;
      const int w1 = w1r;
      const int w1p = (w1 < width1 - 1) ? 1 : 0;
      const real w1lambda = w1r - w1;
      const real w0lambda = (real)1. - w1lambda;
      real* pos1 = &data1[h1 * width1 + w1];
      const real* pos2 = &data2[h2 * width2 + w2];
      for (int c = 0; c < channels; ++c) {
        pos1[0] += h0lambda * w0lambda * pos2[0];
        pos1[w1p] += h0lambda * w1lambda * pos2[0];
        pos1[h1p * width1] += h1lambda * w0lambda * pos2[0];
        pos1[h1p * width1 + w1p] += h1lambda * w1lambda * pos2[0];
        pos1 += width1 * height1;
        pos2 += width2 * height2;
      }
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/SpatialUpSamplingNearest.c
+++ b/torch/lib/THNN/generic/SpatialUpSamplingNearest.c
@ -0,0 +1,143 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
 #else
 void THNN_(SpatialUpSamplingNearest_updateOutput)(
    THNNState *state,
    THTensor *input,
    THTensor *output,
    int scale_factor)
 {
  int dW = scale_factor;
  int dH = scale_factor;
  int xDim = input->nDimension-2;
  int yDim = input->nDimension-1;
  // dims
  int idim = input->nDimension;  // Gauranteed to be between 3 and 5
  int osz0 = output->size[0];
  int osz1 = output->size[1];
  int osz2 = output->size[2];
  int osz3 = 1;
  if (idim > 3) {
    osz3 = output->size[3];
  }
  // get strides
  long *is = input->stride;
  long *os = output->stride;
  // get raw pointers
  real *pin = THTensor_(data)(input);
  real *pout = THTensor_(data)(output);
  // perform the upsampling
  int i0, i1, i2, i3, isrc, idst;
  int iout[4];  // Output indices
  int iin[4];  // Input indices
  for (i0 = 0; i0 < osz0; i0++) {
    iout[0] = i0;
    iin[0] = i0;
    for (i1 = 0; i1 < osz1; i1++) {
      iout[1] = i1;
      iin[1] = i1;
      for (i2 = 0; i2 < osz2; i2++) {
        iout[2] = i2;
        iin[2] = i2;
        for (i3 = 0; i3 < osz3; i3++) {
          iout[3] = i3;
          iin[3] = i3;
          // set the indices for the upsampled dimensions
          iin[xDim] = iout[xDim] / dW;
          iin[yDim] = iout[yDim] / dH;
          idst = i0*os[0] + i1*os[1] + i2*os[2];
          isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
          if (idim > 3) {
            idst += i3*os[3];
            isrc += iin[3]*is[3];
          }
          pout[idst] = pin[isrc];
        }
      }
    }
  }
 }
 void THNN_(SpatialUpSamplingNearest_updateGradInput)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradInput,
    int scale_factor)
 {
  int dW = scale_factor;
  int dH = scale_factor;
  int xDim = gradInput->nDimension-2;
  int yDim = gradInput->nDimension-1;
  // dims
  int idim = gradInput->nDimension;  // Gauranteed to be between 3 and 5
  int isz0 = gradInput->size[0];
  int isz1 = gradInput->size[1];
  int isz2 = gradInput->size[2];
  int isz3 = 1;
  if (idim > 3) {
    isz3 = gradInput->size[3];
  }
  // get strides
  long *is = gradInput->stride;
  long *os = gradOutput->stride;
  // get raw pointers
  real *pin = THTensor_(data)(gradInput);
  real *pout = THTensor_(data)(gradOutput);
  // perform the upsampling
  int i0, i1, i2, i3, isrc, idst, x, y;
  int iin[4];  // Input indices
  int iout[4];  // Output indices
  THTensor_(zero)(gradInput);
  for (i0 = 0; i0 < isz0; i0++) {
    iin[0] = i0;
    iout[0] = i0;
    for (i1 = 0; i1 < isz1; i1++) {
      iin[1] = i1;
      iout[1] = i1;
      for (i2 = 0; i2 < isz2; i2++) {
        iin[2] = i2;
        iout[2] = i2;
        for (i3 = 0; i3 < isz3; i3++) {
          iin[3] = i3;
          iout[3] = i3;
          idst = i0*is[0] + i1*is[1] + i2*is[2];
          if (idim > 3) {
            idst += i3*is[3];
          }
          // Now accumulate the gradients from gradOutput
          for (y = 0; y < dH; y++) {
            for (x = 0; x < dW; x++) {
              iout[xDim] = dW * iin[xDim] + x;
              iout[yDim] = dH * iin[yDim] + y;
              isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
              if (idim > 3) {
                isrc += iout[3]*os[3];
              }
              pin[idst] += pout[isrc];
            }
          }
        }
      }
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/Sqrt.c
+++ b/torch/lib/THNN/generic/Sqrt.c
@ -0,0 +1,50 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/Sqrt.c"
 #else
 void THNN_(Sqrt_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real eps)
 {
  THTensor_(resizeAs)(output, input);
  THTensor_(sqrt)(output, input);
 }
 void THNN_(Sqrt_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output)
 {
  THTensor_(resizeAs)(gradInput, input);
  if (output->nDimension == 1 || 
      !THTensor_(isContiguous)(output) || 
      !THTensor_(isContiguous)(gradOutput) ||
      !THTensor_(isContiguous)(gradInput))
  {
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
    );
  }
  else
  {
    real *gradOutput_data = THTensor_(data)(gradOutput);
    real *gradInput_data  = THTensor_(data)(gradInput);
    real *output_data     = THTensor_(data)(output);
    long i;
 #pragma omp parallel for private(i)
    for(i = 0; i < THTensor_(nElement)(output); i++)
    {
      if (output_data[i] == 0.0)
        gradInput_data[i] = 0.0;
      else
        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/Square.c
+++ b/torch/lib/THNN/generic/Square.c
@ -0,0 +1,58 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/Square.c"
 #else
 void THNN_(Square_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  THTensor_(resizeAs)(output, input);
  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
  {
    TH_TENSOR_APPLY2(real, output, real, input,
      *output_data = (*input_data) * (*input_data);
    );
  }
  else
  {
    real *output_data = THTensor_(data)(output);
    real *input_data  = THTensor_(data)(input);
    long i;
 #pragma omp parallel for private(i)
    for (i = 0; i < THTensor_(nElement)(input); i++)
      output_data[i] = input_data[i]*input_data[i];
  }
 }
 void THNN_(Square_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput)
 {
  THTensor_(resizeAs)(gradInput, input);
  if (input->nDimension == 1 || 
      !THTensor_(isContiguous)(input) || 
      !THTensor_(isContiguous)(gradOutput) ||
      !THTensor_(isContiguous)(gradInput))
  {
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
    );
  }
  else
  {
    real *gradOutput_data = THTensor_(data)(gradOutput);
    real *gradInput_data  = THTensor_(data)(gradInput);
    real *input_data  = THTensor_(data)(input);
    long i;
 #pragma omp parallel for private(i)
    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
  }
 }
 #endif
--- a/torch/lib/THNN/generic/THNN.h
+++ b/torch/lib/THNN/generic/THNN.h
--- a/torch/lib/THNN/generic/Tanh.c
+++ b/torch/lib/THNN/generic/Tanh.c
@ -0,0 +1,49 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/Tanh.c"
 #else
 void THNN_(Tanh_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output)
 {
  THTensor_(resizeAs)(output, input);
  THTensor_(tanh)(output, input);
 }
 void THNN_(Tanh_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *output)
 {
  THTensor_(resizeAs)(gradInput, output);
  if (output->nDimension == 1 || 
      !THTensor_(isContiguous)(output) || 
      !THTensor_(isContiguous)(gradOutput) ||
      !THTensor_(isContiguous)(gradInput))
  {
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
      real z = *output_data;            \
      *gradInput_data = *gradOutput_data * (1. - z*z);
    );
  }
  else
  {
    real* ptr_gradOutput = THTensor_(data)(gradOutput);
    real* ptr_gradInput  = THTensor_(data)(gradInput);
    real* ptr_output     = THTensor_(data)(output);
    long i;
 #pragma omp parallel for private(i)
    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
    {
      real z = ptr_output[i];
      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/TemporalConvolution.c
+++ b/torch/lib/THNN/generic/TemporalConvolution.c
@ -0,0 +1,349 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/TemporalConvolution.c"
 #else
 void THNN_(TemporalConvolution_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          int kW,
          int dW,
          int inputFrameSize,
          int outputFrameSize)
 {
  THTensor *outputWindow, *inputWindow;
  int nInputFrame, nOutputFrame;
  long k, i;
  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension
  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
  if (input->nDimension == 3) 
  {
    dimS = 1;
    dimF = 2;
  }
  THArgCheck(input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
  input = THTensor_(newContiguous)(input);
  outputWindow = THTensor_(new)();
  inputWindow = THTensor_(new)();
  nInputFrame = input->size[dimS];
  nOutputFrame = (nInputFrame - kW) / dW + 1;
  if (input->nDimension == 2)
  {
    THTensor_(resize2d)(output,
                        nOutputFrame,
                        outputFrameSize);
    /* bias first */
    for(k = 0; k < nOutputFrame; k++)
    {
      THTensor_(select)(outputWindow, output, 0, k);
      THTensor_(copy)(outputWindow, bias);
    }
    /* ouch */
    for(k = 0; nOutputFrame > 0; k++)
    {
      long outputFrameStride = (kW-1)/dW+1;
      long inputFrameStride = outputFrameStride*dW;
      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
      nOutputFrame -= nFrame;
      THTensor_(setStorage2d)(inputWindow, input->storage,
                              input->storageOffset+k*dW*input->size[1],
                              nFrame, inputFrameStride*input->size[1],
                              kW*input->size[1], 1);
      THTensor_(setStorage2d)(outputWindow, output->storage, 
                              output->storageOffset + k*output->size[1],
                              nFrame, outputFrameStride*output->size[1],
                              output->size[1], 1);
      THTensor_(transpose)(weight, NULL, 0, 1);
      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
      THTensor_(transpose)(weight, NULL, 0, 1);
    }
  }
  else
  {
    THTensor *outputSample = THTensor_(new)();
    THTensor *inputSample = THTensor_(new)();
    int nBatchFrame = input->size[0];
    THTensor_(resize3d)(output,
                        nBatchFrame,
                        nOutputFrame,
                        outputFrameSize);
    for(i = 0; i < nBatchFrame; i++)
    {
      THTensor_(select)(outputSample, output, 0, i);
      THTensor_(select)(inputSample, input, 0, i);
      long nOutputSampleFrame = nOutputFrame;
      /* bias first */
      for(k = 0; k < nOutputFrame; k++)
      {
        THTensor_(select)(outputWindow, outputSample, 0, k);
        THTensor_(copy)(outputWindow, bias);
      }
      /* ouch */
      for(k = 0; nOutputSampleFrame > 0; k++)
      {
        long outputFrameStride = (kW-1)/dW+1;
        long inputFrameStride = outputFrameStride*dW;
        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
        nOutputSampleFrame -= nFrame;
        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
                                inputSample->storageOffset+k*dW*inputSample->size[1],
                                nFrame, inputFrameStride*inputSample->size[1],
                                kW*inputSample->size[1], 1);
        THTensor_(setStorage2d)(outputWindow, outputSample->storage, 
                                outputSample->storageOffset + k*outputSample->size[1],
                                nFrame, outputFrameStride*outputSample->size[1],
                                outputSample->size[1], 1);
        THTensor_(transpose)(weight, NULL, 0, 1);
        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
        THTensor_(transpose)(weight, NULL, 0, 1);
      }
    }
    THTensor_(free)(outputSample);
    THTensor_(free)(inputSample);
  }
  THTensor_(free)(outputWindow);
  THTensor_(free)(inputWindow);
  THTensor_(free)(input);
 }
 void THNN_(TemporalConvolution_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          int kW,
          int dW)
 {
  long nInputFrame;
  long nOutputFrame;
  THTensor *gradOutputWindow;
  THTensor *gradInputWindow;
  long k, i;
  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension
  if (gradOutput->nDimension == 3) 
  {
    dimS = 1;
    dimF = 2;
  }
  nInputFrame = input->size[dimS];
  nOutputFrame = gradOutput->size[dimS];
  gradOutputWindow = THTensor_(new)();
  gradInputWindow = THTensor_(new)();
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (gradOutput->nDimension == 2)
  {
    /* ouch */
    for(k = 0; nOutputFrame > 0; k++)
    {
      long outputFrameStride = (kW-1)/dW+1;
      long inputFrameStride = outputFrameStride*dW;
      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
      nOutputFrame -= nFrame;
      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
                              gradOutput->storageOffset + k*gradOutput->size[1],
                              nFrame, outputFrameStride*gradOutput->size[1],
                              gradOutput->size[1], 1);
      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
                              gradInput->storageOffset+k*dW*gradInput->size[1],
                              nFrame, inputFrameStride*gradInput->size[1],
                              kW*gradInput->size[1], 1);
      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
    }
  }
  else
  {
    THTensor *gradOutputSample = THTensor_(new)();
    THTensor *gradInputSample = THTensor_(new)();
    int nBatchFrame = input->size[0];
    for(i = 0; i < nBatchFrame; i++)
    {
      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
      THTensor_(select)(gradInputSample, gradInput, 0, i);
      int nOutputSampleFrame = nOutputFrame;
      /* ouch */
      for(k = 0; nOutputSampleFrame > 0; k++)
      {
        long outputFrameStride = (kW-1)/dW+1;
        long inputFrameStride = outputFrameStride*dW;
        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
        nOutputSampleFrame -= nFrame;
        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
                                nFrame, outputFrameStride*gradOutputSample->size[1],
                                gradOutputSample->size[1], 1);
        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
                                nFrame, inputFrameStride*gradInputSample->size[1],
                                kW*gradInputSample->size[1], 1);
        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
      }
    }
    THTensor_(free)(gradOutputSample);
    THTensor_(free)(gradInputSample);
  }
  THTensor_(free)(gradOutputWindow);
  THTensor_(free)(gradInputWindow);
 }
 void THNN_(TemporalConvolution_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          int kW,
          int dW,
          real scale)
 {
  long nInputFrame;
  long nOutputFrame;
  THTensor *gradOutputWindow;
  THTensor *inputWindow;
  long k, i;
  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension
  if (gradOutput->nDimension == 3) 
  {
    dimS = 1;
    dimF = 2;
  }
  nInputFrame = input->size[dimS];
  nOutputFrame = gradOutput->size[dimS];
  input = THTensor_(newContiguous)(input);
  gradOutputWindow = THTensor_(new)();
  inputWindow = THTensor_(new)();
  if (input->nDimension == 2)
  {
    /* bias first */
    for(k = 0; k < nOutputFrame; k++)
    {
      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
    }
    /* ouch */
    for(k = 0; nOutputFrame > 0; k++)
    {
      long outputFrameStride = (kW-1)/dW+1;
      long inputFrameStride = outputFrameStride*dW;
      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
      nOutputFrame -= nFrame;
      THTensor_(setStorage2d)(inputWindow, input->storage,
                              input->storageOffset+k*dW*input->size[1],
                              nFrame, inputFrameStride*input->size[1],
                              kW*input->size[1], 1);
      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, 
                              gradOutput->storageOffset + k*gradOutput->size[1],
                              nFrame, outputFrameStride*gradOutput->size[1],
                              gradOutput->size[1], 1);
      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
    }
  }
  else
  {
    THTensor *gradOutputSample = THTensor_(new)();
    THTensor *inputSample = THTensor_(new)();
    int nBatchFrame = input->size[0];
    for(i = 0; i < nBatchFrame; i++)
    {
      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
      THTensor_(select)(inputSample, input, 0, i);
      int nOutputSampleFrame = nOutputFrame;
      /* bias first */
      for(k = 0; k < nOutputFrame; k++)
      {
        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
      }
      /* ouch */
      for(k = 0; nOutputSampleFrame > 0; k++)
      {
        long outputFrameStride = (kW-1)/dW+1;
        long inputFrameStride = outputFrameStride*dW;
        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
        nOutputSampleFrame -= nFrame;
        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
                                inputSample->storageOffset+k*dW*inputSample->size[1],
                                nFrame, inputFrameStride*inputSample->size[1],
                                kW*inputSample->size[1], 1);
        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, 
                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
                                nFrame, outputFrameStride*gradOutputSample->size[1],
                                gradOutputSample->size[1], 1);
        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
      }
    }
    THTensor_(free)(gradOutputSample);
    THTensor_(free)(inputSample);
  }
  THTensor_(free)(gradOutputWindow);
  THTensor_(free)(inputWindow);
  THTensor_(free)(input);
 }
 #endif
--- a/torch/lib/THNN/generic/TemporalMaxPooling.c
+++ b/torch/lib/THNN/generic/TemporalMaxPooling.c
@ -0,0 +1,235 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
 #else
 void THNN_(TemporalMaxPooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *indices,
          int kW,
          int dW)
 {
  long niframe;
  long framesize;
  long noframe;
  real *input_data;
  real *output_data;
  real *indices_data;
  long t, y;
  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension
  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
  if (input->nDimension == 3)
  {
    dimS = 1;
    dimF = 2;
  }
  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
  /* sizes */
  niframe = input->size[dimS];
  framesize = input->size[dimF];
  noframe = (niframe - kW) / dW + 1;
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  if (input->nDimension == 2)
  {
    /* resize output */
    THTensor_(resize2d)(output, noframe, framesize);
    /* indices will contain index locations for each output point */
    THTensor_(resize2d)(indices, noframe, framesize);
    /* get raw pointers */
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    for(t = 0; t < noframe; t++)
    {
      real *ip = input_data + t*framesize*dW;
      real *op = output_data + t*framesize;
      real *xp = indices_data + t*framesize;
 #pragma omp parallel for private(y)
      for(y = 0; y < framesize; y++)
      {
        /* compute local max: */
        long maxindex = -1;
        real maxval = -THInf;
        long x;
        for(x = 0; x < kW; x++)
        {
          real val = ip[x*framesize+y];
          if (val > maxval)
          {
            maxval = val;
            maxindex = x;
          }
        }
        /* set output to local max */
        op[y] = maxval;
        xp[y] = (real)maxindex;
      }
    }
  }
  else
  {
    /* number of batch frames */
    long nbframe = input->size[0];
    long i;
    /* resize output */
    THTensor_(resize3d)(output, nbframe, noframe, framesize);
    /* indices will contain index locations for each output point */
    THTensor_(resize3d)(indices, nbframe, noframe, framesize);
    /* get raw pointers */
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    for(i = 0; i < nbframe; i++)
    {
      real *inputSample_data = input_data + i*niframe*framesize;
      real *outputSample_data = output_data + i*noframe*framesize;
      real *indicesSample_data = indices_data + i*noframe*framesize;
      for(t = 0; t < noframe; t++)
      {
        real *ip = inputSample_data + t*framesize*dW;
        real *op = outputSample_data + t*framesize;
        real *xp = indicesSample_data + t*framesize;
 #pragma omp parallel for private(y)
        for(y = 0; y < framesize; y++)
        {
          /* compute local max: */
          long maxindex = -1;
          real maxval = -THInf;
          long x;
          for(x = 0; x < kW; x++)
          {
            real val = ip[x*framesize+y];
            if (val > maxval)
            {
              maxval = val;
              maxindex = x;
            }
          }
          /* set output to local max */
          op[y] = maxval;
          xp[y] = (real)maxindex;
        }
      }
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 void THNN_(TemporalMaxPooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *indices,
          int kW,
          int dW)
 {
  long niframe;
  int noframe;
  long framesize;
  real *gradInput_data;
  real *gradOutput_data;
  real *indices_data;
  long t, y;
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize and zero */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension
  if (input->nDimension == 3)
  {
    dimS = 1;
    dimF = 2;
  }
  /* sizes */
  niframe = input->size[dimS];
  noframe = gradOutput->size[dimS];
  framesize = gradOutput->size[dimF];
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THTensor_(data)(indices);
  if (input->nDimension == 2)
  {
    for(t = 0; t < noframe; t++)
    {
      real *gip = gradInput_data + t*framesize*dW;
      real *gop = gradOutput_data + t*framesize;
      real *xp = indices_data + t*framesize;
 #pragma omp parallel for private(y)
      for(y = 0; y < framesize; y++)
      {
        /* compute local max: */
        long maxindex = (long)xp[y];
        gip[maxindex*framesize+y] += gop[y];
      }
    }
  }
  else
  {
    /* number of batch frames */
    long nbframe = input->size[0];
    long i;
    for(i = 0; i < nbframe; i++)
    {
      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
      real *indicesSample_data = indices_data + i*noframe*framesize;
      for(t = 0; t < noframe; t++)
      {
        real *gip = gradInputSample_data + t*framesize*dW;
        real *gop = gradOutputSample_data + t*framesize;
        real *xp = indicesSample_data + t*framesize;
 #pragma omp parallel for private(y)
        for(y = 0; y < framesize; y++)
        {
          /* compute local max: */
          long maxindex = (long)xp[y];
          gip[maxindex*framesize+y] += gop[y];
        }
      }
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/TemporalSubSampling.c
+++ b/torch/lib/THNN/generic/TemporalSubSampling.c
@ -0,0 +1,116 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
 #else
 void THNN_(TemporalSubSampling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          int kW,
          int dW,
          int inputFrameSize)
 {
  THTensor *outputFrame, *inputWindow;
  int nInputFrame, nOutputFrame;
  long k;
  THArgCheck( input->nDimension == 2, 2, "2D tensor expected");
  THArgCheck( input->size[1] == inputFrameSize, 2, "invalid input frame size");
  THArgCheck( input->size[0] >= kW, 2, "input sequence smaller than kernel size");
  outputFrame = THTensor_(new)();
  inputWindow = THTensor_(new)();
  nInputFrame = input->size[0];
  nOutputFrame = (nInputFrame - kW) / dW + 1;
  THTensor_(resize2d)(output,
                      nOutputFrame,
                      inputFrameSize);
  for(k = 0; k < nOutputFrame; k++)
  {
    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
    THTensor_(select)(outputFrame, output, 0, k);
    THTensor_(sum)(outputFrame, inputWindow, 0);
    THTensor_(cmul)(outputFrame, outputFrame, weight);
    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
  }
  THTensor_(free)(outputFrame);
  THTensor_(free)(inputWindow);
 }
 void THNN_(TemporalSubSampling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          int kW,
          int dW)
 {
  THTensor *gradOutputFrame;
  THTensor *gradInputWindow, *buffer, *kwunit;
  long k;
  gradOutputFrame = THTensor_(new)();
  gradInputWindow = THTensor_(new)();
  buffer = THTensor_(new)();
  kwunit = THTensor_(newWithSize1d)(kW);
  THTensor_(fill)(kwunit, 1);
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  for(k = 0; k < gradOutput->size[0]; k++)
  {
    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
    THTensor_(cmul)(buffer, weight, gradOutputFrame);
    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
  }
  THTensor_(free)(gradOutputFrame);
  THTensor_(free)(gradInputWindow);
  THTensor_(free)(buffer);
  THTensor_(free)(kwunit);
 }
 void THNN_(TemporalSubSampling_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          int kW,
          int dW,
          real scale)
 {
  THTensor *gradOutputFrame;
  THTensor *inputWindow, *buffer;
  long k;
  gradOutputFrame = THTensor_(new)();
  inputWindow = THTensor_(new)();
  buffer = THTensor_(new)();
  for(k = 0; k < gradOutput->size[0]; k++)
  {
    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
    THTensor_(sum)(buffer, inputWindow, 0);
    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
  }
  THTensor_(free)(gradOutputFrame);
  THTensor_(free)(inputWindow);
  THTensor_(free)(buffer);
 }
 #endif
--- a/torch/lib/THNN/generic/Threshold.c
+++ b/torch/lib/THNN/generic/Threshold.c
@ -0,0 +1,58 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/Threshold.c"
 #else
 void THNN_(Threshold_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          real threshold,
          real val,
          bool inplace)
 {
  if (inplace)
  {
    TH_TENSOR_APPLY(real, input,
      if (*input_data <= threshold)
        *input_data = val;
    );
    THTensor_(set)(output, input);
  }
  else
  {
    THTensor_(resizeAs)(output, input);
    TH_TENSOR_APPLY2(real, output, real, input,
      *output_data = (*input_data > threshold) ? *input_data : val;
    );
  }
 }
 void THNN_(Threshold_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          real threshold,
          bool inplace)
 {
  if (inplace)
  {
    TH_TENSOR_APPLY2(real, gradOutput, real, input,
      if ((*input_data) <= threshold)
        *gradOutput_data = 0;
    );
    THTensor_(set)(gradInput, gradOutput);
  }
  else
  {
    THTensor_(resizeAs)(gradInput, input);
    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
      if ((*input_data) > threshold)
        *gradInput_data = *gradOutput_data;
      else
        *gradInput_data = 0;
    );
  }
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricAveragePooling.c
+++ b/torch/lib/THNN/generic/VolumetricAveragePooling.c
@ -0,0 +1,309 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
 #else
 static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
          real *input_p,
          real *output_p,
          long nslices,
          long itime,
          long iwidth,
          long iheight,
          long otime,
          long owidth,
          long oheight,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    /* loop over output */
    long i, j, ti;
    for (ti = 0; ti < otime; ti++)
    {
      for (i = 0; i < oheight; i++)
      {
        for (j = 0; j < owidth; j++)
        {
          /* local pointers */
          real *ip = input_p + k * itime * iwidth * iheight
            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
          real *op = output_p + k * otime * owidth * oheight
            + ti * owidth * oheight + i * owidth + j;
          /* compute local sum: */
          real sum = 0.0;
          int x, y, z;
          for (z=0; z < kT; z++)
          {
            for (y = 0; y < kH; y++)
            {
              for (x = 0; x < kW; x++)
              {
                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
              }
            }
          }
          /* set output to local max */
          *op = sum / (kT * kW * kH);
        }
      }
    }
  }
 }
 void THNN_(VolumetricAveragePooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH)
 {
  long nslices;
  long itime;
  long iheight;
  long iwidth;
  long otime;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D (batch-mode) tensor expected"
  );
  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;
  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }
  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
    "input image smaller than kernel size"
  );
  /* sizes */
  nslices = input->size[dimN];
  itime   = input->size[dimt];
  iheight = input->size[dimh];
  iwidth  = input->size[dimw];
  otime   = (itime   - kT) / dT + 1;
  oheight = (iheight - kH) / dH + 1;
  owidth  = (iwidth  - kW) / dW + 1;
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  if (input->nDimension == 4) /* non-batch mode */
  {
    /* resize output */
    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    THNN_(VolumetricAveragePooling_updateOutput_frame)(
      input_data, output_data, nslices,
      itime, iwidth, iheight,
      otime, owidth, oheight,
      kT, kW, kH,
      dT, dW, dH
    );
  }
  else  /* batch mode */
  {
    long p;
    long nBatch = input->size[0];
    long istride = nslices * itime * iwidth * iheight;
    long ostride = nslices * otime * owidth * oheight;
    /* resize output */
    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
 #pragma omp parallel for private(p)
    for (p=0; p < nBatch; p++)
    {
      THNN_(VolumetricAveragePooling_updateOutput_frame)(
        input_data + p * istride, output_data + p * ostride, nslices,
        itime, iwidth, iheight,
        otime, owidth, oheight,
        kT, kW, kH,
        dT, dW, dH
      );
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
          real *gradInput_p,
          real *gradOutput_p,
          long nslices,
          long itime,
          long iwidth,
          long iheight,
          long otime,
          long owidth,
          long oheight,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    /* loop over output */
    long i, j, ti;
    for (ti = 0; ti < otime; ti++)
    {
      for (i = 0; i < oheight; i++)
      {
        for (j = 0; j < owidth; j++)
        {
          /* local pointers */
          real *ip = gradInput_p + k * itime * iwidth * iheight
            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
          real *op = gradOutput_p + k * otime * owidth * oheight
            + ti * owidth * oheight + i * owidth + j;
          /* scatter gradients out to footprint: */
          real val  = *op / (kT * kW * kH);
          int x,y,z;
          for (z=0; z < kT; z++)
          {
            for (y = 0; y < kH; y++)
            {
              for (x = 0; x < kW; x++)
              {
                *(ip + z * iwidth * iheight + y * iwidth + x) += val;
              }
            }
          }
        }
      }
    }
  }
 }
 void THNN_(VolumetricAveragePooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH)
 {
  int nslices;
  int itime;
  int iheight;
  int iwidth;
  int otime;
  int oheight;
  int owidth;
  real *gradInput_data;
  real *gradOutput_data;
  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }
  /* sizes */
  nslices = input->size[dimN];
  itime = input->size[dimt];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  otime = gradOutput->size[dimt];
  oheight = gradOutput->size[dimh];
  owidth = gradOutput->size[dimw];
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  /* backprop */
  if (input->nDimension == 4) /* non-batch mode*/
  {
    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
      gradInput_data, gradOutput_data, nslices,
      itime, iwidth, iheight,
      otime, owidth, oheight,
      kT, kW, kH,
      dT, dW, dH
    );
  }
  else /* batch mode */
  {
    long p;
    long nBatch = input->size[0];
    long istride = nslices * itime * iwidth * iheight;
    long ostride = nslices * otime * owidth * oheight;
 #pragma omp parallel for private(p)
    for (p = 0; p < nBatch; p++)
    {
      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
        itime, iwidth, iheight,
        otime, owidth, oheight,
        kT, kW, kH,
        dT, dW, dH
      );
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricConvolution.c
@ -0,0 +1,247 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
 #else
 void THNN_(VolumetricConvolution_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,     // only used by cuda impl
          THTensor *fgradInput, // only used by cuda impl
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D (batch-mode) tensor expected"
  );
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;
  if (input->nDimension == 5)
  {
    dimt++;
    dimh++;
    dimw++;
  }
  long nOutputPlane = weight->size[0];
  long kT           = weight->size[2];
  long kH           = weight->size[3];
  long kW           = weight->size[4];
  long inputDepth   = input->size[dimt];
  long inputHeight  = input->size[dimh];
  long inputWidth   = input->size[dimw];
  long outputDepth  = (inputDepth - kT) / dT + 1;
  long outputWidth  = (inputWidth - kW) / dW + 1;
  long outputHeight = (inputHeight - kH) / dH + 1;
  THTensor *outn = THTensor_(new)();
  long i, j;
  if (input->nDimension == 4) /* non-batch mode */
  {
    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
    /* add bias */
    for (i = 0; i < bias->size[0]; i++)
    {
      THTensor_(select)(outn, output, 0, i);
      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
    }
    /* do convolutions */
    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
  }
  else /* batch mode */
  {
    long nBatch = input->size[0];
    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor *inb = THTensor_(new)();
    THTensor *outb = THTensor_(new)();
    /* loop over batches */
    for (j = 0; j < nBatch; j++)
    {
      THTensor_(select)(inb, input, 0, j);
      THTensor_(select)(outb, output, 0, j);
      /* add bias */
      for (i = 0; i < bias->size[0]; i++)
      {
        THTensor_(select)(outn, outb, 0, i);
        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
      }
      /* do convolutions */
      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
    }
    THTensor_(free)(inb);
    THTensor_(free)(outb);
  }
  THTensor_(free)(outn);
 }
 void THNN_(VolumetricConvolution_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          THTensor *finput, // only used by cuda impl
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
  THArgCheck(weight->nDimension == 5, 4,
    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
  );
  int nOutputPlane = (int)weight->size[0];
  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
    "4D or 5D (batch-mode) tensor expected"
  );
  int dimPlane = 0;
  if (gradOutput->nDimension == 5)
  {
    dimPlane++;
  }
  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
    "Number of output features is not equal to nOutputPlane"
  );
  /* gradient to input */
  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
  if (gradOutput->nDimension == 4) /* non-batch mode */
  {
    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
  }
  else /* batch mode */
  {
    long nBatch = gradOutput->size[0];
    THTensor *ginpb = THTensor_(new)();
    THTensor *goutb = THTensor_(new)();
    long j;
    THTensor_(resize5d)(gradInput,
      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
    );
    /* loop over batches */
    for (j = 0; j < nBatch; j++)
    {
      THTensor_(select)(ginpb, gradInput, 0, j);
      THTensor_(select)(goutb, gradOutput, 0, j);
      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
    }
    THTensor_(free)(ginpb);
    THTensor_(free)(goutb);
  }
  THTensor_(free)(tweight);
 }
 void THNN_(VolumetricConvolution_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,     // only used by cuda impl
          THTensor *fgradInput, // only used by cuda impl
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          real scale)
 {
  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
  THArgCheck(gradWeight->nDimension == 5, 4,
    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
  );
  int nOutputPlane = (int)gradWeight->size[0];
  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
    "gradBias tensor has wrong size"
  );
  long k;
  real *gradBias_data;
  THTensor *gradOutSlice;
  int dimPlane = 0;
  if (gradOutput->nDimension == 5)
  {
    dimPlane++;
  }
  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
    "Number of output features is not equal to nOutputPlane"
  );
  if (gradOutput->nDimension == 4) /* non-batch mode */
  {
    /* gradient to bias */
    gradBias_data = THTensor_(data)(gradBias);
    gradOutSlice = THTensor_(new)();
    for (k = 0; k < nOutputPlane; k++)
    {
      THTensor_(select)(gradOutSlice, gradOutput, 0, k);
      gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
    }
    THTensor_(free)(gradOutSlice);
    /* gradient to kernels */
    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
  }
  else /* batch mode */
  {
    long nBatch = gradOutput->size[0];
    THTensor *inpb = THTensor_(new)();
    THTensor *goutb = THTensor_(new)();
    long j;
    /* loop over batches */
    for (j = 0; j < nBatch; j++)
    {
      THTensor_(select)(inpb, input, 0, j);
      THTensor_(select)(goutb, gradOutput, 0, j);
      /* gradient to bias */
      gradBias_data = THTensor_(data)(gradBias);
      gradOutSlice = THTensor_(new)();
      for (k = 0; k < nOutputPlane; k++)
      {
        THTensor_(select)(gradOutSlice, goutb, 0, k);
        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
      }
      THTensor_(free)(gradOutSlice);
      /* gradient to kernels */
      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
    }
    THTensor_(free)(inpb);
    THTensor_(free)(goutb);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/torch/lib/THNN/generic/VolumetricConvolutionMM.c
@ -0,0 +1,518 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
 #else
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
 static void THNN_(unfolded_acc_vol)(
          THTensor *finput,
          THTensor *input,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          int nInputPlane,
          int inputDepth,
          int inputWidth,
          int inputHeight,
          int outputDepth,
          int outputWidth,
          int outputHeight)
 {
  int nip;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);
 //#pragma omp parallel for private(nip)
  for (nip = 0; nip < nInputPlane; nip++)
  {
    int kt, kw, kh, t, y, x, it, ix, iy;
    for (kt = 0; kt < kT; kt++)
    {
      for (kh = 0; kh < kH; kh++)
      {
        for (kw = 0; kw < kW; kw++)
        {
          real *src = finput_data
            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
            + kh  * (kW*outputDepth*outputHeight*outputWidth)
            + kw  * (outputDepth*outputHeight*outputWidth);
          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
          if (pT > 0 || pH > 0 || pW > 0)
          {
            for (t = 0; t < outputDepth; t++)
            {
              it = t*dT - pT + kt;
              for (y = 0; y < outputHeight; y++)
              {
                iy = y*dH - pH + kh;
                for (x = 0; x < outputWidth; x++)
                {
                  ix = x*dW - pW + kw;
                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
                  {
                  }
                  else
                  {
                    THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                  }
                }
              }
            }
          }
          else
          {
            for (t = 0; t < outputDepth; t++)
            {
              it = t*dT + kt;
              for (y = 0; y < outputHeight; y++)
              {
                iy = y*dH + kh;
                for(x = 0; x < outputWidth; x++)
                {
                  ix = x*dW + kw;
                  THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                }
              }
            }
          }
        }
      }
    }
  }
 }
 static void THNN_(unfolded_copy_vol)(
          THTensor *finput,
          THTensor *input,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          int nInputPlane,
          int inputDepth,
          int inputWidth,
          int inputHeight,
          int outputDepth,
          int outputWidth,
          int outputHeight)
 {
  long k;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);
 // #pragma omp parallel for private(k)
  for (k = 0; k < nInputPlane*kT*kH*kW; k++)
  {
    int nip = k / (kT*kH*kW);
    int rest = k % (kT*kH*kW);
    int kt = rest / (kH*kW);
    rest = rest % (kH*kW);
    int kh = rest / kW;
    int kw = rest % kW;
    int t,x,y,it,ix,iy;
    real *dst = finput_data
      + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
      + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
      + kh  * (kW*outputDepth*outputHeight*outputWidth)
      + kw  * (outputDepth*outputHeight*outputWidth);
    real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
    if (pT > 0 || pH > 0 || pW > 0)
    {
      for (t = 0; t < outputDepth; t++)
      {
        it = t*dT - pT + kt;
        for (y = 0; y < outputHeight; y++)
        {
          iy = y*dH - pH + kh;
          for (x = 0; x < outputWidth; x++)
          {
            ix = x*dW - pW + kw;
            if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
              memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
            else
              memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
          }
        }
      }
    }
    else
    {
      for (t = 0; t < outputDepth; t++)
      {
        it = t*dT + kt;
        for (y = 0; y < outputHeight; y++)
        {
          iy = y*dH + kh;
          for(x = 0; x < outputWidth; x++)
          {
            ix = x*dW + kw;
            memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
          }
        }
      }
    }
  }
 }
 static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          long nInputPlane,
          long inputDepth,
          long inputWidth,
          long inputHeight,
          long nOutputPlane,
          long outputDepth,
          long outputWidth,
          long outputHeight)
 {
  long i;
  THTensor *output2d;
  THNN_(unfolded_copy_vol)(
    finput, input,
    kT, kW, kH,
    dT, dW, dH,
    pT, pW, pH,
    nInputPlane,
    inputDepth, inputWidth, inputHeight,
    outputDepth, outputWidth, outputHeight
  );
  output2d = THTensor_(newWithStorage2d)(
    output->storage, output->storageOffset, nOutputPlane, -1,
    outputDepth*outputHeight*outputWidth, -1
  );
  for (i = 0; i < nOutputPlane; i++)
  {
    THVector_(fill)(
      output->storage->data+output->storageOffset+output->stride[0]*i,
      THTensor_(get1d)(bias, i),
      outputDepth*outputHeight*outputWidth
    );
  }
  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
  THTensor_(free)(output2d);
 }
 void THNN_(VolumetricConvolutionMM_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  int dimf = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;
  long nInputPlane;
  long inputDepth;
  long inputHeight;
  long inputWidth;
  long nOutputPlane;
  long outputDepth;
  long outputHeight;
  long outputWidth;
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D(batch mode) tensor expected"
  );
  if (input->nDimension == 5)
  {
    dimf++;
    dimt++;
    dimh++;
    dimw++;
  }
  nInputPlane = input->size[dimf];
  inputDepth = input->size[dimt];
  inputHeight  = input->size[dimh];
  inputWidth   = input->size[dimw];
  nOutputPlane = weight->size[0];
  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
  if (outputWidth < 1 || outputHeight < 1)
  {
    THError(
      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
      nInputPlane, inputDepth, inputHeight, inputWidth,
      nOutputPlane, outputDepth, outputHeight, outputWidth
    );
  }
  if (input->nDimension == 4)
  {
    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
      input, output, weight, bias, finput,
      kT, kW, kH,
      dT, dW, dH,
      pT, pW, pH,
      nInputPlane, inputDepth, inputWidth, inputHeight,
      nOutputPlane, outputDepth, outputWidth, outputHeight
    );
  }
  else
  {
    long T = input->size[0];
    long t;
    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
 // #pragma omp parallel for private(t)
    for (t = 0; t < T; t++)
    {
      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
        input_t, output_t, weight, bias, finput_t,
        kT, kW, kH,
        dT, dW, dH,
        pT, pW, pH,
        nInputPlane, inputDepth, inputWidth, inputHeight,
        nOutputPlane, outputDepth, outputWidth, outputHeight
      );
      THTensor_(free)(input_t);
      THTensor_(free)(output_t);
      THTensor_(free)(finput_t);
    }
  }
 }
 static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
          THTensor *gradInput,
          THTensor *gradOutput,
          THTensor *weight,
          THTensor *fgradInput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
    gradOutput->storage, gradOutput->storageOffset,
    gradOutput->size[0], -1,
    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
  );
  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
  THTensor_(free)(gradOutput2d);
  THTensor_(zero)(gradInput);
  THNN_(unfolded_acc_vol)(
    fgradInput, gradInput,
    kT, kW, kH,
    dT, dW, dH,
    pT, pW, pH,
    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
  );
 }
 void THNN_(VolumetricConvolutionMM_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          THTensor *finput,
          THTensor *fgradInput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  // number of input/output planes and kernel size is indirectly defined by the weight tensor
  THArgCheck(weight->nDimension == 2, 4,
    "2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
  );
  int nOutputPlane = (int)weight->size[0];
  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
    "Number of output features is not equal to nOutputPlane"
  );
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(resizeAs)(fgradInput, finput);
  // depending on the BLAS library, fgradInput (result tensor) might
  // be left uninitialized on zero alpha, which might lead to weird behavior
  // hence, to be safe, zero it
  THTensor_(zero)(fgradInput);  
  THTensor_(transpose)(weight, weight, 0, 1);
  if (input->nDimension == 4)
  {
    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
      gradInput, gradOutput, weight, fgradInput,
      kT, kW, kH,
      dT, dW, dH,
      pT, pW, pH
    );
  }
  else
  {
    long T = input->size[0];
    long t;
 //#pragma omp parallel for private(t)
    for (t = 0; t < T; t++)
    {
      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
        gradInput_t, gradOutput_t, weight, fgradInput_t,
        kT, kW, kH,
        dT, dW, dH,
        pT, pW, pH
      );
      THTensor_(free)(gradInput_t);
      THTensor_(free)(gradOutput_t);
      THTensor_(free)(fgradInput_t);
    }
  }
  THTensor_(transpose)(weight, weight, 0, 1);
 }
 static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,
          real scale)
 {
  long i;
  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
    gradOutput->storage, gradOutput->storageOffset,
    gradOutput->size[0], -1,
    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
  );
  THTensor_(transpose)(finput, finput, 0, 1);
  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
  THTensor_(transpose)(finput, finput, 0, 1);
  for (i = 0; i < gradBias->size[0]; i++)
  {
    long k;
    real sum = 0;
    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
    for (k = 0; k < gradOutput2d->size[1]; k++)
      sum += data[k];
    (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
  }
  THTensor_(free)(gradOutput2d);
 }
 void THNN_(VolumetricConvolutionMM_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,
          real scale)
 {
  THArgCheck(gradWeight->nDimension == 2, 4,
    "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
  );
  int nOutputPlane = (int)gradWeight->size[0];
  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
    "gradBias tensor has wrong size"
  );
  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
    "Number of output features is not equal to nOutputPlane"
  );
  if (input->nDimension == 4)   // non-batch mode
  {
    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
  }
  else  // batch mode
  {
    long T = input->size[0];
    long t;
    for (t = 0; t < T; t++)
    {
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
      THTensor_(free)(gradOutput_t);
      THTensor_(free)(finput_t);
    }
  }
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricDilatedConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricDilatedConvolution.c
@ -0,0 +1,356 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
 #else
 void THNN_(VolumetricDilatedConvolution_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *columns,
          THTensor *ones,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
          int padT, int padW, int padH,
          int dilationT, int dilationW, int dilationH)
 {
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
  // Params:
  int nInputPlane = weight->size[1];
  int nOutputPlane = weight->size[0];
  int batch = 1;
  if (input->nDimension == 4) {
    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[0]);
    // Force batch
    batch = 0;
    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
  } else {
    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[1]);
  }
  long inputDepth  = input->size[2];
  long inputHeight  = input->size[3];
  long inputWidth   = input->size[4];
  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
  // Batch size + input planes
  long batchSize = input->size[0];
  // Resize output
  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
  THTensor_(zero)(output);
  // Resize temporary columns
  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
  // Define a buffer of ones, for bias accumulation
  // Note: this buffer can be shared with other modules, it only ever gets increased,
  // and always contains ones.
  if (ones->nDimension != 3 ||
      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *output_n = THTensor_(new)();
  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(output_n, output, 0, elt);
    // Do Bias first:
    // M,N,K are dims of matrix A and B
    long m_ = nOutputPlane;
    long n_ = outputDepth * outputHeight * outputWidth;
    long k_ = 1;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    if (bias) {
      THBlas_(gemm)(
        't', 'n',
        n_, m_, k_,
        1,
        THTensor_(data)(ones), k_,
        THTensor_(data)(bias), k_,
        0,
        THTensor_(data)(output_n), n_
      );
    } else {
      THTensor_(zero)(output_n);
    }
    // Extract columns:
    THNN_(vol2col)(
      THTensor_(data)(input_n),
      nInputPlane, inputDepth, inputHeight, inputWidth,
      kT, kH, kW, padT, padH, padW, dT, dH, dW,
      dilationT, dilationH, dilationW,
      THTensor_(data)(columns)
    );
    // M,N,K are dims of matrix A and B
    long m = nOutputPlane;
    long n = columns->size[1];
    long k = nInputPlane*kT*kH*kW;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
      'n', 'n',
      n, m, k,
      1,
      THTensor_(data)(columns), n,
      THTensor_(data)(weight), k,
      1,
      THTensor_(data)(output_n), n
    );
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(output_n);
  // Resize output
  if (batch == 0) {
    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
  }
 }
 void THNN_(VolumetricDilatedConvolution_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *weight,
          THTensor *gradColumns,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
          int padT, int padW, int padH,
          int dilationT, int dilationW, int dilationH)
 {
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
  // Params
  int nInputPlane = weight->size[1];
  int nOutputPlane = weight->size[0];
  int batch = 1;
  if (input->nDimension == 4) {
    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
    // Force batch
    batch = 0;
    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
  } else {
    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
  }
  long inputDepth  = input->size[2];
  long inputWidth   = input->size[4];
  long inputHeight  = input->size[3];
  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Resize output
  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
  // Resize temporary columns
  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
  THTensor_(zero)(gradColumns);
  // Helpers
  THTensor *gradInput_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per sample:
    THTensor_(select)(gradInput_n, gradInput, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // M,N,K are dims of matrix A and B
    long m = nInputPlane*kT*kW*kH;
    long n = gradColumns->size[1];
    long k = nOutputPlane;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        'n', 't',
        n, m, k,
        1,
        THTensor_(data)(gradOutput_n), n,
        THTensor_(data)(weight), m,
        0,
        THTensor_(data)(gradColumns), n
    );
    // Unpack columns back into input:
    THNN_(col2vol)(
      THTensor_(data)(gradColumns),
      nInputPlane, inputDepth, inputHeight, inputWidth,
      kT, kH, kW, padT, padH, padW, dT, dH, dW,
      dilationT, dilationH, dilationW,
      THTensor_(data)(gradInput_n)
    );
  }
  // Free
  THTensor_(free)(gradInput_n);
  THTensor_(free)(gradOutput_n);
  // Resize output
  if (batch == 0) {
    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
  }
 }
 void THNN_(VolumetricDilatedConvolution_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *columns,
          THTensor *ones,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
          int padT, int padW, int padH,
          int dilationT, int dilationW, int dilationH,
          real scale)
 {
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
  // Params
  int nInputPlane = gradWeight->size[1];
  int nOutputPlane = gradWeight->size[0];
  int batch = 1;
  if (input->nDimension == 4) {
    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
    // Force batch
    batch = 0;
    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
  } else {
    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
  }
  long inputDepth  = input->size[2];
  long inputWidth   = input->size[4];
  long inputHeight  = input->size[3];
  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
  // Batch size + input planes
  long batchSize = input->size[0];
  // Define a buffer of ones, for bias accumulation
  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Resize temporary columns
  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // Extract columns:
    THNN_(vol2col)(
      THTensor_(data)(input_n),
      nInputPlane, inputDepth, inputHeight, inputWidth,
      kT, kH, kW, padT, padH, padW, dT, dH, dW,
      dilationT, dilationH, dilationW,
      THTensor_(data)(columns)
    );
    // M,N,K are dims of matrix A and B
    long m = nOutputPlane;
    long n = nInputPlane*kT*kW*kH;
    long k = columns->size[1];
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        't', 'n',
        n, m, k,
        scale,
        THTensor_(data)(columns), k,
        THTensor_(data)(gradOutput_n), k,
        1,
        THTensor_(data)(gradWeight), n
    );
    // Do Bias:
    // M,N,K are dims of matrix A and B
    long m_ = nOutputPlane;
    long k_ = outputDepth * outputHeight * outputWidth;
    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
    if (gradBias) {
      THBlas_(gemv)(
          't',
          k_, m_,
          scale,
          THTensor_(data)(gradOutput_n), k_,
          THTensor_(data)(ones), 1,
          1,
          THTensor_(data)(gradBias), 1
      );
    }
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(gradOutput_n);
  // Resize
  if (batch == 0) {
    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/torch/lib/THNN/generic/VolumetricFullConvolution.c
@ -0,0 +1,469 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
 #else
 static void THNN_(vol2col)(
  const real *data_vol, const int channels,
  const int depth, const int height, const int width,
  const int kT, const int kH, const int kW,
  const int pT, const int pH, const int pW,
  const int dT, const int dH, const int dW,
  const int dilationT, const int dilationH, const int dilationW,
  real *data_col)
 {
  int c, t, h, w;
  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
  int channels_col = channels * kT * kH * kW;
  for (c = 0; c < channels_col; ++c)
  {
    int w_offset = c % kW;
    int h_offset = (c / kW) % kH;
    int t_offset = (c / kW / kH) % kT;
    int c_vol = c / kT / kH / kW;
    for (t = 0; t < depth_col; ++t)
    {
      for (h = 0; h < height_col; ++h)
      {
        for (w = 0; w < width_col; ++w)
        {
          int t_pad = t * dT - pT + t_offset * dilationT;
          int h_pad = h * dH - pH + h_offset * dilationH;
          int w_pad = w * dW - pW + w_offset * dilationW;
          if (t_pad >= 0 && t_pad < depth &&
              h_pad >= 0 && h_pad < height &&
              w_pad >= 0 && w_pad < width)
            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
          else
            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
        }
      }
    }
  }
 }
 static void THNN_(col2vol)(
  const real* data_col, const int channels,
  const int depth, const int height, const int width,
  const int kT, const int kH, const int kW,
  const int pT, const int pH, const int pW,
  const int dT, const int dH, const int dW,
  const int dilationT, const int dilationH, const int dilationW,
  real* data_vol)
 {
  int c, t, h, w;
  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
  int channels_col = channels * kT * kH * kW;
  for (c = 0; c < channels_col; ++c)
  {
    int w_offset = c % kW;
    int h_offset = (c / kW) % kH;
    int t_offset = (c / kW / kH) % kT;
    int c_vol = c / kT / kH / kW;
    for (t = 0; t < depth_col; ++t)
    {
      for (h = 0; h < height_col; ++h)
      {
        for (w = 0; w < width_col; ++w)
        {
          int t_pad = t * dT - pT + t_offset * dilationT;
          int h_pad = h * dH - pH + h_offset * dilationH;
          int w_pad = w * dW - pW + w_offset * dilationW;
          if (t_pad >= 0 && t_pad < depth &&
              h_pad >= 0 && h_pad < height &&
              w_pad >= 0 && w_pad < width)
            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
        }
      }
    }
  }
 }
 void THNN_(VolumetricFullConvolution_updateOutput)(
  THNNState *state,
  THTensor *input,          // 4D or 5D (batch) tensor
  THTensor *output,
  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
  THTensor *bias,
  THTensor *finput,         // internal columns buffer
  THTensor *fgradInput,     // internal ones buffer
  int dT, int dW, int dH,   // stride of the convolution
  int pT, int pW, int pH,   // padding
  int aT, int aW, int aH)   // extra output adjustment
 {
  THTensor *columns = finput;
  THTensor *ones    = fgradInput;
  // number of input & output planes and kernel size is indirectly defined by the weight tensor
  THArgCheck(weight->nDimension == 5, 4,
    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
  );
  const int nInputPlane  = (int)weight->size[0];
  const int nOutputPlane = (int)weight->size[1];
  const int kT           = (int)weight->size[2];
  const int kH           = (int)weight->size[3];
  const int kW           = (int)weight->size[4];
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D (batch mode) tensor is expected"
  );
  int batch = 1;
  if (input->nDimension == 4)
  {
    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
    // Force batch
    batch = 0;
    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
  }
  else
  {
    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
  }
  const long inputWidth   = input->size[4];
  const long inputHeight  = input->size[3];
  const long inputDepth   = input->size[2];
  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
  // Batch size + input planes
  const long batchSize = input->size[0];
  // Resize output
  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
  // Resize temporary columns
  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
  THTensor_(zero)(columns);
  // Define a buffer of ones, for bias accumulation
  // Note: this buffer can be shared with other modules, it only ever gets increased,
  // and always contains ones.
  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
  {
    // Resize plane and fill with ones...
    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *output_n = THTensor_(new)();
  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; ++elt)
  {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(output_n, output, 0, elt);
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
    const long n = columns->size[1];
    const long k = weight->size[0];
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
      'n', 't',
      n, m, k,
      1,
      THTensor_(data)(input_n), n,
      THTensor_(data)(weight), m,
      0,
      THTensor_(data)(columns), n
    );
    // Unpack columns back into input:
    THNN_(col2vol)(
      THTensor_(data)(columns),
      nOutputPlane, outputDepth, outputHeight, outputWidth,
      kT, kH, kW,
      pT, pH, pW,
      dT, dH, dW,
       1,  1,  1,
      THTensor_(data)(output_n)
    );
    // Do Bias after:
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    const long m_ = nOutputPlane;
    const long n_ = outputDepth * outputHeight * outputWidth;
    const long k_ = 1;
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
      't', 'n',
      n_, m_, k_,
      1,
      THTensor_(data)(ones), k_,
      THTensor_(data)(bias), k_,
      1,
      THTensor_(data)(output_n), n_
    );
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(output_n);
  // Resize output
  if (batch == 0)
  {
    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
  }
 }
 void THNN_(VolumetricFullConvolution_updateGradInput)(
  THNNState *state,
  THTensor *input,
  THTensor *gradOutput,
  THTensor *gradInput,
  THTensor *weight,
  THTensor *finput,
  THTensor *fgradInput,     // only used by cuda impl
  int dT, int dW, int dH,   // stride
  int pT, int pW, int pH,   // padding
  int aT, int aW, int aH)   // extra output adjustment
 {
  THTensor *gradColumns = finput;
  // number of input & output planes and kernel size is indirectly defined by the weight tensor
  THArgCheck(weight->nDimension == 5, 4,
    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
  );
  const int nInputPlane  = (int)weight->size[0];
  const int nOutputPlane = (int)weight->size[1];
  const int kT           = (int)weight->size[2];
  const int kH           = (int)weight->size[3];
  const int kW           = (int)weight->size[4];
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D (batch mode) tensor is expected"
  );
  int batch = 1;
  if (input->nDimension == 4)
  {
    // Force batch
    batch = 0;
    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
  }
  const long inputWidth   = input->size[4];
  const long inputHeight  = input->size[3];
  const long inputDepth   = input->size[2];
  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
  // Batch size + input planes
  const long batchSize = input->size[0];
  // Resize output
  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
  THTensor_(zero)(gradInput);
  // Resize temporary columns
  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
  // Helpers
  THTensor *gradInput_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; ++elt)
  {
    // Matrix mulitply per sample:
    THTensor_(select)(gradInput_n, gradInput, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // Extract columns:
    THNN_(vol2col)(
      THTensor_(data)(gradOutput_n),
      nOutputPlane, outputDepth, outputHeight, outputWidth,
      kT, kH, kW,
      pT, pH, pW,
      dT, dH, dW,
       1,  1,  1,
      THTensor_(data)(gradColumns)
    );
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    const long m = weight->size[0];
    const long n = gradColumns->size[1];
    const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
      'n', 'n',
      n, m, k,
      1,
      THTensor_(data)(gradColumns), n,
      THTensor_(data)(weight), k,
      0,
      THTensor_(data)(gradInput_n), n
    );
  }
  // Free
  THTensor_(free)(gradInput_n);
  THTensor_(free)(gradOutput_n);
  // Resize output
  if (batch == 0)
  {
    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
  }
 }
 void THNN_(VolumetricFullConvolution_accGradParameters)(
  THNNState *state,
  THTensor *input,
  THTensor *gradOutput,
  THTensor *gradWeight,
  THTensor *gradBias,
  THTensor *finput,
  THTensor *fgradInput,
  int dT, int dW, int dH,   // stride
  int pT, int pW, int pH,   // padding
  int aT, int aW, int aH,   // extra output adjustment
  real scale)
 {
  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
  THArgCheck(gradWeight->nDimension == 5, 4,
    "5D gradWeight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
  );
  int nInputPlane  = (int)gradWeight->size[0];
  int nOutputPlane = (int)gradWeight->size[1];
  int kT           = (int)gradWeight->size[2];
  int kH           = (int)gradWeight->size[3];
  int kW           = (int)gradWeight->size[4];
  THTensor *columns = finput;
  THTensor *ones = fgradInput;
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D (batch mode) tensor is expected"
  );
  int batch = 1;
  if (input->nDimension == 4)
  {
    // Force batch
    batch = 0;
    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
  }
  const long inputWidth   = input->size[4];
  const long inputHeight  = input->size[3];
  const long inputDepth   = input->size[2];
  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
  // Batch size + input planes
  const long batchSize = input->size[0];
  // Define a buffer of ones, for bias accumulation
  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
  {
    // Resize plane and fill with ones...
    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }
  // Resize temporary columns
  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();
  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; ++elt)
  {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
    // Extract columns:
    THNN_(vol2col)(
      THTensor_(data)(gradOutput_n), nOutputPlane,
      outputDepth, outputHeight, outputWidth,
      kT, kH, kW,
      pT, pH, pW,
      dT, dH, dW,
       1,  1,  1,
      THTensor_(data)(columns)
    );
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    const long n = columns->size[0];   // nOutputPlane * kt * kh * kw
    const long m = input_n->size[0];   // nInputPlane
    const long k = columns->size[1];   // inputHeight * inputWidth
    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
      't', 'n',
      n, m, k,
      scale,
      THTensor_(data)(columns), k,
      THTensor_(data)(input_n), k,
      1,
      THTensor_(data)(gradWeight), n
    );
    // Do Bias:
    // M,N,K are dims of matrix A and B
    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
    const long m_ = nOutputPlane;
    const long k_ = outputDepth * outputHeight * outputWidth;
    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
    THBlas_(gemv)(
      't',
      k_, m_,
      scale,
      THTensor_(data)(gradOutput_n), k_,
      THTensor_(data)(ones), 1,
      1,
      THTensor_(data)(gradBias), 1
    );
  }
  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(gradOutput_n);
  // Resize
  if (batch == 0)
  {
    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
  }
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/torch/lib/THNN/generic/VolumetricMaxPooling.c
@ -0,0 +1,392 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
 #else
 static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
          real *input_p,
          real *output_p,
          real *indz_p,
          long nslices,
          long itime,
          long iwidth,
          long iheight,
          long otime,
          long owidth,
          long oheight,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    /* loop over output */
    long i, j, ti;
    for (ti = 0; ti < otime; ti++)
    {
      for (i = 0; i < oheight; i++)
      {
        for (j = 0; j < owidth; j++)
        {
          /* local pointers */
          long start_t = ti * dT - pT;
          long start_h = i * dH - pH;
          long start_w = j * dW - pW;
          long kernel_t = fminf(kT, kT + start_t);
          long kernel_h = fminf(kH, kH + start_h);
          long kernel_w = fminf(kW, kW + start_w);
          start_t = fmaxf(start_t, 0);
          start_h = fmaxf(start_h, 0);
          start_w = fmaxf(start_w, 0);
          real *ip = input_p + k * itime * iwidth * iheight
            + start_t * iwidth * iheight + start_h * iwidth + start_w;
          real *op = output_p + k * otime * owidth * oheight
            + ti * owidth * oheight + i * owidth + j;
          real *indzp = indz_p + k * otime * owidth * oheight
            + ti * owidth * oheight + i * owidth + j;
          /* compute local max: */
          real maxval = -THInf;
          int x,y,z;
          int mx, my, mz;
          for (z = 0; z < kernel_t; z++)
          {
            for (y = 0; y < kernel_h; y++)
            {
              for (x = 0; x < kernel_w; x++)
              {
                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
                {
                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
                  if (val > maxval)
                  {
                    maxval = val;
                    // Store indices w.r.t the kernel dimension
                    mz = z + (kT - kernel_t);
                    my = y + (kH - kernel_h);
                    mx = x + (kW - kernel_w);
                  }
                }
              }
            }
          }
          // set max values
          ((unsigned char*)(indzp))[0] = mz;
          ((unsigned char*)(indzp))[1] = my;
          ((unsigned char*)(indzp))[2] = mx;
          ((unsigned char*)(indzp))[3] = 0;
          /* set output to local max */
          *op = maxval;
        }
      }
    }
  }
 }
 void THNN_(VolumetricMaxPooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *indices,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          bool ceilMode)
 {
  long nslices;
  long itime;
  long iheight;
  long iwidth;
  long otime;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;
  real *indices_data;
  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
    "4D or 5D (batch-mode) tensor expected"
  );
  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;
  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }
  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
    "input image smaller than kernel size"
  );
  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
    "pad should be smaller than half of kernel size"
  );
  /* sizes */
  nslices = input->size[dimN];
  itime   = input->size[dimt];
  iheight = input->size[dimh];
  iwidth  = input->size[dimw];
  if (ceilMode)
  {
    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
  }
  else
  {
    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
  }
  if (pT || pW || pH)
  {
    // ensure that the last pooling starts inside the image
    if ((otime - 1)*dT >= itime + pT)
      --otime;
    if ((oheight - 1)*dH >= iheight + pH)
      --oheight;
    if ((owidth  - 1)*dW >= iwidth  + pW)
      --owidth;
  }
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  if (input->nDimension == 4) /* non-batch mode */
  {
    /* resize output */
    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
    /* indices will contain ti,i,j uchar locations packed into float/double */
    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    THNN_(VolumetricMaxPooling_updateOutput_frame)(
      input_data, output_data,
      indices_data,
      nslices,
      itime, iwidth, iheight,
      otime, owidth, oheight,
      kT, kW, kH,
      dT, dW, dH,
      pT, pW, pH
    );
  }
  else /* batch mode */
  {
    long p;
    long nBatch = input->size[0];
    long istride = nslices * itime * iwidth * iheight;
    long ostride = nslices * otime * owidth * oheight;
    /* resize output */
    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
    /* indices will contain ti,i,j locations for each output point */
    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
 #pragma omp parallel for private(p)
    for (p=0; p < nBatch; p++)
    {
      THNN_(VolumetricMaxPooling_updateOutput_frame)(
        input_data   + p * istride,
        output_data  + p * ostride,
        indices_data + p * ostride,
        nslices,
        itime, iwidth, iheight,
        otime, owidth, oheight,
        kT, kW, kH,
        dT, dW, dH,
        pT, pW, pH
      );
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
          real *gradInput_p,
          real *gradOutput_p,
          real *indz_p,
          long nslices,
          long itime,
          long iwidth,
          long iheight,
          long otime,
          long owidth,
          long oheight,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
    real *indz_p_k = indz_p + k * otime * owidth * oheight;
    /* calculate max points */
    long ti, i, j;
    for (ti = 0; ti < otime; ti++)
    {
      for (i = 0; i < oheight; i++)
      {
        for (j = 0; j < owidth; j++)
        {
          /* retrieve position of max */
          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
          /* update gradient */
          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
        }
      }
    }
  }
 }
 void THNN_(VolumetricMaxPooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *indices,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  int nslices;
  int itime;
  int iheight;
  int iwidth;
  int otime;
  int oheight;
  int owidth;
  real *gradInput_data;
  real *gradOutput_data;
  real *indices_data;
  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }
  /* sizes */
  nslices = input->size[dimN];
  itime = input->size[dimt];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  otime = gradOutput->size[dimt];
  oheight = gradOutput->size[dimh];
  owidth = gradOutput->size[dimw];
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THTensor_(data)(indices);
  /* backprop */
  if (input->nDimension == 4) /* non-batch mode*/
  {
    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
      gradInput_data, gradOutput_data,
      indices_data,
      nslices,
      itime, iwidth, iheight,
      otime, owidth, oheight,
      dT, dW, dH,
      pT, pW, pH
    );
  }
  else /* batch mode */
  {
    long p;
    long nBatch = input->size[0];
    long istride = nslices * itime * iwidth * iheight;
    long ostride = nslices * otime * owidth * oheight;
 #pragma omp parallel for private(p)
    for (p = 0; p < nBatch; p++)
    {
      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
        gradInput_data + p * istride,
        gradOutput_data + p * ostride,
        indices_data + p * ostride,
        nslices,
        itime, iwidth, iheight,
        otime, owidth, oheight,
        dT, dW, dH,
        pT, pW, pH
      );
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricMaxUnpooling.c
+++ b/torch/lib/THNN/generic/VolumetricMaxUnpooling.c
@ -0,0 +1,325 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
 #else
 static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
          real *input_p,
          real *output_p,
          real *ind_p,
          long nslices,
          long iT,
          long iW,
          long iH,
          long oT,
          long oW,
          long oH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    long ti, i, j, maxz, maxy, maxx;
    for (ti = 0; ti < iT; ti++)
    {
      for (i = 0; i < iH; i++)
      {
        for (j = 0; j < iW; j++)
        {
          long start_t = ti * dT - pT;
          long start_h = i * dH - pH;
          long start_w = j * dW - pW;
          //real *output_p_k = output_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
          real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
          maxy = ((unsigned char*)(ind_p_k))[1];
          maxx = ((unsigned char*)(ind_p_k))[2];
          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
          {
            THError(
              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
            );
          }
          output_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
        }
      }
    }
  }
 }
 void THNN_(VolumetricMaxUnpooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *indices,
          int oT,
          int oW,
          int oH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  int dimw = 3;
  int dimh = 2;
  int dimt = 1;
  int nbatch = 1;
  int nslices;
  int iT;
  int iH;
  int iW;
  real *input_data;
  real *output_data;
  real *indices_data;
  THArgCheck(input->nDimension == 4 || input->nDimension == 5 , 2,
    "4D or 5D (batch mode) tensor expected"
  );
  if (!THTensor_(isSameSizeAs)(input, indices))
  {
    THError("Invalid input size w.r.t current indices size");
  }
  if (input->nDimension == 5)
  {
    nbatch = input->size[0];
    dimt++;
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimt-1];
  iT = input->size[dimt];
  iH = input->size[dimh];
  iW = input->size[dimw];
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  indices = THTensor_(newContiguous)(indices);
  /* resize output */
  if (input->nDimension == 4)
  {
    THTensor_(resize4d)(output, nslices, oT, oH, oW);
    THTensor_(zero)(output);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
      input_data, output_data,
      indices_data,
      nslices,
      iT, iW, iH,
      oT, oW, oH,
      dT, dW, dH, pT, pW, pH
    );
  }
  else
  {
    long p;
    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
    THTensor_(zero)(output);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THTensor_(data)(indices);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
        input_data+p*nslices*iT*iW*iH,
        output_data+p*nslices*oT*oW*oH,
        indices_data+p*nslices*iT*iW*iH,
        nslices,
        iT, iW, iH,
        oT, oW, oH,
        dT, dW, dH,
        pT, pW, pH
      );
    }
  }
  /* cleanup */
  THTensor_(free)(input);
  THTensor_(free)(indices);
 }
 static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
          real *gradInput_p,
          real *gradOutput_p,
          real *ind_p,
          long nslices,
          long iT,
          long iW,
          long iH,
          long oT,
          long oW,
          long oH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  long k;
 #pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    long ti, i, j, maxz, maxy, maxx;
    for (ti = 0; ti < iT; ti++)
    {
      for (i = 0; i < iH; i++)
      {
        for (j = 0; j < iW; j++)
        {
          long start_t = ti * dT - pT;
          long start_h = i * dH - pH;
          long start_w = j * dW - pW;
          real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
          //real *gradOutput_p_k = gradOutput_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
          maxy = ((unsigned char*)(ind_p_k))[1];
          maxx = ((unsigned char*)(ind_p_k))[2];
          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
          {
            THError(
              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
            );
          }
          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
        }
      }
    }
  }
 }
 void THNN_(VolumetricMaxUnpooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THTensor *indices,
          int oT,
          int oW,
          int oH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH)
 {
  int dimw = 3;
  int dimh = 2;
  int dimt = 1;
  int nbatch = 1;
  int nslices;
  int iT;
  int iH;
  int iW;
  real *gradInput_data;
  real *gradOutput_data;
  real *indices_data;
  if (!THTensor_(isSameSizeAs)(input, indices))
  {
    THError("Invalid input size w.r.t current indices size");
  }
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  indices = THTensor_(newContiguous)(indices);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  if (input->nDimension == 5)
  {
    nbatch = input->size[0];
    dimt++;
    dimw++;
    dimh++;
  }
  /* sizes */
  nslices = input->size[dimt-1];
  iT = input->size[dimt];
  iH = input->size[dimh];
  iW = input->size[dimw];
  if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
  {
    THError(
      "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%d",
      oT, oH, oW,gradOutput->size[dimh], gradOutput->size[dimw]
    );
  }
  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THTensor_(data)(indices);
  /* backprop */
  if (input->nDimension == 4)
  {
    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
      gradInput_data, gradOutput_data,
      indices_data,
      nslices,
      iT, iW, iH,
      oT, oW, oH,
      dT, dW, dH,
      pT, pW, pH
    );
  }
  else
  {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
        gradInput_data+p*nslices*iT*iW*iH,
        gradOutput_data+p*nslices*oT*oW*oH,
        indices_data+p*nslices*iT*iW*iH,
        nslices,
        iT, iW, iH,
        oT, oW, oH,
        dT, dW, dH,
        pT, pW, pH
      );
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
  THTensor_(free)(indices);
 }
 #endif
--- a/torch/lib/THNN/generic/VolumetricReplicationPadding.c
+++ b/torch/lib/THNN/generic/VolumetricReplicationPadding.c
@ -0,0 +1,301 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
 #else
 static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
  real *input_p, real *output_p,
  long nslices,
  long iwidth, long iheight, long idepth,
  long owidth, long oheight, long odepth,
  int pleft, int pright,
  int ptop, int pbottom,
  int pfront, int pback)
 {
  int iStartX = fmax(0, -pleft);
  int iStartY = fmax(0, -ptop);
  int iStartZ = fmax(0, -pfront);
  int oStartX = fmax(0, pleft);
  int oStartY = fmax(0, ptop);
  int oStartZ = fmax(0, pfront);
  long k, ip_x, ip_y, ip_z;
 #pragma omp parallel for private(k, ip_x, ip_y, ip_z)
  for (k = 0; k < nslices; k++) {
    long i, j, z;
    for (z = 0; z < odepth; z++) {
      for (i = 0; i < oheight; i++) {
        for (j = 0; j < owidth; j++) {
          if (j < pleft) {
            ip_x = pleft;
          } else if (j >= pleft && j < iwidth + pleft) {
            ip_x = j;
          } else {
            ip_x = iwidth + pleft - 1;
          }
          ip_x = ip_x - oStartX + iStartX;
          if (i < ptop) {
            ip_y = ptop;
          } else if (i >= ptop && i < iheight + ptop) {
            ip_y = i;
          } else {
            ip_y = iheight + ptop - 1;
          }
          ip_y = ip_y - oStartY + iStartY;
          if (z < pfront) {
            ip_z = pfront;
          } else if (z >= pfront && z < idepth + pfront) {
            ip_z = z;
          } else {
            ip_z = idepth + pfront - 1;
          }
          ip_z = ip_z - oStartZ + iStartZ;
          real *dest_p = output_p + k * owidth * oheight * odepth +
              z * owidth * oheight + i * owidth + j;
          real *src_p = input_p + k * iwidth * iheight * idepth +
              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
          *dest_p = *src_p;
        }
      }
    }
  }
 }
 void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
                                                      THTensor *input,
                                                      THTensor *output,
                                                      int pleft, int pright,
                                                      int ptop, int pbottom,
                                                      int pfront, int pback)
 {
  int dimw = 3;
  int dimh = 2;
  int dimd = 1;
  int dimslices = 0;
  long nbatch = 1;
  long nslices;
  long idepth;
  long iheight;
  long iwidth;
  long odepth;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;
  THArgCheck(input->nDimension == 4 || input->nDimension == 5,
             2, "input must be 4 or 5-dimensional");
  if (input->nDimension == 5)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimd++;
    dimslices++;
  }
  /* sizes */
  nslices = input->size[dimslices];
  idepth = input->size[dimd];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  odepth = idepth + pfront + pback;
  oheight = iheight + ptop + pbottom;
  owidth  = iwidth + pleft + pright;
  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1 , 2,
             "input is too small");
  /* get contiguous input */
  input = THTensor_(newContiguous)(input);
  /* resize output */
  if (input->nDimension == 4)
  {
    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
         input_data, output_data, nslices, iwidth, iheight, idepth,
         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
         pback);
  }
  else
  {
    long p;
    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++)
    {
      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
        input_data + p * nslices * iwidth * iheight * idepth,
        output_data + p * nslices * owidth * oheight * odepth,
        nslices,
        iwidth, iheight, idepth,
        owidth, oheight, odepth,
        pleft, pright,
        ptop, pbottom,
        pfront, pback);
    }
  }
  /* cleanup */
  THTensor_(free)(input);
 }
 static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
  real *ginput_p, real *goutput_p,
  long nslices,
  long iwidth, long iheight, long idepth,
  long owidth, long oheight, long odepth,
  int pleft, int pright,
  int ptop, int pbottom,
  int pfront, int pback)
 {
  int iStartX = fmax(0, -pleft);
  int iStartY = fmax(0, -ptop);
  int iStartZ = fmax(0, -pfront);
  int oStartX = fmax(0, pleft);
  int oStartY = fmax(0, ptop);
  int oStartZ = fmax(0, pfront);
  long k, ip_x, ip_y, ip_z;
 #pragma omp parallel for private(k, ip_x, ip_y, ip_z)
  for (k = 0; k < nslices; k++) {
    long i, j, z;
    for (z = 0; z < odepth; z++) {
      for (i = 0; i < oheight; i++) {
        for (j = 0; j < owidth; j++) {
          if (j < pleft) {
            ip_x = pleft;
          } else if (j >= pleft && j < iwidth + pleft) {
            ip_x = j;
          } else {
            ip_x = iwidth + pleft - 1;
          }
          ip_x = ip_x - oStartX + iStartX;
          if (i < ptop) {
            ip_y = ptop;
          } else if (i >= ptop && i < iheight + ptop) {
            ip_y = i;
          } else {
            ip_y = iheight + ptop - 1;
          }
          ip_y = ip_y - oStartY + iStartY;
          if (z < pfront) {
            ip_z = pfront;
          } else if (z >= pfront && z < idepth + pfront) {
            ip_z = z;
          } else {
            ip_z = idepth + pfront - 1;
          }
          ip_z = ip_z - oStartZ + iStartZ;
          real *src_p = goutput_p + k * owidth * oheight * odepth +
              z * owidth * oheight + i * owidth + j;
          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
          *dest_p += *src_p;
        }
      }
    }
  }
 }
 void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
                                                         THTensor *input,
                                                         THTensor *gradOutput,
                                                         THTensor *gradInput,
                                                         int pleft, int pright,
                                                         int ptop, int pbottom,
                                                         int pfront, int pback)
 {
  int dimw = 3;
  int dimh = 2;
  int dimd = 1;
  int dimslices = 0;
  long nbatch = 1;
  long nslices;
  long idepth;
  long iheight;
  long iwidth;
  long odepth;
  long oheight;
  long owidth;
  if (input->nDimension == 5)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
    dimd++;
    dimslices++;
  }
  /* sizes */
  nslices = input->size[dimslices];
  idepth = input->size[dimd];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  odepth = idepth + pfront + pback;
  oheight = iheight + ptop + pbottom;
  owidth  = iwidth + pleft + pright;
  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
                "gradOutput width unexpected");
  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
                "gradOutput height unexpected");
  THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
                "gradOutput depth unexpected");
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);
  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);
  /* backprop */
  if (input->nDimension == 4) {
    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
      THTensor_(data)(gradInput),
      THTensor_(data)(gradOutput),
      nslices,
      iwidth, iheight, idepth,
      owidth, oheight, odepth,
      pleft, pright,
      ptop, pbottom,
      pfront, pback);
  } else {
    long p;
 #pragma omp parallel for private(p)
    for (p = 0; p < nbatch; p++) {
      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
        nslices,
        iwidth, iheight, idepth,
        owidth, oheight, odepth,
        pleft, pright,
        ptop, pbottom,
        pfront, pback);
    }
  }
  /* cleanup */
  THTensor_(free)(gradOutput);
 }
 #endif
--- a/torch/lib/THNN/generic/unfold.c
+++ b/torch/lib/THNN/generic/unfold.c
@ -0,0 +1,158 @@
 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/unfold.c"
 #else
 #ifdef _WIN32
 # include <windows.h>
 #endif
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
 void THNN_(unfolded_acc)(
          THTensor *finput,
          THTensor *input,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          int nInputPlane,
          int inputWidth,
          int inputHeight,
          int outputWidth,
          int outputHeight)
 {
 #ifdef _WIN32
  LONG_PTR nip;
 #else
  size_t nip;
 #endif
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);
 #pragma omp parallel for private(nip)
  for(nip = 0; nip < nInputPlane; nip++)
  {
    size_t kw, kh, y, x;
    long long ix = 0, iy = 0;
    for(kh = 0; kh < kH; kh++)
    {
      for(kw = 0; kw < kW; kw++)
      {
        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
        real *dst = input_data + nip*(inputHeight*inputWidth);
        if (padW > 0 || padH > 0) {
          size_t lpad,rpad;
          for(y = 0; y < outputHeight; y++) {
            iy = (long long)(y*dH - padH + kh);
            if (iy < 0 || iy >= inputHeight) {
            } else {
              if (dW==1){
                 ix = (long long)(0 - padW + kw);
                 lpad = fmaxf(0,(int)(padW-kw));
                 rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
              }
              else{
                for (x=0; x<outputWidth; x++){
                   ix = (long long)(x*dW - padW + kw);
                   if (ix < 0 || ix >= inputWidth){
                   }else
                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
                }
              }
            }
          }
        } else {
          for(y = 0; y < outputHeight; y++) {
            iy = (long long)(y*dH + kh);
            ix = (long long)(0 + kw);
            if (dW == 1 )
               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
            else{
              for(x = 0; x < outputWidth; x++)
                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
            }
          }
        }
      }
    }
  }
 }
 void THNN_(unfolded_copy)(
          THTensor *finput,
          THTensor *input,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          int nInputPlane,
          int inputWidth,
          int inputHeight,
          int outputWidth,
          int outputHeight)
 {
  long k;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);
 #pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane*kH*kW; k++) {
    size_t nip = k / (kH*kW);
    size_t rest = k % (kH*kW);
    size_t kh = rest / kW;
    size_t kw = rest % kW;
    size_t x,y;
    long long ix,iy;
    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
    real *src = input_data + nip*(inputHeight*inputWidth);
    if (padW > 0 || padH > 0) {
      size_t lpad,rpad;
      for(y = 0; y < outputHeight; y++) {
        iy = (long long)(y*dH - padH + kh);
        if (iy < 0 || iy >= inputHeight) {
          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
        } else {
          if (dW==1){
             ix = (long long)(0 - padW + kw);
             lpad = fmaxf(0,(int)(padW-kw));
             rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
             if (outputWidth-rpad-lpad <= 0) {
                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
             } else {
                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
             }
          }
          else{
            for (x=0; x<outputWidth; x++){
               ix = (long long)(x*dW - padW + kw);
               if (ix < 0 || ix >= inputWidth)
                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
               else
                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
            }
          }
        }
      }
    } else {
      for(y = 0; y < outputHeight; y++) {
        iy = (long long)(y*dH + kh);
        ix = (long long)(0 + kw);
        if (dW == 1)
           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
        else{
          for (x=0; x<outputWidth; x++)
             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
         }
      }
    }
  }
 }
 #endif
--- a/torch/lib/THNN/init.c
+++ b/torch/lib/THNN/init.c
@ -0,0 +1,182 @@
 #include "TH.h"
 #include "THNN.h"
 #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
 #define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
 #include "generic/Abs.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/AbsCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/ClassNLLCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialClassNLLCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/DistKLDivCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/ELU.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/HardShrink.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/HardTanh.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/L1Cost.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/LeakyReLU.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/LogSigmoid.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/LogSoftMax.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/LookupTable.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/MSECriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/MarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SoftMarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/MultiLabelMarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/MultiMarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/PReLU.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/RReLU.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/Sigmoid.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SmoothL1Criterion.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SoftMax.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SoftPlus.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SoftShrink.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SparseLinear.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/Sqrt.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/Square.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/Tanh.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/Threshold.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/TemporalConvolution.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/TemporalSubSampling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/TemporalMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/BatchNormalization.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/unfold.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialConvolutionMap.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialConvolutionLocal.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialFullConvolution.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialFullConvolutionMap.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialDilatedConvolution.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialAdaptiveMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialFractionalMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialSubSampling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialUpSamplingNearest.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialUpSamplingBilinear.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricConvolution.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricFullConvolution.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricDilatedConvolution.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialReflectionPadding.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/SpatialReplicationPadding.c"
 #include "THGenerateFloatTypes.h"
 #include "generic/VolumetricReplicationPadding.c"
 #include "THGenerateFloatTypes.h"