remove old docs

2025-12-06 12:20:52 +01:00 · 2017-01-16 15:06:08 -05:00 · 2017-01-16 15:06:08 -05:00 · af110d37f2
commit af110d37f2
parent 38967568ca
43 changed files with 1 additions and 6210 deletions
--- a/README.md
+++ b/README.md
@ -50,7 +50,7 @@ Elaborating further:

 If you use numpy, then you have used Tensors (a.k.a ndarray).

-![tensor_illustration](docs/image/tensor_illustration.png)
+![tensor_illustration](docs/source/_static/img/tensor_illustration.png)

 PyTorch provides Tensors that can live either on the CPU or the GPU, and accelerate
 compute by a huge amount.
--- a/docs-old/docutils/doc2md.py
+++ b/docs-old/docutils/doc2md.py
@ -1,534 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-"""
-Very lightweight docstring to Markdown converter. Modified for use in pytorch
-
-
-### License
-
-Copyright © 2013 Thomas Gläßle <t_glaessle@gmx.de>
-
-This work  is free. You can  redistribute it and/or modify  it under the
-terms of the Do What The Fuck  You Want To Public License, Version 2, as
-published by Sam Hocevar. See the COPYING file for more details.
-
-This program  is free software.  It comes  without any warranty,  to the
-extent permitted by applicable law.
-
-
-### Description
-
-Little convenience tool to extract docstrings from a module or class and
-convert them to GitHub Flavoured Markdown:
-
-https://help.github.com/articles/github-flavored-markdown
-
-Its purpose is to quickly generate `README.md` files for small projects.
-
-
-### API
-
-The interface consists of the following functions:
-
- - `doctrim(docstring)`
- - `doc2md(docstring, title)`
-
-You can run this script from the command line like:
-
-$ doc2md.py [-a] [--no-toc] [-t title] module-name [class-name] > README.md
-
-
-### Limitations
-
-At the moment  this is suited only  for a very specific use  case. It is
-hardly forseeable, if I will decide to improve on it in the near future.
-
-"""
-import re
-import sys
-import inspect
-
-__all__ = ['doctrim', 'doc2md']
-
-doctrim = inspect.cleandoc
-
-def unindent(lines):
-    """
-    Remove common indentation from string.
-
-    Unlike doctrim there is no special treatment of the first line.
-
-    """
-    try:
-        # Determine minimum indentation:
-        indent = min(len(line) - len(line.lstrip())
-                     for line in lines if line)
-    except ValueError:
-        return lines
-    else:
-        return [line[indent:] for line in lines]
-
-def escape_markdown(line):
-    line = line.replace('[', '\[').replace(']', '\]')
-    line = line.replace('(', '\(').replace(')', '\)')
-    line = line.replace('{', '\{').replace('}', '\}')
-    line = line.replace('\\', '\\\\')
-    line = line.replace('`', '\`')
-    line = line.replace('*', '\*')
-    line = line.replace('_', '\_')
-    line = line.replace('#', '\#')
-    line = line.replace('+', '\+')
-    line = line.replace('-', '\-')
-    line = line.replace('.', '\.')
-    line = line.replace('!', '\!')
-    return line
-
-def code_block(lines, language=''):
-    """
-    Mark the code segment for syntax highlighting.
-    """
-    return ['```' + language] + lines + ['```']
-
-def doctest2md(lines):
-    """
-    Convert the given doctest to a syntax highlighted markdown segment.
-    """
-    is_only_code = True
-    lines = unindent(lines)
-    for line in lines:
-        if not line.startswith('>>> ') and not line.startswith('... ') and line not in ['>>>', '...']:
-            is_only_code = False
-            break
-    if is_only_code:
-        orig = lines
-        lines = []
-        for line in orig:
-            lines.append(line[4:])
-    return lines
-
-def doc_code_block(lines, language):
-    if language == 'python':
-        lines = doctest2md(lines)
-    return code_block(lines, language)
-
-_args_section = re.compile('^\s*Args:\s*')
-def is_args_check(line):
-    return _args_section.match(line)
-
-def args_block(lines):
-    out = ['']
-    out += ['Parameter | Default | Description']
-    out += ['--------- | ------- | -----------']
-    for line in lines:
-        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
-        assert matches != None
-        name = matches[0][0]
-        description = matches[0][1]
-        default = matches[0][3]
-        out += [name + ' | ' + default + ' | ' + description]
-    return out
-
-# Inputs
-_inputs_section = re.compile('^\s*Inputs:\s*(.*)\s*')
-def is_inputs_check(line):
-    return _inputs_section.match(line)
-
-def inputs_block(lines):
-    out = ['']
-    out += ['Parameter | Default | Description']
-    out += ['--------- | ------- | -----------']
-    for line in lines:
-        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
-        assert matches != None
-        name = matches[0][0]
-        description = matches[0][1]
-        default = matches[0][3]
-        out += [name + ' | ' + default + ' | ' + description]
-    return out
-
-# Outputs
-_outputs_section = re.compile('^\s*Outputs:\s*(.*)\s*')
-def is_outputs_check(line):
-    return _outputs_section.match(line)
-
-def outputs_block(lines):
-    out = ['']
-    out += ['Parameter |  Description']
-    out += ['--------- |  -----------']
-    for line in lines:
-        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
-        assert matches != None
-        name = matches[0][0]
-        description = matches[0][1]
-        default = matches[0][3]
-        out += [name + ' | ' + description]
-    return out
-
-# Members
-_members_section = re.compile('^\s*Members:\s*(.*)\s*')
-def is_members_check(line):
-    return _members_section.match(line)
-
-def members_block(lines):
-    out = ['']
-    out += ['Parameter | Description']
-    out += ['--------- | -----------']
-    for line in lines:
-        matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
-        assert matches != None
-        name = matches[0][0]
-        description = matches[0][1]
-        default = matches[0][3]
-        out += [name + ' | ' + description]
-    return out
-
-_returns_section = re.compile('^\s*Returns:\s*')
-def is_returns_check(line):
-    return _returns_section.match(line)
-
-_image_section = re.compile('^\s*Image:\s*')
-def is_image_check(line):
-    return _image_section.match(line)
-
-_example_section = re.compile('^\s*Returns:\s*|^\s*Examples:\s*')
-def is_example_check(line):
-    return _example_section.match(line)
-
-_inputshape_section = re.compile('^\s*Returns:\s*|^\s*Input Shape:\s*')
-def is_inputshape_check(line):
-    return _inputshape_section.match(line)
-
-_outputshape_section = re.compile('^\s*Returns:\s*|^\s*Output Shape:\s*')
-def is_outputshape_check(line):
-    return _outputshape_section.match(line)
-###############################################
-_reg_section = re.compile('^#+ ')
-def is_heading(line):
-    return _reg_section.match(line)
-
-def get_heading(line):
-    assert is_heading(line)
-    part = line.partition(' ')
-    return len(part[0]), part[2]
-
-def make_heading(level, title):
-    return '#'*max(level, 1) + ' ' + title
-
-def find_sections(lines):
-    """
-    Find all section names and return a list with their names.
-    """
-    sections = []
-    for line in lines:
-        if is_heading(line):
-            sections.append(get_heading(line))
-    return sections
-
-def make_toc(sections):
-    """
-    Generate table of contents for array of section names.
-    """
-    if not sections:
-        return []
-    outer = min(n for n,t in sections)
-    refs = []
-    for ind,sec in sections:
-        ref = sec.lower()
-        ref = ref.replace(' ', '-')
-        ref = ref.replace('?', '')
-        refs.append("    "*(ind-outer) + "- [%s](#%s)" % (sec, ref))
-    return refs
-
-def _doc2md(lines, shiftlevel=0):
-    _doc2md.md = []
-    _doc2md.is_code = False
-    _doc2md.is_code_block = False
-    _doc2md.is_args = False
-    _doc2md.is_inputs = False
-    _doc2md.is_outputs = False
-    _doc2md.is_members = False
-    _doc2md.is_returns = False
-    _doc2md.is_inputshape = False
-    _doc2md.is_outputshape = False
-    _doc2md.code = []
-    def reset():
-        if _doc2md.is_code:
-            _doc2md.is_code = False
-            _doc2md.code += doc_code_block(code, 'python')
-            _doc2md.code += ['']
-        if _doc2md.is_code_block:
-            _doc2md.is_code_block = False
-            _doc2md.code += doc_code_block(code_block, 'python')
-            _doc2md.code += ['']
-
-        if _doc2md.is_args:
-            _doc2md.is_args = False
-            _doc2md.md += args_block(args)
-
-        if _doc2md.is_inputs:
-            _doc2md.is_inputs = False
-            _doc2md.md += inputs_block(inputs)
-
-        if _doc2md.is_outputs:
-            _doc2md.is_outputs = False
-            _doc2md.md += outputs_block(outputs)
-
-        if _doc2md.is_members:
-            _doc2md.is_members = False
-            _doc2md.md += members_block(members)
-
-        if _doc2md.is_returns:
-            _doc2md.is_returns = False
-            _doc2md.md += returns
-
-        _doc2md.is_inputshape = False
-        _doc2md.is_outputshape = False
-
-    for line in lines:
-        trimmed = line.lstrip()
-        if is_args_check(line):
-            reset()
-            _doc2md.is_args = True
-            _doc2md.md += ['']
-            _doc2md.md += ['#' * (shiftlevel+2) + ' Constructor Arguments']
-            args = []
-        elif is_inputs_check(line):
-            reset()
-            _doc2md.is_inputs = True
-            _doc2md.md += ['']
-            _doc2md.md += ['#' * (shiftlevel+2) + ' Inputs']
-            inputs = []
-        elif is_outputs_check(line):
-            reset()
-            _doc2md.is_outputs = True
-            _doc2md.md += ['']
-            _doc2md.md += ['#' * (shiftlevel+2) + ' Outputs']
-            outputs = []
-        elif is_members_check(line):
-            reset()
-            _doc2md.is_members = True
-            _doc2md.md += ['']
-            _doc2md.md += ['#' * (shiftlevel+2) + ' Members']
-            members = []
-        elif is_returns_check(line):
-            reset()
-            _doc2md.is_returns = True
-            _doc2md.md += ['']
-            _doc2md.md += ['#' * (shiftlevel+2) + ' Returns']
-            returns = []
-        elif is_example_check(line):
-            reset()
-        elif is_inputshape_check(line):
-            reset()
-            inputshape = re.findall(r'\s*Input\sShape:\s*(.*)\s*:\s*(.*)\s*$', line)[0]
-        elif is_outputshape_check(line):
-            reset()
-            outputshape = re.findall(r'\s*Output\sShape:\s*(.*)\s*:\s*(.*)\s*$', line)[0]
-            _doc2md.md += ['']
-            _doc2md.md += ['#' * (shiftlevel+2) + ' Expected Shape']
-            _doc2md.md += ['       | Shape | Description ']
-            _doc2md.md += ['------ | ----- | ------------']
-            _doc2md.md += [' input | ' + inputshape[0] + ' | ' + inputshape[1]]
-            _doc2md.md += ['output | ' + outputshape[0] + ' | ' + outputshape[1]]
-        elif is_image_check(line):
-            reset()
-            _doc2md.md += ['']
-            filename = re.findall(r'\s*Image:\s*(.*?)\s*$', line)
-            _doc2md.md += ['<img src="image/' + filename[0] + '" >']
-        elif _doc2md.is_code == False and trimmed.startswith('>>> '):
-            reset()
-            _doc2md.is_code = True
-            code = [line]
-        elif _doc2md.is_code_block == False and trimmed.startswith('```'):
-            reset()
-            _doc2md.is_code_block = True
-            code_block = []
-        elif _doc2md.is_code_block == True and trimmed.startswith('```'):
-            # end of code block
-            reset()
-        elif _doc2md.is_code_block:
-            if line:
-                code_block.append(line)
-            else:
-                reset()
-        elif shiftlevel != 0 and is_heading(line):
-            reset()
-            level, title = get_heading(line)
-            _doc2md.md += [make_heading(level + shiftlevel, title)]
-        elif _doc2md.is_args:
-            if line:
-                args.append(line)
-            else:
-                reset()
-        elif _doc2md.is_inputs:
-            if line:
-                inputs.append(line)
-            else:
-                reset()
-        elif _doc2md.is_outputs:
-            if line:
-                outputs.append(line)
-            else:
-                reset()
-        elif _doc2md.is_members:
-            if line:
-                members.append(line)
-            else:
-                reset()
-        elif _doc2md.is_returns:
-            if line:
-                returns.append(line)
-            else:
-                reset()
-        elif _doc2md.is_code:
-            if line:
-                code.append(line)
-            else:
-                reset()
-        else:
-            reset()
-            _doc2md.md += [line]
-    reset()
-    _doc2md.code += _doc2md.md
-    return _doc2md.code
-
-def doc2md(docstr, title, min_level=3, more_info=False, toc=True):
-    """
-    Convert a docstring to a markdown text.
-    """
-    text = doctrim(docstr)
-    lines = text.split('\n')
-
-    sections = find_sections(lines)
-    if sections:
-        level = min(n for n,t in sections) - 1
-    else:
-        level = 1
-
-    shiftlevel = 0
-    if level < min_level:
-        shiftlevel = min_level - level
-        level = min_level
-        sections = [(lev+shiftlevel, tit) for lev,tit in sections]
-
-    md = [
-        make_heading(level, title),
-        "",
-        lines.pop(0),
-        ""
-    ]
-    if toc:
-        md += make_toc(sections)
-    md += _doc2md(lines, shiftlevel)
-    if more_info:
-        return (md, sections)
-    else:
-        return "\n".join(md)
-
-def mod2md(module, title, title_api_section, toc=True):
-    """
-    Generate markdown document from module, including API section.
-    """
-    docstr = module.__doc__  or " "
-
-    text = doctrim(docstr)
-    lines = text.split('\n')
-
-    sections = find_sections(lines)
-    if sections:
-        level = min(n for n,t in sections) - 1
-    else:
-        level = 1
-
-    api_md = []
-    api_sec = []
-    if title_api_section :
-        # sections.append((level+1, title_api_section))
-        for name, entry in iter(module.__dict__.items()):
-            if name[0] != '_' and entry.__doc__:
-                #api_sec.append((level+1, name))
-                #api_md += ['', '']
-                if entry.__doc__:
-                    md, sec = doc2md(entry.__doc__, name,
-                                     min_level=level+1, more_info=True, toc=False)
-                    api_sec += sec
-                    api_md += md
-
-    sections += api_sec
-
-    # headline
-    md = [
-        make_heading(level, title),
-        "",
-        lines.pop(0),
-        ""
-    ]
-
-    # main sections
-    if toc:
-        md += make_toc(sections)
-    md += _doc2md(lines)
-
-    if toc:
-        md += ['']
-        md += make_toc(api_sec)
-    md += api_md
-
-    return "\n".join(md)
-
-def main(args=None):
-    # parse the program arguments
-    import argparse
-    parser = argparse.ArgumentParser(
-            description='Convert docstrings to markdown.')
-
-    parser.add_argument(
-            'module', help='The module containing the docstring.')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument(
-            'entry', nargs='?',
-            help='Convert only docstring of this entry in module.')
-    group.add_argument(
-            '-a', '--all', dest='all', action='store_true',
-            help='Create an API section with the contents of module.__all__.')
-    parser.add_argument(
-            '-t', '--title', dest='title',
-            help='Document title (default is module name)')
-    parser.add_argument(
-            '--no-toc', dest='toc', action='store_false', default=True,
-            help='Do not automatically generate the TOC')
-    args = parser.parse_args(args)
-
-    import importlib
-    import inspect
-    import os
-
-    def add_path(*pathes):
-        for path in reversed(pathes):
-            if path not in sys.path:
-                sys.path.insert(0, path)
-
-    file = inspect.getfile(inspect.currentframe())
-    add_path(os.path.realpath(os.path.abspath(os.path.dirname(file))))
-    add_path(os.getcwd())
-
-    mod_name = args.module
-    if mod_name.endswith('.py'):
-        mod_name = mod_name.rsplit('.py', 1)[0]
-    title = args.title or mod_name.replace('_', '-')
-
-    module = importlib.import_module(mod_name)
-
-    if args.all:
-        print(mod2md(module, title, 'API', toc=args.toc))
-
-    else:
-        if args.entry:
-            docstr = module.__dict__[args.entry].__doc__ or ''
-        else:
-            docstr = module.__doc__ or ''
-
-        print(doc2md(docstr, title, toc=args.toc))
-
-if __name__ == "__main__":
-    main()
--- a/docs-old/docutils/gendocs.sh
+++ b/docs-old/docutils/gendocs.sh
@ -1,100 +0,0 @@
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-pushd $SCRIPT_DIR
-
-# module
-#python doc2md.py torch.nn Module --title Module --no-toc >../nn_module.md
-
-# containers
-echo "## Containers" > ../nn_container.md
-python doc2md.py torch.nn Container --title Container --no-toc    >>../nn_container.md
-python doc2md.py torch.nn Sequential --title Sequential --no-toc >>../nn_container.md
-
-# convolution
-echo "## Convolution Layers" > ../nn_convolution.md
-echo Conv1d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_convolution.md
-echo Conv2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_convolution.md
-echo ConvTranspose2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_convolution.md
-echo Conv3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_convolution.md
-echo ConvTranspose3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_convolution.md
-
-# pooling
-echo "## Pooling Layers" > ../nn_pooling.md
-echo MaxPool1d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
-echo MaxPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
-echo MaxPool3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
-echo MaxUnpool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_pooling.md
-echo MaxUnpool3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_pooling.md
-echo AvgPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
-echo AvgPool3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_pooling.md
-echo FractionalMaxPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_pooling.md
-echo LPPool2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_pooling.md
-
-# activations
-echo "## Non-linearities" > ../nn_activation.md
-echo ReLU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_activation.md
-echo ReLU6 | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_activation.md
-echo Threshold | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc       >>../nn_activation.md
-echo Hardtanh | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_activation.md
-echo Sigmoid | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_activation.md
-echo Tanh | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_activation.md
-echo ELU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_activation.md
-echo LeakyReLU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc       >>../nn_activation.md
-echo LogSigmoid | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
-echo Softplus | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_activation.md
-echo Softshrink | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
-echo PReLU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_activation.md
-echo Softsign | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_activation.md
-echo Tanhshrink | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
-echo Softmin | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_activation.md
-echo Softmax | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_activation.md
-echo Softmax2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc       >>../nn_activation.md
-echo LogSoftmax | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_activation.md
-
-# normalization
-echo "## Normalization layers" > ../nn_normalization.md
-echo BatchNorm1d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_normalization.md
-echo BatchNorm2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_normalization.md
-echo BatchNorm3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_normalization.md
-
-# recurrentnet
-echo "## Recurrent layers" > ../nn_recurrent.md
-echo RNN | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_recurrent.md
-echo LSTM | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_recurrent.md
-echo GRU | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_recurrent.md
-echo RNNCell | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_recurrent.md
-echo LSTMCell | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc        >>../nn_recurrent.md
-echo GRUCell | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc         >>../nn_recurrent.md
-
-# linear
-echo "## Linear layers" > ../nn_linear.md
-echo Linear | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_linear.md
-
-# dropout
-echo "## Dropout layers" > ../nn_dropout.md
-echo Dropout | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc   >>../nn_dropout.md
-echo Dropout2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_dropout.md
-echo Dropout3d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc >>../nn_dropout.md
-
-# Sparse
-echo "## Sparse layers" > ../nn_sparse.md
-echo Embedding | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc              >>../nn_sparse.md
-
-# loss_functions
-echo "## Loss functions" > ../nn_loss.md
-echo L1Loss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                        >>../nn_loss.md
-echo MSELoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                       >>../nn_loss.md
-echo CrossEntropyLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc              >>../nn_loss.md
-echo NLLLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                       >>../nn_loss.md
-echo NLLLoss2d | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                     >>../nn_loss.md
-echo KLDivLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                     >>../nn_loss.md
-echo BCELoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                       >>../nn_loss.md
-echo MarginRankingLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc             >>../nn_loss.md
-echo HingeEmbeddingLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc            >>../nn_loss.md
-echo MultiLabelMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc          >>../nn_loss.md
-echo SmoothL1Loss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                  >>../nn_loss.md
-echo SoftMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc                >>../nn_loss.md
-echo MultiLabelSoftMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc      >>../nn_loss.md
-echo CosineEmbeddingLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc           >>../nn_loss.md
-echo MultiMarginLoss | xargs -I {} python doc2md.py torch.nn {} --title {} --no-toc               >>../nn_loss.md
-
-popd
--- a/docs-old/docutils/torch-cwrap-gen.py
+++ b/docs-old/docutils/torch-cwrap-gen.py
@ -1,143 +0,0 @@
-import sys
-from tools.cwrap import cwrap
-from tools.cwrap.plugins import CWrapPlugin
-from string import Template
-import sys
-import torch
-from torch.autograd import Variable
-
-def transform_defined_if(defined_if):
-    if defined_if != None:
-        defined_if = defined_if.replace('defined(TH_REAL_IS_FLOAT)', 'Float')
-        defined_if = defined_if.replace('defined(TH_REAL_IS_DOUBLE)', 'Double')
-        defined_if = defined_if.replace('defined(TH_REAL_IS_BYTE)', 'Byte')
-        defined_if = defined_if.replace('defined(TH_REAL_IS_CHAR)', 'Char')
-        defined_if = defined_if.replace('defined(TH_REAL_IS_INT)', 'Int')
-        defined_if = defined_if.replace('defined(TH_REAL_IS_LONG)', 'Long')
-        defined_if = defined_if.replace('defined(NUMPY_TYPE_ENUM)', 
-                                        'Byte // Short // Int // Long // Float // Double')
-        defined_if = defined_if.replace('CUDA_INT', 'Cuda_Int')
-        defined_if = defined_if.replace('CUDA_LONG', 'Cuda_Long')
-        defined_if = defined_if.replace('CUDA_FLOAT', 'Cuda_Float')
-        defined_if = defined_if.replace('CUDA_DOUBLE', 'Cuda_Double')
-        defined_if = defined_if.replace('CUDA_HALF', 'Cuda_Half')
-        defined_if = defined_if.replace('!IS_CUDA', 'All CPU Types')
-    else:
-        defined_if = "All Types (CPU and CUDA)"
-    defined_if = defined_if.replace('||', '//')
-    return defined_if
-
-class DocGen(CWrapPlugin):
-    def __init__(self):
-        self.declarations = {}
-
-    def process_declarations(self, declarations):
-        self.declarations.update({declaration['name']: declaration for declaration in declarations})
-        # self.declarations += declarations
-        return declarations
-
-    def get_wrapper_template(self, declaration):
-        return Template("")
-
-    def get_type_check(self, arg, option):
-        return Template("")
-
-    def get_type_unpack(self, arg, option):
-        return Template("")
-
-    def get_return_wrapper(self, option):
-        return Template("")
-
-    def print_declarations(self):  
-        print("# torch.Tensor")
-        for name, declarations in sorted(self.declarations.items()):
-            if name.endswith('_') and name[:-1] in self.declarations:
-                continue
-            if not name.endswith('_') and name + '_' in self.declarations:
-                inplace = True
-            else:
-                inplace = False
-
-            pname = declarations['options'][0].get('python_name', None)
-            if pname != None:
-                name = pname
-            if name.startswith('_'):
-                continue
-
-            # START PRINTING MARKDOWN
-            print("## " + name + " \n")
-            print("|    %-25s |    %-8s |    %-25s |" % ("Name", "Autograd", "defined if"))
-            print("| " + ('-' * 28) + " | " + ('-' * 11) + " | "+ ('-' * 28) + " |")
-            if inplace:
-                sys.stdout.write("|    %-25s" % (name + '  //  ' + name + "_"))
-            else:
-                sys.stdout.write("|    %-25s" % name)
-            sys.stdout.write(' | ')
-            if hasattr(Variable(torch.randn(10)), name):
-                sys.stdout.write(' %9s ' % 'yes') # + '   ' + name)
-            else:
-                sys.stdout.write(' %9s ' % 'no') # + '   ' + name)
-            defined_if = declarations.get('defined_if', None)
-            defined_if = transform_defined_if(defined_if)
-            sys.stdout.write(' | ')
-            sys.stdout.write(defined_if)
-            sys.stdout.write(' |')
-            sys.stdout.write('\n\n')
-            #if inplace:
-            #    print('Inplace Exists : True')
-            #sys.stdout.write('Arguments  : ')
-
-            args = declarations['options'][0]['arguments']
-            if len(args) == 0:
-                print(    '**No Arguments**\n' )
-            else:
-                print(    '**Arguments**\n' )
-                print("|    %-15s |    %-12s |    %-15s |" % ("Name", "Type", "Default"))
-                print("| " + ('-' * 18) + " | " + ('-' * 15) + " | "+ ('-' * 18) + " |")
-
-                for arg in args:
-                    type_ = arg['type']
-                    if type_ == 'THGenerator*':
-                        continue
-                    if type_ == 'THTensor*':
-                        type_ = 'Tensor'
-                    if type_ == 'THIndexTensor*':
-                        type_ = 'LongTensor'
-                    if type_ == 'THBoolTensor*':
-                        type_ = 'ByteTensor'
-                    if type_ == 'THLongTensor*':
-                        type_ = 'LongTensor'
-                    if type_ == 'THLongStorage*':
-                        type_ = 'LongStorage'
-                    default = arg.get('default', None)
-                    allocated = arg.get('allocate', None)
-                    if default == None and allocated == None:
-                        default = "     [required]"
-                    elif allocated != None:
-                        default = "     [optional]"
-                    else:
-                        default = str(default)
-                        import re
-                        m = re.search('\s*AS_REAL\((.+)\)\s*', default)
-                        if m:
-                            default = m.group(1)
-                            default = default
-
-                    print('| %15s    |  %12s   |   %10s |' % (arg['name'], type_, default))
-                    # print(    'Options    : ' )
-                    # print(declarations['options'][0])
-                print('')
-            if declarations['return']:
-                return_ = declarations['return']
-                if return_ == 'THTensor*':
-                    return_ = 'Tensor'
-                if return_ == 'void':
-                    return_ = 'nothing'
-                print(    '**Returns        : ' + return_ + '**')
-            print('')
-
-
-docs = DocGen()
-cwrap('../../torch/csrc/generic/TensorMethods.cwrap', plugins=[docs])
-
-docs.print_declarations()
--- a/docs-old/image/abs.png
+++ b/docs-old/image/abs.png
--- a/docs-old/image/elu.png
+++ b/docs-old/image/elu.png
--- a/docs-old/image/exp.png
+++ b/docs-old/image/exp.png
--- a/docs-old/image/hshrink.png
+++ b/docs-old/image/hshrink.png
--- a/docs-old/image/htanh.png
+++ b/docs-old/image/htanh.png
--- a/docs-old/image/logsigmoid.png
+++ b/docs-old/image/logsigmoid.png
--- a/docs-old/image/logsoftmax.png
+++ b/docs-old/image/logsoftmax.png
--- a/docs-old/image/power.png
+++ b/docs-old/image/power.png
--- a/docs-old/image/prelu.png
+++ b/docs-old/image/prelu.png
--- a/docs-old/image/relu.png
+++ b/docs-old/image/relu.png
--- a/docs-old/image/relu6.png
+++ b/docs-old/image/relu6.png
--- a/docs-old/image/rrelu.png
+++ b/docs-old/image/rrelu.png
--- a/docs-old/image/sigmmoid.png
+++ b/docs-old/image/sigmmoid.png
--- a/docs-old/image/sigmoid.png
+++ b/docs-old/image/sigmoid.png
--- a/docs-old/image/softmax.png
+++ b/docs-old/image/softmax.png
--- a/docs-old/image/softmin.png
+++ b/docs-old/image/softmin.png
--- a/docs-old/image/softplus.png
+++ b/docs-old/image/softplus.png
--- a/docs-old/image/softsign.png
+++ b/docs-old/image/softsign.png
--- a/docs-old/image/sqrt.png
+++ b/docs-old/image/sqrt.png
--- a/docs-old/image/square.png
+++ b/docs-old/image/square.png
--- a/docs-old/image/sshrink.png
+++ b/docs-old/image/sshrink.png
--- a/docs-old/image/tanh.png
+++ b/docs-old/image/tanh.png
--- a/docs-old/nn.md
+++ b/docs-old/nn.md
@ -1,3 +0,0 @@
-# torch.nn
-
-Neural Networks in PyTorch
--- a/docs-old/nn_activation.md
+++ b/docs-old/nn_activation.md
@ -1,496 +0,0 @@
-## Non-linearities
-### ReLU
-
-Applies the rectified linear unit function element-wise ReLU(x)= max(0,x)
-
-```python
-m = nn.ReLU()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-inplace |  | can optionally do the operation in-place
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/relu.png" >
-### ReLU6
-
-Applies the element-wise function ReLU6(x) = min( max(0,x), 6)
-
-```python
-m = nn.ReLU6()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-inplace |  | can optionally do the operation in-place
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/relu6.png" >
-### Threshold
-
-Thresholds each element of the input Tensor
-
-```python
-m = nn.Threshold(0.1, 20)
-input = Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-Threshold is defined as:
-     y =  x        if x >= threshold
-          value    if x <  threshold
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-threshold |  | The value to threshold at
-value |  | The value to replace with
-inplace |  | can optionally do the operation in-place
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    Tensor of same dimension and shape as the input
-### Hardtanh
-
-Applies the HardTanh function element-wise
-
-```python
-m = nn.HardTanh(-2, 2)
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-HardTanh is defined as:
-   f(x) = +1, if x  >  1
-   f(x) = -1, if x  < -1
-   f(x) =  x,  otherwise
-The range of the linear region [-1, 1] can be adjusted
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-min_value |  | minimum value of the linear region range
-max_value |  | maximum value of the linear region range
-inplace |  | can optionally do the operation in-place
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/htanh.png" >
-### Sigmoid
-
-Applies the element-wise function sigmoid(x) = 1 / ( 1 + exp(-x))
-
-```python
-m = nn.Sigmoid()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/sigmoid.png" >
-### Tanh
-
-Applies element-wise, Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-
-```python
-m = nn.Tanh()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/tanh.png" >
-### ELU
-
-Applies element-wise, ELU(x) = max(0,x) + min(0, alpha * (exp(x) - 1))
-
-```python
-m = nn.ELU()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-alpha | 1.0 | the alpha value for the ELU formulation.
-inplace |  | can optionally do the operation in-place
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/elu.png" >
-### LeakyReLU
-
-Applies element-wise, f(x) = max(0, x) + negative_slope * min(0, x)
-
-```python
-m = nn.LeakyReLU(0.1)
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-negative_slope | 1e-2 | Controls the angle of the negative slope.
-inplace |  | can optionally do the operation in-place
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-### LogSigmoid
-
-Applies element-wise LogSigmoid(x) = log( 1 / (1 + exp(-x_i)))
-
-```python
-m = nn.LogSigmoid()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/logsigmoid.png" >
-### Softplus
-
-Applies element-wise SoftPlus(x) = 1/beta * log(1 + exp(beta * x_i))
-
-```python
-m = nn.Softplus()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-SoftPlus is a smooth approximation to the ReLU function and can be used
-to constrain the output of a machine to always be positive.
-For numerical stability the implementation reverts to the linear function
-for inputs above a certain value.
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-beta | 1 | the beta value for the Softplus formulation.
-threshold | 20 | values above this revert to a linear function.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/softplus.png" >
-### Softshrink
-
-Applies the soft shrinkage function elementwise
-
-```python
-m = nn.Softshrink()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-SoftShrinkage operator is defined as:
-    f(x) = x-lambda, if x > lambda >  f(x) = x+lambda, if x < -lambda
-    f(x) = 0, otherwise
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-lambd | 0.5 | the lambda value for the Softshrink formulation.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/sshrink.png" >
-### PReLU
-
-Applies element-wise the function PReLU(x) = max(0,x) + a * min(0,x)
-
-```python
-m = nn.PReLU()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-Here "a" is a learnable parameter.
-When called without arguments, nn.PReLU() uses a single parameter "a"
-across all input channels. If called with nn.PReLU(nChannels), a separate
-"a" is used for each input channel.
-Note that weight decay should not be used when learning "a" for good
-performance.
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-num_parameters | 1 | number of "a" to learn.
-init | 0.25 | the initial value of "a".
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/prelu.png" >
-### Softsign
-
-Applies element-wise, the function Softsign(x) = x / (1 + |x|)
-
-```python
-m = nn.Softsign()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-
-<img src="image/softsign.png" >
-### Tanhshrink
-
-Applies element-wise, Tanhshrink(x) = x - Tanh(x)
-
-```python
-m = nn.Tanhshrink()
-input = autograd.Variable(torch.randn(2))
-print(input)
-print(m(input))
-```
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Tensor of any size and dimension
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input
-### Softmin
-
-Applies the Softmin function to an n-dimensional input Tensor
-
-```python
-m = nn.Softmin()
-input = autograd.Variable(torch.randn(2, 3))
-print(input)
-print(m(input))
-```
-
-rescaling them so that the elements of the n-dimensional output Tensor
-lie in the range (0,1) and sum to 1
-Softmin(x) = exp(-x_i - shift) / sum_j exp(-x_j - shift)
-             where shift = max_i - x_i
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * ]  | 2D Tensor of any size
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input, with
-    values in the range [0, 1]
-
-<img src="image/softmin.png" >
-### Softmax
-
-Applies the Softmax function to an n-dimensional input Tensor
-
-```python
-m = nn.Softmax()
-input = autograd.Variable(torch.randn(2, 3))
-print(input)
-print(m(input))
-```
-
-rescaling them so that the elements of the n-dimensional output Tensor
-lie in the range (0,1) and sum to 1
-
-Softmax is defined as f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)
-                      where shift = max_i x_i
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * ]  | 2D Tensor of any size
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input with
-    values in the range [0, 1]
-
-<img src="image/softmax.png" >
-Notes:
-    Note that this module doesn't work directly with NLLLoss,
-    which expects the Log to be computed between the Softmax and itself.
-    Use Logsoftmax instead (it's faster).
-### Softmax2d
-
-Applies SoftMax over features to each spatial location
-
-```python
-m = nn.Softmax2d()
-# you softmax over the 2nd dimension
-input = autograd.Variable(torch.randn(2, 3, 12, 13))
-print(input)
-print(m(input))
-```
-
-When given an image of Channels x Height x Width, it will
-apply Softmax to each location [Channels, h_i, w_j]
-
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , * , * ]  | 4D Tensor of any size
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input with
-    values in the range [0, 1]
-### LogSoftmax
-
-Applies the Log(Softmax(x)) function to an n-dimensional input Tensor.
-
-```python
-m = nn.LogSoftmax()
-input = autograd.Variable(torch.randn(2, 3))
-print(input)
-print(m(input))
-```
-
-The LogSoftmax formulation can be simplified as
-     f_i(x) = log(1 / a * exp(x_i)) where a = sum_j exp(x_j) .
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * ]  | 2D Tensor of any size
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a Tensor of the same dimension and shape as the input with
-    values in the range [-inf, 0)
-
-<img src="image/logsoftmax.png" >
--- a/docs-old/nn_container.md
+++ b/docs-old/nn_container.md
@ -1,136 +0,0 @@
-## Containers
-### Container
-
-This is the base container class for all neural networks you would define.
-
-```python
-# Example of using Container
- class Net(nn.Container):
-    def __init__(self):
-        super(Net, self).__init__(
-            conv1 = nn.Conv2d(1, 20, 5),
-            relu  = nn.ReLU()
-         )
-    def forward(self, input):
-        output = self.relu(self.conv1(x))
-        return output
- model = Net()
-```
-
-```python
-# one can add modules to the container after construction
-model.add_module('pool1', nn.MaxPool2d(2, 2))
-```
-
-```python
-```
-
-```python
-# .parameters()
-```
-
-```python
->>> for param in model.parameters():
->>>     print(type(param.data), param.size())
-<class 'torch.FloatTensor'> (20L,)
-<class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
-```
-
-```python
-```
-
-```python
-# .state_dict()
-```
-
-```python
->>> pdict = model.state_dict()
->>> print(sdict.keys())
-['conv1.bias', 'conv1.weight']
-```
-
-```python
-```
-
-You will subclass your container from this class.
-In the constructor you define the modules that you would want to use,
-and in the "forward" function you use the constructed modules in
-your operations.
-
-To make it easier to understand, given is a small example.
-
-One can also add new modules to a container after construction.
-You can do this with the add_module function 
-or by assigning them as Container attributes.
-
-#### one can also set modules as attributes of the container
-model.conv1 = nn.Conv2d(12, 24, 3)
-The container has some important additional methods: 
-
-**`[generator] parameters()`**
-
-returns a generator over all learnable parameters in the container instance. 
-This can typically be passed to the optimizer API
-
-**`[dict] state_dict()`**
-
-returns a dictionary of learnable parameters of the Container.
-For example: ['conv1.weight' : Parameter(torch.FloatTensor(20x1x5x5)),
-              'conv1.bias'   : Parameter(torch.FloatTensor(20)),
-             ]
-
-
-**`load_state_dict(dict)`**
-
-Given a parameter dict, sets the parameters of self to be the given dict.
-It loads loads the parameters recursively.
-Excessive or non-matching parameter names are ignored.
-For example, the input dict has an entry 'conv44.weight', but 
-if the container does not have a module named 'conv44', then this entry is ignored.
-
-**`children()`**
-
-Returns a generator over all the children modules of self
-
-**`train()`**
-
-Sets the Container (and all it's child modules) to training mode (for modules such as batchnorm, dropout etc.)
-
-**`eval()`**
-
-Sets the Container (and all it's child modules) to evaluate mode (for modules such as batchnorm, dropout etc.)
-
-**`apply(closure)`**
-
-Applies the given closure to each parameter of the container. 
-
-
-**__Note: Apart from these, the container will define the base functions that it has derived from nn.Module __**
-### Sequential
-
-A sequential Container. It is derived from the base nn.Container class
-
-```python
-# Example of using Sequential
-model = nn.Sequential(
-          nn.Conv2d(1,20,5),
-          nn.ReLU(),
-          nn.Conv2d(20,64,5),
-          nn.ReLU()
-        )
-```
-
-```python
-```
-
-Modules will be added to it in the order they are passed in the constructor.
-Alternatively, an ordered dict of modules can also be passed in.
-
-To make it easier to understand, given is a small example.
-#### Example of using Sequential with OrderedDict
-model = nn.Sequential(OrderedDict([
-          ('conv1', nn.Conv2d(1,20,5)),
-          ('relu1', nn.ReLU()),
-          ('conv2', nn.Conv2d(20,64,5)),
-          ('relu2', nn.ReLU())
-        ]))
--- a/docs-old/nn_convolution.md
+++ b/docs-old/nn_convolution.md
@ -1,236 +0,0 @@
-## Convolution Layers
-### Conv1d
-
-Applies a 1D convolution over an input signal composed of several input
-
-```python
-The output value of the layer with input (b x iC x W) and output (b x oC x oW)
-can be precisely described as:
-output[b_i][oc_i][w_i] = bias[oc_i]
-            + sum_iC sum_{ow = 0, oW-1} sum_{kw = 0 to kW-1}
-                weight[oc_i][ic_i][kw] * input[b_i][ic_i][stride_w * ow + kw)]
-```
-
-```python
-m = nn.Conv1d(16, 33, 3, stride=2)
-input = autograd.Variable(torch.randn(20, 16, 50))
-output = m(input)
-```
-
-planes.
-
-
-Note that depending of the size of your kernel, several (of the last)
-columns of the input might be lost. It is up to the user
-to add proper padding.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-in_channels |  | The number of expected input channels in the image given as input
-out_channels |  | The number of output channels the convolution layer will produce
-kernel_size |  | the size of the convolving kernel.
-stride |  | the stride of the convolving kernel.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , in_channels  , * ]  | Input is minibatch x in_channels x iW
-output | [ * , out_channels , * ]   | Output shape is precisely minibatch x out_channels x floor((iW  + 2*padW - kW) / dW + 1)
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the learnable weights of the module of shape (out_channels x in_channels x kW)
-bias | the learnable bias of the module of shape (out_channels)
-### Conv2d
-
-Applies a 2D convolution over an input image composed of several input
-
-```python
-The output value of the layer with input (b x iC x H x W) and output (b x oC x oH x oW)
-can be precisely described as:
-output[b_i][oc_i][h_i][w_i] = bias[oc_i]
-            + sum_iC sum_{oh = 0, oH-1} sum_{ow = 0, oW-1} sum_{kh = 0 to kH-1} sum_{kw = 0 to kW-1}
-                weight[oc_i][ic_i][kh][kw] * input[b_i][ic_i][stride_h * oh + kh)][stride_w * ow + kw)]
-```
-
-```python
-# With square kernels and equal stride
-m = nn.Conv2d(16, 33, 3, stride=2)
-# non-square kernels and unequal stride and with padding
-m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-# non-square kernels and unequal stride and with padding and dilation
-m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
-input = autograd.Variable(torch.randn(20, 16, 50, 100))
-output = m(input)
-```
-
-planes.
-
-
-Note that depending of the size of your kernel, several (of the last)
-columns or rows of the input image might be lost. It is up to the user
-to add proper padding in images.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-in_channels |  | The number of expected input channels in the image given as input
-out_channels |  | The number of output channels the convolution layer will produce
-kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-stride | 1 | the stride of the convolving kernel. Can be a single number s or a tuple (sh x sw).
-padding | 0 | implicit zero padding on the input. Can be a single number s or a tuple.
-dilation | None | If given, will do dilated (or atrous) convolutions. Can be a single number s or a tuple.
-bias | True | If set to False, the layer will not learn an additive bias.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , in_channels  , * , * ]  | Input is minibatch x in_channels x iH x iW
-output | [ * , out_channels , * , * ]   | Output shape is precisely minibatch x out_channels x floor((iH  + 2*padH - kH) / dH + 1) x floor((iW  + 2*padW - kW) / dW + 1)
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the learnable weights of the module of shape (out_channels x in_channels x kH x kW)
-bias | the learnable bias of the module of shape (out_channels)
-### ConvTranspose2d
-
-Applies a 2D deconvolution operator over an input image composed of several input
-
-```python
-# With square kernels and equal stride
-m = nn.ConvTranspose2d(16, 33, 3, stride=2)
-# non-square kernels and unequal stride and with padding
-m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-input = autograd.Variable(torch.randn(20, 16, 50, 100))
-output = m(input)
-# exact output size can be also specified as an argument
-input = autograd.Variable(torch.randn(1, 16, 12, 12))
-downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
-upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
-h = downsample(input)
-output = upsample(h, output_size=input.size())
-```
-
-planes.
-The deconvolution operator multiplies each input value element-wise by a learnable kernel,
-and sums over the outputs from all input feature planes.
-This module can be seen as the exact reverse of the Conv2d module.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-in_channels |  | The number of expected input channels in the image given as input
-out_channels |  | The number of output channels the convolution layer will produce
-kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-stride | 1 | the stride of the convolving kernel. Can be a single number or a tuple (sh x sw).
-padding | 0 | implicit zero padding on the input. Can be a single number or a tuple.
-output_padding | 0 | A zero-padding of 0 <= padding < stride that should be added to the output. Can be a single number or a tuple.
-bias | True | If set to False, the layer will not learn an additive bias.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , in_channels  , * , * ]  | Input is minibatch x in_channels x iH x iW
-output | [ * , out_channels , * , * ]   | Output shape is minibatch x out_channels x (iH - 1) * sH - 2*padH + kH + output_paddingH x (iW - 1) * sW - 2*padW + kW, or as specified in a second argument to the call.
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the learnable weights of the module of shape (in_channels x out_channels x kH x kW)
-bias | the learnable bias of the module of shape (out_channels)
-### Conv3d
-
-Applies a 3D convolution over an input image composed of several input
-
-```python
-# With square kernels and equal stride
-m = nn.Conv3d(16, 33, 3, stride=2)
-# non-square kernels and unequal stride and with padding
-m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
-input = autograd.Variable(torch.randn(20, 16, 10, 50, 100))
-output = m(input)
-```
-
-planes.
-
-Note that depending of the size of your kernel, several (of the last)
-columns or rows of the input image might be lost. It is up to the user
-to add proper padding in images.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-in_channels |  | The number of expected input channels in the image given as input
-out_channels |  | The number of output channels the convolution layer will produce
-kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
-stride | 1 | the stride of the convolving kernel. Can be a single number s or a tuple (kt x sh x sw).
-padding | 0 | implicit zero padding on the input. Can be a single number s or a tuple.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , in_channels  , * , * , * ]  | Input is minibatch x in_channels x iT x iH x iW
-output | [ * , out_channels , * , * , * ]   | Output shape is precisely minibatch x out_channels x floor((iT  + 2*padT - kT) / dT + 1) x floor((iH  + 2*padH - kH) / dH + 1) x floor((iW  + 2*padW - kW) / dW + 1)
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the learnable weights of the module of shape (out_channels x in_channels x kT x kH x kW)
-bias | the learnable bias of the module of shape (out_channels)
-### ConvTranspose3d
-
-Applies a 3D deconvolution operator over an input image composed of several input
-
-```python
-# With square kernels and equal stride
-m = nn.ConvTranspose3d(16, 33, 3, stride=2)
-# non-square kernels and unequal stride and with padding
-m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2))
-input = autograd.Variable(torch.randn(20, 16, 10, 50, 100))
-output = m(input)
-```
-
-planes.
-The deconvolution operator multiplies each input value element-wise by a learnable kernel,
-and sums over the outputs from all input feature planes.
-This module can be seen as the exact reverse of the Conv3d module.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-in_channels |  | The number of expected input channels in the image given as input
-out_channels |  | The number of output channels the convolution layer will produce
-kernel_size |  | the size of the convolving kernel. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
-stride | 1 | the stride of the convolving kernel. Can be a single number or a tuple (st x sh x sw).
-padding | 0 | implicit zero padding on the input. Can be a single number or a tuple.
-output_padding | 0 | A zero-padding of 0 <= padding < stride that should be added to the output. Can be a single number or a tuple.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , in_channels  , * , * , * ]  | Input is minibatch x in_channels x iH x iW
-output | [ * , out_channels , * , * , * ]   | Output shape is precisely minibatch x out_channels x (iT - 1) * sT - 2*padT + kT + output_paddingT x (iH - 1) * sH - 2*padH + kH + output_paddingH x (iW - 1) * sW - 2*padW + kW
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the learnable weights of the module of shape (in_channels x out_channels x kT x kH x kW)
-bias | the learnable bias of the module of shape (out_channels)
--- a/docs-old/nn_core.md
+++ b/docs-old/nn_core.md
@ -1,233 +0,0 @@
-# Module
-
-This is the base class for all Modules defined in the nn package.
-
-```python
-# .parameters()
-```
-
-```python
->>> for param in model.parameters():
->>>     print(type(param.data), param.size())
-<class 'torch.FloatTensor'> (20L,)
-<class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
-```
-
-```python
-```
-
-```python
-# .state_dict()
-```
-
-```python
->>> pdict = model.state_dict()
->>> print(pdict.keys())
-['bias', 'weight']
-```
-
-```python
-```
-
-Even the Container class derives from this class.
-
-An nn.Module has the following interface:
-
-**Constructor:**
-   nn.Module(**parameters)
-
-All arguments passed in to the constructor need to be of type 
-nn.Parameter or a Tensor.
-
-
-**forward(...)**
-
-This is the function that one defines when subclassing to create
-their own modules.
-It takes in inputs and returns outputs.
-
-**__call__(...)**
-
-This calls the forward function, as well as the hooks
-
-**register_buffer(name, tensor)**
-
-This is typically used to register a buffer that is not a Parameter.
-For example, in BatchNorm, the running_mean is a buffer, so one would
-register it in the constructor of BatchNorm with:
-
-`self.register_buffer('running_mean', torch.zeros(num_features))`
-
-The registered buffers can simply be accessed as class members
-when needed.
-
-**cpu()**
-
-Recursively moves all it's parameters and buffers to the CPU
-
-**cuda(device_id=None)**
-Recursively moves all it's parameters and buffers to the CUDA memory.
-If device_id is given, moves it to GPU number device_id
-
-**float()**
-Typecasts the parameters and buffers to float
-
-**double()**
-Typecasts the parameters and buffers to double
-
-**register_forward_hook(name, hook)**
-
-This will register a user-defined closure on the module.
-Whenever the module finishes it's forward operation,
-the user closure is called.
-The signature of the closure is `def closure(input, output)`
-
-**register_backward_hook(name, hook)**
-
-This will register a user-defined closure on the module.
-Whenever the module finishes it's backward operation,
-the user closure is called.
-The signature of the closure is `def closure(gradOutput, gradInput)`
-
-**remove_forward_hook(name)**
-
-Removes a registered forward hook with the given name
-
-**remove_backward_hook(name)**
-
-Removes a registered backward hook with the given name
-
-**`[generator] parameters()`**
-
-returns a generator over all learnable parameters in the container instance. 
-This can typically be passed to the optimizer API
-
-**`[dict] state_dict()`**
-
-returns a dictionary of learnable parameters of the Module.
-For example: ['weight' : Parameter(torch.FloatTensor(20x1x5x5)),
-              'bias'   : Parameter(torch.FloatTensor(20)),
-             ]
-
-**`load_state_dict(dict)`**
-
-Given a parameter dict, sets the parameters of self to be the given dict.
-
-**`train()`**
-
-Sets the Container to training mode (for modules such as batchnorm, dropout etc.)
-
-**`eval()`**
-
-Sets the Container to evaluate mode (for modules such as batchnorm, dropout etc.)
-
-**`zero_grad()`**
-
-Zeroes the gradients of each Parameter of the module
-# Container
-
-This is the base container class for all neural networks you would define.
-
-```python
-# Example of using Container
- class Net(nn.Container):
-    def __init__(self):
-        super(Net, self).__init__(
-            conv1 = nn.Conv2d(1, 20, 5),
-            relu  = nn.ReLU()
-         )
-    def forward(self, input):
-        output = self.relu(self.conv1(x))
-        return output
- model = Net()
-```
-
-```python
-# one can add modules to the container after construction
-model.add_module('pool1', nn.MaxPool2d(2, 2))
-```
-
-```python
-```
-
-```python
-# .parameters()
-```
-
-```python
->>> for param in model.parameters():
->>>     print(type(param.data), param.size())
-<class 'torch.FloatTensor'> (20L,)
-<class 'torch.FloatTensor'> (20L, 1L, 5L, 5L)
-```
-
-```python
-```
-
-```python
-# .state_dict()
-```
-
-```python
->>> pdict = model.state_dict()
->>> print(pdict.keys())
-['conv1.bias', 'conv1.weight']
-```
-
-```python
-```
-
-You will subclass your container from this class.
-In the constructor you define the modules that you would want to use,
-and in the "forward" function you use the constructed modules in
-your operations.
-
-To make it easier to understand, given is a small example.
-
-One can also add new modules to a container after construction.
-You can do this with the add_module function 
-or by assigning them as Container attributes.
-
-## one can also set modules as attributes of the container
-model.conv1 = nn.Conv2d(12, 24, 3)
-The container has some important additional methods: 
-
-**`[generator] parameters()`**
-
-returns a generator over all learnable parameters in the container instance. 
-This can typically be passed to the optimizer API
-
-**`[dict] state_dict()`**
-
-returns a dictionary of learnable parameters of the Container.
-For example: ['conv1.weight' : Parameter(torch.FloatTensor(20x1x5x5)),
-              'conv1.bias'   : Parameter(torch.FloatTensor(20)),
-             ]
-
-
-**`load_state_dict(dict)`**
-
-Given a parameter dict, sets the parameters of self to be the given dict.
-It loads loads the parameters recursively.
-Excessive or non-matching parameter names are ignored.
-For example, the input dict has an entry 'conv44.weight', but 
-if the container does not have a module named 'conv44', then this entry is ignored.
-
-**`children()`**
-
-Returns a generator over all the children modules of self
-
-**`train()`**
-
-Sets the Container (and all it's child modules) to training mode (for modules such as batchnorm, dropout etc.)
-
-**`eval()`**
-
-Sets the Container (and all it's child modules) to evaluate mode (for modules such as batchnorm, dropout etc.)
-
-**`apply(closure)`**
-
-Applies the given closure to each parameter of the container. 
-
-
-**__Note: Apart from these, the container will define the base functions that it has derived from nn.Module __**
--- a/docs-old/nn_dropout.md
+++ b/docs-old/nn_dropout.md
@ -1,90 +0,0 @@
-## Dropout layers
-### Dropout
-
-Randomly zeroes some of the elements of the input tensor.
-
-```python
-m = nn.Dropout(p=0.2)
-input = autograd.Variable(torch.randn(20, 16))
-output = m(input)
-```
-
-The elements to zero are randomized on every forward call.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-p | 0.5 | probability of an element to be zeroed.
-inplace | false | If set to True, will do this operation in-place.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | Any  | Input can be of any shape
-output | Same   | Output is of the same shape as input
-### Dropout2d
-
-Randomly zeroes whole channels of the input tensor.
-
-```python
-m = nn.Dropout2d(p=0.2)
-input = autograd.Variable(torch.randn(20, 16, 32, 32))
-output = m(input)
-```
-
-The input is 4D (batch x channels, height, width) and each channel
-is of size (1, height, width).
-The channels to zero are randomized on every forward call.
-Usually the input comes from Conv2d modules.
-
-As described in the paper &quot;Efficient Object Localization Using Convolutional
-Networks&quot; (http:arxiv.org/abs/1411.4280), if adjacent pixels within
-feature maps are strongly correlated (as is normally the case in early
-convolution layers) then iid dropout will not regularize the activations
-and will otherwise just result in an effective learning rate decrease.
-In this case, nn.Dropout2d will help promote independence between
-feature maps and should be used instead.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-p | 0.5 | probability of an element to be zeroed.
-inplace | false | If set to True, will do this operation in-place.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [*, *, *, *]  | Input can be of any sizes of 4D shape
-output | Same   | Output is of the same shape as input
-### Dropout3d
-
-Randomly zeroes whole channels of the input tensor.
-
-```python
-m = nn.Dropout3d(p=0.2)
-input = autograd.Variable(torch.randn(20, 16, 4, 32, 32))
-output = m(input)
-```
-
-The input is 5D (batch x channels, depth, height, width) and each channel
-is of size (1, depth, height, width).
-The channels to zero are randomized on every forward call.
-Usually the input comes from Conv3d modules.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-p | 0.5 | probability of an element to be zeroed.
-inplace | false | If set to True, will do this operation in-place.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [*, *, *, *, *]  | Input can be of any sizes of 5D shape
-output | Same   | Output is of the same shape as input
--- a/docs-old/nn_linear.md
+++ b/docs-old/nn_linear.md
@ -1,36 +0,0 @@
-## Linear layers
-### Linear
-
-Applies a linear transformation to the incoming data, y = Ax + b
-
-```python
-m = nn.Linear(20, 30)
-input = autograd.Variable(torch.randn(128, 20))
-output = m(input)
-print(output.size())
-```
-
-The input is a 2D mini-batch of samples, each of size in_features
-The output will be a 2D Tensor of size mini-batch x out_features
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-in_features |  | size of each input sample
-out_features |  | size of each output sample
-bias | True | If set to False, the layer will not learn an additive bias.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [*, in_features]  | Input can be of shape minibatch x in_features
-output | [*, out_features]   | Output is of shape minibatch x out_features
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the learnable weights of the module of shape (out_features x in_features)
-bias | the learnable bias of the module of shape (out_features)
--- a/docs-old/nn_loss.md
+++ b/docs-old/nn_loss.md
@ -1,295 +0,0 @@
-## Loss functions
-### L1Loss
-
-Creates a criterion that measures the mean absolute value of the 
-
-element-wise difference between input `x` and target `y`:
-
-loss(x, y)  = 1/n \sum |x_i - y_i|
-
-`x` and `y` arbitrary shapes with a total of `n` elements each
-the sum operation still operates over all the elements, and divides by `n`.
-
-The division by `n` can be avoided if one sets the internal 
-variable `sizeAverage` to `False`
-### MSELoss
-
-Creates a criterion that measures the mean squared error between 
-
-`n` elements in the input `x` and target `y`:
-    loss(x, y) = 1/n \sum |x_i - y_i|^2
-`x` and `y` arbitrary shapes with a total of `n` elements each
-the sum operation still operates over all the elements, and divides by `n`.
-
-The division by `n` can be avoided if one sets the internal variable 
-`sizeAverage` to `False`
-By default, the losses are averaged over observations for each minibatch. 
-However, if the field `sizeAverage = False`, the losses are instead summed.
-### CrossEntropyLoss
-
-This criterion combines `LogSoftMax` and `ClassNLLLoss` in one single class.
-
-
-It is useful when training a classification problem with `n` classes.
-If provided, the optional argument `weights` should be a 1D `Tensor` 
-assigning weight to each of the classes. 
-This is particularly useful when you have an unbalanced training set.
-
-The `input` is expected to contain scores for each class: 
-      `input` has to be a 2D `Tensor` of size `batch x n`.
-This criterion expects a class index (0 to nClasses-1) as the 
-`target` for each value of a 1D tensor of size `n`
-
-The loss can be described as:
-
-loss(x, class) = -log(exp(x[class]) / (\sum_j exp(x[j])))
-               = -x[class] + log(\sum_j exp(x[j]))
-
-or in the case of the `weights` argument being specified:
-
-loss(x, class) = weights[class] * (-x[class] + log(\sum_j exp(x[j])))
-
-The losses are averaged across observations for each minibatch.
-### NLLLoss
-
-The negative log likelihood loss. It is useful to train a classication problem with n classes
-
-```python
-m = nn.LogSoftmax()
-loss = nn.NLLLoss()
-# input is of size nBatch x nClasses = 3 x 5
-input = autograd.Variable(torch.randn(3, 5))
-# each element in target has to have 0 <= value < nclasses 
-target = autograd.Variable(torch.LongTensor([1, 0, 4]))
-output = loss(m(input), target)
-output.backward()
-```
-
-
-If provided, the optional argument `weights` should be a 1D Tensor assigning
-weight to each of the classes.
-This is particularly useful when you have an unbalanced training set.
-
-The input given through a forward call is expected to contain log-probabilities
-of each class: input has to be a 2D Tensor of size minibatch x n
-Obtaining log-probabilities in a neural network is easily achieved by
-adding a  `LogSoftmax`  layer in the last layer.
-You may use `CrossEntropyLoss`  instead, if you prefer not to
-add an extra layer.
-
-The target that this loss expects is a class index (1 to the number of class)
-
-The loss can be described as:
-    loss(x, class) = -x[class]
-
-or in the case of the weights argument it is specified as follows:
-    loss(x, class) = -weights[class] * x[class]
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-weight | None | a manual rescaling weight given to each class. If given, has to be a Tensor of size "nclasses".
-size_average | True | By default, the losses are averaged over observations for each minibatch. However, if the field sizeAverage is set to False, the losses are instead summed for each minibatch.
-Target Shape: [ * ] : Targets of size [minibatch], each value has to be 1 <= targets[i] <= nClasses
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight | the class-weights given as input to the constructor
-### NLLLoss2d
-
-This is negative log likehood loss, but for image inputs. It computes NLL loss per-pixel.
-
-```python
-m = nn.Conv2d(16, 32, (3, 3)).float()
-loss = nn.NLLLoss2d()
-# input is of size nBatch x nClasses x height x width
-input = autograd.Variable(torch.randn(3, 16, 10, 10))
-# each element in target has to have 0 <= value < nclasses
-target = autograd.Variable(torch.LongTensor(3, 8, 8).random_(0, 4))
-output = loss(m(input), target)
-output.backward()
-```
-
-This loss does not support per-class weights
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-size_average | True | By default, the losses are averaged over observations for each minibatch. However, if the field sizeAverage is set to False, the losses are instead summed for each minibatch.
-Target Shape: [ * , *, *] : Targets of size minibatch x height x width, each value has to be 1 <= targets[i] <= nClasses
-### KLDivLoss
-
-The [Kullback-Leibler divergence](http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) Loss
-
-KL divergence is a useful distance measure for continuous distributions 
-and is often useful when performing direct regression over the space of
-(discretely sampled) continuous output distributions.
-As with ClassNLLLoss, the `input` given is expected to contain 
-_log-probabilities_, however unlike ClassNLLLoss, `input` is not 
-restricted to a 2D Tensor, because the criterion is applied element-wise.
-
-This criterion expects a `target` `Tensor` of the same size as the 
-`input` `Tensor`.
-
-The loss can be described as:
-    loss(x, target) = 1/n \sum(target_i * (log(target_i) - x_i))
-
-By default, the losses are averaged for each minibatch over observations 
-*as well as* over dimensions. However, if the field 
-`sizeAverage` is set to `False`, the losses are instead summed.
-### BCELoss
-
-Creates a criterion that measures the Binary Cross Entropy 
-
-between the target and the output:
-    loss(o, t) = - 1/n sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
-
-or in the case of the weights argument being specified:
-    loss(o, t) = - 1/n sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
-
-This is used for measuring the error of a reconstruction in for example 
-an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1, 
-for instance, the output of an `nn.Sigmoid` layer.
-
-By default, the losses are averaged for each minibatch over observations 
-*as well as* over dimensions. However, if the field `sizeAverage` is set 
-to `False`, the losses are instead summed.
-### MarginRankingLoss
-
-Creates a criterion that measures the loss given  
-
-inputs `x1`, `x2`, two 1D min-batch `Tensor`s, 
-and a label 1D mini-batch tensor `y` with values (`1` or `-1`).
-
-If `y == 1` then it assumed the first input should be ranked higher 
-(have a larger value) than the second input, and vice-versa for `y == -1`.
-
-The loss function for each sample in the mini-batch is:
-
-    loss(x, y) = max(0, -y * (x1 - x2) + margin)
-
-if the internal variable `sizeAverage = True`, 
-the loss function averages the loss over the batch samples; 
-if `sizeAverage = False`, then the loss function sums over the batch samples. 
-By default, `sizeAverage` equals to `True`.
-### HingeEmbeddingLoss
-
-Measures the loss given an input `x` which is a 2D mini-batch tensor
-
-and a labels `y`, a 1D tensor containg values (`1` or `-1`).
-This is usually used for measuring whether two inputs are similar or dissimilar, 
-e.g. using the L1 pairwise distance, and is typically used for learning 
-nonlinear embeddings or semi-supervised learning.
-
-                     { x_i,                  if y_i ==  1
-    loss(x, y) = 1/n {
-                     { max(0, margin - x_i), if y_i == -1
-
-`x` and `y` arbitrary shapes with a total of `n` elements each
-the sum operation still operates over all the elements, and divides by `n`.
-(the division by `n` can be avoided if one sets the internal variable `sizeAverage=False`). 
-The `margin` has a default value of `1`, or can be set in the constructor.
-### MultiLabelMarginLoss
-
-Creates a criterion that optimizes a multi-class multi-classification 
-
-hinge loss (margin-based loss) between input `x`  (a 2D mini-batch `Tensor`) and 
-output `y` (which is a 2D `Tensor` of target class indices).
-For each sample in the mini-batch:
-
-    loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x:size(1)
-
-where `i == 0` to `x.size(0)`, `j == 0` to `y.size(0)`, 
-      `y[j] != 0`, and `i != y[j]` for all `i` and `j`.
-
-`y` and `x` must have the same size.
-The criterion only considers the first non zero `y[j]` targets.
-This allows for different samples to have variable amounts of target classes
-### SmoothL1Loss
-
-Creates a criterion that uses a squared term if the absolute 
-
-element-wise error falls below 1 and an L1 term otherwise. 
-It is less sensitive to outliers than the `MSELoss` and in some cases 
-prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
-Also known as the Huber loss.
-
-                          { 0.5 * (x_i - y_i)^2, if |x_i - y_i| < 1
-    loss(x, y) = 1/n \sum {
-                          { |x_i - y_i| - 0.5,   otherwise
-
-`x` and `y` arbitrary shapes with a total of `n` elements each
-the sum operation still operates over all the elements, and divides by `n`.
-
-The division by `n` can be avoided if one sets the internal variable 
-`sizeAverage` to `False`
-### SoftMarginLoss
-
-Creates a criterion that optimizes a two-class classification 
-
-logistic loss between input `x` (a 2D mini-batch `Tensor`) and 
-target `y` (which is a tensor containing either `1`s or `-1`s).
-
-    loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x:nElement()
-
-The normalization by the number of elements in the input can be disabled by
-setting `self.sizeAverage` to `False`.
-### MultiLabelSoftMarginLoss
-
-Creates a criterion that optimizes a multi-label one-versus-all 
-
-loss based on max-entropy, between input `x`  (a 2D mini-batch `Tensor`) and 
-target `y` (a binary 2D `Tensor`). For each sample in the minibatch:
-
-   loss(x, y) = - sum_i (y[i] log( exp(x[i]) / (1 + exp(x[i]))) 
-                         + (1-y[i]) log(1/(1+exp(x[i])))) / x:nElement()
-
-where `i == 0` to `x.nElement()-1`, `y[i]  in {0,1}`.
-`y` and `x` must have the same size.
-### CosineEmbeddingLoss
-
-Creates a criterion that measures the loss given  an input tensors x1, x2 
-
-and a `Tensor` label `y` with values 1 or -1.
-This is used for measuring whether two inputs are similar or dissimilar, 
-using the cosine distance, and is typically used for learning nonlinear 
-embeddings or semi-supervised learning.
-
-`margin` should be a number from `-1` to `1`, `0` to `0.5` is suggested.
-If `margin` is missing, the default value is `0`.
-
-The loss function for each sample is:
-
-                 { 1 - cos(x1, x2),              if y ==  1
-    loss(x, y) = {
-                 { max(0, cos(x1, x2) - margin), if y == -1
-
-If the internal variable `sizeAverage` is equal to `True`, 
-the loss function averages the loss over the batch samples; 
-if `sizeAverage` is `False`, then the loss function sums over the 
-batch samples. By default, `sizeAverage = True`.
-### MultiMarginLoss
-
-Creates a criterion that optimizes a multi-class classification hinge loss 
-
-(margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and 
-output `y` (which is a 1D tensor of target class indices, `0` <= `y` <= `x.size(1)`):
-
-For each mini-batch sample:
-    loss(x, y) = sum_i(max(0, (margin - x[y] + x[i]))^p) / x.size(0)
-                 where `i == 0` to `x.size(0)` and `i != y`.
-
-Optionally, you can give non-equal weighting on the classes by passing 
-a 1D `weights` tensor into the constructor.
-
-The loss function then becomes:
-    loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x.size(0)
-
-By default, the losses are averaged over observations for each minibatch. 
-However, if the field `sizeAverage` is set to `False`, 
-the losses are instead summed.
--- a/docs-old/nn_normalization.md
+++ b/docs-old/nn_normalization.md
@ -1,142 +0,0 @@
-## Normalization layers
-### BatchNorm1d
-
-Applies Batch Normalization over a 2d input that is seen as a mini-batch of 1d inputs
-
-```python
-              x - mean(x)
-y =  ----------------------------- * gamma + beta
-      standard_deviation(x) + eps
-```
-
-```python
-# With Learnable Parameters
-m = nn.BatchNorm1d(100)
-# Without Learnable Parameters
-m = nn.BatchNorm1d(100, affine=False)
-input = autograd.Variable(torch.randn(20, 100))
-output = m(input)
-```
-
-
-
-The mean and standard-deviation are calculated per-dimension over
-the mini-batches and gamma and beta are learnable parameter vectors
-of size N (where N is the input size).
-
-During training, this layer keeps a running estimate of its computed mean
-and variance. The running sum is kept with a default momentum of 0.1
-During evaluation, this running mean/variance is used for normalization.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-num_features |  | the size of each 1D input in the mini-batch
-eps | 1e-5 | a value added to the denominator for numerical stability.
-momentum | 0.1 | the value used for the running_mean and running_var computation.
-affine |  | a boolean value that when set to true, gives the layer learnable affine parameters.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , num_features ]  | 2D Tensor of nBatches x num_features
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a normalized tensor in the batch dimension
-### BatchNorm2d
-
-Applies Batch Normalization over a 4d input that is seen as a mini-batch of 3d inputs
-
-```python
-              x - mean(x)
-y =  ----------------------------- * gamma + beta
-      standard_deviation(x) + eps
-```
-
-```python
-# With Learnable Parameters
-m = nn.BatchNorm2d(100)
-# Without Learnable Parameters
-m = nn.BatchNorm2d(100, affine=False)
-input = autograd.Variable(torch.randn(20, 100, 35, 45))
-output = m(input)
-```
-
-
-
-The mean and standard-deviation are calculated per-dimension over
-the mini-batches and gamma and beta are learnable parameter vectors
-of size N (where N is the input size).
-
-During training, this layer keeps a running estimate of its computed mean
-and variance. The running sum is kept with a default momentum of 0.1
-During evaluation, this running mean/variance is used for normalization.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-num_features |  | num_features from an expected input of size batch_size x num_features x height x width
-eps | 1e-5 | a value added to the denominator for numerical stability.
-momentum | 0.1 | the value used for the running_mean and running_var computation.
-affine |  | a boolean value that when set to true, gives the layer learnable affine parameters.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , num_features , *, * ]  | 4D Tensor of batch_size x num_features x height x width
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a normalized tensor in the batch dimension
-### BatchNorm3d
-
-Applies Batch Normalization over a 5d input that is seen as a mini-batch of 4d inputs
-
-```python
-              x - mean(x)
-y =  ----------------------------- * gamma + beta
-      standard_deviation(x) + eps
-```
-
-```python
-# With Learnable Parameters
-m = nn.BatchNorm3d(100)
-# Without Learnable Parameters
-m = nn.BatchNorm3d(100, affine=False)
-input = autograd.Variable(torch.randn(20, 100, 35, 45, 10))
-output = m(input)
-```
-
-
-
-The mean and standard-deviation are calculated per-dimension over
-the mini-batches and gamma and beta are learnable parameter vectors
-of size N (where N is the input size).
-
-During training, this layer keeps a running estimate of its computed mean
-and variance. The running sum is kept with a default momentum of 0.1
-During evaluation, this running mean/variance is used for normalization.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-num_features |  | num_features from an expected input of size batch_size x num_features x height x width
-eps | 1e-5 | a value added to the denominator for numerical stability.
-momentum | 0.1 | the value used for the running_mean and running_var computation.
-affine |  | a boolean value that when set to true, gives the layer learnable affine parameters.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , num_features , * , * , * ]  | 5D Tensor of batch_size x num_features x depth x height x width
-output | Same  | Output has the same shape as input
-
-#### Returns
-    a normalized tensor in the batch dimension
--- a/docs-old/nn_pooling.md
+++ b/docs-old/nn_pooling.md
@ -1,308 +0,0 @@
-## Pooling Layers
-### MaxPool1d
-
-Applies a 1D max pooling over an input signal composed of several input
-
-```python
-The output value of the layer with input (b x C x W) and output (b x C x oW)
-can be precisely described as:
-output[b_i][c_i][w_i] = max_{k=1, K} input[b_i][c_i][stride_w * w_i + k)]
-```
-
-```python
-# pool of size=3, stride=2
-m = nn.MaxPool1d(3, stride=2)
-input = autograd.Variable(torch.randn(20, 16, 50))
-output = m(input)
-```
-
-planes.
-
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window to take a max over
-stride |  | the stride of the window
-padding | 0 | implicit padding to be added.
-dilation | kernel_size | a parameter that controls the stride of elements in the window.
-return_indices | False | if True, will return the indices along with the outputs. Useful when Unpooling later.
-ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , * ]  | Input is minibatch x channels x iW
-output | [ * , * , * ]   | Output shape = minibatch x channels x floor((iW  + 2*padW - kernel_size) / stride + 1)
-### MaxPool2d
-
-Applies a 2D max pooling over an input signal composed of several input
-
-```python
-The output value of the layer with input (b x C x H x W) and output (b x C x oH x oW)
-can be precisely described as:
-output[b_i][c_i][h_i][w_i] = max_{{kh=1, KH}, {kw=1, kW}} input[b_i][c_i][stride_h * h_i + kH)][stride_w * w_i + kW)]
-```
-
-```python
-# pool of square window of size=3, stride=2
-m = nn.MaxPool2d(3, stride=2)
-# pool of non-square window
-m = nn.MaxPool2d((3, 2), stride=(2, 1))
-input = autograd.Variable(torch.randn(20, 16, 50, 32))
-output = m(input)
-```
-
-planes.
-
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window to take a max over. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
-padding | 0 | implicit padding to be added. Can be a single number or a tuple.
-dilation | 1 | a parameter that controls the stride of elements in the window. Can be a single number or a tuple.
-return_indices | False | if True, will return the indices along with the outputs. Useful to pass to nn.MaxUnpool2d .
-ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
-output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
-### MaxPool3d
-
-Applies a 3D max pooling over an input signal composed of several input
-
-```python
-# pool of square window of size=3, stride=2
-m = nn.MaxPool3d(3, stride=2)
-# pool of non-square window
-m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2))
-input = autograd.Variable(torch.randn(20, 16, 50,44, 31))
-output = m(input)
-```
-
-planes.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window to take a max over. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (st x sh x sw).
-padding | 0 | implicit padding to be added. Can be a single number or a tuple.
-dilation | 1 | a parameter that controls the stride of elements in the window. Can be a single number or a tuple.
-return_indices | False | if True, will return the indices along with the outputs. Useful to pass to nn.MaxUnpool3d .
-ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, *, * ]  | Input is minibatch x channels x iT x iH x iW
-output | [ * , * , *, *, * ]   | Output shape = minibatch x channels x floor((iT  + 2*padT - kT) / sT + 1) x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
-### MaxUnpool2d
-
-Computes the inverse operation of MaxPool2d
-
-```python
-# pool of square window of size=3, stride=2
-m = nn.MaxPool2d(2, stride=2, return_indices = True)
-mu = nn.MaxUnpool2d(2, stride=2)
-input = autograd.Variable(torch.randn(20, 16, 50, 32))
-output, indices = m(input)
-unpooled_output = mu.forward(output, indices)
-# exact output size can be also specified as an argument
-input = autograd.Variable(torch.randn(1, 16, 11, 11))
-downsample = nn.MaxPool2d(3, 3, return_indices=True)
-upsample = nn.MaxUnpool2d(3, 3)
-h, indices = downsample(input)
-output = upsample(h, indices, output_size=input.size())
-```
-
-MaxPool2d is not invertible, as the locations of the max locations are lost.
-MaxUnpool2d takes in as input the output of MaxPool2d and the indices of the Max locations
-and computes the inverse.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the max window. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
-padding | 0 | implicit padding that was added to the input. Can be a single number or a tuple.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
-output | [ * , * , *, * ]   | Output shape is minibatch x channels x padH x (iH - 1) * sH + kH x padW x (iW - 1) * sW + kW, or as specified to the call.
-### MaxUnpool3d
-
-Computes the inverse operation of MaxPool3d
-
-```python
-# pool of square window of size=3, stride=2
-m = nn.MaxPool3d(3, stride=2, return_indices = True)
-mu = nn.MaxUnpool3d(3, stride=2)
-input, indices = autograd.Variable(torch.randn(20, 16, 50, 32, 15))
-output = m(input)
-unpooled_output = m2.forward(output, indices)
-```
-
-MaxPool3d is not invertible, as the locations of the max locations are lost.
-MaxUnpool3d takes in as input the output of MaxPool3d and the indices of the Max locations
-and computes the inverse.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the max window. Can be a single number k (for a square kernel of k x k) or a tuple (kt x kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (st x sh x sw).
-padding | 0 | implicit padding that was added to the input. Can be a single number or a tuple.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, *, * ]  | Input is minibatch x channels x iT x iH x iW
-output | [ * , * , *, *, * ]   | Output shape = minibatch x channels x padT x (iT - 1) * sT + kT x padH x (iH - 1) * sH + kH x padW x (iW - 1) * sW + kW
-### AvgPool2d
-
-Applies a 2D average pooling over an input signal composed of several input
-
-```python
-The output value of the layer with input (b x C x H x W) and output (b x C x oH x oW)
-can be precisely described as:
-output[b_i][c_i][h_i][w_i] = (1 / K) * sum_{kh=1, KH} sum_{kw=1, kW}  input[b_i][c_i][stride_h * h_i + kh)][stride_w * w_i + kw)]
-```
-
-```python
-# pool of square window of size=3, stride=2
-m = nn.AvgPool2d(3, stride=2)
-# pool of non-square window
-m = nn.AvgPool2d((3, 2), stride=(2, 1))
-input = autograd.Variable(torch.randn(20, 16, 50, 32))
-output = m(input)
-```
-
-planes.
-
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
-padding | 0 | implicit padding to be added. Can be a single number or a tuple.
-ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
-output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
-### AvgPool3d
-
-Applies a 3D average pooling over an input signal composed of several input
-
-```python
-# pool of square window of size=3, stride=2
-m = nn.AvgPool3d(3, stride=2)
-# pool of non-square window
-m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2))
-input = autograd.Variable(torch.randn(20, 16, 50,44, 31))
-output = m(input)
-```
-
-planes.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window to take a average over. Can be a single number k (for a square kernel of k x k x k) or a tuple (kt x kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (st x sh x sw).
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, *, * ]  | Input is minibatch x channels x iT x iH x iW
-output | [ * , * , *, *, * ]   | Output shape = minibatch x channels x floor((iT  + 2*padT - kT) / sT + 1) x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
-### FractionalMaxPool2d
-
-Applies a 2D fractional max pooling over an input signal composed of several input
-
-```python
-# pool of square window of size=3, and target output size 13x12
-m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
-# pool of square window and target output size being half of input image size
-m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
-input = autograd.Variable(torch.randn(20, 16, 50, 32))
-output = m(input)
-```
-
-planes.
-
-Fractiona MaxPooling is described in detail in the paper ["Fractional Max-Pooling" by Ben Graham](http://arxiv.org/abs/1412.6071)
-The max-pooling operation is applied in kHxkW regions by a stochastic
-step size determined by the target output size.
-The number of output features is equal to the number of input planes.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window to take a max over. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-output_size |  | the target output size of the image of the form oH x oW. Can be a tuple (oH, oW) or a single number oH for a square image oH x oH
-output_ratio |  | If one wants to have an output size as a ratio of the input size, this option can be given. This has to be a number or tuple in the range (0, 1)
-return_indices | False | if True, will return the indices along with the outputs. Useful to pass to nn.MaxUnpool2d .
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
-output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
-### LPPool2d
-
-Applies a 2D power-average pooling over an input signal composed of several input
-
-```python
-# power-2 pool of square window of size=3, stride=2
-m = nn.LPPool2d(2, 3, stride=2)
-# pool of non-square window of power 1.2
-m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
-input = autograd.Variable(torch.randn(20, 16, 50, 32))
-output = m(input)
-```
-
-planes.
-On each window, the function computed is: f(X) = pow(sum(pow(X, p)), 1/p)
-At p = infinity, one gets Max Pooling
-At p = 1, one gets Average Pooling
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-kernel_size |  | the size of the window. Can be a single number k (for a square kernel of k x k) or a tuple (kh x kw)
-stride | kernel_size | the stride of the window. Can be a single number s or a tuple (sh x sw).
-ceil_mode |  | when True, will use "ceil" instead of "floor" to compute the output shape
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ * , * , *, * ]  | Input is minibatch x channels x iH x iW
-output | [ * , * , *, * ]   | Output shape = minibatch x channels x floor((iH  + 2*padH - kH) / sH + 1) x floor((iW  + 2*padW - kW) / sW + 1)
--- a/docs-old/nn_recurrent.md
+++ b/docs-old/nn_recurrent.md
@ -1,346 +0,0 @@
-## Recurrent layers
-### RNN
-
-Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an input sequence.
-
-```python
-h_t = tanh(w_ih * x_t + b_ih  +  w_hh * h_(t-1) + b_hh)
-```
-
-```python
-rnn = nn.RNN(10, 20, 2)
-input = Variable(torch.randn(5, 3, 10))
-h0 = Variable(torch.randn(2, 3, 20))
-output, hn = rnn(input, h0)
-```
-
-
-
-For each element in the input sequence, each layer computes the following
-function:
-where `h_t` is the hidden state at time t, and `x_t` is the hidden
-state of the previous layer at time t or `input_t` for the first layer.
-If nonlinearity='relu', then ReLU is used instead of tanh.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-input_size |  | The number of expected features in the input x
-hidden_size |  | The number of features in the hidden state h
-num_layers |  | the size of the convolving kernel.
-nonlinearity | 'tanh' | The non-linearity to use ['tanh'|'relu'].
-bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
-batch_first |  | If True, then the input tensor is provided as (batch, seq, feature)
-dropout |  | If non-zero, introduces a dropout layer on the outputs of each RNN layer
-bidirectional | False | If True, becomes a bidirectional RNN.
-
-#### Inputs
-
-Parameter | Default | Description
--------- | ------- | -----------
-input |  | A (seq_len x batch x input_size) tensor containing the features of the input sequence.
-h_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
-
-#### Outputs
-
-Parameter |  Description
--------- |  -----------
-output | A (seq_len x batch x hidden_size) tensor containing the output features (h_k) from the last layer of the RNN, for each k
-h_n | A (num_layers x batch x hidden_size) tensor containing the hidden state for k=seq_len
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight_ih_l[k] | the learnable input-hidden weights of the k-th layer, of shape (input_size x hidden_size)
-weight_hh_l[k] | the learnable hidden-hidden weights of the k-th layer, of shape (hidden_size x hidden_size)
-bias_ih_l[k] | the learnable input-hidden bias of the k-th layer, of shape (hidden_size)
-bias_hh_l[k] | the learnable hidden-hidden bias of the k-th layer, of shape (hidden_size)
-### LSTM
-
-Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
-
-```python
-i_t = sigmoid(W_ii x_t + b_ii + W_hi h_(t-1) + b_hi)
-f_t = sigmoid(W_if x_t + b_if + W_hf h_(t-1) + b_hf)
-g_t = tanh(W_ig x_t + b_ig + W_hc h_(t-1) + b_hg)
-o_t = sigmoid(W_io x_t + b_io + W_ho h_(t-1) + b_ho)
-c_t = f_t * c_(t-1) + i_t * c_t
-h_t = o_t * tanh(c_t)
-```
-
-```python
-rnn = nn.LSTM(10, 20, 2)
-input = Variable(torch.randn(5, 3, 10))
-h0 = Variable(torch.randn(2, 3, 20))
-c0 = Variable(torch.randn(2, 3, 20))
-output, hn = rnn(input, (h0, c0))
-```
-
-
-
-For each element in the input sequence, each layer computes the following
-function:
-where `h_t` is the hidden state at time t, `c_t` is the cell state at time t,
-`x_t` is the hidden state of the previous layer at time t or input_t for the first layer,
-and `i_t`, `f_t`, `g_t`, `o_t` are the input, forget, cell, and out gates, respectively.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-input_size |  | The number of expected features in the input x
-hidden_size |  | The number of features in the hidden state h
-num_layers |  | the size of the convolving kernel.
-bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
-batch_first |  | If True, then the input tensor is provided as (batch, seq, feature)
-dropout |  | If non-zero, introduces a dropout layer on the outputs of each RNN layer
-bidirectional | False | If True, becomes a bidirectional RNN.
-
-#### Inputs
-
-Parameter | Default | Description
--------- | ------- | -----------
-input |  | A (seq_len x batch x input_size) tensor containing the features of the input sequence.
-h_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
-c_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial cell state for each element in the batch.
-
-#### Outputs
-
-Parameter |  Description
--------- |  -----------
-output | A (seq_len x batch x hidden_size) tensor containing the output features (h_t) from the last layer of the RNN, for each t
-h_n | A (num_layers x batch x hidden_size) tensor containing the hidden state for t=seq_len
-c_n | A (num_layers x batch x hidden_size) tensor containing the cell state for t=seq_len
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight_ih_l[k] | the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape (input_size x 3*hidden_size)
-weight_hh_l[k] | the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape (hidden_size x 3*hidden_size)
-bias_ih_l[k] | the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape (3*hidden_size)
-bias_hh_l[k] | the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape (3*hidden_size)
-### GRU
-
-Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
-
-```python
-r_t = sigmoid(W_ir x_t + b_ir + W_hr h_(t-1) + b_hr)
-i_t = sigmoid(W_ii x_t + b_ii + W_hi h_(t-1) + b_hi)
-n_t = tanh(W_in x_t + resetgate * W_hn h_(t-1))
-h_t = (1 - i_t) * n_t + i_t * h_(t-1)
-```
-
-```python
-rnn = nn.GRU(10, 20, 2)
-input = Variable(torch.randn(5, 3, 10))
-h0 = Variable(torch.randn(2, 3, 20))
-output, hn = rnn(input, h0)
-```
-
-
-
-For each element in the input sequence, each layer computes the following
-function:
-where `h_t` is the hidden state at time t, `x_t` is the hidden
-state of the previous layer at time t or input_t for the first layer,
-and `r_t`, `i_t`, `n_t` are the reset, input, and new gates, respectively.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-input_size |  | The number of expected features in the input x
-hidden_size |  | The number of features in the hidden state h
-num_layers |  | the size of the convolving kernel.
-bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
-batch_first |  | If True, then the input tensor is provided as (batch, seq, feature)
-dropout |  | If non-zero, introduces a dropout layer on the outputs of each RNN layer
-bidirectional | False | If True, becomes a bidirectional RNN.
-
-#### Inputs
-
-Parameter | Default | Description
--------- | ------- | -----------
-input |  | A (seq_len x batch x input_size) tensor containing the features of the input sequence.
-h_0 |  | A (num_layers x batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
-
-#### Outputs
-
-Parameter |  Description
--------- |  -----------
-output | A (seq_len x batch x hidden_size) tensor containing the output features (h_t) from the last layer of the RNN, for each t
-h_n | A (num_layers x batch x hidden_size) tensor containing the hidden state for t=seq_len
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight_ih_l[k] | the learnable input-hidden weights of the k-th layer (W_ir|W_ii|W_in), of shape (input_size x 3*hidden_size)
-weight_hh_l[k] | the learnable hidden-hidden weights of the k-th layer (W_hr|W_hi|W_hn), of shape (hidden_size x 3*hidden_size)
-bias_ih_l[k] | the learnable input-hidden bias of the k-th layer (b_ir|b_ii|b_in), of shape (3*hidden_size)
-bias_hh_l[k] | the learnable hidden-hidden bias of the k-th layer (W_hr|W_hi|W_hn), of shape (3*hidden_size)
-### RNNCell
-
-An Elman RNN cell with tanh or ReLU non-linearity.
-
-```python
-h' = tanh(w_ih * x + b_ih  +  w_hh * h + b_hh)
-```
-
-```python
-rnn = nn.RNNCell(10, 20)
-input = Variable(torch.randn(6, 3, 10))
-hx = Variable(torch.randn(3, 20))
-output = []
-for i in range(6):
-    hx = rnn(input, hx)
-    output[i] = hx
-```
-
-If nonlinearity='relu', then ReLU is used in place of tanh.
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-input_size |  | The number of expected features in the input x
-hidden_size |  | The number of features in the hidden state h
-bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
-nonlinearity | 'tanh' | The non-linearity to use ['tanh'|'relu'].
-
-#### Inputs
-
-Parameter | Default | Description
--------- | ------- | -----------
-input |  | A (batch x input_size) tensor containing input features
-hidden |  | A (batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
-
-#### Outputs
-
-Parameter |  Description
--------- |  -----------
-h' | A (batch x hidden_size) tensor containing the next hidden state for each element in the batch
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight_ih | the learnable input-hidden weights, of shape (input_size x hidden_size)
-weight_hh | the learnable hidden-hidden weights, of shape (hidden_size x hidden_size)
-bias_ih | the learnable input-hidden bias, of shape (hidden_size)
-bias_hh | the learnable hidden-hidden bias, of shape (hidden_size)
-### LSTMCell
-
-A long short-term memory (LSTM) cell.
-
-```python
-i = sigmoid(W_ii x + b_ii + W_hi h + b_hi)
-f = sigmoid(W_if x + b_if + W_hf h + b_hf)
-g = tanh(W_ig x + b_ig + W_hc h + b_hg)
-o = sigmoid(W_io x + b_io + W_ho h + b_ho)
-c' = f * c + i * c
-h' = o * tanh(c_t)
-```
-
-```python
-rnn = nn.LSTMCell(10, 20)
-input = Variable(torch.randn(6, 3, 10))
-hx = Variable(torch.randn(3, 20))
-cx = Variable(torch.randn(3, 20))
-output = []
-for i in range(6):
-    hx, cx = rnn(input, (hx, cx))
-    output[i] = hx
-```
-
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-input_size |  | The number of expected features in the input x
-hidden_size |  | The number of features in the hidden state h
-bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
-
-#### Inputs
-
-Parameter | Default | Description
--------- | ------- | -----------
-input |  | A (batch x input_size) tensor containing input features
-hidden |  | A (batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
-
-#### Outputs
-
-Parameter |  Description
--------- |  -----------
-h' | A (batch x hidden_size) tensor containing the next hidden state for each element in the batch
-c' | A (batch x hidden_size) tensor containing the next cell state for each element in the batch
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight_ih | the learnable input-hidden weights, of shape (input_size x hidden_size)
-weight_hh | the learnable hidden-hidden weights, of shape (hidden_size x hidden_size)
-bias_ih | the learnable input-hidden bias, of shape (hidden_size)
-bias_hh | the learnable hidden-hidden bias, of shape (hidden_size)
-### GRUCell
-
-A gated recurrent unit (GRU) cell
-
-```python
-r = sigmoid(W_ir x + b_ir + W_hr h + b_hr)
-i = sigmoid(W_ii x + b_ii + W_hi h + b_hi)
-n = tanh(W_in x + resetgate * W_hn h)
-h' = (1 - i) * n + i * h
-```
-
-```python
-rnn = nn.RNNCell(10, 20)
-input = Variable(torch.randn(6, 3, 10))
-hx = Variable(torch.randn(3, 20))
-output = []
-for i in range(6):
-    hx = rnn(input, hx)
-    output[i] = hx
-```
-
-
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-input_size |  | The number of expected features in the input x
-hidden_size |  | The number of features in the hidden state h
-bias | True | If False, then the layer does not use bias weights b_ih and b_hh.
-
-#### Inputs
-
-Parameter | Default | Description
--------- | ------- | -----------
-input |  | A (batch x input_size) tensor containing input features
-hidden |  | A (batch x hidden_size) tensor containing the initial hidden state for each element in the batch.
-
-#### Outputs
-
-Parameter |  Description
--------- |  -----------
-h' | A (batch x hidden_size) tensor containing the next hidden state for each element in the batch
-
-#### Members
-
-Parameter | Description
--------- | -----------
-weight_ih | the learnable input-hidden weights, of shape (input_size x hidden_size)
-weight_hh | the learnable hidden-hidden weights, of shape (hidden_size x hidden_size)
-bias_ih | the learnable input-hidden bias, of shape (hidden_size)
-bias_hh | the learnable hidden-hidden bias, of shape (hidden_size)
--- a/docs-old/nn_sparse.md
+++ b/docs-old/nn_sparse.md
@ -1,37 +0,0 @@
-## Sparse layers
-### Embedding
-
-A simple lookup table that stores embeddings of a fixed dictionary and size
-
-```python
-# an Embedding module containing 10 tensors of size 3
-embedding = nn.Embedding(10, 3)
-# a batch of 2 samples of 4 indices each
-input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
-print(embedding(input))
-# example with padding_idx
-embedding = nn.Embedding(10, 3, padding_idx=0)
-input = torch.LongTensor([[0,2,0,5]])
-print(embedding(input))
-```
-
-This module is often used to store word embeddings and retrieve them using indices.
-The input to the module is a list of indices, and the output is the corresponding
-word embeddings.
-
-#### Constructor Arguments
-
-Parameter | Default | Description
--------- | ------- | -----------
-num_embeddings |  | size of the dictionary of embeddings
-embedding_dim |  | the size of each embedding vector
-padding_idx | None | If given, pads the output with zeros whenever it encounters the index.
-max_norm | None | If given, will renormalize the embeddings to always have a norm lesser than this
-norm_type |  | The p of the p-norm to compute for the max_norm option
-scale_grad_by_freq |  | if given, this will scale gradients by the frequency of the words in the dictionary.
-
-#### Expected Shape
-       | Shape | Description 
------ | ----- | ------------
- input | [ *, * ]  | Input is a 2D mini_batch LongTensor of m x n indices to extract from the Embedding dictionary
-output | [ * , *, * ]   | Output shape = m x n x embedding_dim
--- a/docs-old/optim.md
+++ b/docs-old/optim.md
@ -1,114 +0,0 @@
-# torch.optim
-
-The Optim package in Torch is targeted for one to optimize their neural networks
-using a wide variety of optimization methods such as SGD, Adam etc.
-
-Currently, the following optimization methods are supported, typically with
-options such as weight decay and other bells and whistles.
-
- SGD          `(params, lr=required, momentum=0, dampening=0)`
- AdaDelta     `(params, rho=0.9, eps=1e-6, weight_decay=0)`
- Adagrad      `(params, lr=1e-2, lr_decay=0, weight_decay=0)`
- Adam         `(params, lr=1e-2, betas=(0.9, 0.999), epsilon=1e-8, weight_decay=0)`
- AdaMax       `(params, lr=1e-2, betas=(0.9, 0.999), eps=1e-38, weight_decay=0)`
- Averaged SGD `(params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0)`
- RProp        `(params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50))`
- RMSProp      `(params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0)`
-
-
-The usage of the Optim package itself is as follows.
-
-1. Construct an optimizer
-2. Use `optimizer.step(...)` to optimize.
-   - Call `optimizer.zero_grad()` to zero out the gradient buffers when appropriate
-
-## 1. Constructing the optimizer
-
-One first constructs an `Optimizer` object by giving it a list of parameters
-to optimize, as well as the optimizer options,such as learning rate, weight decay, etc.
-
-Examples:
-
-`optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)`
-
-`optimizer = optim.Adam([var1, var2], lr = 0.0001)`
-
-### Per-parameter options
-
-In a more advanced usage, one can specify per-layer options by passing each parameter group along with it's custom options.
-
-**__Any parameter group that does not have an attribute defined will use the default attributes.__**
-
-This is very useful when one wants to specify per-layer learning rates for example.
-
-Example:
-
-`optim.SGD([{'params': model1.parameters()}, {'params': model2.parameters(), 'lr': 1e-3}, lr=1e-2, momentum=0.9)`
-
-`model1`'s parameters will use the default learning rate of `1e-2` and momentum of `0.9`
-`model2`'s parameters will use a learning rate of `1e-3`, and the default momentum of `0.9`
-
-Then, you can use the optimizer by calling `optimizer.zero_grad()` and `optimizer.step(...)`. Read the next sections.
-
-## 2. Taking an optimization step using `Optimizer.step(...)`
-
-The step function has the following two signatures:
-
-### a. `Optimizer.step(closure)`
-
-The `step` function takes a user-defined closure that computes f(x) and returns the loss.
-
-The closure needs to do the following:
- Optimizer.zero_grad()
- Compute the loss
- Call loss.backward()
- return the loss
-
-Example 1: training a neural network
-
-```python
-# Example 1: training a neural network with optimizer.step(closure)
-net = MNISTNet()
-criterion = ClassNLLLoss()
-optimizer = optim.SGD(net.parameters(), lr=0.001)
-
-for data in data_batches:
-    input, target = data
-	def closure():
-	    optimizer.zero_grad()
-	    output = net(input)
-		loss = criterion(output, target)
-		loss.backward()
-		return loss
-	optimizer.step(closure)
-```
-
-Notes: Why is this required? Why cant we simply have the optimizer take the parameters and grads?
-       Some optimization algorithms such as Conjugate Gradient and LBFGS need to evaluate their function
-	   multiple times. For such optimization methods, the function (i.e. the closure) has to be defined.
-      
-
-### b. `Optimizer.step()`
-
-This is a simplified usage that supports most, but not all optimization algorithms. For example, it does not support LBFGS or Conjugate Gradient.
-
-The usage for this is to simply call the function after the backward() is called on your model.
-
-Example 2: training a neural network
-
-```python
-# Example 2: training a neural network with optimizer.step()
-net = MNISTNet()
-criterion = ClassNLLLoss()
-optimizer = optim.SGD(net.parameters(), lr=0.001)
-
-for data in data_batches:
-    input, target = data
-	optimizer.zero_grad()
-	output = net(input)
-	loss = criterion(output, target)
-	loss.backward()
-	optimizer.step()
-```
-
-
--- a/docs-old/tensor.md
+++ b/docs-old/tensor.md
@ -1,417 +0,0 @@
-# Tensors
-
-A `Tensor` is a potentially multi-dimensional matrix.
-The number of dimensions is unlimited.
-
-The `Tensor` set of classes are probably the most important class in
-`torch`. Almost every package depends on these classes. They are *__the__*
-class for handling numeric data. As with pretty much anything in
-[torch], tensors are serializable with `torch.save` and `torch.load`
-
-There are 7 Tensor classes in torch:
-
- `torch.FloatTensor`   :   Signed 32-bit floating point tensor
- `torch.DoubleTensor`  :   Signed 64-bit floating point tensor
- `torch.ByteTensor`    :   Signed  8-bit integer tensor
- `torch.CharTensor`    : Unsigned  8-bit integer tensor
- `torch.ShortTensor`   :   Signed 16-bit integer tensor
- `torch.IntTensor`     :   Signed 32-bit integer tensor
- `torch.LongTensor`    :   Signed 64-bit integer tensor
-
-The data in these tensors lives on the system memory connected to your CPU.
-
-Most numeric operations are implemented _only_ for `FloatTensor` and `DoubleTensor`.
-Other Tensor types are useful if you want to save memory space or specifically
-do integer operations.
-
-The number of dimensions of a `Tensor` can be queried by
-`ndimension()` or `dim()`. Size of the `i-th` dimension is
-returned by `size(i)`. A tuple containing the size of all the dimensions
-can be returned by `size()`.
-
-```python
-import torch
-
-# allocate a matrix of shape 3x4
-a = torch.FloatTensor(3, 4)
-print(a)
-
-# convert this into a LongTensor
-b = a.long()
-print(b)
-
-# print the size of the tensor
-print(a.size())
-
-# print the number of dimensions
-print(a.dim())
-```
-
-These tensors can be converted to numpy arrays very efficiently
-with zero memory copies.
-For this, the two provided functions are `.numpy()` and `torch.from_numpy()`
-
-```python
-import numpy as np
-
-# convert to numpy
-c = a.numpy()
-print(type(c))
-```
-
-When using GPUs, each of the classes above has an equivalent
-class such as: `torch.cuda.FloatTensor`, `torch.cuda.LongTensor`, etc.
-When one allocates a CUDA tensor, the data in these tensors lives in the
-GPU memory.
-
-One can seamlessly transfer a tensor from the CPU to the GPU, as well as
-between different GPUs on your machine.
-
-Apart from the above 7 tensor types, there is one additional tensor type on the GPU
-
- `torch.cuda.HalfTensor` : Signed 16-bit floating point tensor
-
-```python
-import torch.cuda
-
-# allocate a matrix of shape 3x4
-a = torch.cuda.FloatTensor(3, 4)
-print(a)
-
-# transfer this to the CPU
-b = a.cpu()
-print(b)
-
-# transfer this back to the GPU-1
-a = b.cuda()
-print(a)
-
-# transfer this to GPU-2
-b = a.cuda(1)
-```
-
-## Internal data representation
-
-The actual data of a `Tensor` is contained into a
-`Storage`. It can be accessed using
-`storage()`. While the memory of a
-`Tensor` has to be contained in this unique `Storage`, it might
-not be contiguous: the first position used in the `Storage` is given
-by `storage_offset()` (starting at `0`).
-And the _jump_ needed to go from one element to another
-element in the `i-th` dimension is given by
-`stride(i-1)`. See the code example for an illustration.
-
-```python
-# given a 3d tensor
-x = torch.FloatTensor(7,7,7)
-
-# accessing the element `(3,4,5)` can be done by
-x[3 - 1][4 - 1][5 - 1]
-# or equivalently (but slowly!)
-x.storage()[x.storageOffset()
-            + (3 - 1) * x.stride(0)
-			+ (4 - 1) * x.stride(1)
-			+ (5 - 1) * x.stride(2)]
-```
-
-One could say that a `Tensor` is a particular way of _viewing_ a
-`Storage`: a `Storage` only represents a chunk of memory, while the
-`Tensor` interprets this chunk of memory as having dimensions:
-
-```python
-# a tensor interprets a chunk of memory as having dimensions
->>> x = torch.Tensor(4,5)
->>> s = x.storage()
->>> for i in range(s.size()): # fill up the Storage
->>>   s[i] = i
-
-# s is interpreted by x as a 2D matrix
->>> print(x)
-
-  1   2   3   4   5
-  6   7   8   9  10
- 11  12  13  14  15
- 16  17  18  19  20
-[torch.FloatTensor of dimension 4x5]
-```
-
-Note also that in Torch7 ___elements in the same row___ [elements along the __last__ dimension]
-are contiguous in memory for a matrix [tensor]:
-
-This is exactly like in `C` and `numpy` (and not `Fortran`).
-
-## Default Tensor type
-
-For convenience, _an alias_ `torch.Tensor` is provided, which allows the user to write
-type-independent scripts, which can then ran after choosing the desired Tensor type with
-a call like
-
-`torch.set_default_tensor_type('torch.DoubleTensor')`
-
-
-By default, the alias points to `torch.FloatTensor`.
-
-## Efficient memory management
-
-_All_ tensor operations post-fixed with an underscore (for example `.fill_`)
-do _not_ make any memory copy. All these methods transform the existing tensor.
-Tensor methods such as `narrow` and `select` return a new tensor referencing _the same storage_.
-This magical behavior is internally obtained by good usage of the `stride()` and
-`storage_offset()`. See the code example illustrating this.
-
-```python
->>> x = torch.Tensor(5).zero_()
->>> print(x)
- 0
- 0
- 0
- 0
- 0
-[torch.FloatTensor of dimension 5]
->>> x.narrow(0, 1, 2).fill_(1)
->>> # narrow() returns a Tensor referencing the same Storage as x
->>> print(x)
- 0
- 1
- 1
- 0
- 0
-[torch.FloatTensor of dimension 5]
->>> # same thing can be achieved with slice indexing
->>> x[1:3] = 2
->>> print(x)
- 0
- 2
- 2
- 0
- 0
-[torch.FloatTensor of dimension 5]
-```
-
-If you really need to copy a `Tensor`, you can use the `copy_()` method:
-
-```python
-# making a copy of a tensor
-y = x.new(x.size()).copy_(x)
-y = x.clone()
-```
-Or the convenience method `clone()`
-
-We now describe all the methods for `Tensor`. If you want to specify the Tensor type,
-just replace `Tensor` by the name of the Tensor variant (like `CharTensor`).
-
-## Constructors ##
-
-Tensor constructors, create new Tensor object, optionally, allocating
-new memory. By default the elements of a newly allocated memory are
-not initialized, therefore, might contain arbitrary numbers. Here are
-several ways to construct a new `Tensor`.
-
-### torch.Tensor() ###
-
-Returns an empty tensor.
-
-### torch.Tensor(tensor) ###
-
-Returns a new tensor which reference the same `Storage` than the given `tensor`.
-The `size`, `stride`, and `storage_offset` are the same than the given tensor.
-
-The new `Tensor` is now going to "view" the same `storage`
-as the given `tensor`. As a result, any modification in the elements
-of the `Tensor` will have a impact on the elements of the given
-`tensor`, and vice-versa. No memory copy!
-
-```python
->>> x = torch.Tensor(2,5).fill_(3.14)
->>> x
- 3.1400  3.1400  3.1400  3.1400  3.1400
- 3.1400  3.1400  3.1400  3.1400  3.1400
-[torch.FloatTensor of dimension 2x5]
-
->>> y = torch.Tensor(x)
->>> y
- 3.1400  3.1400  3.1400  3.1400  3.1400
- 3.1400  3.1400  3.1400  3.1400  3.1400
-[torch.FloatTensor of dimension 2x5]
-
->>> y.zero_()
->>> x # elements of x are the same as y!
-0 0 0 0 0
-0 0 0 0 0
-[torch.FloatTensor of dimension 2x5]
-```
-
-### torch.Tensor(sz1 [,sz2 [,sz3 [,sz4 [,sz5 ...]]]]]) ###
-
-Create a tensor of the given sizes.
-The tensor size will be `sz1 x sz2 x sx3 x sz4 x sz5 x ...`.
-
-### torch.Tensor(sizes) ###
-
-Create a tensor of any number of dimensions. `sizes` gives the size in each dimension of
-the tensor and is of type `torch.Size`.
-
-```python
-Example, create a 4D 4x4x3x2 tensor:
-x = torch.Tensor(torch.Size([4,4,3,2]))
-```
-
-### torch.Tensor(storage) ###
-
-Returns a tensor which uses the existing `Storage` starting at a storage offset of 0.
-
-### torch.Tensor(sequence) ###
-
-One can create a tensor from a python sequence.
-
-For example, you can create a `Tensor` from a `list` or a `tuple`
-
-```python
-# create a 2d tensor from a list of lists
->>> torch.Tensor([[1,2,3,4], [5,6,7,8]])
- 1  2  3  4
- 5  6  7  8
-[torch.FloatTensor of dimension 2x4]
-```
-
-### torch.Tensor(ndarray) ###
-
-Creates a `Tensor` from a NumPy `ndarray`.
-If the `dtype` of the `ndarray` is the same as the type of the `Tensor` being created,
-The underlying memory of both are shared, i.e. if the value of an element
-in the `ndarray` is changed, the corresponding value in the `Tensor` changes,
-and vice versa.
-
-```python
-# create a ndarray of dtype=int64
->>> a = np.random.randint(2, size=10)
->>> a
-array([0, 0, 1, 1, 0, 1, 1, 0, 0, 0])
-# create a LongTensor. Since they are the same type (int64), the memory is shared
->>> b = torch.LongTensor(a)
- 0
- 0
- 1
- 1
- 0
- 1
- 1
- 0
- 0
- 0
-[torch.LongTensor of size 10]
->>> b[3] = 100
->>> print(a[3])
-100
-
-# now create an IntTensor from the same ndarray.
-# The memory is not shared in this case as the dtype=int64 != IntTensor (int32)
->>> b = torch.IntTensor(a)
->>> b[3] = 30000
->>> print(a[3])
-100
-# a did not change to the value 30000
-```
-
-## NumPy Conversion ##
-### torch.from_numpy(ndarray)
-
-This is a convenience function similar to the constructor above.
-Given a numpy `ndarray`, it constructs a torch `Tensor` of the same `dtype`
-as the numpy array.
-
-For example, passing in an ndarray of dtype=float64 will create a torch.DoubleTensor
-
-### Tensor.numpy()
-
-This is a member function on a tensor that converts a torch `Tensor` to a
-numpy `ndarray`. The memory of the data of both objects is shared.
-Hence, changing a value in the `Tensor` will change the corresponding value in
-the `ndarray` and vice versa.
-
-```python
->>> a = torch.randn(3,4)
->>> b = a.numpy() # creates a numpy array with dtype=float32 in this case
->>> print(a)
-1.0453  1.4730 -1.8990 -0.7763
- 1.8155  1.4004 -1.5286  1.0420
- 0.6551  1.0258  0.1152 -0.3239
-[torch.FloatTensor of size 3x4]
->>> print(b)
-[[-1.04525673  1.4730444  -1.89899576 -0.77626842]
- [ 1.81549406  1.40035892 -1.5286355   1.04199517]
- [ 0.6551016   1.02575183  0.11520521 -0.32391372]]
->>> a[2][2] = 1000
->>> print(b)
-[[ -1.04525673e+00   1.47304440e+00  -1.89899576e+00  -7.76268423e-01]
- [  1.81549406e+00   1.40035892e+00  -1.52863550e+00   1.04199517e+00]
- [  6.55101597e-01   1.02575183e+00   1.00000000e+03  -3.23913723e-01]]
-# notice that b[2][2] has changed to the value 1000 too.
-```
-
-### torch.is_tensor(obj)
-
-Returns True if the passed-in object is a `Tensor` (of any type). Returns `False` otherwise.
-
-### torch.is_storage(obj)
-
-Returns True if the passed-in object is a `Storage` (of any type). Returns `False` otherwise.
-
-### torch.expand_as
-### torch.expand
-### torch.view
-### torch.view_as
-### torch.permute
-### torch.pin_memory
-### copy
-### split
-### chunk
-### tolist
-### repeat
-### unsqueeze
-### unsqueeze_
-### add, iadd, sub, isub, mul, imul, matmul, div, rdiv, idiv, mod, neg
-
-## GPU Semantics ##
-
-When you create a `torch.cuda.*Tensor`, it is allocated on the current GPU.
-However, you could allocate it on another GPU as well, using the `with torch.cuda.device(id)` context.
-All allocations within this context will be placed on the GPU `id`.
-
-Once `Tensor`s are allocated, you can do operations on them from any GPU context, and the results
-will be placed on the same device as where the source `Tensor` is located.
-
-For example if Tensor `a` and `b` are on GPU-2, but the GPU-1 is the current device.
-If one does `c = a + b`, then `c` will be on GPU-2, regardless of what the current device is.
-
-Cross-GPU operations are not allowed. The only Cross-GPU operation allowed is `copy`.
-
-If `a` is on GPU-1 and `b` is on GPU-2, then `c = a + b` will result in an error.
-
-See the example for more clarity on these semantics.
-
-```python
-# Tensors are allocated on GPU 1 by default
-x = torch.cuda.FloatTensor(1)
-# x.get_device() == 0
-y = torch.FloatTensor(1).cuda()
-# y.get_device() == 0
-
-with torch.cuda.device(1):
-    # allocates a tensor on GPU 2
-    a = torch.cuda.FloatTensor(1)
-
-    # transfers a tensor from CPU to GPU-2
-	b = torch.FloatTensor(1).cuda()
-	# a.get_device() == b.get_device() == 1
-
-    z = x + y
-	# z.get_device() == 1
-
-    # even within a context, you can give a GPU id to the .cuda call
-    c = torch.randn(2).cuda(2)
-	# c.get_device() == 2
-
-```
-
--- a/docs-old/tensor_ref.md
+++ b/docs-old/tensor_ref.md
--- a/docs-old/torch.md
+++ b/docs-old/torch.md
@ -1,83 +0,0 @@
-# torch
-
-```python
-# load torch with
-import torch
-```
-
-```python
-# load the CUDA features of torch with
-import torch.cuda
-```
-
-__torch__ is the main package where data structures for multi-dimensional
-tensors and mathematical operations over these are defined.
-Additionally, it provides many utilities for efficient serializing of
-Tensors and arbitrary types, and other useful utilities.
-
-It has a CUDA counterpart, that enables you to run your tensor computations
-on an NVIDIA GPU with compute capability >= 2.0.
-
-## Multi-core
-### torch.get_num_threads()
-
-Gets the number of OpenMP threads that will be used for parallelizing CPU operations
-
-### torch.set_num_threads(n)
-
-Sets the number of OpenMP threads to use for parallelizing CPU operations
-
-## Serialization
-### torch.save(object, file)
-This function pickles a Python object to the `file`. `file` is either a filename or a file handle.
-
-`object` can be a picklable python object, including `torch` `Tensor`s, autograd `Variable`, nn `Module`s etc.
-
-When a group of `torch` `Tensor`s are saved together, and if any of them share the same storages, then this sharing is preserved during saving and loading back.
-
-
-### torch.load(file)
-
-This function unpickles objects that have been pickled with `torch.save`
-
-## Random Numbers
-
-### torch.get_rng_state()
-
-Gets the current state of the torch Random Number Generator.
-
-This can be passed in the future to `torch.set_rng_state` to restore the current RNG state.
-
-### torch.set_rng_state(state)
-
-Sets the current state of the torch Random Number Generator to the given `state`. 
-
-### torch.manual_seed(number)
-
-Sets the initial seed of the random number generator to a given number.
-
-### torch.initial_seed()
-
-Returns the number that is the initial seed to the Random Number Generator
-
-## CUDA
-### torch.cuda.is_available()
-
-Returns `True` if CUDA is available and usable. Returns `False` otherwise.
-
-### torch.cuda.device_count()
-
-Returns the number of CUDA devices on the system.
-
-### torch.cuda.current_device()
-
-Returns the device index of the current default CUDA device.
-
-### torch.cuda.synchronize()
-
-This function issues a `cudaDeviceSynchronize` on the current device, and hence waits for all in-flight CUDA computation to finish.
-
-### torch.cuda.current_stream()
-
-Returns the handle to the current stream of the CUDA context.
-
--- a/docs/source/_static/img/tensor_illustration.png
+++ b/docs/source/_static/img/tensor_illustration.png