## @package tt_core
# Module caffe2.python.tt_core


import numpy as np

"""
The following methods are various utility methods for using the Tensor-Train
decomposition, or TT-decomposition introduced by I. V. Oseledets (2011) in his
paper (http://epubs.siam.org/doi/abs/10.1137/090752286).

Broadly speaking, these methods are used to replace fully connected layers in
neural networks with Tensor-Train layers introduced by A. Novikov et. al. (2015)
in their paper (http://arxiv.org/abs/1509.06569). More details about each of
the methods are provided in each respective docstring.
"""


def init_tt_cores(inp_sizes, out_sizes, tt_ranks, seed=1234):
    """
    Initialize randomized orthogonalized TT-cores.

    This method should be used when a TT-layer is trained from scratch. The
    sizes of each of the cores are specified by the inp_sizes and out_sizes, and
    the respective tt_ranks will dictate the ranks of each of the cores. Note
    that a larger set of tt_ranks will result in slower computation but will
    result in more accurate approximations. The size of the ith core is:

        tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].

    Note that the following relationships of lengths of each input is expected:

        len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.

    Args:
        inp_sizes: list of the input dimensions of the respective cores
        out_sizes: list of the output dimensions of the respective cores
        tt_ranks: list of the ranks of the respective cores
        seed: integer to seed the random number generator

    Returns:
        cores: One-dimensional list of cores concatentated along an axis
    """
    np.random.seed(seed)

    # Assert that the sizes of each input is correct
    assert(len(inp_sizes) == len(out_sizes)), \
           "The number of input dimensions (" + str(len(inp_sizes)) + \
           ") must be equal to the number of output dimensions (" + \
           str(len(out_sizes)) + ")."

    assert(len(tt_ranks) == len(inp_sizes) + 1), \
           "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
           "one more than the number of input and output dims (" + \
           str(len(out_sizes)) + ")."

    # Convert to numpy arrays
    inp_sizes = np.array(inp_sizes)
    out_sizes = np.array(out_sizes)
    tt_ranks = np.array(tt_ranks)

    # Initialize the cores array
    cores_len = np.sum(
        inp_sizes * out_sizes * tt_ranks[1:] * tt_ranks[:-1])
    cores = np.zeros(cores_len)
    cores_idx = 0
    rv = 1

    # Compute the full list of cores by computing each individual one
    for i in range(inp_sizes.shape[0]):
        shape = [tt_ranks[i],
                 inp_sizes[i],
                 out_sizes[i],
                 tt_ranks[i + 1]]

        # Precompute the shape of each core
        tall_shape = (np.prod(shape[:3]), shape[3])

        # Randomly initialize the current core using a normal distribution
        curr_core = np.dot(rv, np.random.normal(
            0, 1, size=(shape[0], np.prod(shape[1:]))))
        curr_core = curr_core.reshape(tall_shape)

        # Orthogonalize the initialized current core and append to cores list
        if i < inp_sizes.shape[0] - 1:
            curr_core, rv = np.linalg.qr(curr_core)
        cores[cores_idx:cores_idx +
              curr_core.size] = curr_core.flatten()
        cores_idx += curr_core.size

    # Normalize the list of arrays using this Glarot trick
    glarot_style = (np.prod(inp_sizes) *
                    np.prod(tt_ranks))**(1.0 / inp_sizes.shape[0])

    return (0.1 / glarot_style) * np.array(cores).astype(np.float32)


def matrix_to_tt(W, inp_sizes, out_sizes, tt_ranks):
    """
    Convert a matrix into the TT-format.

    This method will consume a 2D weight matrix such as those used in fully
    connected layers in a neural network and will compute the TT-decomposition
    of the weight matrix and return the TT-cores of the resulting computation.
    This method should be used when converting a trained, fully connected layer,
    into a TT-layer for increased speed and decreased parameter size. The size
    of the ith core is:

        tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].

    Note that the following relationships of lengths of each input is expected:

        len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.

    We also require that np.prod(inp_sizes) == W.shape[0] and that
    np.prod(out_sizes) == W.shape[1].

    Args:
        W: two-dimensional weight matrix numpy array representing a fully
           connected layer to be converted to TT-format; note that the weight
           matrix is transposed before decomposed because we want to emulate the
           X * W^T operation that the FC layer performs.
        inp_sizes: list of the input dimensions of the respective cores
        out_sizes: list of the output dimensions of the respective cores
        tt_ranks: list of the ranks of the respective cores

    Returns:
        new_cores: One-dimensional list of cores concatentated along an axis
   """

    # Assert that the sizes of each input is correct
    assert(len(inp_sizes) == len(out_sizes)), \
           "The number of input dimensions (" + str(len(inp_sizes)) + \
           ") must be equal to the number of output dimensions (" + \
           str(len(out_sizes)) + ")."

    assert(len(tt_ranks) == len(inp_sizes) + 1), \
           "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
           "one more than the number of input and output dimensions (" + \
           str(len(out_sizes)) + ")."

    assert(W.shape[0] == np.prod(inp_sizes)), \
           "The product of the input sizes (" + str(np.prod(inp_sizes)) + \
           ") must be equal to first dimension of W (" + str(W.shape[0]) + ")."

    assert(W.shape[1] == np.prod(out_sizes)), \
           "The product of the output sizes (" + str(np.prod(out_sizes)) + \
           ") must be equal to second dimension of W (" + str(W.shape[1]) + ")."

    # W is transposed so that the multiplication X * W^T can be computed, just
    # as it is in the FC layer.
    W = W.transpose()

    # Convert to numpy arrays
    inp_sizes = np.array(inp_sizes)
    out_sizes = np.array(out_sizes)
    tt_ranks = np.array(tt_ranks)

    # Copy the original weight matrix in order to permute and reshape the weight
    # matrix. In addition, the inp_sizes and out_sizes are combined to a single
    # sizes array to use the tt_svd helper method, which only consumes a single
    # sizes array.
    W_copy = W.copy()
    total_inp_size = inp_sizes.size
    W_copy = np.reshape(W_copy, np.concatenate((inp_sizes, out_sizes)))
    order = np.repeat(np.arange(0, total_inp_size), 2) + \
            np.tile([0, total_inp_size], total_inp_size)
    W_copy = np.transpose(W_copy, axes=order)
    W_copy = np.reshape(W_copy, inp_sizes * out_sizes)

    # Use helper method to convert the W matrix copy into the preliminary
    # cores array.
    cores = tt_svd(W_copy, inp_sizes * out_sizes, tt_ranks)

    # Permute the dimensions of each of the cores to be compatible with the
    # TT-layer.
    new_cores = np.zeros(cores.shape).astype(np.float32)
    idx = 0
    for i in range(len(inp_sizes)):
        shape = (tt_ranks[i], inp_sizes[i], out_sizes[i], tt_ranks[i + 1])
        current_core = cores[idx:idx + np.prod(shape)].reshape(shape)
        current_core = current_core.transpose((1, 3, 0, 2))
        new_cores[new_cores.shape[0] - idx - np.prod(shape):
                  new_cores.shape[0] - idx] \
                  = current_core.flatten()
        idx += np.prod(shape)

    return new_cores


def tt_svd(W, sizes, tt_ranks):
    """
    Helper method for the matrix_to_tt() method performing the TT-SVD
    decomposition.

    Uses the TT-decomposition algorithm to convert a matrix to TT-format using
    multiple reduced SVD operations.

    Args:
        W: two-dimensional weight matrix representing a fully connected layer to
           be converted to TT-format preprocessed by the matrix_to_tt() method.
        sizes: list of the dimensions of each of the cores
        tt_ranks: list of the ranks of the respective cores

    Returns:
        cores: One-dimensional list of cores concatentated along an axis
   """

    assert(len(tt_ranks) == len(sizes) + 1)

    C = W.copy()
    total_size = sizes.size
    core = np.zeros(np.sum(tt_ranks[:-1] * sizes * tt_ranks[1:]),
                    dtype='float32')

    # Compute iterative reduced SVD operations and store each resulting U matrix
    # as an individual core.
    pos = 0
    for i in range(0, total_size - 1):
        shape = tt_ranks[i] * sizes[i]
        C = np.reshape(C, [shape, -1])
        U, S, V = np.linalg.svd(C, full_matrices=False)
        U = U[:, 0:tt_ranks[i + 1]]
        S = S[0:tt_ranks[i + 1]]
        V = V[0:tt_ranks[i + 1], :]

        core[pos:pos + tt_ranks[i] * sizes[i] * tt_ranks[i + 1]] = U.ravel()
        pos += tt_ranks[i] * sizes[i] * tt_ranks[i + 1]
        C = np.dot(np.diag(S), V)

    core[pos:pos + tt_ranks[total_size - 1] *
         sizes[total_size - 1] * tt_ranks[total_size]] = C.ravel()
    return core


# TODO(Surya) Write a method to convert an entire network where all fully
# connected layers are replaced by an TT layer.
def fc_net_to_tt_net(net):
    pass