from caffe2.python import schema
from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap
import numpy as np


class SemiRandomFeatures(ArcCosineFeatureMap):
    """
    Implementation of the semi-random kernel feature map.

    Applies H(x_rand) * x_rand^s * x_learned, where
        H is the Heaviside step function,
        x_rand is the input after applying FC with randomized parameters,
        and x_learned is the input after applying FC with learnable parameters.

    If using multilayer model with semi-random layers, then input and output records
    should have a 'full' and 'random' Scalar. The random Scalar will be passed as
    input to process the random features.

    For more information, see the original paper:
        https://arxiv.org/pdf/1702.08882.pdf

    Inputs :
        output_dims -- dimensions of the output vector
        s -- if s == 0, will obtain linear semi-random features;
             else if s == 1, will obtain squared semi-random features;
             else s >= 2, will obtain higher order semi-random features
        scale_random -- amount to scale the standard deviation
                        (for random parameter initialization when weight_init or
                        bias_init hasn't been specified)
        scale_learned -- amount to scale the standard deviation
                        (for learned parameter initialization when weight_init or
                        bias_init hasn't been specified)

        weight_init_random -- initialization distribution for random weight parameter
                              (if None, will use Gaussian distribution)
        bias_init_random -- initialization distribution for random bias pararmeter
                            (if None, will use Uniform distribution)
        weight_init_learned -- initialization distribution for learned weight parameter
                               (if None, will use Gaussian distribution)
        bias_init_learned -- initialization distribution for learned bias pararmeter
                             (if None, will use Uniform distribution)
        weight_optim -- optimizer for weight params for learned features
        bias_optim -- optimizer for bias param for learned features

        set_weight_as_global_constant -- if True, initialized random parameters
                                         will be constant across all distributed
                                         instances of the layer
    """
    def __init__(
            self,
            model,
            input_record,
            output_dims,
            s=1,
            scale_random=1.0,
            scale_learned=1.0,
            weight_init_random=None,
            bias_init_random=None,
            weight_init_learned=None,
            bias_init_learned=None,
            weight_optim=None,
            bias_optim=None,
            set_weight_as_global_constant=False,
            name='semi_random_features',
            **kwargs):

        if isinstance(input_record, schema.Struct):
            schema.is_schema_subset(
                schema.Struct(
                    ('full', schema.Scalar()),
                    ('random', schema.Scalar()),
                ),
                input_record
            )
            self.input_record_full = input_record.full
            self.input_record_random = input_record.random

        elif isinstance(input_record, schema.Scalar):
            self.input_record_full = input_record
            self.input_record_random = input_record

        super().__init__(
            model,
            self.input_record_full,
            output_dims,
            s=s,
            scale=scale_random,       # To initialize the random parameters
            weight_init=weight_init_random,
            bias_init=bias_init_random,
            weight_optim=None,
            bias_optim=None,
            set_weight_as_global_constant=set_weight_as_global_constant,
            initialize_output_schema=False,
            name=name,
            **kwargs)

        self.output_schema = schema.Struct(
            ('full', schema.Scalar(
                (np.float32, output_dims),
                model.net.NextScopedBlob(name + '_full_output')
            ),),
            ('random', schema.Scalar(
                (np.float32, output_dims),
                model.net.NextScopedBlob(name + '_random_output')
            ),),
        )

        # To initialize the learnable parameters
        assert (scale_learned > 0.0), \
            "Expected scale (learned) > 0, got %s" % scale_learned
        self.stddev = scale_learned * np.sqrt(1.0 / self.input_dims)

        # Learned Parameters
        (self.learned_w, self.learned_b) = self._initialize_params(
            'learned_w',
            'learned_b',
            w_init=weight_init_learned,
            b_init=bias_init_learned,
            w_optim=weight_optim,
            b_optim=bias_optim
        )

    def add_ops(self, net):
        # Learned features: wx + b
        learned_features = net.FC(self.input_record_full.field_blobs() +
                                  [self.learned_w, self.learned_b],
                                  net.NextScopedBlob('learned_features'))
        # Random features: wx + b
        random_features = net.FC(self.input_record_random.field_blobs() +
                                 [self.random_w, self.random_b],
                                 net.NextScopedBlob('random_features'))
        processed_random_features = self._heaviside_with_power(
            net,
            random_features,
            self.output_schema.random.field_blobs(),
            self.s
        )
        net.Mul([processed_random_features, learned_features],
                self.output_schema.full.field_blobs())