pytorch/caffe2/python/layers/feature_sparse_to_dense.py

# @package sparse_to_dense
# Module caffe2.python.layers.sparse_to_dense
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from caffe2.python import schema
from caffe2.python.layers.layers import (
    ModelLayer,
)
import numpy as np


class FeatureSparseToDense(ModelLayer):

    def __init__(self, model, input_record, input_specs,
                 name='feature_sparse_to_dense', **kwargs):
        """
        `input_specs` follows the format of FeatureSpec from schema. To be more
        precise it's a namedtuple that should have:
            'feature_type', 'feature_names', 'feature_ids'
        """
        super(FeatureSparseToDense, self).__init__(model, name,
                                            input_record, **kwargs)

        self.input_specs = input_specs

        outputs = []
        for field, feature_specs in self.input_specs:
            assert len(feature_specs.feature_names) ==\
                len(feature_specs.feature_ids)
            if feature_specs.feature_type == 'FLOAT':
                outputs.append((
                    field,
                    schema.Scalar(
                        (np.float32, (len(feature_specs.feature_ids), )),
                        self.get_next_blob_reference(field + '_output')
                    )
                ))
            elif feature_specs.feature_type == 'ID_LIST':
                outputs.append((
                    field,
                    schema.Struct(
                        ('ranges',
                            schema.Scalar(
                                (
                                    np.int32,
                                    (len(feature_specs.feature_ids), 2)
                                ),
                                self.get_next_blob_reference(
                                    field + '_ranges')
                            ),
                         ),
                        ('values',
                         schema.Scalar(np.int64,
                                       self.get_next_blob_reference(
                                           field + '_values')
                                       ),
                         )
                    )
                ))
            elif feature_specs.feature_type == 'ID_SCORE_LIST':
                outputs.append((
                    field,
                    schema.Struct(
                        ('ranges',
                            schema.Scalar(
                                (
                                    np.int32,
                                    (len(feature_specs.feature_ids), 2)
                                ),
                                self.get_next_blob_reference(
                                    field + '_ranges')
                            ),
                         ),
                        ('ids',
                         schema.Scalar(np.int64,
                                       self.get_next_blob_reference(
                                           field + '_ids')
                                       ),
                         ),
                        ('scores',
                         schema.Scalar(np.float32,
                                       self.get_next_blob_reference(
                                           field + '_scores')
                                       ),
                         )
                    )
                ))
            elif feature_specs.feature_type == 'EMBEDDING':
                # We don't know dimensions of embeddings in input data.
                # Even though they should match dimensions from feature config,
                # we keep ranges blob to check input data later.
                outputs.append((
                    field,
                    schema.Struct(
                        ('ranges',
                            schema.Scalar(
                                (
                                    np.int32,
                                    (len(feature_specs.feature_ids), 2)
                                ),
                                self.get_next_blob_reference(
                                    field + '_ranges')
                            ),
                         ),
                        ('values',
                         schema.Scalar(np.float32,
                                       self.get_next_blob_reference(
                                           field + '_values')
                                       ),
                         )
                    )
                ))
            else:
                raise TypeError(
                    "Unsupported input type: {0}".
                    format(feature_specs.feature_type))

        # TODO(amalevich): This schema is producing ranges. And thus if there is
        # something using it it should support ranges as well. It might be
        # confusing, if we don't add better support for ranges/have it as a
        # first layer
        self.output_schema = schema.Struct(
            *outputs
        )

        # TODO(amalevich): Consider moving this data to schema, instead
        # Structs doens't support attaching metadata to them and clonning
        # will break things badly, but this is the most elegant way to pass
        # this info around. Should we change it or it'll be too much work and
        # not worse it?
        for field, feature_specs in input_specs:
            schema.attach_metadata_to_scalars(
                self.output_schema[field],
                schema.Metadata(
                    feature_specs=feature_specs)
            )
        self.zero = model.global_constants['ZERO']
        self.zero_range = model.global_constants['ZERO_RANGE']

    # Add operators to all types that need to be densified
    def add_ops(self, net):
        record = self.input_record
        for field, feature_specs in self.input_specs:
            if feature_specs.feature_type == 'FLOAT':
                net.SparseToDenseMask(
                    [
                        record[field].keys(),
                        record[field].values(),
                        self.zero,
                        record[field].lengths(),
                    ],
                    [
                        self.output_schema[field](),
                    ],
                    mask=feature_specs.feature_ids,
                )
            elif feature_specs.feature_type == 'ID_LIST':
                id_list_ranges = net.LengthsToRanges(
                    record[field].values.lengths(),
                    net.NextScopedBlob('id_list_ranges')
                )
                net.SparseToDenseMask(
                    [
                        record[field].keys(), id_list_ranges, self.zero_range,
                        record[field].lengths()
                    ],
                    self.output_schema[field].ranges(),
                    mask=feature_specs.feature_ids,
                )
                # Alias helps to enforce the fact that all SparseToDense calls
                # produce new blobs.
                # Reusing blob names might result in some weird consequences
                # during the delivery time, when content of the blobs is
                # generated based on the inputSpecs.
                net.Alias(record[field].values.items(),
                          self.output_schema[field].values())
            elif feature_specs.feature_type == 'ID_SCORE_LIST':
                # TODO: merge this to the case above?
                id_list_ranges = net.LengthsToRanges(
                    record[field].values.lengths(),
                    net.NextScopedBlob('id_score_list_ranges')
                )
                net.SparseToDenseMask(
                    [
                        record[field].keys(), id_list_ranges, self.zero_range,
                        record[field].lengths()
                    ],
                    self.output_schema[field].ranges(),
                    mask=feature_specs.feature_ids,
                )
                # Alias helps to enforce the fact that all SparseToDense calls
                # produce new blobs.
                # Reusing blob names might result in some weird consequences
                # during the delivery time, when content of the blobs is
                # generated based on the inputSpecs.
                net.Alias(record[field].values.keys(),
                          self.output_schema[field].ids())
                net.Alias(record[field].values.values(),
                          self.output_schema[field].scores())
            elif feature_specs.feature_type == 'EMBEDDING':
                ranges = net.LengthsToRanges(
                    record[field].values.lengths(),
                    net.NextScopedBlob('embeddings_ranges')
                )
                net.SparseToDenseMask(
                    [
                        record[field].keys(),
                        ranges,
                        self.zero_range,
                        record[field].lengths()
                    ],
                    self.output_schema[field].ranges(),
                    mask=feature_specs.feature_ids,
                )
                # Alias helps to enforce the fact that all SparseToDense calls
                # produce new blobs.
                # Reusing blob names might result in some weird consequences
                # during the delivery time, when content of the blobs is
                # generated based on the inputSpecs.
                net.Alias(record[field].values.items(),
                          self.output_schema[field].values())

    def get_metadata(self):
        metadata = []
        for field, feature_specs in self.input_specs:
            metadata.append(
                (
                    {
                        'type': feature_specs.feature_type,
                        'names': feature_specs.feature_names,
                        'ids': feature_specs.feature_ids,
                    },
                    self.output_schema[field].field_blobs(),
                    self.output_schema[field].field_types()
                )
            )
            if feature_specs.feature_type == 'FLOAT':
                metadata[-1][0]['cardinality'] = 1
        return metadata