pytorch/caffe2/python/layers/feature_sparse_to_dense.py
Orion Reblitz-Richardson 1d5780d42c Remove Apache headers from source.
* LICENSE file contains details, so removing from individual source files.
2018-03-27 13:10:18 -07:00

243 lines
10 KiB
Python

# @package sparse_to_dense
# Module caffe2.python.layers.sparse_to_dense
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import schema
from caffe2.python.layers.layers import (
ModelLayer,
)
import numpy as np
class FeatureSparseToDense(ModelLayer):
def __init__(self, model, input_record, input_specs,
name='feature_sparse_to_dense', **kwargs):
"""
`input_specs` follows the format of FeatureSpec from schema. To be more
precise it's a namedtuple that should have:
'feature_type', 'feature_names', 'feature_ids'
"""
super(FeatureSparseToDense, self).__init__(model, name,
input_record, **kwargs)
self.input_specs = input_specs
outputs = []
for field, feature_specs in self.input_specs:
assert len(feature_specs.feature_names) ==\
len(feature_specs.feature_ids)
if feature_specs.feature_type == 'FLOAT':
outputs.append((
field,
schema.Scalar(
(np.float32, (len(feature_specs.feature_ids), )),
self.get_next_blob_reference(field + '_output')
)
))
elif feature_specs.feature_type == 'ID_LIST':
outputs.append((
field,
schema.Struct(
('ranges',
schema.Scalar(
(
np.int32,
(len(feature_specs.feature_ids), 2)
),
self.get_next_blob_reference(
field + '_ranges')
),
),
('values',
schema.Scalar(np.int64,
self.get_next_blob_reference(
field + '_values')
),
)
)
))
elif feature_specs.feature_type == 'ID_SCORE_LIST':
outputs.append((
field,
schema.Struct(
('ranges',
schema.Scalar(
(
np.int32,
(len(feature_specs.feature_ids), 2)
),
self.get_next_blob_reference(
field + '_ranges')
),
),
('ids',
schema.Scalar(np.int64,
self.get_next_blob_reference(
field + '_ids')
),
),
('scores',
schema.Scalar(np.float32,
self.get_next_blob_reference(
field + '_scores')
),
)
)
))
elif feature_specs.feature_type == 'EMBEDDING':
# We don't know dimensions of embeddings in input data.
# Even though they should match dimensions from feature config,
# we keep ranges blob to check input data later.
outputs.append((
field,
schema.Struct(
('ranges',
schema.Scalar(
(
np.int32,
(len(feature_specs.feature_ids), 2)
),
self.get_next_blob_reference(
field + '_ranges')
),
),
('values',
schema.Scalar(np.float32,
self.get_next_blob_reference(
field + '_values')
),
)
)
))
else:
raise TypeError(
"Unsupported input type: {0}".
format(feature_specs.feature_type))
# TODO(amalevich): This schema is producing ranges. And thus if there is
# something using it it should support ranges as well. It might be
# confusing, if we don't add better support for ranges/have it as a
# first layer
self.output_schema = schema.Struct(
*outputs
)
# TODO(amalevich): Consider moving this data to schema, instead
# Structs doens't support attaching metadata to them and clonning
# will break things badly, but this is the most elegant way to pass
# this info around. Should we change it or it'll be too much work and
# not worse it?
for field, feature_specs in input_specs:
schema.attach_metadata_to_scalars(
self.output_schema[field],
schema.Metadata(
feature_specs=feature_specs)
)
self.zero = model.global_constants['ZERO']
self.zero_range = model.global_constants['ZERO_RANGE']
# Add operators to all types that need to be densified
def add_ops(self, net):
record = self.input_record
for field, feature_specs in self.input_specs:
if feature_specs.feature_type == 'FLOAT':
net.SparseToDenseMask(
[
record[field].keys(),
record[field].values(),
self.zero,
record[field].lengths(),
],
[
self.output_schema[field](),
],
mask=feature_specs.feature_ids,
)
elif feature_specs.feature_type == 'ID_LIST':
id_list_ranges = net.LengthsToRanges(
record[field].values.lengths(),
net.NextScopedBlob('id_list_ranges')
)
net.SparseToDenseMask(
[
record[field].keys(), id_list_ranges, self.zero_range,
record[field].lengths()
],
self.output_schema[field].ranges(),
mask=feature_specs.feature_ids,
)
# Alias helps to enforce the fact that all SparseToDense calls
# produce new blobs.
# Reusing blob names might result in some weird consequences
# during the delivery time, when content of the blobs is
# generated based on the inputSpecs.
net.Alias(record[field].values.items(),
self.output_schema[field].values())
elif feature_specs.feature_type == 'ID_SCORE_LIST':
# TODO: merge this to the case above?
id_list_ranges = net.LengthsToRanges(
record[field].values.lengths(),
net.NextScopedBlob('id_score_list_ranges')
)
net.SparseToDenseMask(
[
record[field].keys(), id_list_ranges, self.zero_range,
record[field].lengths()
],
self.output_schema[field].ranges(),
mask=feature_specs.feature_ids,
)
# Alias helps to enforce the fact that all SparseToDense calls
# produce new blobs.
# Reusing blob names might result in some weird consequences
# during the delivery time, when content of the blobs is
# generated based on the inputSpecs.
net.Alias(record[field].values.keys(),
self.output_schema[field].ids())
net.Alias(record[field].values.values(),
self.output_schema[field].scores())
elif feature_specs.feature_type == 'EMBEDDING':
ranges = net.LengthsToRanges(
record[field].values.lengths(),
net.NextScopedBlob('embeddings_ranges')
)
net.SparseToDenseMask(
[
record[field].keys(),
ranges,
self.zero_range,
record[field].lengths()
],
self.output_schema[field].ranges(),
mask=feature_specs.feature_ids,
)
# Alias helps to enforce the fact that all SparseToDense calls
# produce new blobs.
# Reusing blob names might result in some weird consequences
# during the delivery time, when content of the blobs is
# generated based on the inputSpecs.
net.Alias(record[field].values.items(),
self.output_schema[field].values())
def get_metadata(self):
metadata = []
for field, feature_specs in self.input_specs:
metadata.append(
(
{
'type': feature_specs.feature_type,
'names': feature_specs.feature_names,
'ids': feature_specs.feature_ids,
},
self.output_schema[field].field_blobs(),
self.output_schema[field].field_types()
)
)
if feature_specs.feature_type == 'FLOAT':
metadata[-1][0]['cardinality'] = 1
return metadata