mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
243 lines
10 KiB
Python
243 lines
10 KiB
Python
# @package sparse_to_dense
|
|
# Module caffe2.python.layers.sparse_to_dense
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from caffe2.python import schema
|
|
from caffe2.python.layers.layers import (
|
|
ModelLayer,
|
|
)
|
|
import numpy as np
|
|
|
|
|
|
class FeatureSparseToDense(ModelLayer):
|
|
|
|
def __init__(self, model, input_record, input_specs,
|
|
name='feature_sparse_to_dense', **kwargs):
|
|
"""
|
|
`input_specs` follows the format of FeatureSpec from schema. To be more
|
|
precise it's a namedtuple that should have:
|
|
'feature_type', 'feature_names', 'feature_ids'
|
|
"""
|
|
super(FeatureSparseToDense, self).__init__(model, name,
|
|
input_record, **kwargs)
|
|
|
|
self.input_specs = input_specs
|
|
|
|
outputs = []
|
|
for field, feature_specs in self.input_specs:
|
|
assert len(feature_specs.feature_names) ==\
|
|
len(feature_specs.feature_ids)
|
|
if feature_specs.feature_type == 'FLOAT':
|
|
outputs.append((
|
|
field,
|
|
schema.Scalar(
|
|
(np.float32, (len(feature_specs.feature_ids), )),
|
|
self.get_next_blob_reference(field + '_output')
|
|
)
|
|
))
|
|
elif feature_specs.feature_type == 'ID_LIST':
|
|
outputs.append((
|
|
field,
|
|
schema.Struct(
|
|
('ranges',
|
|
schema.Scalar(
|
|
(
|
|
np.int32,
|
|
(len(feature_specs.feature_ids), 2)
|
|
),
|
|
self.get_next_blob_reference(
|
|
field + '_ranges')
|
|
),
|
|
),
|
|
('values',
|
|
schema.Scalar(np.int64,
|
|
self.get_next_blob_reference(
|
|
field + '_values')
|
|
),
|
|
)
|
|
)
|
|
))
|
|
elif feature_specs.feature_type == 'ID_SCORE_LIST':
|
|
outputs.append((
|
|
field,
|
|
schema.Struct(
|
|
('ranges',
|
|
schema.Scalar(
|
|
(
|
|
np.int32,
|
|
(len(feature_specs.feature_ids), 2)
|
|
),
|
|
self.get_next_blob_reference(
|
|
field + '_ranges')
|
|
),
|
|
),
|
|
('ids',
|
|
schema.Scalar(np.int64,
|
|
self.get_next_blob_reference(
|
|
field + '_ids')
|
|
),
|
|
),
|
|
('scores',
|
|
schema.Scalar(np.float32,
|
|
self.get_next_blob_reference(
|
|
field + '_scores')
|
|
),
|
|
)
|
|
)
|
|
))
|
|
elif feature_specs.feature_type == 'EMBEDDING':
|
|
# We don't know dimensions of embeddings in input data.
|
|
# Even though they should match dimensions from feature config,
|
|
# we keep ranges blob to check input data later.
|
|
outputs.append((
|
|
field,
|
|
schema.Struct(
|
|
('ranges',
|
|
schema.Scalar(
|
|
(
|
|
np.int32,
|
|
(len(feature_specs.feature_ids), 2)
|
|
),
|
|
self.get_next_blob_reference(
|
|
field + '_ranges')
|
|
),
|
|
),
|
|
('values',
|
|
schema.Scalar(np.float32,
|
|
self.get_next_blob_reference(
|
|
field + '_values')
|
|
),
|
|
)
|
|
)
|
|
))
|
|
else:
|
|
raise TypeError(
|
|
"Unsupported input type: {0}".
|
|
format(feature_specs.feature_type))
|
|
|
|
# TODO(amalevich): This schema is producing ranges. And thus if there is
|
|
# something using it it should support ranges as well. It might be
|
|
# confusing, if we don't add better support for ranges/have it as a
|
|
# first layer
|
|
self.output_schema = schema.Struct(
|
|
*outputs
|
|
)
|
|
|
|
# TODO(amalevich): Consider moving this data to schema, instead
|
|
# Structs doens't support attaching metadata to them and clonning
|
|
# will break things badly, but this is the most elegant way to pass
|
|
# this info around. Should we change it or it'll be too much work and
|
|
# not worse it?
|
|
for field, feature_specs in input_specs:
|
|
schema.attach_metadata_to_scalars(
|
|
self.output_schema[field],
|
|
schema.Metadata(
|
|
feature_specs=feature_specs)
|
|
)
|
|
self.zero = model.global_constants['ZERO']
|
|
self.zero_range = model.global_constants['ZERO_RANGE']
|
|
|
|
# Add operators to all types that need to be densified
|
|
def add_ops(self, net):
|
|
record = self.input_record
|
|
for field, feature_specs in self.input_specs:
|
|
if feature_specs.feature_type == 'FLOAT':
|
|
net.SparseToDenseMask(
|
|
[
|
|
record[field].keys(),
|
|
record[field].values(),
|
|
self.zero,
|
|
record[field].lengths(),
|
|
],
|
|
[
|
|
self.output_schema[field](),
|
|
],
|
|
mask=feature_specs.feature_ids,
|
|
)
|
|
elif feature_specs.feature_type == 'ID_LIST':
|
|
id_list_ranges = net.LengthsToRanges(
|
|
record[field].values.lengths(),
|
|
net.NextScopedBlob('id_list_ranges')
|
|
)
|
|
net.SparseToDenseMask(
|
|
[
|
|
record[field].keys(), id_list_ranges, self.zero_range,
|
|
record[field].lengths()
|
|
],
|
|
self.output_schema[field].ranges(),
|
|
mask=feature_specs.feature_ids,
|
|
)
|
|
# Alias helps to enforce the fact that all SparseToDense calls
|
|
# produce new blobs.
|
|
# Reusing blob names might result in some weird consequences
|
|
# during the delivery time, when content of the blobs is
|
|
# generated based on the inputSpecs.
|
|
net.Alias(record[field].values.items(),
|
|
self.output_schema[field].values())
|
|
elif feature_specs.feature_type == 'ID_SCORE_LIST':
|
|
# TODO: merge this to the case above?
|
|
id_list_ranges = net.LengthsToRanges(
|
|
record[field].values.lengths(),
|
|
net.NextScopedBlob('id_score_list_ranges')
|
|
)
|
|
net.SparseToDenseMask(
|
|
[
|
|
record[field].keys(), id_list_ranges, self.zero_range,
|
|
record[field].lengths()
|
|
],
|
|
self.output_schema[field].ranges(),
|
|
mask=feature_specs.feature_ids,
|
|
)
|
|
# Alias helps to enforce the fact that all SparseToDense calls
|
|
# produce new blobs.
|
|
# Reusing blob names might result in some weird consequences
|
|
# during the delivery time, when content of the blobs is
|
|
# generated based on the inputSpecs.
|
|
net.Alias(record[field].values.keys(),
|
|
self.output_schema[field].ids())
|
|
net.Alias(record[field].values.values(),
|
|
self.output_schema[field].scores())
|
|
elif feature_specs.feature_type == 'EMBEDDING':
|
|
ranges = net.LengthsToRanges(
|
|
record[field].values.lengths(),
|
|
net.NextScopedBlob('embeddings_ranges')
|
|
)
|
|
net.SparseToDenseMask(
|
|
[
|
|
record[field].keys(),
|
|
ranges,
|
|
self.zero_range,
|
|
record[field].lengths()
|
|
],
|
|
self.output_schema[field].ranges(),
|
|
mask=feature_specs.feature_ids,
|
|
)
|
|
# Alias helps to enforce the fact that all SparseToDense calls
|
|
# produce new blobs.
|
|
# Reusing blob names might result in some weird consequences
|
|
# during the delivery time, when content of the blobs is
|
|
# generated based on the inputSpecs.
|
|
net.Alias(record[field].values.items(),
|
|
self.output_schema[field].values())
|
|
|
|
def get_metadata(self):
|
|
metadata = []
|
|
for field, feature_specs in self.input_specs:
|
|
metadata.append(
|
|
(
|
|
{
|
|
'type': feature_specs.feature_type,
|
|
'names': feature_specs.feature_names,
|
|
'ids': feature_specs.feature_ids,
|
|
},
|
|
self.output_schema[field].field_blobs(),
|
|
self.output_schema[field].field_types()
|
|
)
|
|
)
|
|
if feature_specs.feature_type == 'FLOAT':
|
|
metadata[-1][0]['cardinality'] = 1
|
|
return metadata
|