# @package sparse_to_dense # Module caffe2.python.layers.sparse_to_dense from collections import defaultdict import numpy as np from caffe2.python import schema from caffe2.python.layers.layers import AccessedFeatures, ModelLayer class FeatureSparseToDense(ModelLayer): def __init__( self, model, input_record, input_specs, name="feature_sparse_to_dense", default_dense_value=None, **kwargs ): """ `input_specs` follows the format of FeatureSpec from schema. To be more precise it's a namedtuple that should have: 'feature_type', 'feature_names', 'feature_ids' Default_dense_value can only be 0.0 or float("NaN"). Any input that isn't None will be NaN. """ super(FeatureSparseToDense, self).__init__(model, name, input_record, **kwargs) if default_dense_value is None: default_dense_value = 0.0 default_dense_value = float(default_dense_value) assert ( np.isnan(default_dense_value) or default_dense_value == 0.0 ), "default_dense_value can only be 0.0 or NaN" self.input_specs = input_specs self.default_float_value = ( model.global_constants["NAN"] if np.isnan(default_dense_value) else model.global_constants["ZERO"] ) self.zero_range = model.global_constants["ZERO_RANGE"] outputs = [] for field, feature_specs in self.input_specs: assert len(feature_specs.feature_names) == len(feature_specs.feature_ids) if feature_specs.feature_type == "FLOAT": outputs.append( ( field, schema.Scalar( (np.float32, (len(feature_specs.feature_ids),)), self.get_next_blob_reference(field + "_output"), ), ) ) elif feature_specs.feature_type == "ID_LIST": outputs.append( ( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "values", schema.Scalar( np.int64, self.get_next_blob_reference(field + "_values"), ), ), ), ) ) elif feature_specs.feature_type == "ID_SCORE_LIST": outputs.append( ( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "ids", schema.Scalar( np.int64, self.get_next_blob_reference(field + "_ids"), ), ), ( "scores", schema.Scalar( np.float32, self.get_next_blob_reference(field + "_scores"), ), ), ), ) ) elif feature_specs.feature_type == "EMBEDDING": # We don't know dimensions of embeddings in input data. # Even though they should match dimensions from feature config, # we keep ranges blob to check input data later. outputs.append( ( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "values", schema.Scalar( np.float32, self.get_next_blob_reference(field + "_values"), ), ), ), ) ) elif feature_specs.feature_type == "GENERIC_FEATURE": # We don't know dimensions of embeddings in input data. # Even though they should match dimensions from feature config, # we keep ranges blob to check input data later. # Currently this schema with ranges and values is only for # generic type enum 1. If new types are implemented, we need to # modify the ParseGeneric operator, and this part accordingly outputs.append( ( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "values", schema.Scalar( np.float32, self.get_next_blob_reference(field + "_values"), ), ), ), ) ) else: raise TypeError( "Unsupported input type: {0}".format(feature_specs.feature_type) ) # TODO(amalevich): This schema is producing ranges. And thus if there is # something using it it should support ranges as well. It might be # confusing, if we don't add better support for ranges/have it as a # first layer self.output_schema = schema.Struct(*outputs) # TODO(amalevich): Consider moving this data to schema, instead # Structs doesn't support attaching metadata to them and clonning # will break things badly, but this is the most elegant way to pass # this info around. Should we change it or it'll be too much work and # not worse it? for field, feature_specs in input_specs: schema.attach_metadata_to_scalars( self.output_schema[field], schema.Metadata(feature_specs=feature_specs) ) # Add operators to all types that need to be densified def add_ops(self, net): record = self.input_record for field, feature_specs in self.input_specs: if feature_specs.feature_type == "FLOAT": net.SparseToDenseMask( [ record[field].keys(), record[field].values(), self.default_float_value, record[field].lengths(), ], [self.output_schema[field]()], mask=feature_specs.feature_ids, ) elif feature_specs.feature_type == "ID_LIST": id_list_ranges = net.LengthsToRanges( record[field].values.lengths(), net.NextScopedBlob("id_list_ranges") ) net.SparseToDenseMask( [ record[field].keys(), id_list_ranges, self.zero_range, record[field].lengths(), ], self.output_schema[field].ranges(), mask=feature_specs.feature_ids, ) # Alias helps to enforce the fact that all SparseToDense calls # produce new blobs. # Reusing blob names might result in some weird consequences # during the delivery time, when content of the blobs is # generated based on the inputSpecs. net.Alias( record[field].values.items(), self.output_schema[field].values() ) elif feature_specs.feature_type == "ID_SCORE_LIST": # TODO: merge this to the case above? id_list_ranges = net.LengthsToRanges( record[field].values.lengths(), net.NextScopedBlob("id_score_list_ranges"), ) net.SparseToDenseMask( [ record[field].keys(), id_list_ranges, self.zero_range, record[field].lengths(), ], self.output_schema[field].ranges(), mask=feature_specs.feature_ids, ) # Alias helps to enforce the fact that all SparseToDense calls # produce new blobs. # Reusing blob names might result in some weird consequences # during the delivery time, when content of the blobs is # generated based on the inputSpecs. net.Alias(record[field].values.keys(), self.output_schema[field].ids()) net.Alias( record[field].values.values(), self.output_schema[field].scores() ) elif feature_specs.feature_type == "EMBEDDING": ranges = net.LengthsToRanges( record[field].values.lengths(), net.NextScopedBlob("embeddings_ranges"), ) net.SparseToDenseMask( [ record[field].keys(), ranges, self.zero_range, record[field].lengths(), ], self.output_schema[field].ranges(), mask=feature_specs.feature_ids, ) # Alias helps to enforce the fact that all SparseToDense calls # produce new blobs. # Reusing blob names might result in some weird consequences # during the delivery time, when content of the blobs is # generated based on the inputSpecs. net.Alias( record[field].values.items(), self.output_schema[field].values() ) elif feature_specs.feature_type == "GENERIC_FEATURE": ( feature_lengths_blob, feature_ids_blob, value_lengths_blob, value_values_blob, ) = net.ParseGeneric( [record[field]()], ["feature_lengths", "feature_ids", "value_lengths", "value_values"], feature_type_enum=1, ) # Currently our implementation only supports # generic type enum 1. If new types are implemented, we need to # modify the ParseGeneric operator, the schema above, # and this part accordingly to parse the generic feature strings # into input_record ranges = net.LengthsToRanges( value_lengths_blob, net.NextScopedBlob("generics_ranges") ) net.SparseToDenseMask( [feature_ids_blob, ranges, self.zero_range, feature_lengths_blob], self.output_schema[field].ranges(), mask=feature_specs.feature_ids, ) # Alias helps to enforce the fact that all SparseToDense calls # produce new blobs. # Reusing blob names might result in some weird consequences # during the delivery time, when content of the blobs is # generated based on the inputSpecs. net.Alias(value_values_blob, self.output_schema[field].values()) def get_metadata(self): metadata = [] for field, feature_specs in self.input_specs: metadata.append( ( { "type": feature_specs.feature_type, "names": feature_specs.feature_names, "ids": feature_specs.feature_ids, }, self.output_schema[field].field_blobs(), self.output_schema[field].field_types(), ) ) if feature_specs.feature_type == "FLOAT": metadata[-1][0]["cardinality"] = 1 return metadata def get_accessed_features(self): accessed_features = defaultdict(list) # The features that are accessed are just those features that appear in # the input specs for field, feature_specs in self.input_specs: accessed_features[field].append( AccessedFeatures( feature_specs.feature_type, set(feature_specs.feature_ids) ) ) return accessed_features