mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: **Description** Provide DeepText model with the functionality to load a secondary index (pre-trained char-ngram embedding, e.g. FastText) during training/test. Embeddings of out-of-vocabulary words will be computed on-the-fly during training/test by averaging the char-ngram embeddings. **Approach** This diff provides two custom operators to accomplish this task – ConditionalOp and IndexCharNgramGetOp. We first use IndexCharNgramGetOp to perform char-ngram index lookup and return a sparse tensor segmented by lengths for each token. The sparse tensor is then used to compute the average embedding provided by the char-ngram index. Finally, we use a ConditionalOp to replace those whose embeddings were not found in the original index during the feature apply stage. Please refer to documentations of the code for more details. Reviewed By: jamesr66a Differential Revision: D5666924 fbshipit-source-id: f76605d093154a014d5b9ebf9510de9d79874eee
30 lines
1.0 KiB
Python
30 lines
1.0 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import numpy as np
|
|
from hypothesis import given
|
|
import hypothesis.strategies as st
|
|
from caffe2.python import core
|
|
import caffe2.python.hypothesis_test_util as hu
|
|
|
|
|
|
class TestConditionalOp(hu.HypothesisTestCase):
|
|
@given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
|
|
def test_conditional(self, rows_num, gc, dc):
|
|
op = core.CreateOperator(
|
|
"Conditional", ["condition", "data_t", "data_f"], "output"
|
|
)
|
|
data_t = np.random.random((rows_num, 10, 20)).astype(np.float32)
|
|
data_f = np.random.random((rows_num, 10, 20)).astype(np.float32)
|
|
condition = np.random.choice(a=[True, False], size=rows_num)
|
|
|
|
def ref(condition, data_t, data_f):
|
|
output = [
|
|
data_t[i] if condition[i] else data_f[i]
|
|
for i in range(rows_num)
|
|
]
|
|
return (output,)
|
|
|
|
self.assertReferenceChecks(gc, op, [condition, data_t, data_f], ref)
|