mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48021 Extending operator schema check for simple memonger to dag memonger as well. As part of this a fix is being made to handle inplace ops (having at least one output name same as input blob). Earlier all the output blobs from ops were being treated as shareable but it failed assertion of external input blobs with the same name not allowed to share. Test Plan: Added corresponding unit tests Reviewed By: hlu1 Differential Revision: D24968862 fbshipit-source-id: b6679a388a82b0d68f65ade64b85560354aaa3ef
843 lines
36 KiB
Python
843 lines
36 KiB
Python
import numpy as np
|
|
|
|
from caffe2.python import workspace, memonger, core, model_helper, brew
|
|
from caffe2.proto import caffe2_pb2
|
|
import caffe2.python.hypothesis_test_util as hu
|
|
from future.utils import viewvalues
|
|
import hypothesis.strategies as st
|
|
from hypothesis import given, settings
|
|
import unittest
|
|
|
|
|
|
def has_blob(proto, needle):
|
|
for op in proto.op:
|
|
for inp in op.input:
|
|
if inp == needle:
|
|
return True
|
|
for outp in op.output:
|
|
if outp == needle:
|
|
return True
|
|
return False
|
|
|
|
|
|
def count_blobs(proto):
|
|
blobs = set()
|
|
for op in proto.op:
|
|
blobs = blobs.union(set(op.input)).union(set(op.output))
|
|
return len(blobs)
|
|
|
|
|
|
class MemongerTest(hu.HypothesisTestCase):
|
|
@given(input_dim=st.integers(min_value=1, max_value=10),
|
|
output_dim=st.integers(min_value=1, max_value=10),
|
|
batch_size=st.integers(min_value=1, max_value=10),
|
|
do=st.sampled_from(hu.device_options),
|
|
algo=st.sampled_from(memonger.AssignmentAlgorithm))
|
|
@settings(max_examples=5, deadline=None)
|
|
def test_simple_memonger(self, input_dim, output_dim, batch_size, do, algo):
|
|
m = model_helper.ModelHelper()
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc3.Relu([], fc3)\
|
|
.Softmax([], "pred") \
|
|
.LabelCrossEntropy(["label"], ["xent"]) \
|
|
.AveragedLoss([], "loss")
|
|
input_to_grad = m.AddGradientOperators(["loss"])
|
|
m.net.Proto().device_option.CopyFrom(do)
|
|
m.param_init_net.Proto().device_option.CopyFrom(do)
|
|
static_blobs = \
|
|
[o for op in m.param_init_net.Proto().op for o in op.output] + \
|
|
["data", "label", "loss", input_to_grad["fc1_w"]]
|
|
|
|
optimization = memonger.optimize_interference(
|
|
m.Proto(), static_blobs, algo=algo)
|
|
data = np.random.randn(batch_size, input_dim).astype(np.float32)
|
|
label = np.random.randint(
|
|
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
|
|
workspace.RunNetOnce(m.param_init_net)
|
|
workspace.FeedBlob("data", data, device_option=do)
|
|
workspace.FeedBlob("label", label, device_option=do)
|
|
workspace.RunNetOnce(m.net)
|
|
loss = workspace.FetchBlob("loss")
|
|
grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
|
|
workspace.RunNetOnce(optimization.net)
|
|
optimized_loss = workspace.FetchBlob("loss")
|
|
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
|
|
np.testing.assert_almost_equal(loss, optimized_loss)
|
|
np.testing.assert_almost_equal(grad, optimized_grad)
|
|
stats = memonger.compute_statistics(optimization.assignments)
|
|
self.assertLess(stats.optimized_nbytes, stats.baseline_nbytes)
|
|
|
|
# run with blob sizes
|
|
blob_sizes = memonger.collect_blob_sizes(m.Proto())
|
|
optimization1 = memonger.optimize_interference(
|
|
m.Proto(), static_blobs, blob_sizes=blob_sizes, algo=algo)
|
|
workspace.RunNetOnce(optimization1.net)
|
|
optimized_loss = workspace.FetchBlob("loss")
|
|
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
|
|
np.testing.assert_almost_equal(loss, optimized_loss)
|
|
np.testing.assert_almost_equal(grad, optimized_grad)
|
|
stats = memonger.compute_statistics(optimization1.assignments)
|
|
self.assertLessEqual(stats.optimized_nbytes, stats.baseline_nbytes)
|
|
|
|
@given(input_dim=st.integers(min_value=1, max_value=10),
|
|
output_dim=st.integers(min_value=1, max_value=10),
|
|
batch_size=st.integers(min_value=1, max_value=10),
|
|
do=st.sampled_from(hu.device_options))
|
|
@settings(max_examples=5, deadline=None)
|
|
def test_fast_memonger(self, input_dim, output_dim, batch_size, do):
|
|
m = model_helper.ModelHelper()
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc3.Relu([], fc3)\
|
|
.Softmax([], "pred") \
|
|
.LabelCrossEntropy(["label"], ["xent"]) \
|
|
.AveragedLoss([], "loss")
|
|
input_to_grad = m.AddGradientOperators(["loss"])
|
|
m.net.Proto().device_option.CopyFrom(do)
|
|
m.param_init_net.Proto().device_option.CopyFrom(do)
|
|
static_blobs = \
|
|
[o for op in m.param_init_net.Proto().op for o in op.output] + \
|
|
["data", "label", "loss", input_to_grad["fc1_w"]]
|
|
|
|
optimized_net = memonger.optimize_inference_fast(
|
|
m.Proto(), static_blobs)
|
|
data = np.random.randn(batch_size, input_dim).astype(np.float32)
|
|
label = np.random.randint(
|
|
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
|
|
workspace.RunNetOnce(m.param_init_net)
|
|
workspace.FeedBlob("data", data, device_option=do)
|
|
workspace.FeedBlob("label", label, device_option=do)
|
|
workspace.RunNetOnce(m.net)
|
|
loss = workspace.FetchBlob("loss")
|
|
grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
|
|
workspace.RunNetOnce(optimized_net)
|
|
optimized_loss = workspace.FetchBlob("loss")
|
|
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
|
|
np.testing.assert_almost_equal(loss, optimized_loss)
|
|
np.testing.assert_almost_equal(grad, optimized_grad)
|
|
|
|
self.assertLess(count_blobs(optimized_net), count_blobs(m.Proto()))
|
|
|
|
def test_fast_memonger_unique_outputs(self):
|
|
m = model_helper.ModelHelper()
|
|
fc = []
|
|
for i in range(2):
|
|
z = brew.fc(
|
|
m, "data{}".format(i), "fc".format(i), dim_in=2, dim_out=2)
|
|
fc.append(z)
|
|
r = []
|
|
# Trick is here to have same input appear twice in a same Sum
|
|
for x in fc:
|
|
for y in fc:
|
|
r.append(brew.sum(m, [x, y], 1))
|
|
concated = brew.concat(m, r, "concated")
|
|
brew.relu(m, concated, "merged")
|
|
|
|
static_blobs = \
|
|
[o for op in m.param_init_net.Proto().op for o in op.output] + \
|
|
["merged"] + ["data{}".format(i) for i in range(len(fc))]
|
|
|
|
optimized_net = memonger.optimize_inference_fast(
|
|
m.Proto(), static_blobs)
|
|
for op in optimized_net.op:
|
|
self.assertEqual(len(op.output), len(set(op.output)), str(op))
|
|
|
|
@given(input_dim=st.integers(min_value=1, max_value=4),
|
|
output_dim=st.integers(min_value=1, max_value=4),
|
|
batch_size=st.integers(min_value=1, max_value=4))
|
|
def test_gradient_optim(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
|
|
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
|
|
fc5.Relu([], fc5)\
|
|
.Softmax([], "pred") \
|
|
.LabelCrossEntropy(["label"], ["xent"]) \
|
|
.AveragedLoss([], "loss")
|
|
input_to_grad = m.AddGradientOperators(["name_x/loss"])
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
optim_proto = memonger.share_grad_blobs(
|
|
m.net,
|
|
["name_x/loss"],
|
|
set(viewvalues(m.param_to_grad)),
|
|
"name_x/",
|
|
share_activations=False,
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
optim_proto_wacts = memonger.share_grad_blobs(
|
|
m.net,
|
|
["name_x/loss"],
|
|
set(viewvalues(m.param_to_grad)),
|
|
"name_x/",
|
|
share_activations=True,
|
|
dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
|
|
)
|
|
blobs_wact_optim = count_blobs(optim_proto_wacts)
|
|
self.assertLessEqual(blobs_wact_optim, blobs_after)
|
|
|
|
# Check that the last activations are not shared
|
|
self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
|
|
self.assertTrue(
|
|
has_blob(optim_proto_wacts, "name_x/fc5"),
|
|
"Dont remap final activation",
|
|
)
|
|
|
|
# Test networks produce exactly same gradients
|
|
data = np.random.randn(batch_size, input_dim).astype(np.float32)
|
|
label = np.random.randint(
|
|
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
|
|
workspace.RunNetOnce(m.param_init_net)
|
|
workspace.FeedBlob("name_x/data", data)
|
|
workspace.FeedBlob("name_x/label", label)
|
|
workspace.RunNetOnce(m.net)
|
|
loss = workspace.FetchBlob("name_x/loss")
|
|
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss = workspace.FetchBlob("name_x/loss")
|
|
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
|
|
np.testing.assert_almost_equal(loss, optimized_loss)
|
|
np.testing.assert_almost_equal(grad, optimized_grad)
|
|
|
|
workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
|
|
|
|
# Run with the forward optimization
|
|
workspace.RunNetOnce(optim_proto_wacts)
|
|
optimized_loss = workspace.FetchBlob("name_x/loss")
|
|
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
|
|
np.testing.assert_almost_equal(loss, optimized_loss)
|
|
np.testing.assert_almost_equal(grad, optimized_grad)
|
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
|
|
def test_memonger_mix_cpu_gpu(self):
|
|
'''
|
|
Check that memonger does not make blobs cross CPU/GPU boundary
|
|
'''
|
|
m = model_helper.ModelHelper()
|
|
with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
|
|
fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2)
|
|
fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu")
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
|
|
fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2)
|
|
fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2)
|
|
fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2)
|
|
fc7_cpu.Relu([], fc7_cpu) \
|
|
.Softmax([], "pred") \
|
|
.LabelCrossEntropy(["label"], ["xent"]) \
|
|
.AveragedLoss([], "loss")
|
|
m.AddGradientOperators(["loss"])
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
optim_proto = memonger.share_grad_blobs(
|
|
m.net,
|
|
["loss"],
|
|
set(viewvalues(m.param_to_grad)),
|
|
"",
|
|
share_activations=True,
|
|
dont_share_blobs=set(),
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
# Create set of blobs on CPU side and GPU side and check they don't
|
|
# overlap
|
|
device_blobs = {caffe2_pb2.CPU: set(), workspace.GpuDeviceType: set()}
|
|
for op in optim_proto.op:
|
|
if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
|
|
dev = op.device_option.device_type
|
|
for b in list(op.input) + list(op.output):
|
|
device_blobs[dev].add(b)
|
|
|
|
device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
|
|
device_blobs[workspace.GpuDeviceType]
|
|
)
|
|
self.assertEquals(device_crossers, set())
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
@settings(deadline=1000)
|
|
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
|
|
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
|
|
fc5.Relu([], fc5) \
|
|
.Softmax([], "pred1") \
|
|
.LabelCrossEntropy(["label"], ["xent1"]) \
|
|
.AveragedLoss([], "loss1")
|
|
fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
|
|
fc6.Relu([], fc6) \
|
|
.Softmax([], "pred2") \
|
|
.LabelCrossEntropy(["label"], ["xent2"]) \
|
|
.AveragedLoss([], "loss2")
|
|
input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
optim_proto = memonger.share_grad_blobs(
|
|
m.net,
|
|
["name_x/loss1", "name_x/loss2"],
|
|
set(viewvalues(m.param_to_grad)),
|
|
"name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/"
|
|
share_activations=True,
|
|
dont_share_blobs=set(['name_x/fc6', 'name_x/fc5',
|
|
str(input_to_grad["name_x/fc1_w"])]),
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
self.assertTrue(has_blob(optim_proto, "name_x/fc6"))
|
|
|
|
# Test networks produce exactly same gradients
|
|
data = np.random.randn(batch_size, input_dim).astype(np.float32)
|
|
label = np.random.randint(
|
|
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
|
|
workspace.RunNetOnce(m.param_init_net)
|
|
workspace.FeedBlob("name_x/data", data)
|
|
workspace.FeedBlob("name_x/label", label)
|
|
workspace.RunNetOnce(m.net)
|
|
loss1 = workspace.FetchBlob("name_x/loss1")
|
|
loss2 = workspace.FetchBlob("name_x/loss2")
|
|
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
|
|
workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
|
|
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss1 = workspace.FetchBlob("name_x/loss1")
|
|
optimized_loss2 = workspace.FetchBlob("name_x/loss2")
|
|
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
|
|
np.testing.assert_almost_equal(loss1, optimized_loss1)
|
|
np.testing.assert_almost_equal(loss2, optimized_loss2)
|
|
np.testing.assert_almost_equal(grad, optimized_grad)
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
@settings(deadline=1000)
|
|
def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
|
|
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
# Branch
|
|
fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
|
|
fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
|
|
fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
|
|
|
|
fc5.Relu([], fc5sum) \
|
|
.Softmax([], "pred1") \
|
|
.LabelCrossEntropy(["label"], ["xent1"]) \
|
|
.AveragedLoss([], "loss1")
|
|
fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
|
|
fc6.Relu([], fc6) \
|
|
.Softmax([], "pred2") \
|
|
.LabelCrossEntropy(["label"], ["xent2"]) \
|
|
.AveragedLoss([], "loss2")
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
optim_proto = memonger.optimize_inference_for_dag(
|
|
m.net, ["name_x/data"], "name_x"
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
# Test networks produce exactly same results
|
|
data = np.random.randn(batch_size, input_dim).astype(np.float32)
|
|
label = np.random.randint(
|
|
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
|
|
workspace.RunNetOnce(m.param_init_net)
|
|
workspace.FeedBlob("name_x/data", data)
|
|
workspace.FeedBlob("name_x/label", label)
|
|
workspace.RunNetOnce(m.net)
|
|
loss1 = workspace.FetchBlob("name_x/loss1")
|
|
loss2 = workspace.FetchBlob("name_x/loss2")
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss1 = workspace.FetchBlob("name_x/loss1")
|
|
optimized_loss2 = workspace.FetchBlob("name_x/loss2")
|
|
np.testing.assert_almost_equal(loss1, optimized_loss1)
|
|
np.testing.assert_almost_equal(loss2, optimized_loss2)
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
@settings(deadline=10000)
|
|
def test_forward_optim_tree_harder(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
m.net.Proto().type = "dag"
|
|
m.net.Proto().num_workers = 4
|
|
m.net.AddExternalInput("label")
|
|
m.net.AddExternalInput("data")
|
|
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
|
|
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
# Branch
|
|
fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
|
|
fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
|
|
fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
|
|
fc5sum.Relu([], "relu1") \
|
|
.Softmax([], "pred1") \
|
|
.LabelCrossEntropy(["label"], ["xent1"]) \
|
|
.AveragedLoss([], "loss1")
|
|
fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
|
|
fc6.Relu([], fc6) \
|
|
.Softmax([], "pred2") \
|
|
.LabelCrossEntropy(["label"], ["xent2"]) \
|
|
.AveragedLoss([], "loss2")
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
optim_proto = memonger.optimize_inference_for_dag(
|
|
m.net, ["name_x/data"], "name_x/"
|
|
)
|
|
|
|
blobs_after = count_blobs(optim_proto)
|
|
|
|
# Extra test with when one of the parameters is also an input.
|
|
# This caused a bug before.
|
|
optim_proto_extra_input = memonger.optimize_inference_for_dag(
|
|
m.net, ["name_x/data", "name_x/fc1_w"], "name_x/"
|
|
)
|
|
blobs_after_extra_input = count_blobs(optim_proto_extra_input)
|
|
self.assertEqual(blobs_after, blobs_after_extra_input)
|
|
###
|
|
|
|
print(str(optim_proto))
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
# Test networks produce exactly same results
|
|
data = np.random.randn(batch_size, input_dim).astype(np.float32)
|
|
label = np.random.randint(
|
|
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
|
|
workspace.RunNetOnce(m.param_init_net)
|
|
workspace.FeedBlob("name_x/data", data)
|
|
workspace.FeedBlob("name_x/label", label)
|
|
workspace.RunNetOnce(m.net)
|
|
loss1 = workspace.FetchBlob("name_x/loss1")
|
|
loss2 = workspace.FetchBlob("name_x/loss2")
|
|
workspace.RunNetOnce(optim_proto)
|
|
optimized_loss1 = workspace.FetchBlob("name_x/loss1")
|
|
optimized_loss2 = workspace.FetchBlob("name_x/loss2")
|
|
np.testing.assert_almost_equal(loss1, optimized_loss1)
|
|
np.testing.assert_almost_equal(loss2, optimized_loss2)
|
|
|
|
# This test reproduces scenario where dag traversal for finding
|
|
# shared blobs was not always starting from ops with in degree of 0
|
|
@settings(deadline=10000)
|
|
def test_forward_optim_tree_dag_traversal(self):
|
|
input_dim = 4
|
|
output_dim = 4
|
|
batch_size = 4
|
|
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
|
|
fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
|
|
fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
# Branch
|
|
fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
|
|
fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
|
|
fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
|
|
|
|
fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
|
|
|
|
fc5.Relu([], fc5sum) \
|
|
.Softmax([], "pred1") \
|
|
.LabelCrossEntropy(["label"], ["xent1"]) \
|
|
.AveragedLoss([], "loss1")
|
|
fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
|
|
fc6.Relu([], fc6) \
|
|
.Softmax([], "pred2") \
|
|
.LabelCrossEntropy(["label"], ["xent2"]) \
|
|
.AveragedLoss([], "loss2")
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
# adding name_x/fc5_w as heads (which belongs to non-root op)
|
|
# to make sure that dag traversal always starts from root ops
|
|
optim_proto = memonger.optimize_inference_for_dag(
|
|
m.net, ["name_x/fc5_w", "name_x/data"], "name_x"
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
# This is specifically to verify the op schema check being done in memonger
|
|
def test_forward_optim_tree_enforce_inplace_op_invalid(self):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
|
|
net = m.net
|
|
net.IndexFreeze("A", "B") # enforce inplace op
|
|
net.Sum(["B", "B"], "C")
|
|
net.Relu("C", "D")
|
|
net.Sum(["D", "D"], "E")
|
|
|
|
with self.assertRaises(RuntimeError):
|
|
memonger.optimize_inference_for_dag(net, ["A"], "")
|
|
|
|
# Here inplace op is specifically a root op to repro the scenario where dag
|
|
# memonger could treat all the output blobs as shareable blobs and fails
|
|
# assertion of input blob with the same name not allowed to share
|
|
def test_forward_optim_tree_enforce_inplace_op_valid_and_as_head(self):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
|
|
net = m.net
|
|
net.IndexFreeze("A", "A") # enforce inplace op
|
|
net.Sum(["A", "A"], "B")
|
|
net.Relu("B", "C")
|
|
net.Relu("C", "D")
|
|
net.Sum(["D", "D"], "E")
|
|
|
|
blobs_before = count_blobs(m.net.Proto())
|
|
optim_proto = memonger.optimize_inference_for_dag(
|
|
net, ["A"], ""
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
def test_rnn(self):
|
|
from caffe2.python import rnn_cell
|
|
T = 5
|
|
model = model_helper.ModelHelper()
|
|
seq_lengths, labels = \
|
|
model.net.AddExternalInputs(
|
|
'seq_lengths', 'labels',
|
|
)
|
|
init_blobs = []
|
|
for i in range(2):
|
|
hidden_init, cell_init = model.net.AddExternalInputs(
|
|
"hidden_init_{}".format(i),
|
|
"cell_init_{}".format(i)
|
|
)
|
|
init_blobs.extend([hidden_init, cell_init])
|
|
model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10])
|
|
output, last_hidden, _, last_state = rnn_cell.LSTM(
|
|
model=model,
|
|
input_blob="input",
|
|
seq_lengths=seq_lengths,
|
|
initial_states=init_blobs,
|
|
dim_in=10,
|
|
dim_out=[10, 10],
|
|
scope="lstm1",
|
|
forward_only=False,
|
|
drop_states=True,
|
|
return_last_layer_only=True,
|
|
)
|
|
softmax, loss = model.net.SoftmaxWithLoss(
|
|
[model.Flatten(output), "labels"],
|
|
['softmax', 'loss'],
|
|
)
|
|
|
|
model.AddGradientOperators([loss])
|
|
blobs_before = count_blobs(model.net.Proto())
|
|
optim_proto = memonger.share_grad_blobs(
|
|
model.net,
|
|
["loss"],
|
|
set(viewvalues(model.param_to_grad)),
|
|
"",
|
|
share_activations=True,
|
|
dont_share_blobs=set(),
|
|
)
|
|
blobs_after = count_blobs(optim_proto)
|
|
self.assertLess(blobs_after, blobs_before)
|
|
|
|
# Run once to see all blobs are set up correctly
|
|
for init_blob in init_blobs:
|
|
workspace.FeedBlob(init_blob, np.zeros(
|
|
[1, 4, 10], dtype=np.float32
|
|
))
|
|
workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32))
|
|
workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32))
|
|
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
workspace.RunNetOnce(model.net)
|
|
|
|
def test_compute_interference_graph_inplace_ops(self):
|
|
m = model_helper.ModelHelper()
|
|
m.Copy("b1", "b1")
|
|
m.Copy("b1", "b1")
|
|
m.Copy("b1", "b1")
|
|
g = memonger.compute_interference_graph(m.net.Proto().op)
|
|
self.assertEqual(list(g.edges()), [(0, 1), (0, 2), (1, 2)])
|
|
|
|
def test_topological_sort_longest_path(self):
|
|
m = model_helper.ModelHelper()
|
|
# 0
|
|
m.Copy("conv0_w_comp", "conv0_w")
|
|
# 1
|
|
conv0 = brew.conv(m, "data", "conv0", 32, 32, 4)
|
|
# 2
|
|
m.Copy("conv2_w", "conv2_w")
|
|
# 3
|
|
brew.conv(m, conv0, "conv2", 16, 32, 4)
|
|
|
|
g = memonger.compute_interference_graph(m.net.Proto().op)
|
|
|
|
orders_org = memonger.topological_sort_traversal(g)
|
|
orders_gt_org = [2, 0, 1, 3]
|
|
self.assertEqual(orders_gt_org, list(orders_org))
|
|
|
|
orders = memonger.topological_sort_traversal_longest_path(g)
|
|
# longer path is in front of the shorter one
|
|
orders_gt = [0, 1, 2, 3]
|
|
self.assertEqual(orders_gt, list(orders))
|
|
|
|
def test_topological_sort_longest_path_multi_target(self):
|
|
# two outputs: conv2 and data4
|
|
m = model_helper.ModelHelper()
|
|
# 0
|
|
m.Copy("conv0_w_comp", "conv0_w")
|
|
# 1
|
|
conv0 = brew.conv(m, "data", "conv0", 32, 32, 4)
|
|
# 2
|
|
m.Copy("conv2_w", "conv2_w")
|
|
# 3
|
|
brew.conv(m, conv0, "conv2", 16, 32, 4)
|
|
# 4
|
|
m.Copy("data1", "data2")
|
|
# 5
|
|
m.Copy("data2", "data3")
|
|
|
|
g = memonger.compute_interference_graph(m.net.Proto().op)
|
|
|
|
orders_org = memonger.topological_sort_traversal(g)
|
|
orders_gt_org = [4, 5, 2, 0, 1, 3]
|
|
self.assertEqual(orders_gt_org, list(orders_org))
|
|
|
|
orders = memonger.topological_sort_traversal_longest_path(g)
|
|
# longer path is in front of the shorter one
|
|
orders_gt = [0, 1, 2, 3, 4, 5]
|
|
self.assertEqual(orders_gt, list(orders))
|
|
|
|
def test_topological_sort_longest_path_single_node(self):
|
|
# single node
|
|
m = model_helper.ModelHelper()
|
|
# 0
|
|
m.Copy("conv0_w_comp", "conv0_w")
|
|
|
|
g = memonger.compute_interference_graph(m.net.Proto().op)
|
|
|
|
orders_org = memonger.topological_sort_traversal(g)
|
|
orders_gt_org = [0]
|
|
self.assertEqual(orders_gt_org, list(orders_org))
|
|
|
|
orders = memonger.topological_sort_traversal_longest_path(g)
|
|
# longer path is in front of the shorter one
|
|
orders_gt = [0]
|
|
self.assertEqual(orders_gt, list(orders))
|
|
|
|
def test_compute_assignments_greedy(self):
|
|
LiveRange = memonger.LiveRange
|
|
ranges_sorted = [
|
|
('b1', LiveRange(1, 3, 10)),
|
|
('b2', LiveRange(3, 4, 1)),
|
|
('b3', LiveRange(5, 6, 1)),
|
|
('b4', LiveRange(5, 7, 10)),
|
|
]
|
|
assignment_gt = [
|
|
[ranges_sorted[0], ranges_sorted[3]],
|
|
[ranges_sorted[1], ranges_sorted[2]],
|
|
]
|
|
|
|
best = memonger.compute_assignments_greedy(ranges_sorted, None)
|
|
self.assertEqual(memonger.get_memory_usage(best), 11)
|
|
self.assertEqual(best, assignment_gt)
|
|
|
|
def test_compute_assignments_dp(self):
|
|
LiveRange = memonger.LiveRange
|
|
ranges_sorted = [
|
|
('b1', LiveRange(1, 3, 10)),
|
|
('b2', LiveRange(3, 4, 1)),
|
|
('b3', LiveRange(5, 6, 1)),
|
|
('b4', LiveRange(5, 7, 10)),
|
|
]
|
|
|
|
best = memonger.compute_assignments_dp(ranges_sorted, None)
|
|
self.assertEqual(memonger.get_memory_usage(best), 11)
|
|
|
|
def test_compute_assignments_dp1(self):
|
|
LiveRange = memonger.LiveRange
|
|
ranges_sorted = [
|
|
('b1', LiveRange(1, 2, 10)),
|
|
('b2', LiveRange(4, 6, 1)),
|
|
('b3', LiveRange(5, 6, 10)),
|
|
]
|
|
|
|
best = memonger.compute_assignments_dp(ranges_sorted, [])
|
|
self.assertEqual(memonger.get_memory_usage(best), 11)
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
def test_verify_graph_equality(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m, [fc2, fc3], "out")
|
|
|
|
m2 = model_helper.ModelHelper()
|
|
m2.Proto().type = "dag"
|
|
m2.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m2, "data", "other_x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m2, fc1, "other_y", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m2, fc1, "other_z", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m2, [fc2, fc3], "out")
|
|
|
|
self.assertTrue(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
def test_verify_graph_equality_harder(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2a = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc2b = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
|
|
fc3a = brew.fc(m, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
|
|
fc3b = brew.fc(m, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m, [fc3a, fc3b], "out")
|
|
|
|
m2 = model_helper.ModelHelper()
|
|
m2.Proto().type = "dag"
|
|
m2.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2a = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc2b = brew.fc(m2, fc1, "z", dim_in=output_dim, dim_out=output_dim)
|
|
fc3a = brew.fc(m2, fc2a, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc3b = brew.fc(m2, fc2b, "z", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m2, [fc3a, fc3b], "out")
|
|
|
|
self.assertTrue(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
def test_verify_graph_inequality(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m, [fc2, fc3], "out")
|
|
|
|
m2 = model_helper.ModelHelper()
|
|
m2.Proto().type = "dag"
|
|
m2.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m2, [fc2, fc3], "out")
|
|
|
|
self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
|
|
|
|
@given(input_dim=st.integers(min_value=4, max_value=4),
|
|
output_dim=st.integers(min_value=4, max_value=4),
|
|
batch_size=st.integers(min_value=4, max_value=4))
|
|
def test_verify_graph_inequality_harder(self, input_dim, output_dim, batch_size):
|
|
m = model_helper.ModelHelper()
|
|
m.Proto().type = "dag"
|
|
m.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2a = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc2b = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
|
|
fc3a = brew.fc(m, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
|
|
fc3b = brew.fc(m, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m, [fc3a, fc3b], "out")
|
|
|
|
m2 = model_helper.ModelHelper()
|
|
m2.Proto().type = "dag"
|
|
m2.Proto().num_workers = 4
|
|
with core.NameScope("name_x"):
|
|
fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
|
|
fc2a = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc2b = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
|
|
fc3a = brew.fc(m2, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
|
|
fc3b = brew.fc(m2, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
|
|
brew.sum(m2, [fc3a, fc3b], "out")
|
|
|
|
self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
|
|
|
|
def test_release_blobs_when_used(self):
|
|
m = model_helper.ModelHelper()
|
|
fc1 = brew.fc(m, "data", "x", dim_in=2, dim_out=2)
|
|
fc2 = brew.fc(m, fc1, "y", dim_in=2, dim_out=2)
|
|
fc3 = brew.fc(m, fc1, "z", dim_in=2, dim_out=2)
|
|
fc4 = brew.fc(m, fc2, "u", dim_in=2, dim_out=2)
|
|
m.net.Alias(["u"], ["u_alias"])
|
|
|
|
brew.sum(m, [fc3, fc4], "out")
|
|
|
|
with_frees = memonger.release_blobs_when_used(m.net.Proto(), set("data"))
|
|
|
|
expect_frees = {"x", "y", "z"} # out is external output
|
|
# and u is aliased so cannot be freed
|
|
found_frees = set()
|
|
for op in with_frees.op:
|
|
if op.type == "Free":
|
|
self.assertFalse(op.input[0] in found_frees) # no double frees
|
|
found_frees.add(op.input[0])
|
|
else:
|
|
# Check a freed blob is not used anymore
|
|
for inp in op.input:
|
|
self.assertFalse(inp in found_frees)
|
|
for outp in op.output:
|
|
self.assertFalse(outp in found_frees)
|
|
|
|
self.assertEqual(expect_frees, found_frees)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|