pytorch/caffe2/python/memonger_test.py
Aapo Kyrola 95b3309a87 Gradient Input memory sharing using memonger blob sharing
Summary:
This diff brings us to roughly par with Torch on ResNet memory usage. On batch size 32, Resnet-50 took 7497MiB, after this 5010 MiB. This will thus allow us to handle 64 images / GPU, or 256 images / 4 GPUs.

In addition, I added a special argument to DagNet that causes it to run only one thread for the first iteration. This is needed since there are allocations on the first iteration's backward pass due to gradient sharing, and this will cause NCCL to deadlock.

The sharing of gradient buffers requires inferring which gradients can share memory (i.e that they are not used concurrently). Previous memonger code uses topological sort, but rbgirshick showed that it does not work with tree-like models. Thus, I wrote a new optimization algorithm based on DFS. It takes about 0.25 secs / GPU on resnet-50, so is clearly fast enough.

Module data_parallel_model supports this feature natively.

Reviewed By: prigoyal

Differential Revision: D4363209

fbshipit-source-id: 73b11e7610438098bb11bff0af8075ab0cf2c0f1
2017-01-09 19:44:23 -08:00

167 lines
7.6 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from caffe2.python import workspace, cnn, memonger, core
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
from hypothesis import given
class MemongerTest(hu.HypothesisTestCase):
@given(input_dim=st.integers(min_value=1, max_value=10),
output_dim=st.integers(min_value=1, max_value=10),
batch_size=st.integers(min_value=1, max_value=10),
do=st.sampled_from(hu.device_options))
def test_simple_memonger(self, input_dim, output_dim, batch_size, do):
m = cnn.CNNModelHelper()
fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc3.Relu([], fc3)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["loss"])
m.net.Proto().device_option.CopyFrom(do)
m.param_init_net.Proto().device_option.CopyFrom(do)
static_blobs = \
[o for op in m.param_init_net.Proto().op for o in op.output] + \
["data", "label", "loss", input_to_grad["fc1_w"]]
optimization = memonger.optimize_interference(m.Proto(), static_blobs)
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("data", data, device_option=do)
workspace.FeedBlob("label", label, device_option=do)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("loss")
grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
workspace.RunNetOnce(optimization.net)
optimized_loss = workspace.FetchBlob("loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
stats = memonger.compute_statistics(optimization.assignments)
self.assertLess(stats.optimized_nbytes, stats.baseline_nbytes)
@given(input_dim=st.integers(min_value=1, max_value=4),
output_dim=st.integers(min_value=1, max_value=4),
batch_size=st.integers(min_value=1, max_value=4))
def test_gradient_optim(self, input_dim, output_dim, batch_size):
m = cnn.CNNModelHelper()
with core.NameScope("name_x"):
fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
fc5.Relu([], fc5)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["name_x/loss"])
def count_blobs(proto):
blob_set = set()
for op in proto.op:
for inp in op.input:
blob_set.add(inp)
for outp in op.output:
blob_set.add(outp)
return len(blob_set)
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["name_x/loss"],
set(m.param_to_grad.values()),
"name_x/",
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
# Test networks produce exactly same gradients
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("name_x/data", data)
workspace.FeedBlob("name_x/label", label)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("name_x/loss")
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
workspace.RunNetOnce(optim_proto)
optimized_loss = workspace.FetchBlob("name_x/loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
@given(input_dim=st.integers(min_value=4, max_value=4),
output_dim=st.integers(min_value=4, max_value=4),
batch_size=st.integers(min_value=4, max_value=4))
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
m = cnn.CNNModelHelper()
with core.NameScope("name_x"):
fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
fc5.Relu([], fc5) \
.Softmax([], "pred1") \
.LabelCrossEntropy(["label"], ["xent1"]) \
.AveragedLoss([], "loss1")
fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
fc6.Relu([], fc6) \
.Softmax([], "pred2") \
.LabelCrossEntropy(["label"], ["xent2"]) \
.AveragedLoss([], "loss2")
input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])
def count_blobs(proto):
blob_set = set()
for op in proto.op:
for inp in op.input:
blob_set.add(inp)
for outp in op.output:
blob_set.add(outp)
return len(blob_set)
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["name_x/loss1", "name_x/loss2"],
set(m.param_to_grad.values()),
"name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/"
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
print(str(optim_proto))
# Test networks produce exactly same gradients
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("name_x/data", data)
workspace.FeedBlob("name_x/label", label)
workspace.RunNetOnce(m.net)
loss1 = workspace.FetchBlob("name_x/loss1")
loss2 = workspace.FetchBlob("name_x/loss2")
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
workspace.RunNetOnce(optim_proto)
optimized_loss1 = workspace.FetchBlob("name_x/loss1")
optimized_loss2 = workspace.FetchBlob("name_x/loss2")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss1, optimized_loss1)
np.testing.assert_almost_equal(loss2, optimized_loss2)
np.testing.assert_almost_equal(grad, optimized_grad)