pytorch/caffe2/python/memonger_test.py
Peizhao Zhang a54000dc6a Added an ordering function to reduce live spans of computed blobs.
Summary:
Added an ordering function (topological_sort_traversal_longest_path()) to reduce live spans of computed blobs. The idea is to sort the ops based on the length of the execution path so that ops in longer path will be used first.

Tested on segmentation model with on-the-fly decoder and reduced memory usage from 21.7MB to 14MB (original size is 33MB with compressed parameters and without considering the conv buffer), comparing to use topological_sort_traversal() as the ordering function.

It is a general ordering function so I put it in memonger.py directly.

Reviewed By: ajtulloch

Differential Revision: D4790135

fbshipit-source-id: e661b45c1640de44ce1a9fdd009a4fba38f8e042
2017-04-06 12:20:39 -07:00

233 lines
9.7 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
from caffe2.python import workspace, cnn, memonger, core
import caffe2.python.hypothesis_test_util as hu
import hypothesis.strategies as st
from hypothesis import given
class MemongerTest(hu.HypothesisTestCase):
@given(input_dim=st.integers(min_value=1, max_value=10),
output_dim=st.integers(min_value=1, max_value=10),
batch_size=st.integers(min_value=1, max_value=10),
do=st.sampled_from(hu.device_options))
def test_simple_memonger(self, input_dim, output_dim, batch_size, do):
m = cnn.CNNModelHelper()
fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc3.Relu([], fc3)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["loss"])
m.net.Proto().device_option.CopyFrom(do)
m.param_init_net.Proto().device_option.CopyFrom(do)
static_blobs = \
[o for op in m.param_init_net.Proto().op for o in op.output] + \
["data", "label", "loss", input_to_grad["fc1_w"]]
optimization = memonger.optimize_interference(m.Proto(), static_blobs)
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("data", data, device_option=do)
workspace.FeedBlob("label", label, device_option=do)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("loss")
grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
workspace.RunNetOnce(optimization.net)
optimized_loss = workspace.FetchBlob("loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
stats = memonger.compute_statistics(optimization.assignments)
self.assertLess(stats.optimized_nbytes, stats.baseline_nbytes)
@given(input_dim=st.integers(min_value=1, max_value=4),
output_dim=st.integers(min_value=1, max_value=4),
batch_size=st.integers(min_value=1, max_value=4))
def test_gradient_optim(self, input_dim, output_dim, batch_size):
m = cnn.CNNModelHelper()
with core.NameScope("name_x"):
fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
fc5.Relu([], fc5)\
.Softmax([], "pred") \
.LabelCrossEntropy(["label"], ["xent"]) \
.AveragedLoss([], "loss")
input_to_grad = m.AddGradientOperators(["name_x/loss"])
def count_blobs(proto):
blob_set = set()
for op in proto.op:
for inp in op.input:
blob_set.add(inp)
for outp in op.output:
blob_set.add(outp)
return len(blob_set)
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["name_x/loss"],
set(m.param_to_grad.values()),
"name_x/",
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
# Test networks produce exactly same gradients
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("name_x/data", data)
workspace.FeedBlob("name_x/label", label)
workspace.RunNetOnce(m.net)
loss = workspace.FetchBlob("name_x/loss")
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
workspace.RunNetOnce(optim_proto)
optimized_loss = workspace.FetchBlob("name_x/loss")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss, optimized_loss)
np.testing.assert_almost_equal(grad, optimized_grad)
@given(input_dim=st.integers(min_value=4, max_value=4),
output_dim=st.integers(min_value=4, max_value=4),
batch_size=st.integers(min_value=4, max_value=4))
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
m = cnn.CNNModelHelper()
with core.NameScope("name_x"):
fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim)
fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
fc5.Relu([], fc5) \
.Softmax([], "pred1") \
.LabelCrossEntropy(["label"], ["xent1"]) \
.AveragedLoss([], "loss1")
fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
fc6.Relu([], fc6) \
.Softmax([], "pred2") \
.LabelCrossEntropy(["label"], ["xent2"]) \
.AveragedLoss([], "loss2")
input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])
def count_blobs(proto):
blob_set = set()
for op in proto.op:
for inp in op.input:
blob_set.add(inp)
for outp in op.output:
blob_set.add(outp)
return len(blob_set)
blobs_before = count_blobs(m.net.Proto())
optim_proto = memonger.share_grad_blobs(
m.net,
["name_x/loss1", "name_x/loss2"],
set(m.param_to_grad.values()),
"name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/"
)
blobs_after = count_blobs(optim_proto)
self.assertLess(blobs_after, blobs_before)
print(str(optim_proto))
# Test networks produce exactly same gradients
data = np.random.randn(batch_size, input_dim).astype(np.float32)
label = np.random.randint(
low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
workspace.RunNetOnce(m.param_init_net)
workspace.FeedBlob("name_x/data", data)
workspace.FeedBlob("name_x/label", label)
workspace.RunNetOnce(m.net)
loss1 = workspace.FetchBlob("name_x/loss1")
loss2 = workspace.FetchBlob("name_x/loss2")
grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
workspace.RunNetOnce(optim_proto)
optimized_loss1 = workspace.FetchBlob("name_x/loss1")
optimized_loss2 = workspace.FetchBlob("name_x/loss2")
optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
np.testing.assert_almost_equal(loss1, optimized_loss1)
np.testing.assert_almost_equal(loss2, optimized_loss2)
np.testing.assert_almost_equal(grad, optimized_grad)
def test_topological_sort_longest_path(self):
m = cnn.CNNModelHelper()
# 0
m.Copy("conv0_w_comp", "conv0_w")
# 1
conv0 = m.Conv("data", "conv0", 32, 32, 4)
# 2
m.Copy("conv2_w", "conv2_w")
# 3
m.Conv(conv0, "conv2", 16, 32, 4)
g = memonger.compute_interference_graph(m.net.Proto().op)
orders_org = memonger.topological_sort_traversal(g)
orders_gt_org = [2, 0, 1, 3]
self.assertEqual(orders_gt_org, orders_org)
orders = memonger.topological_sort_traversal_longest_path(g)
# longer path is in front of the shorter one
orders_gt = [0, 1, 2, 3]
self.assertEqual(orders_gt, orders)
def test_topological_sort_longest_path_multi_target(self):
# two outputs: conv2 and data4
m = cnn.CNNModelHelper()
# 0
m.Copy("conv0_w_comp", "conv0_w")
# 1
conv0 = m.Conv("data", "conv0", 32, 32, 4)
# 2
m.Copy("conv2_w", "conv2_w")
# 3
m.Conv(conv0, "conv2", 16, 32, 4)
# 4
m.Copy("data1", "data2")
# 5
m.Copy("data2", "data3")
g = memonger.compute_interference_graph(m.net.Proto().op)
orders_org = memonger.topological_sort_traversal(g)
orders_gt_org = [4, 5, 2, 0, 1, 3]
self.assertEqual(orders_gt_org, orders_org)
orders = memonger.topological_sort_traversal_longest_path(g)
# longer path is in front of the shorter one
orders_gt = [0, 1, 2, 3, 4, 5]
self.assertEqual(orders_gt, orders)
def test_topological_sort_longest_path_single_node(self):
# single node
m = cnn.CNNModelHelper()
# 0
m.Copy("conv0_w_comp", "conv0_w")
g = memonger.compute_interference_graph(m.net.Proto().op)
orders_org = memonger.topological_sort_traversal(g)
orders_gt_org = [0]
self.assertEqual(orders_gt_org, orders_org)
orders = memonger.topological_sort_traversal_longest_path(g)
# longer path is in front of the shorter one
orders_gt = [0]
self.assertEqual(orders_gt, orders)