mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Major improvements. Before we only synced "params" and "computed params" of model after initialization and after loading a checkpoint. But actually we want to sync all blobs that are generated in the param_init_net. For example the _momentum blobs were missed by the previous implementation and had to be manually included in checkpoint finalization. I also added GetCheckpointParams() to data_parallel_model because it is now fully general. Also added a unit test. Reviewed By: andrewwdye Differential Revision: D5093689 fbshipit-source-id: 8154ded0c73cd6a0f54ee024dc5f2c6826ed7e42
638 lines
24 KiB
Python
638 lines
24 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import numpy as np
|
|
import unittest
|
|
from caffe2.proto import caffe2_pb2
|
|
from caffe2.python import core, workspace, data_parallel_model, cnn, rnn_cell
|
|
from caffe2.python.test_util import TestCase
|
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
|
|
@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
|
|
class GPUDataParallelModelTest(TestCase):
|
|
|
|
def run_model(self, gpu_devices):
|
|
'''
|
|
Helper function for test_equiv
|
|
'''
|
|
def input_builder_fun(model):
|
|
return None
|
|
|
|
def model_build_fun(model, loss_scale):
|
|
fc = model.FC("data", "fc", 16, 1,
|
|
("ConstantFill", {}), ("ConstantFill", {}))
|
|
fc_fl = model.FlattenToVec(fc, "fc_fl")
|
|
sigm = model.Sigmoid(fc_fl, "sigm")
|
|
sq = model.SquaredL2Distance([sigm, "label"], "sq")
|
|
loss = model.AveragedLoss(sq, "loss")
|
|
loss = model.Scale(loss, scale=loss_scale)
|
|
return [loss]
|
|
|
|
def param_update_fun(model):
|
|
ITER = model.Iter("ITER")
|
|
LR = model.net.LearningRate(
|
|
[ITER],
|
|
"LR",
|
|
base_lr=(-0.1),
|
|
policy="fixed",
|
|
)
|
|
ONE = model.param_init_net.ConstantFill(
|
|
[], "ONE", shape=[1], value=1.0,
|
|
)
|
|
for param in model.GetParams():
|
|
grad = model.param_to_grad[param]
|
|
model.WeightedSum([param, ONE, grad, LR], param)
|
|
|
|
workspace.ResetWorkspace()
|
|
model = cnn.CNNModelHelper(
|
|
order="NHWC",
|
|
name="test{}".format(gpu_devices),
|
|
)
|
|
data_parallel_model.Parallelize_GPU(
|
|
model,
|
|
input_builder_fun=input_builder_fun,
|
|
forward_pass_builder_fun=model_build_fun,
|
|
param_update_builder_fun=param_update_fun,
|
|
devices=gpu_devices,
|
|
)
|
|
|
|
np.random.seed(2603)
|
|
|
|
# Each run has same input, independent of number of gpus
|
|
batch_size = 64
|
|
for i in range(0, 10):
|
|
full_data = np.random.rand(batch_size, 16)
|
|
full_labels = np.round(full_data[:, 0])
|
|
batch_per_device = batch_size // len(gpu_devices)
|
|
|
|
for (j, g) in enumerate(gpu_devices):
|
|
st = j * batch_per_device
|
|
en = st + batch_per_device
|
|
data = full_data[st:en, :].astype(np.float32)
|
|
labels = full_labels[st:en].astype(np.float32)
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
|
|
workspace.FeedBlob("gpu_{}/data".format(g), data)
|
|
workspace.FeedBlob("gpu_{}/label".format(g), labels)
|
|
|
|
if i == 0:
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
workspace.CreateNet(model.net)
|
|
|
|
print(i, workspace.FetchBlob("gpu_0/fc_w").flatten()[:5])
|
|
workspace.RunNet(model.net.Proto().name)
|
|
|
|
return workspace.FetchBlob("gpu_0/fc_w")
|
|
|
|
def test_equiv(self):
|
|
'''
|
|
Test that the model produces exactly same results given
|
|
total batchsize, independent of number of GPUs.
|
|
'''
|
|
result_2gpus = self.run_model([0, 1])
|
|
result_1gpus = self.run_model([0])
|
|
|
|
self.assertTrue(np.allclose(result_1gpus, result_2gpus))
|
|
|
|
if workspace.NumCudaDevices() >= 4:
|
|
result_4gpus = self.run_model(range(4))
|
|
self.assertTrue(np.allclose(result_1gpus, result_4gpus))
|
|
|
|
if workspace.NumCudaDevices() >= 8:
|
|
result_8gpus = self.run_model(range(8))
|
|
self.assertTrue(np.allclose(result_1gpus, result_8gpus))
|
|
|
|
def test_checkpoint_params(self):
|
|
def add_input_ops(model):
|
|
pass
|
|
|
|
def add_model_ops(model):
|
|
model = cnn.CNNModelHelper(name="convtest", order="NCHW")
|
|
model.NHWC2NCHW("data", "data_nchw")
|
|
model.Conv("data_nchw", 'conv1', 3, 64,
|
|
weight_init=("MSRAFill", {}), kernel=7,
|
|
stride=2, pad=3, no_bias=0)
|
|
model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3)
|
|
model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
|
|
model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
|
|
model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100)
|
|
model.Sigmoid('fc', 'fc_sigm')
|
|
model.Softmax('fc_sigm', 'softmax')
|
|
model.LabelCrossEntropy(['softmax', 'label'], 'xent')
|
|
loss = model.AveragedLoss('xent', 'loss')
|
|
|
|
model.AddGradientOperators([loss])
|
|
|
|
def add_parameter_update_ops(model):
|
|
model.Iter("ITER")
|
|
LR = model.param_init_net.ConstantFill(
|
|
[], 'LR', shape=[1], value=0.1
|
|
)
|
|
for param in model.GetParams():
|
|
param_grad = model.param_to_grad[param]
|
|
param_momentum = model.param_init_net.ConstantFill(
|
|
[param], param + '_momentum', value=0.0
|
|
)
|
|
model.net.MomentumSGDUpdate(
|
|
[param_grad, param_momentum, LR, param],
|
|
[param_grad, param_momentum, param],
|
|
)
|
|
model = cnn.CNNModelHelper(
|
|
order="NHWC",
|
|
name="test",
|
|
)
|
|
data_parallel_model.Parallelize_GPU(
|
|
model,
|
|
input_builder_fun=add_input_ops,
|
|
forward_pass_builder_fun=add_model_ops,
|
|
param_update_builder_fun=add_parameter_update_ops,
|
|
devices=[1, 2, 3],
|
|
)
|
|
|
|
# Only gpu_1 params should be returned (gpu_1 is the first gpu)
|
|
checkpoint_params = data_parallel_model.GetCheckpointParams(model)
|
|
for p in model.GetParams("gpu_1/"):
|
|
self.assertTrue(p in checkpoint_params)
|
|
self.assertTrue(p + "_momentum" in checkpoint_params)
|
|
for p in model.GetParams("gpu_2/"):
|
|
self.assertTrue(p in checkpoint_params)
|
|
for c in model.GetComputedParams("gpu_1/"):
|
|
self.assertFalse(c in checkpoint_params)
|
|
for c in model.GetComputedParams("gpu_2/"):
|
|
self.assertFalse(c in checkpoint_params)
|
|
self.assertFalse(core.BlobReference("gpu_1/data") in checkpoint_params)
|
|
self.assertTrue(core.BlobReference("gpu_1/ITER") in checkpoint_params)
|
|
|
|
|
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
|
|
@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
|
|
class RecurrentNetworkParallelTest(TestCase):
|
|
|
|
def run_model(self, gpu_devices):
|
|
|
|
'''
|
|
Helper function for test_equiv
|
|
'''
|
|
def input_builder_fun(model):
|
|
return None
|
|
|
|
def model_build_fun(model, loss_scale):
|
|
workspace.FeedBlob(
|
|
core.ScopedBlobReference("seq_lengths"),
|
|
np.array([self.T] * self.batch_per_device, dtype=np.int32)
|
|
)
|
|
model.param_init_net.ConstantFill(
|
|
[],
|
|
"hidden_init",
|
|
value=0.0,
|
|
shape=[1, self.batch_per_device, self.hidden_dim]
|
|
)
|
|
model.param_init_net.ConstantFill(
|
|
[],
|
|
"cell_init",
|
|
value=0.0,
|
|
shape=[1, self.batch_per_device, self.hidden_dim]
|
|
)
|
|
|
|
output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
|
|
model=model,
|
|
input_blob="data",
|
|
seq_lengths="seq_lengths",
|
|
initial_states=("hidden_init", "cell_init"),
|
|
dim_in=self.input_dim,
|
|
dim_out=self.hidden_dim,
|
|
scope="partest",
|
|
)
|
|
|
|
# A silly loss function
|
|
loss = model.AveragedLoss(
|
|
model.Sub([output, "target"], "dist"),
|
|
"loss",
|
|
)
|
|
loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
|
|
return [loss]
|
|
|
|
def param_update_fun(model):
|
|
ITER = model.Iter("ITER")
|
|
LR = model.net.LearningRate(
|
|
[ITER],
|
|
"LR",
|
|
base_lr=(-0.1),
|
|
policy="fixed",
|
|
)
|
|
ONE = model.param_init_net.ConstantFill(
|
|
[], "ONE", shape=[1], value=1.0,
|
|
)
|
|
for param in model.GetParams():
|
|
param_grad = model.param_to_grad[param]
|
|
model.WeightedSum([param, ONE, param_grad, LR], param)
|
|
|
|
assert len(model.GetParams()) == len(model.params) // len(model._devices)
|
|
|
|
workspace.ResetWorkspace()
|
|
model = cnn.CNNModelHelper(
|
|
name="recurrent_test{}".format(gpu_devices),
|
|
)
|
|
|
|
self.T = 8
|
|
self.batch_size = 64
|
|
self.input_dim = 8
|
|
self.hidden_dim = 31
|
|
self.batch_per_device = self.batch_size // len(gpu_devices)
|
|
|
|
data_parallel_model.Parallelize_GPU(
|
|
model,
|
|
input_builder_fun=input_builder_fun,
|
|
forward_pass_builder_fun=model_build_fun,
|
|
param_update_builder_fun=param_update_fun,
|
|
devices=gpu_devices,
|
|
optimize_gradient_memory=True,
|
|
)
|
|
|
|
# Change all initialization to be ConstantFills so that
|
|
# the everything is deterministic
|
|
for op in model.param_init_net.Proto().op:
|
|
if op.type.endswith('Fill'):
|
|
op.type = 'ConstantFill'
|
|
|
|
# Each run has same input, independent of number of gpus
|
|
np.random.seed(20150210)
|
|
for i in range(0, 10):
|
|
full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
|
|
full_target = np.random.rand(
|
|
self.T, self.batch_size, self.hidden_dim
|
|
)
|
|
|
|
for (j, g) in enumerate(gpu_devices):
|
|
st = j * self.batch_per_device
|
|
en = st + self.batch_per_device
|
|
data = full_data[:, st:en, :].astype(np.float32)
|
|
targets = full_target[:, st:en, :].astype(np.float32)
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
|
|
workspace.FeedBlob("gpu_{}/data".format(g), data)
|
|
workspace.FeedBlob("gpu_{}/target".format(g), targets)
|
|
|
|
if i == 0:
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
workspace.CreateNet(model.net)
|
|
|
|
workspace.RunNet(model.net.Proto().name)
|
|
|
|
return workspace.FetchBlob("gpu_0/partest/i2h_w")
|
|
|
|
def test_equiv_recurrent(self):
|
|
'''
|
|
Test that the model produces exactly same results given
|
|
total batchsize, independent of number of GPUs.
|
|
'''
|
|
result_2gpus = self.run_model([0, 1])
|
|
result_1gpus = self.run_model([0])
|
|
|
|
print("result 1", result_1gpus.flatten()[:5])
|
|
print("result 2", result_2gpus.flatten()[:5])
|
|
|
|
self.assertTrue(np.allclose(result_1gpus, result_2gpus))
|
|
|
|
if workspace.NumCudaDevices() >= 4:
|
|
result_4gpus = self.run_model(range(4))
|
|
self.assertTrue(np.allclose(result_1gpus, result_4gpus))
|
|
|
|
if workspace.NumCudaDevices() >= 8:
|
|
result_8gpus = self.run_model(range(8))
|
|
self.assertTrue(np.allclose(result_1gpus, result_8gpus))
|
|
|
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
|
|
@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
|
|
class SparseDataParallelModelTest(TestCase):
|
|
|
|
'''
|
|
Create and run the model. We try with both storing indices for gather
|
|
on CPU and on GPU
|
|
'''
|
|
def run_model(self, V, gpu_devices, cpu_indices):
|
|
|
|
def input_builder_fun(model):
|
|
return None
|
|
|
|
def model_build_fun(model, loss_scale):
|
|
if cpu_indices:
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
|
gathered_cpu = model.net.Gather(
|
|
[self.vecs, 'indices'], 'gathered_cpu')
|
|
|
|
gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
|
|
else:
|
|
gpu_vecs = model.param_init_net.CopyCPUToGPU(
|
|
self.vecs, "gpuvecs",
|
|
)
|
|
model.params.append(gpu_vecs)
|
|
gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
|
|
flattened = model.Flatten(gathered, "flattened")
|
|
fc = model.FC(flattened, "fc", 16 * 16, 1,
|
|
("ConstantFill", {}), ("ConstantFill", {}))
|
|
fc_fl = model.FlattenToVec(fc, "fc_fl")
|
|
sigm = model.Sigmoid(fc_fl, "sigm")
|
|
sq = model.SquaredL2Distance([sigm, "label"], "sq")
|
|
loss = model.AveragedLoss(sq, "loss")
|
|
loss = model.Scale(loss, scale=loss_scale)
|
|
return [loss]
|
|
|
|
def param_update_fun(model):
|
|
ONE = model.param_init_net.ConstantFill(
|
|
[], "ONE", shape=[1], value=1.0,
|
|
)
|
|
LR = model.CopyCPUToGPU(self.LR, "LR")
|
|
for param in model.GetParams():
|
|
param_grad = model.param_to_grad[param]
|
|
if not isinstance(param_grad, core.GradientSlice):
|
|
model.WeightedSum([param, ONE, param_grad, LR], param)
|
|
else:
|
|
param_momentum = model.param_init_net.ConstantFill(
|
|
[param],
|
|
param + '_momentum',
|
|
value=0.0,
|
|
)
|
|
model.net.SparseMomentumSGDUpdate(
|
|
[
|
|
param_grad.values,
|
|
param_momentum,
|
|
LR,
|
|
param,
|
|
param_grad.indices,
|
|
],
|
|
[
|
|
param_grad.values, param_momentum, param
|
|
],
|
|
momentum=0.1,
|
|
nesterov=0,
|
|
)
|
|
|
|
workspace.ResetWorkspace()
|
|
model = cnn.CNNModelHelper(
|
|
order="NHWC",
|
|
name="sparse_test{}".format(gpu_devices),
|
|
)
|
|
|
|
with core.NameScope("cpu"):
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
|
self.ITER = model.Iter("ITER")
|
|
self.LR = model.net.LearningRate(
|
|
[self.ITER],
|
|
"LR",
|
|
base_lr=(-0.1),
|
|
policy="fixed",
|
|
)
|
|
self.vecs = model.param_init_net.UniformFill(
|
|
[], "vecs", shape=[V, 16])
|
|
if cpu_indices:
|
|
model.params.append(self.vecs)
|
|
self.ONE_CPU = model.param_init_net.ConstantFill(
|
|
[], "ONE_CPU", shape=[1], value=1.0,
|
|
)
|
|
|
|
data_parallel_model.Parallelize_GPU(
|
|
model,
|
|
input_builder_fun=input_builder_fun,
|
|
forward_pass_builder_fun=model_build_fun,
|
|
param_update_builder_fun=param_update_fun,
|
|
devices=gpu_devices,
|
|
)
|
|
|
|
# Update the vecs
|
|
if cpu_indices:
|
|
with core.NameScope("cpu"):
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
|
for param in model.GetParams():
|
|
param_grad = model.param_to_grad[param]
|
|
model.ScatterWeightedSum([param, self.ONE_CPU,
|
|
param_grad.indices,
|
|
param_grad.values,
|
|
self.LR],
|
|
self.vecs)
|
|
else:
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
|
|
model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)
|
|
|
|
np.random.seed(2603)
|
|
|
|
# Each run has same input, independent of number of gpus
|
|
batch_size = 64
|
|
for i in range(0, 10):
|
|
full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
|
|
batch_size, 16
|
|
)
|
|
full_labels = full_indices[:, 0] % 2
|
|
batch_per_device = batch_size // len(gpu_devices)
|
|
|
|
for (j, g) in enumerate(gpu_devices):
|
|
st = j * batch_per_device
|
|
en = st + batch_per_device
|
|
indices = full_indices[st:en, :].astype(np.int32)
|
|
labels = full_labels[st:en].astype(np.float32)
|
|
|
|
device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
|
|
if not cpu_indices:
|
|
device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)
|
|
|
|
with core.DeviceScope(device_for_indices):
|
|
workspace.FeedBlob("gpu_{}/indices".format(g), indices)
|
|
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
|
|
workspace.FeedBlob("gpu_{}/label".format(g), labels)
|
|
|
|
if i == 0:
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
# Force vecs to be same on all runs
|
|
orig_vecs = np.random.rand(V, 16).astype(np.float32)
|
|
workspace.FeedBlob(
|
|
self.vecs,
|
|
orig_vecs
|
|
)
|
|
if not cpu_indices:
|
|
for g in gpu_devices:
|
|
workspace.FeedBlob(
|
|
"gpu_{}/gpuvecs".format(g),
|
|
orig_vecs,
|
|
device_option=core.DeviceOption(caffe2_pb2.CUDA, g),
|
|
)
|
|
workspace.CreateNet(model.net)
|
|
|
|
workspace.RunNet(model.net.Proto().name)
|
|
if len(gpu_devices) == 2:
|
|
open("dump.txt", "w").write(str(model.net.Proto()))
|
|
if not cpu_indices:
|
|
idx = workspace.FetchBlob("gpu_0/indices")
|
|
idx = list(idx.flatten())
|
|
n = len(idx)
|
|
nu = len(set(idx))
|
|
assert n == nu, "We cannot have duplicate indices"
|
|
|
|
# Sanity check to see the vecs were updated
|
|
self.assertFalse(
|
|
np.allclose(workspace.FetchBlob(self.vecs), orig_vecs))
|
|
return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
|
|
workspace.FetchBlob("gpu_0/fc_w")]
|
|
|
|
def _test_equiv_sparse(self, cpu_indices):
|
|
'''
|
|
Test that the model produces exactly same results given
|
|
total batchsize, independent of number of GPUs.
|
|
'''
|
|
V = 10000
|
|
result_2gpus = self.run_model(V, [0, 1], cpu_indices)
|
|
result_1gpus = self.run_model(V, [0], cpu_indices)
|
|
|
|
self.assertTrue(np.allclose(result_1gpus[0], result_2gpus[0]))
|
|
self.assertTrue(np.allclose(result_1gpus[1], result_2gpus[1]))
|
|
|
|
if workspace.NumCudaDevices() >= 4:
|
|
result_4gpus = self.run_model(V, range(4), cpu_indices)
|
|
self.assertTrue(np.allclose(result_1gpus[0], result_4gpus[0]))
|
|
self.assertTrue(np.allclose(result_1gpus[1], result_4gpus[1]))
|
|
|
|
if workspace.NumCudaDevices() >= 8:
|
|
result_8gpus = self.run_model(V, range(8), cpu_indices)
|
|
self.assertTrue(np.allclose(result_1gpus[0], result_8gpus[0]))
|
|
self.assertTrue(np.allclose(result_1gpus[1], result_8gpus[1]))
|
|
|
|
def test_equiv_sparse(self):
|
|
self._test_equiv_sparse(True)
|
|
self._test_equiv_sparse(False)
|
|
|
|
|
|
@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
|
|
@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
|
|
class ParallelizeGPUBMUFTest(TestCase):
|
|
|
|
def _run_model(self, gpu_devices):
|
|
'''
|
|
Helper function for test_equiv
|
|
'''
|
|
def input_builder_fun(model):
|
|
return None
|
|
|
|
def _model_build_fun(self, model, loss_scale):
|
|
fc = model.FC(
|
|
"data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})
|
|
)
|
|
fc_fl = model.FlattenToVec(fc, "fc_fl")
|
|
sigm = model.Sigmoid(fc_fl, "sigm")
|
|
sq = model.SquaredL2Distance([sigm, "label"], "sq")
|
|
loss = model.AveragedLoss(sq, "loss")
|
|
loss = model.Scale(loss, scale=loss_scale)
|
|
return [loss]
|
|
|
|
def _param_update_fun(self, model):
|
|
ITER = model.Iter("ITER")
|
|
LR = model.net.LearningRate(
|
|
[ITER],
|
|
"LR",
|
|
base_lr=(-0.1),
|
|
policy="fixed",
|
|
)
|
|
ONE = model.param_init_net.ConstantFill(
|
|
[], "ONE", shape=[1], value=1.0,
|
|
)
|
|
for param in model.GetParams():
|
|
grad = model.param_to_grad[param]
|
|
model.WeightedSum([param, ONE, grad, LR], param)
|
|
|
|
def _generate_data(self, gpu_devices):
|
|
np.random.seed(26)
|
|
# Each run has same input, independent of number of gpus
|
|
batch_size = 64
|
|
for _ in range(0, 10):
|
|
full_data = np.random.rand(batch_size, 16)
|
|
full_labels = np.round(full_data[:, 0])
|
|
batch_per_device = batch_size // len(gpu_devices)
|
|
|
|
for (j, g) in enumerate(gpu_devices):
|
|
st = j * batch_per_device
|
|
en = st + batch_per_device
|
|
data = full_data[st:en, :].astype(np.float32)
|
|
labels = full_labels[st:en].astype(np.float32)
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
|
|
workspace.FeedBlob("gpu_{}/data".format(g), data)
|
|
workspace.FeedBlob("gpu_{}/label".format(g), labels)
|
|
|
|
def test_parallelize_gpu_bmuf(self):
|
|
model = cnn.CNNModelHelper(
|
|
order="NHWC",
|
|
name="test"
|
|
)
|
|
gpu_ids = [0, 1]
|
|
|
|
def input_builder_fun(model):
|
|
return None
|
|
|
|
self._generate_data(gpu_ids)
|
|
|
|
data_parallel_model.Parallelize_GPU_BMUF(
|
|
model,
|
|
input_builder_fun,
|
|
self._model_build_fun,
|
|
self._param_update_fun,
|
|
devices=gpu_ids,
|
|
)
|
|
|
|
data_parallel_model.RunInitNet(model)
|
|
|
|
# Check initial momentum params are zeros
|
|
self.assertEqual(model._device_grouped_blobs.keys(), ['fc_w', 'fc_b'])
|
|
self.assertEqual(workspace.FetchBlob('gpu_0/fc_b_v'), 0)
|
|
np.testing.assert_equal(
|
|
workspace.FetchBlob('gpu_0/fc_w_v'),
|
|
np.zeros(16).astype(np.float32).reshape(1, 16)
|
|
)
|
|
|
|
# Run the algorithm for one iteration to have non-zero params.
|
|
data_parallel_model.RunNet(model, 1)
|
|
|
|
# Save iteration momentum and post local update params
|
|
v_b_ = workspace.FetchBlob('gpu_0/fc_b_v')
|
|
v_w_ = workspace.FetchBlob('gpu_0/fc_w_v')
|
|
|
|
workspace.RunNetOnce(model.net)
|
|
|
|
b_0_ = workspace.FetchBlob('gpu_0/fc_b')
|
|
w_0_ = workspace.FetchBlob('gpu_0/fc_w')
|
|
b_1_ = workspace.FetchBlob('gpu_1/fc_b')
|
|
w_1_ = workspace.FetchBlob('gpu_1/fc_w')
|
|
|
|
def getBlockAvg(param_name):
|
|
param_0 = workspace.FetchBlob("gpu_0/{}".format(param_name))
|
|
param_1 = workspace.FetchBlob("gpu_1/{}".format(param_name))
|
|
return (param_0 + param_1) / 2
|
|
|
|
# Compute block gradients.
|
|
b_g_ = workspace.FetchBlob('gpu_0/fc_b_g')
|
|
w_g_ = workspace.FetchBlob('gpu_0/fc_w_g')
|
|
workspace.RunNetOnce(model._global_model_param_updates_net)
|
|
|
|
g_b = (b_0_ + b_1_) / 2 - b_g_
|
|
g_w = (w_0_ + w_1_) / 2 - w_g_
|
|
v_b = workspace.FetchBlob('gpu_0/fc_b_v')
|
|
v_w = workspace.FetchBlob('gpu_0/fc_w_v')
|
|
|
|
w_g = workspace.FetchBlob('gpu_0/fc_w_g')
|
|
b_g = workspace.FetchBlob('gpu_0/fc_b_g')
|
|
w_0 = workspace.FetchBlob('gpu_0/fc_w')
|
|
b_0 = workspace.FetchBlob('gpu_0/fc_b')
|
|
w_1 = workspace.FetchBlob('gpu_1/fc_w')
|
|
b_1 = workspace.FetchBlob('gpu_1/fc_b')
|
|
|
|
# Check momentum update step
|
|
np.testing.assert_equal(v_b, 0.5 * v_b_ + g_b)
|
|
np.testing.assert_equal(v_w, 0.5 * v_w_ + g_w)
|
|
|
|
np.testing.assert_equal(w_g, w_0)
|
|
np.testing.assert_equal(w_g, w_1)
|
|
np.testing.assert_equal(b_g, b_0)
|
|
np.testing.assert_equal(b_g, b_1)
|
|
|
|
# Check params update step
|
|
np.testing.assert_equal(w_0, w_g_ + v_w)
|
|
np.testing.assert_equal(b_0, b_g_ + v_b)
|