pytorch/caffe2/mpi/mpi_gpu_test.cc
Jerry Zhang aebf3b47ae Remove template parameter from Tensor (#9939)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9939

Pull Request resolved: https://github.com/facebookresearch/weakly-supervised-action-detection/pull/13

Pull Request resolved: https://github.com/pytorch/translate/pull/166

Pull Request resolved: https://github.com/pytorch/pytorch/pull/9125

Closes https://github.com/pytorch/pytorch/pull/9125

Use inheritance for polymorphism, and remove template parameter
This is to change the templating in call sites, the core implementations will change later

Before Caffe2 Tensor class was compile-time fixed to bind to a particular device/context. With this change, we're making it a runtime property (stored inside the tensor), but preserve the same semantics. For example, one has to specify device type in order to create a Tensor - there are no uninitialized tensors. More specifically the changes are:

1. We added an extra argument *DeviceType* to most of the constructors of the tensor, e.g. (Tensor(DeviceType type)),
2. Semantics of constructor Tensor(const Tensor<SrcContext>& src, ContextForCopy* context); is changed, in this constructor, the second context is passed in to enable us to call the templated Copy function, it could be in a different context as source and target previously, now we'll enforce that the context should have same device type as src, if it is provided.
3. To preserve 'get-or-construct' semantics of Blob, we added specialized getter Blob::GetMutableTensor that verifies both that Blob contains a Tensor and that it's of a correct type
4. Specifically, Tensor type is not default-constructible any more (as we don't have unknown device tensors) and thus some of the code handling STL containers needs to change

Note: Some changes are postponed just to keep this diff a bit smaller. Please see `TODO`s.

Reviewed By: ezyang, houseroad

Differential Revision: D9024330

fbshipit-source-id: e0b8295d2dc6ebe2963383ded5af799ad17164ba
2018-07-27 10:56:39 -07:00

336 lines
7.6 KiB
C++

#include "caffe2/core/init.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/net.h"
#include "caffe2/core/operator.h"
#include "caffe2/mpi/mpi_common.h"
#include <gtest/gtest.h>
CAFFE2_DEFINE_string(
caffe_test_root, "gen/", "The root of the caffe test folder.");
namespace caffe2 {
const char kBcastNet[] = R"NET(
name: "bcast"
op {
output: "comm"
type: "MPICreateCommonWorld"
}
op {
output: "X"
type: "ConstantFill"
arg {
name: "shape"
ints: 10
}
arg {
name: "value"
f: 0.0
}
}
op {
input: "comm"
input: "X"
output: "X"
type: "MPIBroadcast"
arg {
name: "root"
i: 0
}
}
device_option {
device_type: 1
}
)NET";
TEST(MPITest, TestMPIBroadcast) {
NetDef net_def;
CHECK(TextFormat::ParseFromString(
string(kBcastNet), &net_def));
// Let's set the network's constant fill value to be the mpi rank.
auto* arg = net_def.mutable_op(1)->mutable_arg(1);
CAFFE_ENFORCE_EQ(arg->name(), "value");
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
arg->set_f(rank);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
for (int root = 0; root < size; ++root) {
net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Run());
// Let's test the value.
auto& X = ws.GetBlob("X")->Get<Tensor>();
Tensor X_cpu(X, CPU);
EXPECT_EQ(X.size(), 10);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], root);
}
}
}
const char kReduceNet[] = R"NET(
name: "reduce"
op {
output: "comm"
type: "MPICreateCommonWorld"
}
op {
output: "X"
type: "ConstantFill"
arg {
name: "shape"
ints: 10
}
arg {
name: "value"
f: 0.0
}
}
op {
input: "comm"
input: "X"
output: "X_reduced"
type: "MPIReduce"
arg {
name: "root"
i: 0
}
}
device_option {
device_type: 1
}
)NET";
TEST(MPITest, TestMPIReduce) {
NetDef net_def;
CHECK(TextFormat::ParseFromString(
string(kReduceNet), &net_def));
// Let's set the network's constant fill value to be the mpi rank.
auto* arg = net_def.mutable_op(1)->mutable_arg(1);
CAFFE_ENFORCE_EQ(arg->name(), "value");
int rank0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank0);
arg->set_f(rank0);
int size0;
MPI_Comm_size(MPI_COMM_WORLD, &size0);
for (int root = 0; root < size0; ++root) {
net_def.mutable_op(2)->mutable_arg(0)->set_i(root);
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Run());
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (rank == root) {
// Let's test the value.
auto& X = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
EXPECT_EQ(X.size(), 10);
int expected_result = size * (size - 1) / 2;
Tensor X_cpu(X, CPU);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], expected_result);
}
}
}
}
const char kMPIAllgatherNet[] = R"NET(
name: "allgather"
op {
output: "comm"
type: "MPICreateCommonWorld"
}
op {
output: "X"
type: "ConstantFill"
arg {
name: "shape"
ints: 2
ints: 10
}
arg {
name: "value"
f: 0.0
}
}
op {
input: "comm"
input: "X"
output: "X_gathered"
type: "MPIAllgather"
}
device_option {
device_type: 1
}
)NET";
TEST(MPITest, TestMPIAllgather) {
NetDef net_def;
CHECK(TextFormat::ParseFromString(
string(kMPIAllgatherNet), &net_def));
// Let's set the network's constant fill value to be the mpi rank.
auto* arg = net_def.mutable_op(1)->mutable_arg(1);
CAFFE_ENFORCE_EQ(arg->name(), "value");
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
arg->set_f(rank);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Run());
// Let's test the value.
auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
Tensor X_cpu(X, CPU);
EXPECT_EQ(X.size(), 20);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], rank);
}
auto& X_gathered = ws.GetBlob("X_gathered")->Get<TensorCUDA>();
EXPECT_EQ(X_gathered.size(), 20 * size);
EXPECT_EQ(X_gathered.dim(0), 2 * size);
EXPECT_EQ(X_gathered.dim(1), 10);
Tensor X_gathered_cpu(X_gathered, CPU);
for (int i = 0; i < X_gathered.size(); ++i) {
EXPECT_EQ(X_gathered_cpu.data<float>()[i], i / 20);
}
}
const char kMPIAllreduceNet[] = R"NET(
name: "allreduce"
op {
output: "comm"
type: "MPICreateCommonWorld"
}
op {
output: "X"
type: "ConstantFill"
arg {
name: "shape"
ints: 10
}
arg {
name: "value"
f: 0.0
}
}
op {
input: "comm"
input: "X"
output: "X_reduced"
type: "MPIAllreduce"
}
device_option {
device_type: 1
}
)NET";
TEST(MPITest, TestMPIAllreduce) {
NetDef net_def;
CHECK(TextFormat::ParseFromString(
string(kMPIAllreduceNet), &net_def));
// Let's set the network's constant fill value to be the mpi rank.
auto* arg = net_def.mutable_op(1)->mutable_arg(1);
CAFFE_ENFORCE_EQ(arg->name(), "value");
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
arg->set_f(rank);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Run());
// Let's test the value.
auto& X = ws.GetBlob("X")->Get<TensorCUDA>();
EXPECT_EQ(X.size(), 10);
Tensor X_cpu(X, CPU);
for (int i = 0; i < X.size(); ++i) {
EXPECT_EQ(X_cpu.data<float>()[i], rank);
}
auto& X_reduced = ws.GetBlob("X_reduced")->Get<TensorCUDA>();
EXPECT_EQ(X_reduced.size(), 10);
int expected_result = size * (size - 1) / 2;
Tensor X_reduced_cpu(X_reduced, CPU);
for (int i = 0; i < X_reduced.size(); ++i) {
EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
}
}
const char kInPlaceMPIAllreduceNet[] = R"NET(
name: "allreduce"
op {
output: "comm"
type: "MPICreateCommonWorld"
}
op {
output: "X"
type: "ConstantFill"
arg {
name: "shape"
ints: 10
}
arg {
name: "value"
f: 0.0
}
}
op {
input: "comm"
input: "X"
output: "X"
type: "MPIAllreduce"
}
device_option {
device_type: 1
}
)NET";
TEST(MPITest, TestInPlaceMPIAllreduce) {
NetDef net_def;
CHECK(TextFormat::ParseFromString(
string(kInPlaceMPIAllreduceNet), &net_def));
// Let's set the network's constant fill value to be the mpi rank.
auto* arg = net_def.mutable_op(1)->mutable_arg(1);
CAFFE_ENFORCE_EQ(arg->name(), "value");
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
arg->set_f(rank);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
Workspace ws;
unique_ptr<NetBase> net(CreateNet(net_def, &ws));
EXPECT_NE(nullptr, net.get());
EXPECT_TRUE(net->Run());
auto& X_reduced = ws.GetBlob("X")->Get<TensorCUDA>();
EXPECT_EQ(X_reduced.size(), 10);
int expected_result = size * (size - 1) / 2;
Tensor X_reduced_cpu(X_reduced, CPU);
for (int i = 0; i < X_reduced.size(); ++i) {
EXPECT_EQ(X_reduced_cpu.data<float>()[i], expected_result);
}
}
} // namespace caffe2
GTEST_API_ int main(int argc, char **argv) {
int mpi_ret;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
testing::InitGoogleTest(&argc, argv);
caffe2::GlobalInit(&argc, &argv);
int test_result = RUN_ALL_TESTS();
MPI_Finalize();
return test_result;
}