mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary:
This PR adds the functional version of `DataParallel` (i.e. `data_parallel`) to the C++ frontend.
For this, I had to:
1. Add "differentiable" versions of scatter and gather, which perform their inverse operation in the backward pass, to C++. I've added them under `torch/csrc/autograd/functions/comm.{h,cpp}`. I had to move some utilities from `VariableType.cpp` into `torch/csrc/autograd/functions/utils.h`, and changed them a bit to fix the `const_cast`s for which there were `TODO`s,
2. Implement the `replicate`, `parallel_apply` and the combining `data_parallel` functions in C++.
`replicate` is implemented based on our existing `clone()` interface, along with the ability to set the current device via `at::OptionsGuard` (so nice).
`parallel_apply` is implemented using `at::parallel_for` (CC cpuhrsch) and [follows the code from PyTorch](https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/parallel_apply.py).
Added lots of tests for these things.
apaszke ezyang ebetica colesbury
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9234
Differential Revision: D8865182
Pulled By: goldsborough
fbshipit-source-id: 4f1fecf2b3f3bc1540c071dfb2d23dd45de433e4
231 lines
7.4 KiB
C++
231 lines
7.4 KiB
C++
#include <catch.hpp>
|
|
|
|
#include <torch/csrc/autograd/functions/comm.h>
|
|
#include <torch/nn/module.h>
|
|
#include <torch/nn/modules/linear.h>
|
|
#include <torch/nn/parallel/data_parallel.h>
|
|
#include <torch/nn/pimpl.h>
|
|
#include <torch/tensor.h>
|
|
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
using Catch::StartsWith;
|
|
|
|
using namespace torch::autograd;
|
|
using namespace torch::nn;
|
|
|
|
TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
|
|
Scatter scatter(
|
|
{torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
|
|
|
|
auto input = torch::ones(10, torch::requires_grad(true));
|
|
auto output = scatter.apply({input});
|
|
|
|
REQUIRE(output.size() == 2);
|
|
REQUIRE(output[0].size(0) == 5);
|
|
REQUIRE(output[1].size(0) == 5);
|
|
|
|
REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
|
|
.allclose(input));
|
|
|
|
auto sum = output[0].to({torch::kCUDA, 1}) + output[1];
|
|
sum.backward();
|
|
|
|
REQUIRE(input.grad().defined());
|
|
REQUIRE(input.grad().device().is_cpu());
|
|
REQUIRE(input.grad().sum().toCInt() == 10);
|
|
}
|
|
|
|
TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
|
|
Gather gather(torch::Device(torch::kCUDA, 1));
|
|
|
|
auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
|
|
auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));
|
|
|
|
auto outputs = gather.apply({a, b});
|
|
REQUIRE(outputs.size() == 1);
|
|
auto& output = outputs.front();
|
|
|
|
REQUIRE(output.size(0) == 10);
|
|
REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));
|
|
|
|
auto chunks = output.chunk(2);
|
|
REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
|
|
REQUIRE(chunks[1].allclose(b));
|
|
|
|
output.backward();
|
|
|
|
REQUIRE(a.grad().defined());
|
|
REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
|
|
REQUIRE(a.grad().sum().toCInt() == 5);
|
|
|
|
REQUIRE(b.grad().defined());
|
|
REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
|
|
REQUIRE(b.grad().sum().toCInt() == 5);
|
|
}
|
|
|
|
TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
|
|
Linear linear(3, 4);
|
|
auto replicas = parallel::replicate(
|
|
linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
|
|
REQUIRE(replicas.size() == 2);
|
|
|
|
auto original_parameters = linear->parameters();
|
|
|
|
auto replica1_parameters = replicas[0]->parameters();
|
|
for (auto& parameter : replica1_parameters) {
|
|
REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
|
|
}
|
|
replicas[0]->to(torch::kCPU);
|
|
REQUIRE(replica1_parameters.size() == original_parameters.size());
|
|
for (size_t i = 0; i < original_parameters.size(); ++i) {
|
|
REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
|
|
REQUIRE(
|
|
replica1_parameters[i]->data().data<float>() !=
|
|
original_parameters[i]->data().data<float>());
|
|
}
|
|
|
|
auto replica2_parameters = replicas[1]->parameters();
|
|
for (auto& parameter : replica2_parameters) {
|
|
REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
|
|
}
|
|
replicas[1]->to(torch::kCPU);
|
|
REQUIRE(replica2_parameters.size() == original_parameters.size());
|
|
for (size_t i = 0; i < original_parameters.size(); ++i) {
|
|
REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
|
|
REQUIRE(
|
|
replica2_parameters[i]->data().data<float>() !=
|
|
original_parameters[i]->data().data<float>());
|
|
}
|
|
}
|
|
|
|
TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
|
|
Linear a(3, 4);
|
|
|
|
Linear b(std::static_pointer_cast<LinearImpl>(a->clone()));
|
|
b->to({torch::kCUDA, 0});
|
|
|
|
Linear c(std::static_pointer_cast<LinearImpl>(a->clone()));
|
|
c->to({torch::kCUDA, 1});
|
|
|
|
std::vector<Linear> modules = {a, b, c};
|
|
std::vector<torch::Tensor> inputs = {
|
|
torch::ones({2, 3}),
|
|
torch::ones({2, 3}, torch::device({torch::kCUDA, 0})),
|
|
torch::ones({2, 3}, torch::device({torch::kCUDA, 1}))};
|
|
|
|
auto outputs = parallel::parallel_apply(modules, inputs);
|
|
|
|
REQUIRE(outputs.size() == 3);
|
|
REQUIRE(outputs[0].device().is_cpu());
|
|
|
|
REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
|
|
REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
|
|
|
|
REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
|
|
REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
|
|
}
|
|
|
|
TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
|
|
struct M : torch::nn::Module {
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
return torch::ones({5}, torch::dtype(torch::kInt32));
|
|
}
|
|
};
|
|
|
|
std::vector<std::shared_ptr<M>> modules = {
|
|
std::make_shared<M>(), std::make_shared<M>(), std::make_shared<M>()};
|
|
std::vector<torch::Tensor> inputs = {
|
|
torch::empty({}), torch::empty({}), torch::empty({})};
|
|
std::vector<torch::Device> devices = {
|
|
{torch::kCUDA, 1}, {torch::kCUDA, 0}, {torch::kCPU}};
|
|
|
|
auto outputs = parallel::parallel_apply(modules, inputs, devices);
|
|
|
|
REQUIRE(outputs.size() == 3);
|
|
REQUIRE(outputs[0].device().is_cuda());
|
|
REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));
|
|
|
|
REQUIRE(outputs[1].device().is_cuda());
|
|
REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
|
|
|
|
REQUIRE(outputs[2].device().is_cpu());
|
|
}
|
|
|
|
TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
|
|
struct M : torch::nn::Cloneable<M> {
|
|
void reset() override {}
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
throw std::runtime_error("Badness!");
|
|
}
|
|
};
|
|
|
|
auto m = std::make_shared<M>();
|
|
auto input = torch::ones({10, 3});
|
|
REQUIRE_THROWS_WITH(
|
|
parallel::data_parallel(m, input), StartsWith("Badness!"));
|
|
}
|
|
|
|
TEST_CASE(
|
|
"Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice",
|
|
"[multi-cuda]") {
|
|
struct M : torch::nn::Cloneable<M> {
|
|
void reset() override {}
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
// Intermediate tensors should be on the replica's current device.
|
|
intermediate_tensor = torch::rand(5);
|
|
// The returned tensor should be on the output device.
|
|
return torch::ones(3);
|
|
}
|
|
torch::Tensor intermediate_tensor;
|
|
};
|
|
auto m = std::make_shared<M>();
|
|
auto input = torch::ones({10, 3});
|
|
{
|
|
auto output = parallel::data_parallel(
|
|
m,
|
|
input,
|
|
/*devices=*/at::nullopt,
|
|
/*output_device=*/torch::Device(torch::kCUDA, 1));
|
|
REQUIRE(output.defined());
|
|
REQUIRE(output.device().is_cuda());
|
|
REQUIRE(output.device().index() == 1);
|
|
}
|
|
{
|
|
// Verify for the single-device case (where we don't scatter/gather).
|
|
auto output = parallel::data_parallel(
|
|
m,
|
|
input,
|
|
/*devices=*/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
|
|
/*output_device=*/torch::Device(torch::kCUDA, 1));
|
|
REQUIRE(m->intermediate_tensor.defined());
|
|
REQUIRE(m->intermediate_tensor.device().is_cuda());
|
|
REQUIRE(m->intermediate_tensor.device().index() == 0);
|
|
REQUIRE(output.defined());
|
|
REQUIRE(output.device().is_cuda());
|
|
REQUIRE(output.device().index() == 1);
|
|
}
|
|
}
|
|
|
|
TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
|
|
struct M : torch::nn::Cloneable<M> {
|
|
void reset() override {}
|
|
torch::Tensor forward(torch::Tensor input) {
|
|
return torch::tensor(torch::DefaultTensorOptions::get().device().index());
|
|
}
|
|
};
|
|
|
|
auto m = std::make_shared<M>();
|
|
auto input = torch::ones({10, 3});
|
|
auto output = parallel::data_parallel(m, input);
|
|
|
|
const auto device_count = torch::cuda::device_count();
|
|
REQUIRE(output.numel() == device_count);
|
|
for (size_t i = 0; i < device_count; ++i) {
|
|
REQUIRE(output[i].toCInt() == i);
|
|
}
|
|
}
|