mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary:
Anywhere we used #include "foo.h", we now say #include <foo.h>
Paths are adjusted to be rooted out of aten/src, torch/lib, or
the root level directory.
I modified CMakeLists.txt by hand to remove TH and THC from
the include paths.
I used the following script to do the canonicalization:
```
import subprocess
import re
import os.path
files = subprocess.check_output(['git', 'ls-files']).decode('utf-8').rstrip().split('\n')
for fn in files:
if not any(fn.endswith(suff) for suff in ['.cu', '.cpp', '.in', '.h', '.hpp', '.cu', '.cuh', '.cc']):
continue
if not any(fn.startswith(pref) for pref in ["aten/", "torch/"]):
continue
with open(fn, 'r') as f:
c = f.read()
def fmt(p):
return "#include <{}>".format(p)
def repl(m):
p = m.group(1)
if p in ["dlfcn.h", "unistd.h", "nvrtc.h", "cuda.h", "cuda_runtime.h", "cstdint", "cudnn.h", "Python.h", "cusparse.h", "cuda_runtime_api.h", "cuda_fp16.h", "cublas_v2.h", "stdint.h", "curand_kernel.h"]:
return fmt(p)
if any(p.startswith(pref) for pref in ["torch/csrc", "c10/", "ATen/", "caffe2/", "TH/", "THC/", "Eigen/", "gtest/", "zdl/", "gloo/", "onnx/", "miopen/"]):
return fmt(p)
for root in ["aten/src", "torch/lib", ""]:
for bad_root in [os.path.dirname(fn), "aten/src/TH", "aten/src/THC", "torch/csrc"]:
new_p = os.path.relpath(os.path.join(bad_root, p), root)
if not new_p.startswith("../") and (os.path.exists(os.path.join(root, new_p)) or os.path.exists(os.path.join(root, new_p + ".in"))):
return fmt(new_p)
print("ERROR: ", fn, p)
return m.group(0)
new_c = re.sub(r'#include "([^"]+)"', repl, c)
if new_c != c:
print(fn)
with open(fn, 'w') as f:
f.write(new_c)
```
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14849
Reviewed By: dzhulgakov
Differential Revision: D13363445
Pulled By: ezyang
fbshipit-source-id: 52361f878a672785f9306c9e9ab2513128092b68
169 lines
5.2 KiB
C++
169 lines
5.2 KiB
C++
#pragma once
|
|
|
|
#include <THD/base/ChannelType.h>
|
|
#include <THD/base/ChannelUtils.hpp>
|
|
#include <THD/base/DataChannel.h>
|
|
#include <THD/base/Scalar.hpp>
|
|
#include <THD/base/init_methods/InitMethod.hpp>
|
|
|
|
#include <ATen/ATen.h>
|
|
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
MAKE_HASHABLE(THDReduceOp, static_cast<int>(t));
|
|
MAKE_HASHABLE(thd::RPCType, static_cast<char>(t));
|
|
MAKE_HASHABLE(at::ScalarType, static_cast<int>(t));
|
|
|
|
namespace thd {
|
|
|
|
struct DataChannel {
|
|
struct Request {
|
|
Request(){};
|
|
virtual ~Request(){};
|
|
|
|
// Checks if request has completed. Non-blocking operation.
|
|
virtual bool isCompleted() = 0;
|
|
// Waits until request completes. Blocking operation.
|
|
virtual void wait() = 0;
|
|
};
|
|
|
|
struct Group {
|
|
Group();
|
|
/*
|
|
* Constructs `Group` from provided `ranks` and checks if all ranks are
|
|
* in range: [0, `max_rank`].
|
|
*
|
|
* `ranks` vector should have mapping from new ranks to old ranks (global
|
|
* ranks) eg. ranks = {[0] = 6, [1] = 2} which means that 0 and 1 are new
|
|
* ranks in group and 6, 2 are global ranks corresponding to 0 and 1
|
|
* respectively.
|
|
*/
|
|
Group(std::vector<rank_type> ranks, rank_type max_rank);
|
|
virtual ~Group();
|
|
|
|
rank_type size() const;
|
|
|
|
/*
|
|
* In contrast to `getGroupRank` this function throws `std::logic_error`
|
|
* when rank is member of this group.
|
|
*/
|
|
rank_type mustGetGroupRank(rank_type global_rank) const;
|
|
std::pair<rank_type, bool> getGroupRank(rank_type global_rank) const;
|
|
|
|
/*
|
|
* In contrast to `getGlobalRank` this function throws `std::logic_error`
|
|
* when provided `group_rank` is not in range of group.
|
|
*/
|
|
rank_type mustGetGlobalRank(rank_type group_rank) const;
|
|
std::pair<rank_type, bool> getGlobalRank(rank_type group_rank) const;
|
|
|
|
private:
|
|
// maps new group ranks to old ranks (global ranks)
|
|
std::vector<rank_type> _new2old;
|
|
|
|
// maps old ranks (global ranks) to new group ranks
|
|
std::unordered_map<rank_type, rank_type> _old2new;
|
|
};
|
|
|
|
DataChannel(){};
|
|
virtual ~DataChannel(){};
|
|
|
|
virtual bool init() = 0;
|
|
|
|
/**
|
|
* This is required for NCCL backend, since the destroy cannot be done before
|
|
* CUDA is unloaded since DataChannel is a static object.
|
|
*/
|
|
virtual void destroy() = 0;
|
|
|
|
virtual rank_type getRank() = 0;
|
|
virtual rank_type getNumProcesses() = 0;
|
|
|
|
/**
|
|
* All gather inputs from multiple GPUs, each Tensor in input vector should be
|
|
* on a separate GPU.
|
|
*
|
|
* Also note that the output vector is a 1D vector (flattened from 2D),
|
|
* with the size of input.size() * world_size.
|
|
*
|
|
* For instance, rank i 's input[k] tensor would be in
|
|
* output[i * input.size() + k].
|
|
*/
|
|
virtual void allGather(
|
|
std::vector<at::Tensor>& output,
|
|
std::vector<at::Tensor>& input,
|
|
THDGroup groupId = THDGroupWORLD) = 0;
|
|
virtual void allGather(
|
|
std::vector<at::Tensor>& output,
|
|
at::Tensor& input,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
virtual void gather(
|
|
std::vector<at::Tensor>& output,
|
|
at::Tensor& input,
|
|
rank_type dst_rank,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
virtual void scatter(
|
|
std::vector<at::Tensor>& input,
|
|
at::Tensor& output,
|
|
rank_type src_rank,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
// All reduce multiple GPUs on a number of nodes
|
|
virtual void allReduce(
|
|
std::vector<at::Tensor>& data,
|
|
THDReduceOp operation,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
virtual void allReduce(
|
|
at::Tensor& data,
|
|
THDReduceOp operation,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
/**
|
|
* Reduce multiple GPUs on a number of nodes
|
|
* data[0]'s GPU in dstRank will receive the result
|
|
*/
|
|
virtual void reduce(
|
|
std::vector<at::Tensor>& data,
|
|
THDReduceOp operation,
|
|
rank_type dstRank,
|
|
THDGroup groupId = THDGroupWORLD) = 0;
|
|
virtual void reduce(
|
|
at::Tensor& data,
|
|
THDReduceOp operation,
|
|
rank_type dst_rank,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
/**
|
|
* Broadcast multiple GPUs on a number of nodes
|
|
* data[0]'s GPU in srcRank will be the source to broadcast
|
|
*/
|
|
virtual void broadcast(
|
|
std::vector<at::Tensor>& data,
|
|
rank_type srcRank,
|
|
THDGroup groupId = THDGroupWORLD) = 0;
|
|
virtual void broadcast(
|
|
at::Tensor& data,
|
|
rank_type src_rank,
|
|
THDGroup group_id = THDGroupWORLD) = 0;
|
|
virtual void send(Scalar& value, rank_type src_rank) = 0;
|
|
virtual void send(at::Tensor& data, rank_type dst_rank) = 0;
|
|
virtual void receive(Scalar& value, rank_type src_rank) = 0;
|
|
virtual rank_type receive(at::Tensor& data) = 0; // receive from any source
|
|
virtual void receive(at::Tensor& data, rank_type src_rank) = 0;
|
|
virtual Request* isend(at::Tensor& data, rank_type dst_rank) = 0;
|
|
virtual Request* ireceive(at::Tensor& data, rank_type src_rank) = 0;
|
|
|
|
virtual void barrier(THDGroup group_id = THDGroupWORLD) = 0;
|
|
|
|
virtual THDGroup newGroup(const std::vector<rank_type>& ranks) = 0;
|
|
virtual void clearGroupCache(THDGroup group_id = THDGroupWORLD) = 0;
|
|
|
|
static DataChannel* newChannel(
|
|
THDChannelType type,
|
|
std::string init_method,
|
|
int world_size,
|
|
std::string group_name,
|
|
int rank);
|
|
};
|
|
|
|
} // namespace thd
|