pytorch/torch/csrc/distributed/c10d/Functional.hpp
Simon Fan 95bc3da9f8 [c10d] support dynamic shapes for all_to_all_single_autograd (#157521)
`all_to_all_single_autograd` is not an op, all the code executed until the `all_to_all_single` dispatch is visible to the compiler. This means the `all_to_all_single_autograd` wrapper code must support symints in order to be traceable with dynamic shapes.

FIXES https://github.com/pytorch/pytorch/issues/157479

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157521
Approved by: https://github.com/wconstab
2025-07-08 23:19:59 +00:00

79 lines
2.2 KiB
C++

#pragma once
#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
namespace c10d {
C10_EXPORT at::Tensor& all_reduce_(
at::Tensor& input,
std::string reduce_op,
std::string group_name);
C10_EXPORT at::Tensor all_reduce(
const at::Tensor& input,
std::string reduce_op,
std::string group_name);
C10_EXPORT std::vector<at::Tensor> all_reduce_coalesced_(
std::vector<at::Tensor> inputs,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::string reduce_op,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::string group_name);
C10_EXPORT std::vector<at::Tensor> all_reduce_coalesced(
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::vector<at::Tensor> inputs,
std::string reduce_op,
std::string group_name);
C10_EXPORT std::vector<at::Tensor> all_gather_into_tensor_coalesced(
std::vector<at::Tensor> inputs,
int64_t group_size,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::string group_name);
C10_EXPORT at::Tensor all_gather_into_tensor(
const at::Tensor& input,
int64_t group_size,
std::string group_name);
C10_EXPORT at::Tensor& all_gather_into_tensor_out(
at::Tensor& input,
int64_t group_size,
const std::string& group_name,
at::Tensor& output);
C10_EXPORT std::vector<at::Tensor> reduce_scatter_tensor_coalesced(
std::vector<at::Tensor> inputs,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::string reduce_op,
int64_t group_size,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::string group_name);
C10_EXPORT at::Tensor reduce_scatter_tensor(
const at::Tensor& input,
std::string reduce_op,
int64_t group_size,
std::string group_name);
C10_EXPORT at::Tensor all_to_all_single(
const at::Tensor& input,
at::SymIntArrayRef output_split_sizes,
at::SymIntArrayRef input_split_sizes,
// NOLINTNEXTLINE(performance-unnecessary-value-param)
std::string group_name);
C10_EXPORT at::Tensor& broadcast_(
at::Tensor& input,
int64_t src,
std::string group_name);
C10_EXPORT at::Tensor broadcast(
const at::Tensor& input,
int64_t src,
std::string group_name);
} // namespace c10d