pytorch/torch/lib/THD/base/DataChannel.hpp

#pragma once

#include <THD/base/ChannelType.h>
#include <THD/base/ChannelUtils.hpp>
#include <THD/base/DataChannel.h>
#include <THD/base/Scalar.hpp>
#include <THD/base/init_methods/InitMethod.hpp>

#include <ATen/ATen.h>

#include <unordered_map>
#include <utility>
#include <vector>

MAKE_HASHABLE(THDReduceOp, static_cast<int>(t));
MAKE_HASHABLE(thd::RPCType, static_cast<char>(t));
MAKE_HASHABLE(at::ScalarType, static_cast<int>(t));

namespace thd {

struct DataChannel {
  struct Request {
    Request(){};
    virtual ~Request(){};

    // Checks if request has completed. Non-blocking operation.
    virtual bool isCompleted() = 0;
    // Waits until request completes. Blocking operation.
    virtual void wait() = 0;
  };

  struct Group {
    Group();
    /*
     * Constructs `Group` from provided `ranks` and checks if all ranks are
     * in range: [0, `max_rank`].
     *
     * `ranks` vector should have mapping from new ranks to old ranks (global
     * ranks) eg. ranks = {[0] = 6, [1] = 2} which means that 0 and 1 are new
     * ranks in group and 6, 2 are global ranks corresponding to 0 and 1
     * respectively.
     */
    Group(std::vector<rank_type> ranks, rank_type max_rank);
    virtual ~Group();

    rank_type size() const;

    /*
     * In contrast to `getGroupRank` this function throws `std::logic_error`
     * when rank is member of this group.
     */
    rank_type mustGetGroupRank(rank_type global_rank) const;
    std::pair<rank_type, bool> getGroupRank(rank_type global_rank) const;

    /*
     * In contrast to `getGlobalRank` this function throws `std::logic_error`
     * when provided `group_rank` is not in range of group.
     */
    rank_type mustGetGlobalRank(rank_type group_rank) const;
    std::pair<rank_type, bool> getGlobalRank(rank_type group_rank) const;

   private:
    // maps new group ranks to old ranks (global ranks)
    std::vector<rank_type> _new2old;

    // maps old ranks (global ranks) to new group ranks
    std::unordered_map<rank_type, rank_type> _old2new;
  };

  DataChannel(){};
  virtual ~DataChannel(){};

  virtual bool init() = 0;

  /**
   * This is required for NCCL backend, since the destroy cannot be done before
   * CUDA is unloaded since DataChannel is a static object.
   */
  virtual void destroy() = 0;

  virtual rank_type getRank() = 0;
  virtual rank_type getNumProcesses() = 0;

  /**
   * All gather inputs from multiple GPUs, each Tensor in input vector should be
   * on a separate GPU.
   *
   * Also note that the output vector is a 1D vector (flattened from 2D),
   * with the size of input.size() * world_size.
   *
   * For instance, rank i 's input[k] tensor would be in
   * output[i * input.size() + k].
   */
  virtual void allGather(
      std::vector<at::Tensor>& output,
      std::vector<at::Tensor>& input,
      THDGroup groupId = THDGroupWORLD) = 0;
  virtual void allGather(
      std::vector<at::Tensor>& output,
      at::Tensor& input,
      THDGroup group_id = THDGroupWORLD) = 0;
  virtual void gather(
      std::vector<at::Tensor>& output,
      at::Tensor& input,
      rank_type dst_rank,
      THDGroup group_id = THDGroupWORLD) = 0;
  virtual void scatter(
      std::vector<at::Tensor>& input,
      at::Tensor& output,
      rank_type src_rank,
      THDGroup group_id = THDGroupWORLD) = 0;
  // All reduce multiple GPUs on a number of nodes
  virtual void allReduce(
      std::vector<at::Tensor>& data,
      THDReduceOp operation,
      THDGroup group_id = THDGroupWORLD) = 0;
  virtual void allReduce(
      at::Tensor& data,
      THDReduceOp operation,
      THDGroup group_id = THDGroupWORLD) = 0;
  /**
   * Reduce multiple GPUs on a number of nodes
   * data[0]'s GPU in dstRank will receive the result
   */
  virtual void reduce(
      std::vector<at::Tensor>& data,
      THDReduceOp operation,
      rank_type dstRank,
      THDGroup groupId = THDGroupWORLD) = 0;
  virtual void reduce(
      at::Tensor& data,
      THDReduceOp operation,
      rank_type dst_rank,
      THDGroup group_id = THDGroupWORLD) = 0;
  /**
   * Broadcast multiple GPUs on a number of nodes
   * data[0]'s GPU in srcRank will be the source to broadcast
   */
  virtual void broadcast(
      std::vector<at::Tensor>& data,
      rank_type srcRank,
      THDGroup groupId = THDGroupWORLD) = 0;
  virtual void broadcast(
      at::Tensor& data,
      rank_type src_rank,
      THDGroup group_id = THDGroupWORLD) = 0;
  virtual void send(Scalar& value, rank_type src_rank) = 0;
  virtual void send(at::Tensor& data, rank_type dst_rank) = 0;
  virtual void receive(Scalar& value, rank_type src_rank) = 0;
  virtual rank_type receive(at::Tensor& data) = 0; // receive from any source
  virtual void receive(at::Tensor& data, rank_type src_rank) = 0;
  virtual Request* isend(at::Tensor& data, rank_type dst_rank) = 0;
  virtual Request* ireceive(at::Tensor& data, rank_type src_rank) = 0;

  virtual void barrier(THDGroup group_id = THDGroupWORLD) = 0;

  virtual THDGroup newGroup(const std::vector<rank_type>& ranks) = 0;
  virtual void clearGroupCache(THDGroup group_id = THDGroupWORLD) = 0;

  static DataChannel* newChannel(
      THDChannelType type,
      std::string init_method,
      int world_size,
      std::string group_name,
      int rank);
};

} // namespace thd