diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h index 4342cf72fae..cb45756847d 100644 --- a/aten/src/ATen/cuda/ATenCUDAGeneral.h +++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h @@ -6,4 +6,4 @@ #include -#define AT_CUDA_API CAFFE2_API +// Use TORCH_CUDA_API for exports from this folder diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h index 5afcac93a23..7e7599e3965 100644 --- a/aten/src/ATen/cuda/CUDAContext.h +++ b/aten/src/ATen/cuda/CUDAContext.h @@ -52,17 +52,17 @@ inline bool is_available() { return c10::cuda::device_count() > 0; } -CAFFE2_API cudaDeviceProp* getCurrentDeviceProperties(); +TORCH_CUDA_API cudaDeviceProp* getCurrentDeviceProperties(); -CAFFE2_API int warp_size(); +TORCH_CUDA_API int warp_size(); -CAFFE2_API cudaDeviceProp* getDeviceProperties(int64_t device); +TORCH_CUDA_API cudaDeviceProp* getDeviceProperties(int64_t device); -CAFFE2_API Allocator* getCUDADeviceAllocator(); +TORCH_CUDA_API Allocator* getCUDADeviceAllocator(); /* Handles */ -CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle(); -CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle(); +TORCH_CUDA_API cusparseHandle_t getCurrentCUDASparseHandle(); +TORCH_CUDA_API cublasHandle_t getCurrentCUDABlasHandle(); } // namespace cuda diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index e93c3acc84b..bd1e645b271 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -24,7 +24,7 @@ namespace at { namespace cuda { * called before the event is ever recorded, it will use the current device. * Later streams that record the event must match this device. */ -struct AT_CUDA_API CUDAEvent { +struct TORCH_CUDA_API CUDAEvent { // Constructors // Default value for `flags` is specified below - it's cudaEventDisableTiming CUDAEvent() {} diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.h b/aten/src/ATen/cuda/PinnedMemoryAllocator.h index b7448dcaef5..e980908857f 100644 --- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h +++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h @@ -4,6 +4,6 @@ namespace at { namespace cuda { -CAFFE2_API at::Allocator* getPinnedMemoryAllocator(); +TORCH_CUDA_API at::Allocator* getPinnedMemoryAllocator(); }} // namespace at::cuda diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cuh b/aten/src/ATen/cuda/detail/IndexUtils.cuh index ed29e1eeee5..315897ae655 100644 --- a/aten/src/ATen/cuda/detail/IndexUtils.cuh +++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh @@ -8,8 +8,8 @@ namespace at { namespace cuda { namespace detail { -CAFFE2_API bool maybeOverlappingIndices(const at::Tensor& t); -CAFFE2_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits::max()); +TORCH_CUDA_API bool maybeOverlappingIndices(const at::Tensor& t); +TORCH_CUDA_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits::max()); template TensorInfo diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 7f831e5cdf7..b76f9328d3e 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -89,6 +89,6 @@ extern "C" typedef struct NVRTC { #undef CREATE_MEMBER } NVRTC; -extern "C" AT_CUDA_API NVRTC* load_nvrtc(); +extern "C" TORCH_CUDA_API NVRTC* load_nvrtc(); }} // at::cuda diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index c65bf2351b4..6853017cde4 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -64,7 +64,7 @@ struct DescriptorDeleter { // initialized the first time you call set() or any other initializing // function. template -class AT_CUDA_API Descriptor +class TORCH_CUDA_API Descriptor { public: // TODO: Figure out why const-correctness doesn't work here @@ -93,7 +93,7 @@ private: std::unique_ptr> desc_; }; -class AT_CUDA_API TensorDescriptor +class TORCH_CUDA_API TensorDescriptor : public Descriptor @@ -145,7 +145,7 @@ private: } }; -struct AT_CUDA_API ConvolutionDescriptor +struct TORCH_CUDA_API ConvolutionDescriptor : public Descriptor @@ -164,7 +164,7 @@ struct AT_CUDA_API ConvolutionDescriptor } }; -struct AT_CUDA_API SpatialTransformerDescriptor +struct TORCH_CUDA_API SpatialTransformerDescriptor : public Descriptor @@ -174,7 +174,7 @@ struct AT_CUDA_API SpatialTransformerDescriptor } }; -struct AT_CUDA_API DropoutDescriptor +struct TORCH_CUDA_API DropoutDescriptor : public Descriptor @@ -216,7 +216,7 @@ struct AT_CUDA_API DropoutDescriptor } }; -struct AT_CUDA_API RNNDescriptor +struct TORCH_CUDA_API RNNDescriptor : public Descriptor @@ -252,7 +252,7 @@ struct AT_CUDA_API RNNDescriptor } }; -struct AT_CUDA_API CTCLossDescriptor +struct TORCH_CUDA_API CTCLossDescriptor : public Descriptor diff --git a/aten/src/ATen/cudnn/Handle.h b/aten/src/ATen/cudnn/Handle.h index 7526a0701a8..89dfc8ca559 100644 --- a/aten/src/ATen/cudnn/Handle.h +++ b/aten/src/ATen/cudnn/Handle.h @@ -5,6 +5,6 @@ namespace at { namespace native { -AT_CUDA_API cudnnHandle_t getCudnnHandle(); +TORCH_CUDA_API cudnnHandle_t getCudnnHandle(); }} // namespace diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh index ed800fcb936..d67d9d1f060 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh @@ -4,21 +4,21 @@ namespace at { namespace native { namespace sparse { namespace cuda { -AT_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr); +TORCH_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr); /* Level 3 */ -AT_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc); -AT_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); +TORCH_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc); +TORCH_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); // overloaded version inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { Scsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); } inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { Dcsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); } /* format conversion */ -AT_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P); -AT_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes); -AT_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer); -AT_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes); -AT_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer); +TORCH_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P); +TORCH_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes); +TORCH_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer); +TORCH_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes); +TORCH_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer); }}}} // namespace at::native::sparse::cuda diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h index c0e75263882..a749ee53fea 100644 --- a/aten/src/THC/THCAllocator.h +++ b/aten/src/THC/THCAllocator.h @@ -5,7 +5,7 @@ // IPC doesn't support (re)allocation -class CAFFE2_API THCIpcDeleter { +class TORCH_CUDA_API THCIpcDeleter { public: THCIpcDeleter(std::shared_ptr basePtr); ~THCIpcDeleter(); diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in index abfaf3fe843..ced6af17363 100644 --- a/aten/src/THC/THCGeneral.h.in +++ b/aten/src/THC/THCGeneral.h.in @@ -17,8 +17,9 @@ # define THC_EXTERNC extern "C" // TH & THC are now part of the same library as ATen and Caffe2 -#define THC_API THC_EXTERNC CAFFE2_API -#define THC_CLASS CAFFE2_API +// NB: However, we are planning to split it out to a torch_cuda library +#define THC_API THC_EXTERNC TORCH_CUDA_API +#define THC_CLASS TORCH_CUDA_API #ifndef THAssert #define THAssert(exp) \ diff --git a/c10/macros/Export.h b/c10/macros/Export.h index c4880408a25..0a213edd7f3 100644 --- a/c10/macros/Export.h +++ b/c10/macros/Export.h @@ -91,11 +91,20 @@ #define C10_API C10_IMPORT #endif -// This one is being used by libcaffe2.so +// This one is being used by libtorch.so +// TODO: rename this to TORCH_API #ifdef CAFFE2_BUILD_MAIN_LIB #define CAFFE2_API C10_EXPORT #else #define CAFFE2_API C10_IMPORT #endif +// This one will eventually be used by libtorch_cuda.so, but for +// now it has the same function as CAFFE2_API +#ifdef CAFFE2_BUILD_MAIN_LIB +#define TORCH_CUDA_API C10_EXPORT +#else +#define TORCH_CUDA_API C10_IMPORT +#endif + #endif // C10_MACROS_MACROS_H_ diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h index 1f20967947f..4ec1a5dd742 100644 --- a/torch/csrc/autograd/functions/comm.h +++ b/torch/csrc/autograd/functions/comm.h @@ -14,8 +14,7 @@ namespace torch { namespace autograd { -//TODO: change it to TORCH_API when we merge the libs -struct AT_CUDA_API Scatter : public Node { +struct TORCH_CUDA_API Scatter : public Node { explicit Scatter( std::vector devices, const c10::optional>& chunk_sizes = c10::nullopt, @@ -34,7 +33,7 @@ struct AT_CUDA_API Scatter : public Node { bool unsqueeze_scalars_; }; -struct AT_CUDA_API Gather : public Node { +struct TORCH_CUDA_API Gather : public Node { explicit Gather(const at::Device& destination_device, int64_t dim = 0); ~Gather() override; diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h index 5a93daceb8a..2c3e2e1f9a8 100644 --- a/torch/csrc/cuda/comm.h +++ b/torch/csrc/cuda/comm.h @@ -13,11 +13,11 @@ namespace torch { namespace cuda { using tensor_list2d = std::vector>; -TORCH_API std::vector broadcast(const at::Tensor& tensor, at::IntArrayRef devices); -TORCH_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices, +TORCH_CUDA_API std::vector broadcast(const at::Tensor& tensor, at::IntArrayRef devices); +TORCH_CUDA_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices, size_t buffer_size); -TORCH_API std::vector scatter( +TORCH_CUDA_API std::vector scatter( const at::Tensor& tensor, at::IntArrayRef devices, const c10::optional>& chunk_sizes = c10::nullopt, @@ -25,7 +25,7 @@ TORCH_API std::vector scatter( const c10::optional>>& streams = c10::nullopt); -TORCH_API at::Tensor gather( +TORCH_CUDA_API at::Tensor gather( at::TensorList tensors, int64_t dim, c10::optional destination_index); diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h index 1e92850043d..e243fc7bfdc 100644 --- a/torch/csrc/cuda/nccl.h +++ b/torch/csrc/cuda/nccl.h @@ -42,31 +42,31 @@ struct AutoNcclGroup { } }; -TORCH_API at::ArrayRef get_communicators(at::TensorList inputs); -TORCH_API void check_inputs( +TORCH_CUDA_API at::ArrayRef get_communicators(at::TensorList inputs); +TORCH_CUDA_API void check_inputs( at::TensorList inputs, at::TensorList outputs, int input_multiplier, int output_multiplier); -TORCH_API ncclDataType_t get_data_type(const at::Tensor& t); +TORCH_CUDA_API ncclDataType_t get_data_type(const at::Tensor& t); } // namespace detail using comm_list = std::vector; using stream_list = std::vector>; -TORCH_API std::uint64_t version(); +TORCH_CUDA_API std::uint64_t version(); bool is_available(at::TensorList tensors); -TORCH_API void broadcast( +TORCH_CUDA_API void broadcast( at::TensorList tensors, const stream_list& streams = {}, const comm_list& user_comms = {}); size_t get_max_count(); -TORCH_API void reduce( +TORCH_CUDA_API void reduce( const std::vector& inputs, std::vector& outputs, int32_t root = 0, @@ -74,7 +74,7 @@ TORCH_API void reduce( const stream_list& streams = {}, const comm_list& user_comms = {}); -TORCH_API void reduce( +TORCH_CUDA_API void reduce( std::vector& inputs, int32_t root = 0, int32_t op = ncclSum, diff --git a/torch/csrc/jit/fuser/cuda/fused_kernel.h b/torch/csrc/jit/fuser/cuda/fused_kernel.h index 09e0995eae5..6c2e2fa1327 100644 --- a/torch/csrc/jit/fuser/cuda/fused_kernel.h +++ b/torch/csrc/jit/fuser/cuda/fused_kernel.h @@ -19,7 +19,7 @@ namespace cuda { // A class holding metadata for an actual CUDA function. // Note: CUDA functions are per device. -struct TORCH_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel { +struct TORCH_CUDA_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel { FusedKernelCUDA( int16_t device, std::string name,