Introducing TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API to the code (#50627)

Summary:
Sub-step of my attempt to split up the torch_cuda library, as it is huge. Please look at https://github.com/pytorch/pytorch/issues/49050 for details on the split and which files are in which target.

This PR introduces two new macros for Windows DLL purposes, TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API. Both are defined as TORCH_CUDA_API for the time being.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/50627

Reviewed By: mruberry

Differential Revision: D25955441

Pulled By: janeyx99

fbshipit-source-id: ff226026833b8fb2fb7c77df6f2d6c824f006869
This commit is contained in:
Jane Xu 2021-01-21 19:03:02 -08:00 committed by Facebook GitHub Bot
parent 3aed177484
commit 533cb9530e
89 changed files with 1316 additions and 799 deletions

View File

@ -119,7 +119,7 @@ struct PhiloxCudaState {
bool captured_ = false;
};
struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
// Constructors
CUDAGeneratorImpl(DeviceIndex device_index = -1);
~CUDAGeneratorImpl() = default;
@ -155,10 +155,10 @@ private:
namespace cuda {
namespace detail {
TORCH_CUDA_API const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1);
TORCH_CUDA_API Generator createCUDAGenerator(DeviceIndex device_index = -1);
TORCH_CUDA_CPP_API const Generator& getDefaultCUDAGenerator(
DeviceIndex device_index = -1);
TORCH_CUDA_CPP_API Generator createCUDAGenerator(DeviceIndex device_index = -1);
} // namespace detail
} // namespace cuda
} // namespace at

View File

@ -6,4 +6,4 @@
#include <c10/macros/Export.h>
// Use TORCH_CUDA_API for exports from this folder
// Use TORCH_CUDA_CPP_API or TORCH_CUDA_CU_API for exports from this folder

View File

@ -56,22 +56,24 @@ inline bool is_available() {
return c10::cuda::device_count() > 0;
}
TORCH_CUDA_API cudaDeviceProp* getCurrentDeviceProperties();
TORCH_CUDA_CPP_API cudaDeviceProp* getCurrentDeviceProperties();
TORCH_CUDA_API int warp_size();
TORCH_CUDA_CPP_API int warp_size();
TORCH_CUDA_API cudaDeviceProp* getDeviceProperties(int64_t device);
TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(int64_t device);
TORCH_CUDA_API bool canDeviceAccessPeer(int64_t device, int64_t peer_device);
TORCH_CUDA_CPP_API bool canDeviceAccessPeer(
int64_t device,
int64_t peer_device);
TORCH_CUDA_API Allocator* getCUDADeviceAllocator();
TORCH_CUDA_CPP_API Allocator* getCUDADeviceAllocator();
/* Handles */
TORCH_CUDA_API cusparseHandle_t getCurrentCUDASparseHandle();
TORCH_CUDA_API cublasHandle_t getCurrentCUDABlasHandle();
TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
#ifdef CUDART_VERSION
TORCH_CUDA_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
#endif
} // namespace cuda

View File

@ -24,7 +24,7 @@ namespace at { namespace cuda {
* called before the event is ever recorded, it will use the current device.
* Later streams that record the event must match this device.
*/
struct TORCH_CUDA_API CUDAEvent {
struct TORCH_CUDA_CPP_API CUDAEvent {
// Constructors
// Default value for `flags` is specified below - it's cudaEventDisableTiming
CUDAEvent() {}

View File

@ -21,7 +21,7 @@
namespace at { namespace cuda {
struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future {
struct TORCH_CUDA_CPP_API CUDAFuture : at::ivalue::Future {
public:
using at::ivalue::Future::Future;

View File

@ -6,7 +6,7 @@
namespace at {
namespace cuda {
struct TORCH_CUDA_API CUDAGraph {
struct TORCH_CUDA_CPP_API CUDAGraph {
CUDAGraph();
~CUDAGraph();

View File

@ -4,6 +4,5 @@
namespace at { namespace cuda {
TORCH_CUDA_API at::Allocator* getPinnedMemoryAllocator();
TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator();
}} // namespace at::cuda

View File

@ -8,7 +8,7 @@ namespace at {
namespace cuda {
namespace detail {
TORCH_CUDA_API bool maybeOverlappingIndices(const at::Tensor& t);
TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::Tensor& t);
using at::native::canUse32BitIndexMath;
template <typename scalar, typename IndexType>

View File

@ -105,6 +105,5 @@ extern "C" typedef struct NVRTC {
#undef CREATE_MEMBER
} NVRTC;
extern "C" TORCH_CUDA_API NVRTC* load_nvrtc();
extern "C" TORCH_CUDA_CPP_API NVRTC* load_nvrtc();
}} // at::cuda

View File

@ -80,9 +80,8 @@ struct DescriptorDeleter {
// initialized the first time you call set() or any other initializing
// function.
template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
class TORCH_CUDA_API Descriptor
{
public:
class TORCH_CUDA_CU_API Descriptor {
public:
// TODO: Figure out why const-correctness doesn't work here
// Use desc() to access the underlying descriptor pointer in
@ -109,12 +108,11 @@ private:
std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
};
class TORCH_CUDA_API TensorDescriptor
: public Descriptor<cudnnTensorStruct,
&cudnnCreateTensorDescriptor,
&cudnnDestroyTensorDescriptor>
{
public:
class TORCH_CUDA_CU_API TensorDescriptor : public Descriptor<
cudnnTensorStruct,
&cudnnCreateTensorDescriptor,
&cudnnDestroyTensorDescriptor> {
public:
TensorDescriptor() {}
explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
set(t, pad);
@ -149,12 +147,11 @@ private:
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
class TORCH_CUDA_API FilterDescriptor
: public Descriptor<cudnnFilterStruct,
&cudnnCreateFilterDescriptor,
&cudnnDestroyFilterDescriptor>
{
public:
class TORCH_CUDA_CU_API FilterDescriptor : public Descriptor<
cudnnFilterStruct,
&cudnnCreateFilterDescriptor,
&cudnnDestroyFilterDescriptor> {
public:
void set(const at::Tensor &t, int64_t pad = 0, bool force_nhwc = false);
void print();
@ -166,11 +163,11 @@ private:
std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
struct TORCH_CUDA_API ConvolutionDescriptor
: public Descriptor<cudnnConvolutionStruct,
&cudnnCreateConvolutionDescriptor,
&cudnnDestroyConvolutionDescriptor>
{
struct TORCH_CUDA_CU_API ConvolutionDescriptor
: public Descriptor<
cudnnConvolutionStruct,
&cudnnCreateConvolutionDescriptor,
&cudnnDestroyConvolutionDescriptor> {
void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool allow_tf32) {
cudnnDataType_t mathType = dataType;
if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
@ -189,21 +186,21 @@ struct TORCH_CUDA_API ConvolutionDescriptor
}
};
struct TORCH_CUDA_API SpatialTransformerDescriptor
: public Descriptor<cudnnSpatialTransformerStruct,
&cudnnCreateSpatialTransformerDescriptor,
&cudnnDestroySpatialTransformerDescriptor>
{
struct TORCH_CUDA_CU_API SpatialTransformerDescriptor
: public Descriptor<
cudnnSpatialTransformerStruct,
&cudnnCreateSpatialTransformerDescriptor,
&cudnnDestroySpatialTransformerDescriptor> {
void set(cudnnDataType_t dataType, int dim, int* size) {
AT_CUDNN_CHECK(cudnnSetSpatialTransformerNdDescriptor(mut_desc(), CUDNN_SAMPLER_BILINEAR, dataType, dim, size));
}
};
struct TORCH_CUDA_API DropoutDescriptor
: public Descriptor<cudnnDropoutStruct,
&cudnnCreateDropoutDescriptor,
&cudnnDestroyDropoutDescriptor>
{
struct TORCH_CUDA_CU_API DropoutDescriptor
: public Descriptor<
cudnnDropoutStruct,
&cudnnCreateDropoutDescriptor,
&cudnnDestroyDropoutDescriptor> {
at::Tensor state;
// Initialize a dropout descriptor's RNG state.
@ -238,11 +235,10 @@ struct TORCH_CUDA_API DropoutDescriptor
}
};
struct TORCH_CUDA_API RNNDescriptor
: public Descriptor<cudnnRNNStruct,
&cudnnCreateRNNDescriptor,
&cudnnDestroyRNNDescriptor>
{
struct TORCH_CUDA_CU_API RNNDescriptor : public Descriptor<
cudnnRNNStruct,
&cudnnCreateRNNDescriptor,
&cudnnDestroyRNNDescriptor> {
DropoutDescriptor dropout_desc_;
void set(cudnnHandle_t handle, int hidden_size, int proj_size, int num_layers, DropoutDescriptor&& dropout_desc,
cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional,
@ -286,11 +282,11 @@ struct TORCH_CUDA_API RNNDescriptor
}
};
struct TORCH_CUDA_API CTCLossDescriptor
: public Descriptor<cudnnCTCLossStruct,
&cudnnCreateCTCLossDescriptor,
&cudnnDestroyCTCLossDescriptor>
{
struct TORCH_CUDA_CU_API CTCLossDescriptor
: public Descriptor<
cudnnCTCLossStruct,
&cudnnCreateCTCLossDescriptor,
&cudnnDestroyCTCLossDescriptor> {
void set(cudnnDataType_t datatype) {
AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
}

View File

@ -5,6 +5,5 @@
namespace at { namespace native {
TORCH_CUDA_API cudnnHandle_t getCudnnHandle();
TORCH_CUDA_CU_API cudnnHandle_t getCudnnHandle();
}} // namespace at::native

View File

@ -5,7 +5,8 @@
namespace at { namespace native {
TORCH_CUDA_API cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
TORCH_CUDA_CU_API cudnnDataType_t
getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
cudnnDataType_t getCudnnDataType(const at::Tensor& tensor);
int64_t cudnn_version();

View File

@ -774,79 +774,94 @@ namespace {
// Utilities exposed in RNNUtils.h
namespace cudnn_rnn {
TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_views(
TensorList weight_arr,
int64_t weight_stride0,
int64_t input_size,
int64_t mode,
int64_t hidden_size,
int64_t proj_size,
int64_t num_layers,
bool batch_first,
bool bidirectional,
const cudnnDataType_t flat_buf_datatype,
const TensorOptions& flat_buf_options,
bool set_orig_weights_to_flat_buf,
bool allow_type_change/*=false*/,
bool include_bias/*=true*/) {
// flat_buf_datatype is accepted as a separate argument (rather than extracted from flat_buf_options)
// because to extract flat_buf_datatype from flat_buf_options, we'd need to say
// auto flat_buf_datatype = getCudnnDataTypeFromScalarType(typeMetaToScalarType(options.dtype()));
// typeMetaToScalarType is a surprisingly nontrivial function. We should avoid it if we can.
TORCH_CHECK(weight_arr.size() > 0,
"copy_weights_to_flat_buf_views: cannot flatten empty weight list");
TORCH_CUDA_CU_API std::tuple<Tensor, std::vector<Tensor>>
copy_weights_to_flat_buf_views(
TensorList weight_arr,
int64_t weight_stride0,
int64_t input_size,
int64_t mode,
int64_t hidden_size,
int64_t proj_size,
int64_t num_layers,
bool batch_first,
bool bidirectional,
const cudnnDataType_t flat_buf_datatype,
const TensorOptions& flat_buf_options,
bool set_orig_weights_to_flat_buf,
bool allow_type_change /*=false*/,
bool include_bias /*=true*/) {
// flat_buf_datatype is accepted as a separate argument (rather than extracted
// from flat_buf_options) because to extract flat_buf_datatype from
// flat_buf_options, we'd need to say auto flat_buf_datatype =
// getCudnnDataTypeFromScalarType(typeMetaToScalarType(options.dtype()));
// typeMetaToScalarType is a surprisingly nontrivial function. We should
// avoid it if we can.
TORCH_CHECK(
weight_arr.size() > 0,
"copy_weights_to_flat_buf_views: cannot flatten empty weight list");
RNNDescriptorParams rnn;
rnn.set(mode, hidden_size, proj_size, num_layers, bidirectional, promote_rnn_math_type(flat_buf_datatype), flat_buf_datatype);
RNNDescriptorParams rnn;
rnn.set(
mode,
hidden_size,
proj_size,
num_layers,
bidirectional,
promote_rnn_math_type(flat_buf_datatype),
flat_buf_datatype);
auto handle = getCudnnHandle();
RNNDescriptor rnn_desc = rnn.descriptor(handle);
auto handle = getCudnnHandle();
RNNDescriptor rnn_desc = rnn.descriptor(handle);
TensorGeometry x_geom({1, input_size});
TensorDescriptor x_desc;
// Why do we pad to 5 dims here (and elsewhere)?
// https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNForwardTraining
// expects descriptors padded to 3 dimensions.
x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5);
TensorGeometry x_geom({1, input_size});
TensorDescriptor x_desc;
// Why do we pad to 5 dims here (and elsewhere)?
// https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNForwardTraining
// expects descriptors padded to 3 dimensions.
x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5);
auto num_weights = get_num_weights(handle, rnn_desc, x_desc, flat_buf_datatype);
auto weight_buf = at::zeros(num_weights, flat_buf_options);
auto num_weights =
get_num_weights(handle, rnn_desc, x_desc, flat_buf_datatype);
auto weight_buf = at::zeros(num_weights, flat_buf_options);
FilterDescriptor w_desc;
w_desc.set(weight_buf, 3);
FilterDescriptor w_desc;
w_desc.set(weight_buf, 3);
// Slice off views into weight_buf
std::vector<Tensor> params_arr;
size_t params_stride0;
std::tie(params_arr, params_stride0) = get_parameters(handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias);
MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)},
params{params_arr, params_stride0};
// Slice off views into weight_buf
std::vector<Tensor> params_arr;
size_t params_stride0;
std::tie(params_arr, params_stride0) = get_parameters(
handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias);
MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)},
params{params_arr, params_stride0};
// Copy weights
_viewOrCopyParams(weight, params, /*copy=*/true, allow_type_change);
if (set_orig_weights_to_flat_buf) {
// Update the storage
for (size_t i = 0; i < weight.size(0); i++) {
// There is a special case for LSTM with projections and no bias,
// where weight copy is done in 0->0, 1->1, 2->4 layout
if (weight[i].size() == 3 && params[i].size() == 5) {
weight[i][0].set_(params[i][0].view_as(weight[i][0]));
weight[i][1].set_(params[i][1].view_as(weight[i][1]));
weight[i][2].set_(params[i][4].view_as(weight[i][2]));
} else {
for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin();
orig_param_it != weight[i].end() && new_param_it != params[i].end();
orig_param_it++, new_param_it++) {
auto orig_param = *orig_param_it, new_param = *new_param_it;
orig_param.set_(new_param.view_as(orig_param));
}
// Copy weights
_viewOrCopyParams(weight, params, /*copy=*/true, allow_type_change);
if (set_orig_weights_to_flat_buf) {
// Update the storage
for (size_t i = 0; i < weight.size(0); i++) {
// There is a special case for LSTM with projections and no bias,
// where weight copy is done in 0->0, 1->1, 2->4 layout
if (weight[i].size() == 3 && params[i].size() == 5) {
weight[i][0].set_(params[i][0].view_as(weight[i][0]));
weight[i][1].set_(params[i][1].view_as(weight[i][1]));
weight[i][2].set_(params[i][4].view_as(weight[i][2]));
} else {
for (auto orig_param_it = weight[i].begin(),
new_param_it = params[i].begin();
orig_param_it != weight[i].end() &&
new_param_it != params[i].end();
orig_param_it++, new_param_it++) {
auto orig_param = *orig_param_it, new_param = *new_param_it;
orig_param.set_(new_param.view_as(orig_param));
}
}
}
return std::make_tuple(weight_buf, params_arr);
}
return std::make_tuple(weight_buf, params_arr);
}
} // namespace cudnn_rnn
using namespace cudnn_rnn;

View File

@ -8,7 +8,8 @@ namespace at {
namespace native {
namespace cudnn_rnn {
TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_views(
TORCH_CUDA_CU_API std::tuple<Tensor, std::vector<Tensor>>
copy_weights_to_flat_buf_views(
TensorList weight_arr,
int64_t weight_stride0,
int64_t input_size,
@ -21,8 +22,8 @@ TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_
const cudnnDataType_t flat_buf_datatype,
const TensorOptions& flat_buf_options,
bool set_orig_weights_to_flat_buf,
bool allow_type_change=false,
bool include_bias=true);
bool allow_type_change = false,
bool include_bias = true);
} // namespace cudnn_rnn
} // namespace native

View File

@ -4,17 +4,61 @@
namespace at { namespace native { namespace sparse { namespace cuda {
TORCH_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
TORCH_CUDA_CU_API void Xcoo2csr(
const int* coorowind,
int64_t nnz,
int64_t m,
int* csrrowptr);
/* Level 3 */
template<typename T>
TORCH_CUDA_API void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, T alpha, T *csrvala, int *csrrowptra, int *csrcolinda, T *b, int64_t ldb, T beta, T *c, int64_t ldc);
template <typename T>
TORCH_CUDA_CU_API void csrmm2(
char transa,
char transb,
int64_t m,
int64_t n,
int64_t k,
int64_t nnz,
T alpha,
T* csrvala,
int* csrrowptra,
int* csrcolinda,
T* b,
int64_t ldb,
T beta,
T* c,
int64_t ldc);
/* format conversion */
TORCH_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
TORCH_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
TORCH_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
TORCH_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
TORCH_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
TORCH_CUDA_CU_API void CreateIdentityPermutation(int64_t nnz, int* P);
TORCH_CUDA_CU_API void Xcsrsort_bufferSizeExt(
int64_t m,
int64_t n,
int64_t nnz,
const int* csrRowPtr,
const int* csrColInd,
size_t* pBufferSizeInBytes);
TORCH_CUDA_CU_API void Xcsrsort(
int64_t m,
int64_t n,
int64_t nnz,
const int* csrRowPtr,
int* csrColInd,
int* P,
void* pBuffer);
TORCH_CUDA_CU_API void Xcoosort_bufferSizeExt(
int64_t m,
int64_t n,
int64_t nnz,
const int* cooRows,
const int* cooCols,
size_t* pBufferSizeInBytes);
TORCH_CUDA_CU_API void XcoosortByRow(
int64_t m,
int64_t n,
int64_t nnz,
int* cooRows,
int* cooCols,
int* P,
void* pBuffer);
}}}} // namespace at::native::sparse::cuda

View File

@ -5,7 +5,7 @@
// IPC doesn't support (re)allocation
class TORCH_CUDA_API THCIpcDeleter {
class TORCH_CUDA_CU_API THCIpcDeleter {
public:
THCIpcDeleter(std::shared_ptr<void> basePtr);
~THCIpcDeleter();

View File

@ -21,13 +21,14 @@
// Note that this allocator does not split larger allocations into smaller
// blocks, unlike the caching device allocator.
//
TORCH_CUDA_API c10::Allocator* getTHCCachingHostAllocator(void);
TORCH_CUDA_CU_API c10::Allocator* getTHCCachingHostAllocator(void);
// Records an event in the specified stream. The allocation 'ptr' will not be
// re-used until the event has occurred.
TORCH_CUDA_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, at::cuda::CUDAStream stream);
TORCH_CUDA_CU_API cudaError_t
THCCachingHostAllocator_recordEvent(void* ptr, at::cuda::CUDAStream stream);
// Releases cached pinned memory allocations via cudaHostFree
TORCH_CUDA_API void THCCachingHostAllocator_emptyCache(void);
TORCH_CUDA_CU_API void THCCachingHostAllocator_emptyCache(void);
#endif

View File

@ -31,22 +31,22 @@ typedef struct _THCCudaResourcesPerDevice {
size_t scratchSpacePerStream;
} THCCudaResourcesPerDevice;
TORCH_CUDA_API THCState* THCState_alloc(void);
TORCH_CUDA_API void THCState_free(THCState* state);
TORCH_CUDA_CU_API THCState* THCState_alloc(void);
TORCH_CUDA_CU_API void THCState_free(THCState* state);
TORCH_CUDA_API void THCudaInit(THCState* state);
TORCH_CUDA_API void THCudaShutdown(THCState* state);
TORCH_CUDA_CU_API void THCudaInit(THCState* state);
TORCH_CUDA_CU_API void THCudaShutdown(THCState* state);
/* If device `dev` can access allocations on device `devToAccess`, this will return */
/* 1; otherwise, 0. */
TORCH_CUDA_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
TORCH_CUDA_CU_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
TORCH_CUDA_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);
TORCH_CUDA_CU_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);
TORCH_CUDA_API void THCMagma_init(THCState *state);
TORCH_CUDA_CU_API void THCMagma_init(THCState *state);
/* For the current device and stream, returns the allocated scratch space */
TORCH_CUDA_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
TORCH_CUDA_CU_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
#define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs")
#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__)
@ -54,16 +54,16 @@ TORCH_CUDA_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__)
#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__)
TORCH_CUDA_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
TORCH_CUDA_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
TORCH_CUDA_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
TORCH_CUDA_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
TORCH_CUDA_CU_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
TORCH_CUDA_CU_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
TORCH_CUDA_CU_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
TORCH_CUDA_CU_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
TORCH_CUDA_API void* THCudaMalloc(THCState *state, size_t size);
TORCH_CUDA_API void THCudaFree(THCState *state, void* ptr);
TORCH_CUDA_CU_API void* THCudaMalloc(THCState *state, size_t size);
TORCH_CUDA_CU_API void THCudaFree(THCState *state, void* ptr);
at::DataPtr THCudaHostAlloc(THCState *state, size_t size);
TORCH_CUDA_API void THCudaHostRecord(THCState *state, void *ptr);
TORCH_CUDA_CU_API void THCudaHostRecord(THCState *state, void *ptr);
#endif

View File

@ -147,6 +147,6 @@ __device__ T reduceBlockWithNThreadLocalReductions(T *smem,
void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
// Produces a grid with at least one point per tile
TORCH_CUDA_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
TORCH_CUDA_CU_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
#endif // THC_REDUCE_APPLY_UTILS_INC

View File

@ -5,6 +5,6 @@
#include <time.h>
// enqueues a kernel that spins for the specified number of cycles
TORCH_CUDA_API void THC_sleep(THCState* state, int64_t cycles);
TORCH_CUDA_CU_API void THC_sleep(THCState* state, int64_t cycles);
#endif

View File

@ -13,17 +13,19 @@
#include <cuda_runtime.h>
#include <cuda_fp16.h>
TORCH_CUDA_API THCStorage* THCStorage_new(THCState* state);
TORCH_CUDA_CU_API THCStorage* THCStorage_new(THCState* state);
TORCH_CUDA_API void THCStorage_retain(THCState *state, THCStorage *storage);
TORCH_CUDA_CU_API void THCStorage_retain(THCState* state, THCStorage* storage);
TORCH_CUDA_API void THCStorage_resizeBytes(
TORCH_CUDA_CU_API void THCStorage_resizeBytes(
THCState* state,
THCStorage* storage,
ptrdiff_t size_bytes);
TORCH_CUDA_API int THCStorage_getDevice(THCState* state, const THCStorage* storage);
TORCH_CUDA_CU_API int THCStorage_getDevice(
THCState* state,
const THCStorage* storage);
TORCH_CUDA_API THCStorage* THCStorage_newWithDataAndAllocator(
TORCH_CUDA_CU_API THCStorage* THCStorage_newWithDataAndAllocator(
THCState* state,
at::DataPtr&& data,
ptrdiff_t size,

View File

@ -9,9 +9,8 @@
#define THC_DESC_BUFF_LEN 64
typedef struct TORCH_CUDA_API THCDescBuff
{
char str[THC_DESC_BUFF_LEN];
typedef struct TORCH_CUDA_CU_API THCDescBuff {
char str[THC_DESC_BUFF_LEN];
} THCDescBuff;
#include <THC/generic/THCTensor.h>

View File

@ -12,46 +12,110 @@
#include <ATen/ATen.h>
// See [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
TORCH_CUDA_API int THCTensor_nDimension(THCState *state, const THCTensor *self);
TORCH_CUDA_API int THCTensor_nDimensionLegacyNoScalars(THCState *state, const THCTensor *self);
TORCH_CUDA_API int THCTensor_nDimensionLegacyAll(THCState *state, const THCTensor *self);
TORCH_CUDA_CU_API int THCTensor_nDimension(
THCState* state,
const THCTensor* self);
TORCH_CUDA_CU_API int THCTensor_nDimensionLegacyNoScalars(
THCState* state,
const THCTensor* self);
TORCH_CUDA_CU_API int THCTensor_nDimensionLegacyAll(
THCState* state,
const THCTensor* self);
TORCH_CUDA_API int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API int64_t THCTensor_sizeLegacyNoScalars(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_CU_API int64_t
THCTensor_size(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API int64_t
THCTensor_sizeLegacyNoScalars(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API int64_t
THCTensor_stride(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API int64_t THCTensor_strideLegacyNoScalars(
THCState* state,
const THCTensor* self,
int dim);
TORCH_CUDA_API THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta);
TORCH_CUDA_CU_API THCTensor* THCTensor_new(
THCState* state,
caffe2::TypeMeta type_meta);
TORCH_CUDA_API void THCTensor_resize(THCState *state, THCTensor *tensor, at::IntArrayRef size, at::IntArrayRef stride);
TORCH_CUDA_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
TORCH_CUDA_API void THCTensor_resizeAs(THCState *state, THCTensor *tensor, THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_resize(
THCState* state,
THCTensor* tensor,
at::IntArrayRef size,
at::IntArrayRef stride);
TORCH_CUDA_CU_API void THCTensor_resizeNd(
THCState* state,
THCTensor* tensor,
int nDimension,
const int64_t* size,
const int64_t* stride);
TORCH_CUDA_CU_API void THCTensor_resizeAs(
THCState* state,
THCTensor* tensor,
THCTensor* src);
TORCH_CUDA_API void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src);
TORCH_CUDA_API void THCTensor_setStorage(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, at::IntArrayRef size_, at::IntArrayRef stride_);
TORCH_CUDA_CU_API void THCTensor_set(
THCState* state,
THCTensor* self,
THCTensor* src);
TORCH_CUDA_CU_API void THCTensor_setStorage(
THCState* state,
THCTensor* self,
THCStorage* storage_,
ptrdiff_t storageOffset_,
at::IntArrayRef size_,
at::IntArrayRef stride_);
TORCH_CUDA_API void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
TORCH_CUDA_API void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
TORCH_CUDA_CU_API void THCTensor_squeeze1d(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension_);
TORCH_CUDA_CU_API void THCTensor_unsqueeze1d(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension_);
TORCH_CUDA_API bool THCTensor_allContiguous(THCState *state, THCTensor **inputs, int numInputs);
TORCH_CUDA_API ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self);
TORCH_CUDA_CU_API bool THCTensor_allContiguous(
THCState* state,
THCTensor** inputs,
int numInputs);
TORCH_CUDA_CU_API ptrdiff_t
THCTensor_nElement(THCState* state, const THCTensor* self);
TORCH_CUDA_API void THCTensor_retain(THCState *state, THCTensor *self);
TORCH_CUDA_API void THCTensor_free(THCState *state, THCTensor *self);
TORCH_CUDA_CU_API void THCTensor_retain(THCState* state, THCTensor* self);
TORCH_CUDA_CU_API void THCTensor_free(THCState* state, THCTensor* self);
TORCH_CUDA_API int THCTensor_getDevice(THCState* state, const THCTensor* tensor);
TORCH_CUDA_API bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs);
TORCH_CUDA_CU_API int THCTensor_getDevice(
THCState* state,
const THCTensor* tensor);
TORCH_CUDA_CU_API bool THCTensor_allSameDevice(
THCState* state,
THCTensor** inputs,
int numInputs);
/* Can we use 32 bit math for indexing? */
TORCH_CUDA_API bool THCTensor_canUse32BitIndexMath(THCState* state, const THCTensor* t, ptrdiff_t max_elem=INT32_MAX);
TORCH_CUDA_CU_API bool THCTensor_canUse32BitIndexMath(
THCState* state,
const THCTensor* t,
ptrdiff_t max_elem = INT32_MAX);
/* Are all tensors 32-bit indexable? */
TORCH_CUDA_API bool THCTensor_all32BitIndexable(THCState* state, THCTensor** inputs, int numInputs);
TORCH_CUDA_API void THCTensor_preserveReduceDimSemantics(THCState *state, THCTensor *tensor, int in_dims,
int64_t dimension, int keepdim);
TORCH_CUDA_CU_API bool THCTensor_all32BitIndexable(
THCState* state,
THCTensor** inputs,
int numInputs);
TORCH_CUDA_CU_API void THCTensor_preserveReduceDimSemantics(
THCState* state,
THCTensor* tensor,
int in_dims,
int64_t dimension,
int keepdim);
/* Returns false if there is no possibility that the tensor */
/* has more than one index that references the same datapoint, */
/* true otherwise. */
TORCH_CUDA_API bool THCTensor_maybeOverlappingIndices(THCState* state, const THCTensor* t);
TORCH_CUDA_CU_API bool THCTensor_maybeOverlappingIndices(
THCState* state,
const THCTensor* t);
#include <THC/generic/THCTensor.hpp>
#include <THC/THCGenerateAllTypes.h>

View File

@ -19,34 +19,51 @@
#define THCudaComplexFloatStorage THCStorage
#define THCudaComplexDoubleStorage THCStorage
TORCH_CUDA_API scalar_t* THCStorage_(data)(THCState *state, const THCStorage*);
TORCH_CUDA_API int THCStorage_(elementSize)(THCState *state);
TORCH_CUDA_CU_API scalar_t* THCStorage_(
data)(THCState* state, const THCStorage*);
TORCH_CUDA_CU_API int THCStorage_(elementSize)(THCState* state);
/* slow access -- checks everything */
TORCH_CUDA_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, scalar_t);
TORCH_CUDA_API scalar_t THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t);
TORCH_CUDA_CU_API void THCStorage_(
set)(THCState* state, THCStorage*, ptrdiff_t, scalar_t);
TORCH_CUDA_CU_API scalar_t
THCStorage_(get)(THCState* state, const THCStorage*, ptrdiff_t);
TORCH_CUDA_API THCStorage* THCStorage_(new)(THCState *state);
TORCH_CUDA_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size);
TORCH_CUDA_API THCStorage* THCStorage_(newWithSize1)(THCState *state, scalar_t);
TORCH_CUDA_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared);
TORCH_CUDA_CU_API THCStorage* THCStorage_(new)(THCState* state);
TORCH_CUDA_CU_API THCStorage* THCStorage_(
newWithSize)(THCState* state, ptrdiff_t size);
TORCH_CUDA_CU_API THCStorage* THCStorage_(
newWithSize1)(THCState* state, scalar_t);
TORCH_CUDA_CU_API THCStorage* THCStorage_(newWithMapping)(
THCState* state,
const char* filename,
ptrdiff_t size,
int shared);
TORCH_CUDA_API THCStorage* THCStorage_(newWithAllocator)(
THCState *state, ptrdiff_t size,
at::Allocator* allocator);
TORCH_CUDA_API THCStorage* THCStorage_(newWithDataAndAllocator)(
THCState *state, at::DataPtr&& data, ptrdiff_t size,
at::Allocator* allocator);
TORCH_CUDA_CU_API THCStorage* THCStorage_(newWithAllocator)(
THCState* state,
ptrdiff_t size,
at::Allocator* allocator);
TORCH_CUDA_CU_API THCStorage* THCStorage_(newWithDataAndAllocator)(
THCState* state,
at::DataPtr&& data,
ptrdiff_t size,
at::Allocator* allocator);
TORCH_CUDA_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag);
TORCH_CUDA_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag);
TORCH_CUDA_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
TORCH_CUDA_CU_API void THCStorage_(
setFlag)(THCState* state, THCStorage* storage, const char flag);
TORCH_CUDA_CU_API void THCStorage_(
clearFlag)(THCState* state, THCStorage* storage, const char flag);
TORCH_CUDA_CU_API void THCStorage_(
retain)(THCState* state, THCStorage* storage);
TORCH_CUDA_API void THCStorage_(free)(THCState *state, THCStorage *storage);
TORCH_CUDA_API void THCStorage_(
TORCH_CUDA_CU_API void THCStorage_(free)(THCState* state, THCStorage* storage);
TORCH_CUDA_CU_API void THCStorage_(
resizeBytes)(THCState* state, THCStorage* storage, ptrdiff_t size_bytes);
TORCH_CUDA_API void THCStorage_(fill)(THCState *state, THCStorage *storage, scalar_t value);
TORCH_CUDA_CU_API void THCStorage_(
fill)(THCState* state, THCStorage* storage, scalar_t value);
TORCH_CUDA_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage);
TORCH_CUDA_CU_API int THCStorage_(
getDevice)(THCState* state, const THCStorage* storage);
#endif

View File

@ -4,57 +4,146 @@
/* Support for copy between different Storage types */
TORCH_CUDA_API void THCStorage_(copy)(THCState *state, THCStorage *storage, THCStorage *src);
TORCH_CUDA_CU_API void THCStorage_(
copy)(THCState* state, THCStorage* storage, THCStorage* src);
#if !defined(THC_REAL_IS_COMPLEXFLOAT) && !defined(THC_REAL_IS_COMPLEXDOUBLE)
TORCH_CUDA_API void THCStorage_(copyByte)(THCState *state, THCStorage *storage, struct THByteStorage *src);
TORCH_CUDA_API void THCStorage_(copyChar)(THCState *state, THCStorage *storage, struct THCharStorage *src);
TORCH_CUDA_API void THCStorage_(copyShort)(THCState *state, THCStorage *storage, struct THShortStorage *src);
TORCH_CUDA_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct THIntStorage *src);
TORCH_CUDA_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src);
TORCH_CUDA_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src);
TORCH_CUDA_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src);
TORCH_CUDA_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src);
TORCH_CUDA_API void THCStorage_(copyBool)(THCState *state, THCStorage *storage, struct THBoolStorage *src);
TORCH_CUDA_API void THCStorage_(copyBFloat16)(THCState *state, THCStorage *storage, struct THBFloat16Storage *src);
TORCH_CUDA_CU_API void THCStorage_(
copyByte)(THCState* state, THCStorage* storage, struct THByteStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyChar)(THCState* state, THCStorage* storage, struct THCharStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyShort)(
THCState* state,
THCStorage* storage,
struct THShortStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyInt)(THCState* state, THCStorage* storage, struct THIntStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyLong)(THCState* state, THCStorage* storage, struct THLongStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyFloat)(
THCState* state,
THCStorage* storage,
struct THFloatStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyDouble)(
THCState* state,
THCStorage* storage,
struct THDoubleStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyHalf)(THCState* state, THCStorage* storage, struct THHalfStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyBool)(THCState* state, THCStorage* storage, struct THBoolStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyBFloat16)(
THCState* state,
THCStorage* storage,
struct THBFloat16Storage* src);
#else
TORCH_CUDA_API void THCStorage_(copyComplexFloat)(THCState *state, THCStorage *storage, struct THComplexFloatStorage *src);
TORCH_CUDA_API void THCStorage_(copyComplexDouble)(THCState *state, THCStorage *storage, struct THComplexDoubleStorage *src);
TORCH_CUDA_CU_API void THCStorage_(copyComplexFloat)(
THCState* state,
THCStorage* storage,
struct THComplexFloatStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyComplexDouble)(
THCState* state,
THCStorage* storage,
struct THComplexDoubleStorage* src);
#endif
#if !defined(THC_REAL_IS_COMPLEXFLOAT) && !defined(THC_REAL_IS_COMPLEXDOUBLE)
TORCH_CUDA_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaShort)(THCState *state, THCStorage *storage, struct THCudaShortStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, struct THCudaIntStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaBool)(THCState *state, THCStorage *storage, struct THCudaBoolStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaBFloat16)(THCState *state, THCStorage *storage, struct THCudaBFloat16Storage *src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaByte)(
THCState* state,
THCStorage* storage,
struct THCudaByteStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaChar)(
THCState* state,
THCStorage* storage,
struct THCudaCharStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaShort)(
THCState* state,
THCStorage* storage,
struct THCudaShortStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaInt)(
THCState* state,
THCStorage* storage,
struct THCudaIntStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaLong)(
THCState* state,
THCStorage* storage,
struct THCudaLongStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaFloat)(
THCState* state,
THCStorage* storage,
struct THCudaStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaDouble)(
THCState* state,
THCStorage* storage,
struct THCudaDoubleStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaHalf)(
THCState* state,
THCStorage* storage,
struct THCudaHalfStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaBool)(
THCState* state,
THCStorage* storage,
struct THCudaBoolStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaBFloat16)(
THCState* state,
THCStorage* storage,
struct THCudaBFloat16Storage* src);
#else
TORCH_CUDA_API void THCStorage_(copyCudaComplexFloat)(THCState *state, THCStorage *storage, struct THCudaComplexFloatStorage *src);
TORCH_CUDA_API void THCStorage_(copyCudaComplexDouble)(THCState *state, THCStorage *storage, struct THCudaComplexDoubleStorage *src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaComplexFloat)(
THCState* state,
THCStorage* storage,
struct THCudaComplexFloatStorage* src);
TORCH_CUDA_CU_API void THCStorage_(copyCudaComplexDouble)(
THCState* state,
THCStorage* storage,
struct THCudaComplexDoubleStorage* src);
#endif
#if !defined(THC_REAL_IS_COMPLEXFLOAT) && !defined(THC_REAL_IS_COMPLEXDOUBLE)
TORCH_CUDA_API void TH_CONCAT_2(THByteStorage_copyCuda , Real)(THCState *state, THByteStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THCharStorage_copyCuda , Real)(THCState *state, THCharStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THShortStorage_copyCuda , Real)(THCState *state, THShortStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THIntStorage_copyCuda , Real)(THCState *state, THIntStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THLongStorage_copyCuda , Real)(THCState *state, THLongStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THBoolStorage_copyCuda, Real)(THCState *state, THBoolStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THBFloat16Storage_copyCuda, Real)(THCState *state, THBFloat16Storage *self, struct THCStorage *src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THByteStorage_copyCuda,
Real)(THCState* state, THByteStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THCharStorage_copyCuda,
Real)(THCState* state, THCharStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THShortStorage_copyCuda,
Real)(THCState* state, THShortStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THIntStorage_copyCuda,
Real)(THCState* state, THIntStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THLongStorage_copyCuda,
Real)(THCState* state, THLongStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THFloatStorage_copyCuda,
Real)(THCState* state, THFloatStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THDoubleStorage_copyCuda,
Real)(THCState* state, THDoubleStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THHalfStorage_copyCuda,
Real)(THCState* state, THHalfStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THBoolStorage_copyCuda,
Real)(THCState* state, THBoolStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THBFloat16Storage_copyCuda,
Real)(THCState* state, THBFloat16Storage* self, struct THCStorage* src);
#else
TORCH_CUDA_API void TH_CONCAT_2(THComplexFloatStorage_copyCuda , Real)(THCState *state, THComplexFloatStorage *self, struct THCStorage *src);
TORCH_CUDA_API void TH_CONCAT_2(THComplexDoubleStorage_copyCuda, Real)(THCState *state, THComplexDoubleStorage *self, struct THCStorage *src);
TORCH_CUDA_CU_API void TH_CONCAT_2(
THComplexFloatStorage_copyCuda,
Real)(THCState* state, THComplexFloatStorage* self, struct THCStorage* src);
TORCH_CUDA_CU_API void TH_CONCAT_2(THComplexDoubleStorage_copyCuda, Real)(
THCState* state,
THComplexDoubleStorage* self,
struct THCStorage* src);
#endif
TORCH_CUDA_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src);
TORCH_CUDA_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src);
TORCH_CUDA_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src);
TORCH_CUDA_CU_API void THStorage_(
copyCuda)(THCState* state, THStorage* self, THCStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyCuda)(THCState* state, THCStorage* self, THCStorage* src);
TORCH_CUDA_CU_API void THCStorage_(
copyCPU)(THCState* state, THCStorage* self, THStorage* src);
#endif

View File

@ -20,87 +20,215 @@
#define THCudaComplexDoubleTensor THCTensor
/**** access methods ****/
TORCH_CUDA_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self);
TORCH_CUDA_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self);
TORCH_CUDA_CU_API THCStorage* THCTensor_(
storage)(THCState* state, const THCTensor* self);
TORCH_CUDA_CU_API ptrdiff_t
THCTensor_(storageOffset)(THCState* state, const THCTensor* self);
// See [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
TORCH_CUDA_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self);
TORCH_CUDA_API int THCTensor_(nDimensionLegacyNoScalars)(THCState *state, const THCTensor *self);
TORCH_CUDA_API int THCTensor_(nDimensionLegacyAll)(THCState *state, const THCTensor *self);
TORCH_CUDA_CU_API int THCTensor_(
nDimension)(THCState* state, const THCTensor* self);
TORCH_CUDA_CU_API int THCTensor_(
nDimensionLegacyNoScalars)(THCState* state, const THCTensor* self);
TORCH_CUDA_CU_API int THCTensor_(
nDimensionLegacyAll)(THCState* state, const THCTensor* self);
TORCH_CUDA_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
TORCH_CUDA_API scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self);
TORCH_CUDA_API void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag);
TORCH_CUDA_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag);
TORCH_CUDA_CU_API int64_t
THCTensor_(size)(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API int64_t THCTensor_(
sizeLegacyNoScalars)(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API int64_t
THCTensor_(stride)(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API int64_t THCTensor_(
strideLegacyNoScalars)(THCState* state, const THCTensor* self, int dim);
TORCH_CUDA_CU_API scalar_t* THCTensor_(
data)(THCState* state, const THCTensor* self);
TORCH_CUDA_CU_API void THCTensor_(
setFlag)(THCState* state, THCTensor* self, const char flag);
TORCH_CUDA_CU_API void THCTensor_(
clearFlag)(THCState* state, THCTensor* self, const char flag);
/**** creation methods ****/
TORCH_CUDA_API THCTensor *THCTensor_(new)(THCState *state);
TORCH_CUDA_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor);
TORCH_CUDA_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
int64_t size0_, int64_t stride0_);
TORCH_CUDA_CU_API THCTensor* THCTensor_(new)(THCState* state);
TORCH_CUDA_CU_API THCTensor* THCTensor_(
newWithTensor)(THCState* state, THCTensor* tensor);
TORCH_CUDA_CU_API THCTensor* THCTensor_(newWithStorage1d)(
THCState* state,
THCStorage* storage_,
ptrdiff_t storageOffset_,
int64_t size0_,
int64_t stride0_);
/* stride might be NULL */
TORCH_CUDA_API THCTensor *THCTensor_(newWithSize1d)(THCState *state, int64_t size0_);
TORCH_CUDA_CU_API THCTensor* THCTensor_(
newWithSize1d)(THCState* state, int64_t size0_);
TORCH_CUDA_API THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self);
TORCH_CUDA_API THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *tensor);
TORCH_CUDA_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, int64_t sliceIndex_);
TORCH_CUDA_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_);
TORCH_CUDA_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
TORCH_CUDA_API THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input);
TORCH_CUDA_CU_API THCTensor* THCTensor_(
newClone)(THCState* state, THCTensor* self);
TORCH_CUDA_CU_API THCTensor* THCTensor_(
newContiguous)(THCState* state, THCTensor* tensor);
TORCH_CUDA_CU_API THCTensor* THCTensor_(newSelect)(
THCState* state,
THCTensor* tensor,
int dimension_,
int64_t sliceIndex_);
TORCH_CUDA_CU_API THCTensor* THCTensor_(newNarrow)(
THCState* state,
THCTensor* tensor,
int dimension_,
int64_t firstIndex_,
int64_t size_);
TORCH_CUDA_CU_API THCTensor* THCTensor_(newTranspose)(
THCState* state,
THCTensor* tensor,
int dimension1_,
int dimension2_);
TORCH_CUDA_CU_API THCTensor* THCTensor_(
newFoldBatchDim)(THCState* state, THCTensor* input);
// resize* methods simply resize the storage. So they may not retain the current data at current indices.
// This is especially likely to happen when the tensor is not contiguous. In general, if you still need the
// values, unless you are doing some size and stride tricks, do not use resize*.
TORCH_CUDA_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
TORCH_CUDA_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
TORCH_CUDA_API void THCTensor_(resize0d)(THCState *state, THCTensor *tensor);
TORCH_CUDA_API void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0_);
TORCH_CUDA_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_);
TORCH_CUDA_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
TORCH_CUDA_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
TORCH_CUDA_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
TORCH_CUDA_CU_API void THCTensor_(resizeNd)(
THCState* state,
THCTensor* tensor,
int nDimension,
const int64_t* size,
const int64_t* stride);
TORCH_CUDA_CU_API void THCTensor_(
resizeAs)(THCState* state, THCTensor* tensor, THCTensor* src);
TORCH_CUDA_CU_API void THCTensor_(resize0d)(THCState* state, THCTensor* tensor);
TORCH_CUDA_CU_API void THCTensor_(
resize1d)(THCState* state, THCTensor* tensor, int64_t size0_);
TORCH_CUDA_CU_API void THCTensor_(resize2d)(
THCState* state,
THCTensor* tensor,
int64_t size0_,
int64_t size1_);
TORCH_CUDA_CU_API void THCTensor_(resize3d)(
THCState* state,
THCTensor* tensor,
int64_t size0_,
int64_t size1_,
int64_t size2_);
TORCH_CUDA_CU_API void THCTensor_(resize4d)(
THCState* state,
THCTensor* tensor,
int64_t size0_,
int64_t size1_,
int64_t size2_,
int64_t size3_);
TORCH_CUDA_CU_API void THCTensor_(resize5d)(
THCState* state,
THCTensor* tensor,
int64_t size0_,
int64_t size1_,
int64_t size2_,
int64_t size3_,
int64_t size4_);
TORCH_CUDA_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_(
set)(THCState* state, THCTensor* self, THCTensor* src);
TORCH_CUDA_API void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
TORCH_CUDA_API void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t sliceIndex_);
TORCH_CUDA_API void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1_, int dimension2_);
TORCH_CUDA_CU_API void THCTensor_(narrow)(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension_,
int64_t firstIndex_,
int64_t size_);
TORCH_CUDA_CU_API void THCTensor_(select)(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension_,
int64_t sliceIndex_);
TORCH_CUDA_CU_API void THCTensor_(transpose)(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension1_,
int dimension2_);
TORCH_CUDA_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
TORCH_CUDA_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
TORCH_CUDA_CU_API void THCTensor_(squeeze1d)(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension_);
TORCH_CUDA_CU_API void THCTensor_(unsqueeze1d)(
THCState* state,
THCTensor* self,
THCTensor* src,
int dimension_);
TORCH_CUDA_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
TORCH_CUDA_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
TORCH_CUDA_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self);
TORCH_CUDA_CU_API int THCTensor_(
isContiguous)(THCState* state, const THCTensor* self);
TORCH_CUDA_CU_API int THCTensor_(
isSameSizeAs)(THCState* state, const THCTensor* self, const THCTensor* src);
TORCH_CUDA_CU_API ptrdiff_t
THCTensor_(nElement)(THCState* state, const THCTensor* self);
TORCH_CUDA_API void THCTensor_(retain)(THCState *state, THCTensor *self);
TORCH_CUDA_API void THCTensor_(free)(THCState *state, THCTensor *self);
TORCH_CUDA_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst);
TORCH_CUDA_CU_API void THCTensor_(retain)(THCState* state, THCTensor* self);
TORCH_CUDA_CU_API void THCTensor_(free)(THCState* state, THCTensor* self);
TORCH_CUDA_CU_API void THCTensor_(
freeCopyTo)(THCState* state, THCTensor* self, THCTensor* dst);
/* Slow access methods [check everything] */
TORCH_CUDA_API void THCTensor_(set0d)(THCState *state, THCTensor *tensor, scalar_t value);
TORCH_CUDA_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, scalar_t value);
TORCH_CUDA_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, scalar_t value);
TORCH_CUDA_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, scalar_t value);
TORCH_CUDA_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(
set0d)(THCState* state, THCTensor* tensor, scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(
set1d)(THCState* state, THCTensor* tensor, int64_t x0, scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(set2d)(
THCState* state,
THCTensor* tensor,
int64_t x0,
int64_t x1,
scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(set3d)(
THCState* state,
THCTensor* tensor,
int64_t x0,
int64_t x1,
int64_t x2,
scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(set4d)(
THCState* state,
THCTensor* tensor,
int64_t x0,
int64_t x1,
int64_t x2,
int64_t x3,
scalar_t value);
TORCH_CUDA_API scalar_t THCTensor_(get0d)(THCState *state, const THCTensor *tensor);
TORCH_CUDA_API scalar_t THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0);
TORCH_CUDA_API scalar_t THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1);
TORCH_CUDA_API scalar_t THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
TORCH_CUDA_API scalar_t THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
TORCH_CUDA_CU_API scalar_t
THCTensor_(get0d)(THCState* state, const THCTensor* tensor);
TORCH_CUDA_CU_API scalar_t
THCTensor_(get1d)(THCState* state, const THCTensor* tensor, int64_t x0);
TORCH_CUDA_CU_API scalar_t THCTensor_(
get2d)(THCState* state, const THCTensor* tensor, int64_t x0, int64_t x1);
TORCH_CUDA_CU_API scalar_t THCTensor_(get3d)(
THCState* state,
const THCTensor* tensor,
int64_t x0,
int64_t x1,
int64_t x2);
TORCH_CUDA_CU_API scalar_t THCTensor_(get4d)(
THCState* state,
const THCTensor* tensor,
int64_t x0,
int64_t x1,
int64_t x2,
int64_t x3);
/* CUDA-specific functions */
TORCH_CUDA_API int THCTensor_(getDevice)(THCState *state, const THCTensor *self);
TORCH_CUDA_API int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...);
TORCH_CUDA_CU_API int THCTensor_(
getDevice)(THCState* state, const THCTensor* self);
TORCH_CUDA_CU_API int THCTensor_(
checkGPU)(THCState* state, unsigned int nTensors, ...);
/* debug methods */
TORCH_CUDA_API THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor);
TORCH_CUDA_CU_API THCDescBuff
THCTensor_(sizeDesc)(THCState* state, const THCTensor* tensor);
#endif

View File

@ -8,9 +8,18 @@
// NOTE: functions exist here only to support dispatch via Declarations.cwrap. You probably don't want to put
// new functions in here, they should probably be un-genericized.
TORCH_CUDA_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
at::IntArrayRef size_, at::IntArrayRef stride_);
TORCH_CUDA_CU_API void THCTensor_(setStorage)(
THCState* state,
THCTensor* self,
THCStorage* storage_,
ptrdiff_t storageOffset_,
at::IntArrayRef size_,
at::IntArrayRef stride_);
TORCH_CUDA_API void THCTensor_(resize)(THCState *state, THCTensor *self, at::IntArrayRef size, at::IntArrayRef stride);
TORCH_CUDA_CU_API void THCTensor_(resize)(
THCState* state,
THCTensor* self,
at::IntArrayRef size,
at::IntArrayRef stride);
#endif

View File

@ -2,10 +2,14 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorCopy.h"
#else
TORCH_CUDA_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src);
TORCH_CUDA_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_(
copy)(THCState* state, THCTensor* self, THCTensor* src);
TORCH_CUDA_CU_API void THCTensor_(
copyIgnoringOverlaps)(THCState* state, THCTensor* self, THCTensor* src);
TORCH_CUDA_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src);
TORCH_CUDA_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_(
copyAsyncCPU)(THCState* state, THCTensor* self, THTensor* src);
TORCH_CUDA_CU_API void THTensor_(
copyAsyncCuda)(THCState* state, THTensor* self, THCTensor* src);
#endif

View File

@ -2,10 +2,34 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorIndex.h"
#else
TORCH_CUDA_API void THCTensor_(indexCopy)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src);
TORCH_CUDA_API void THCTensor_(indexFill)(THCState *state, THCTensor *tensor, int dim, THCudaLongTensor *index, scalar_t val);
TORCH_CUDA_API void THCTensor_(indexSelect)(THCState *state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
TORCH_CUDA_API void THCTensor_(take)(THCState *state, THCTensor *res_, THCTensor *src, THCudaLongTensor *index);
TORCH_CUDA_API void THCTensor_(put)(THCState *state, THCTensor *res_, THCudaLongTensor *indices, THCTensor *src, int accumulate);
TORCH_CUDA_CU_API void THCTensor_(indexCopy)(
THCState* state,
THCTensor* res_,
int dim,
THCudaLongTensor* indices,
THCTensor* src);
TORCH_CUDA_CU_API void THCTensor_(indexFill)(
THCState* state,
THCTensor* tensor,
int dim,
THCudaLongTensor* index,
scalar_t val);
TORCH_CUDA_CU_API void THCTensor_(indexSelect)(
THCState* state,
THCTensor* tensor,
THCTensor* src,
int dim,
THCudaLongTensor* index);
TORCH_CUDA_CU_API void THCTensor_(take)(
THCState* state,
THCTensor* res_,
THCTensor* src,
THCudaLongTensor* index);
TORCH_CUDA_CU_API void THCTensor_(put)(
THCState* state,
THCTensor* res_,
THCudaLongTensor* indices,
THCTensor* src,
int accumulate);
#endif

View File

@ -2,37 +2,42 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorMasked.h"
#else
TORCH_CUDA_API void THCTensor_(maskedFill)(THCState *state,
THCTensor *tensor,
THCudaByteTensor *mask,
scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(maskedFill)(
THCState* state,
THCTensor* tensor,
THCudaByteTensor* mask,
scalar_t value);
TORCH_CUDA_API void THCTensor_(maskedFillBool)(THCState *state,
THCTensor *tensor,
THCudaBoolTensor *mask,
scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(maskedFillBool)(
THCState* state,
THCTensor* tensor,
THCudaBoolTensor* mask,
scalar_t value);
// FIXME: remove now that we have THCudaByteTensor?
TORCH_CUDA_API void THCTensor_(maskedFillByte)(THCState *state,
THCTensor *tensor,
THByteTensor *mask,
scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(maskedFillByte)(
THCState* state,
THCTensor* tensor,
THByteTensor* mask,
scalar_t value);
TORCH_CUDA_API void THCTensor_(maskedCopy)(THCState *state,
THCTensor *tensor,
THCudaByteTensor *mask,
THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_(maskedCopy)(
THCState* state,
THCTensor* tensor,
THCudaByteTensor* mask,
THCTensor* src);
TORCH_CUDA_API void THCTensor_(maskedCopyBool)(THCState *state,
THCTensor *tensor,
THCudaBoolTensor *mask,
THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_(maskedCopyBool)(
THCState* state,
THCTensor* tensor,
THCudaBoolTensor* mask,
THCTensor* src);
// FIXME: remove now that we have THCudaByteTensor?
TORCH_CUDA_API void THCTensor_(maskedCopyByte)(THCState *state,
THCTensor *tensor,
THByteTensor *mask,
THCTensor *src);
TORCH_CUDA_CU_API void THCTensor_(maskedCopyByte)(
THCState* state,
THCTensor* tensor,
THByteTensor* mask,
THCTensor* src);
#endif

View File

@ -2,9 +2,9 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorMath.h"
#else
TORCH_CUDA_API void THCTensor_(fill)(THCState *state, THCTensor *self, scalar_t value);
TORCH_CUDA_API void THCTensor_(zero)(THCState *state, THCTensor *self);
TORCH_CUDA_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t);
TORCH_CUDA_CU_API void THCTensor_(
fill)(THCState* state, THCTensor* self, scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(zero)(THCState* state, THCTensor* self);
TORCH_CUDA_CU_API ptrdiff_t THCTensor_(numel)(THCState* state, THCTensor* t);
#endif

View File

@ -5,9 +5,16 @@
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
// MAGMA (i.e. CUDA implementation of LAPACK functions)
TORCH_CUDA_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
TORCH_CUDA_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, bool upper);
TORCH_CUDA_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_);
TORCH_CUDA_CU_API void THCTensor_(gels)(
THCState* state,
THCTensor* rb_,
THCTensor* ra_,
THCTensor* b_,
THCTensor* a_);
TORCH_CUDA_CU_API void THCTensor_(
potri)(THCState* state, THCTensor* ra_, THCTensor* a, bool upper);
TORCH_CUDA_CU_API void THCTensor_(
geqrf)(THCState* state, THCTensor* ra_, THCTensor* rtau_, THCTensor* a_);
#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)

View File

@ -2,11 +2,13 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.h"
#else
TORCH_CUDA_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
TORCH_CUDA_CU_API int THCTensor_(
equal)(THCState* state, THCTensor* self, THCTensor* src);
#if !defined(THC_REAL_IS_BOOL)
TORCH_CUDA_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
TORCH_CUDA_CU_API void THCTensor_(
mul)(THCState* state, THCTensor* self, THCTensor* src, scalar_t value);
#endif

View File

@ -4,7 +4,12 @@
#if !defined(THC_REAL_IS_BOOL)
TORCH_CUDA_API void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension);
TORCH_CUDA_CU_API void THCTensor_(crossKernel)(
THCState* state,
THCTensor* self,
THCTensor* src1,
THCTensor* src2,
int dimension);
#endif
#endif

View File

@ -6,16 +6,36 @@
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
TORCH_CUDA_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t max_norm);
TORCH_CUDA_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, int keepdim);
TORCH_CUDA_CU_API void THCTensor_(renorm)(
THCState* state,
THCTensor* self,
THCTensor* src,
scalar_t value,
int dimension,
scalar_t max_norm);
TORCH_CUDA_CU_API void THCTensor_(norm)(
THCState* state,
THCTensor* self,
THCTensor* src,
scalar_t value,
int dimension,
int keepdim);
TORCH_CUDA_API accreal THCTensor_(std_all)(THCState *state, THCTensor *self, bool unbiased);
TORCH_CUDA_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t value);
TORCH_CUDA_API accreal THCTensor_(var_all)(THCState *state, THCTensor *self, bool unbiased);
TORCH_CUDA_CU_API accreal
THCTensor_(std_all)(THCState* state, THCTensor* self, bool unbiased);
TORCH_CUDA_CU_API accreal
THCTensor_(normall)(THCState* state, THCTensor* self, scalar_t value);
TORCH_CUDA_CU_API accreal
THCTensor_(var_all)(THCState* state, THCTensor* self, bool unbiased);
#endif
TORCH_CUDA_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim);
TORCH_CUDA_CU_API void THCTensor_(prod)(
THCState* state,
THCTensor* self,
THCTensor* src,
int dim,
int keepdim);
#endif

View File

@ -4,11 +4,12 @@
/* Returns the mode, and index of the mode, for the set of values
* along a given dimension in the input tensor. */
TORCH_CUDA_API void THCTensor_(mode)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
THCTensor *input,
int dimension,
int keepdim);
TORCH_CUDA_CU_API void THCTensor_(mode)(
THCState* state,
THCTensor* values,
THCudaLongTensor* indices,
THCTensor* input,
int dimension,
int keepdim);
#endif // THC_GENERIC_FILE

View File

@ -6,8 +6,18 @@
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
TORCH_CUDA_API void THCTensor_(multinomialAliasSetup)(struct THCState *state, THCTensor *probs, THCudaLongTensor *J, THCTensor *q);
TORCH_CUDA_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCTensor *_q, THCudaLongTensor *_J, int n_sample, c10::optional<at::Generator> gen_);
TORCH_CUDA_CU_API void THCTensor_(multinomialAliasSetup)(
struct THCState* state,
THCTensor* probs,
THCudaLongTensor* J,
THCTensor* q);
TORCH_CUDA_CU_API void THCTensor_(multinomialAliasDraw)(
THCState* state,
THCudaLongTensor* self,
THCTensor* _q,
THCudaLongTensor* _J,
int n_sample,
c10::optional<at::Generator> gen_);
#endif
#endif

View File

@ -2,6 +2,11 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorScatterGather.h"
#else
TORCH_CUDA_API void THCTensor_(gather)(THCState* state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
TORCH_CUDA_CU_API void THCTensor_(gather)(
THCState* state,
THCTensor* tensor,
THCTensor* src,
int dim,
THCudaLongTensor* index);
#endif

View File

@ -4,17 +4,21 @@
/* Performs an in-place sort of (keys, values). Only works for slice sizes
<= 2048 at the moment (slice size == size of keys/values dim `dim`) */
TORCH_CUDA_API void THCTensor_(sortKeyValueInplace)(THCState* state,
THCTensor* keys,
THCudaLongTensor* values,
int dim, bool dir);
TORCH_CUDA_CU_API void THCTensor_(sortKeyValueInplace)(
THCState* state,
THCTensor* keys,
THCudaLongTensor* values,
int dim,
bool dir);
/* Performs an out-of-place sort of `input`, returning the per-slice indices
in `indices` and the sorted values in `sorted` */
TORCH_CUDA_API void THCTensor_(sort)(THCState* state,
THCTensor* sorted,
THCudaLongTensor* indices,
THCTensor* input,
int dim, int order);
TORCH_CUDA_CU_API void THCTensor_(sort)(
THCState* state,
THCTensor* sorted,
THCudaLongTensor* indices,
THCTensor* input,
int dim,
int order);
#endif

View File

@ -4,10 +4,14 @@
/* Returns the set of all kth smallest (or largest) elements, depending */
/* on `dir` */
TORCH_CUDA_API void THCTensor_(topk)(THCState* state,
THCTensor* topK,
THCudaLongTensor* indices,
THCTensor* input,
int64_t k, int dim, int dir, int sorted);
TORCH_CUDA_CU_API void THCTensor_(topk)(
THCState* state,
THCTensor* topK,
THCudaLongTensor* indices,
THCTensor* input,
int64_t k,
int dim,
int dir,
int sorted);
#endif // THC_GENERIC_FILE

View File

@ -5,200 +5,221 @@
#include <ATen/core/Reduction.h>
#include <ATen/Generator.h>
TORCH_CUDA_API void THNN_(ClassNLLCriterion_updateOutput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *output,
int64_t reduction,
THCTensor *weights, // [OPTIONAL]
THCTensor *total_weight,
int64_t ignore_index);
TORCH_CUDA_CU_API void THNN_(ClassNLLCriterion_updateOutput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* output,
int64_t reduction,
THCTensor* weights, // [OPTIONAL]
THCTensor* total_weight,
int64_t ignore_index);
TORCH_CUDA_API void THNN_(ClassNLLCriterion_updateGradInput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *gradOutput,
THCTensor *gradInput,
int64_t reduction,
THCTensor *weights, // [OPTIONAL]
THCTensor *total_weight,
int64_t ignore_index);
TORCH_CUDA_CU_API void THNN_(ClassNLLCriterion_updateGradInput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* gradOutput,
THCTensor* gradInput,
int64_t reduction,
THCTensor* weights, // [OPTIONAL]
THCTensor* total_weight,
int64_t ignore_index);
TORCH_CUDA_API void THNN_(GatedLinear_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
int dim);
TORCH_CUDA_CU_API void THNN_(GatedLinear_updateOutput)(
THCState* state,
THCTensor* input,
THCTensor* output,
int dim);
TORCH_CUDA_API void THNN_(GatedLinear_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
int dim);
TORCH_CUDA_CU_API void THNN_(GatedLinear_updateGradInput)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradInput,
int dim);
TORCH_CUDA_API void THNN_(LogSigmoid_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCTensor *buffer);
TORCH_CUDA_CU_API void THNN_(LogSigmoid_updateOutput)(
THCState* state,
THCTensor* input,
THCTensor* output,
THCTensor* buffer);
TORCH_CUDA_API void THNN_(LogSigmoid_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *buffer);
TORCH_CUDA_CU_API void THNN_(LogSigmoid_updateGradInput)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradInput,
THCTensor* buffer);
TORCH_CUDA_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *output,
THCTensor *is_target,
int64_t reduction);
TORCH_CUDA_CU_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* output,
THCTensor* is_target,
int64_t reduction);
TORCH_CUDA_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *is_target,
int64_t reduction);
TORCH_CUDA_CU_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* gradOutput,
THCTensor* gradInput,
THCTensor* is_target,
int64_t reduction);
TORCH_CUDA_API void THNN_(MultiMarginCriterion_updateOutput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *output,
int64_t reduction,
int p,
THCTensor *weights, // [OPTIONAL]
accreal margin);
TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateOutput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* output,
int64_t reduction,
int p,
THCTensor* weights, // [OPTIONAL]
accreal margin);
TORCH_CUDA_API void THNN_(MultiMarginCriterion_updateGradInput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *gradOutput,
THCTensor *gradInput,
int64_t reduction,
int p,
THCTensor *weights, // [OPTIONAL]
accreal margin);
TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateGradInput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* gradOutput,
THCTensor* gradInput,
int64_t reduction,
int p,
THCTensor* weights, // [OPTIONAL]
accreal margin);
TORCH_CUDA_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *output,
int64_t reduction,
THCTensor *weights, // [OPTIONAL]
THCTensor *total_weight,
int64_t ignore_index);
TORCH_CUDA_CU_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* output,
int64_t reduction,
THCTensor* weights, // [OPTIONAL]
THCTensor* total_weight,
int64_t ignore_index);
TORCH_CUDA_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
THCState *state,
THCTensor *input,
THCIndexTensor *target,
THCTensor *gradOutput,
THCTensor *gradInput,
int64_t reduction,
THCTensor *weights, // [OPTIONAL]
THCTensor *total_weight,
int64_t ignore_index);
TORCH_CUDA_CU_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
THCState* state,
THCTensor* input,
THCIndexTensor* target,
THCTensor* gradOutput,
THCTensor* gradInput,
int64_t reduction,
THCTensor* weights, // [OPTIONAL]
THCTensor* total_weight,
int64_t ignore_index);
TORCH_CUDA_API void THNN_(SpatialConvolutionMM_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCTensor *weight,
THCTensor *bias, // [OPTIONAL]
THCTensor *columns,
THCTensor *ones,
int kW, int kH,
int dW, int dH,
int padW, int padH);
TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateOutput)(
THCState* state,
THCTensor* input,
THCTensor* output,
THCTensor* weight,
THCTensor* bias, // [OPTIONAL]
THCTensor* columns,
THCTensor* ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH);
TORCH_CUDA_API void THNN_(SpatialConvolutionMM_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *weight,
THCTensor *columns,
THCTensor *ones,
int kW, int kH,
int dW, int dH,
int padW, int padH);
TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateGradInput)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradInput,
THCTensor* weight,
THCTensor* columns,
THCTensor* ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH);
TORCH_CUDA_API void THNN_(SpatialConvolutionMM_accGradParameters)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradWeight,
THCTensor *gradBias, // [OPTIONAL]
THCTensor *columns,
THCTensor *ones,
int kW, int kH,
int dW, int dH,
int padW, int padH,
accreal scale);
TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_accGradParameters)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradWeight,
THCTensor* gradBias, // [OPTIONAL]
THCTensor* columns,
THCTensor* ones,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
accreal scale);
TORCH_CUDA_API void THNN_(SpatialDepthwiseConvolution_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCTensor *weight,
THCTensor *bias, // [OPTIONAL]
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH);
TORCH_CUDA_CU_API void THNN_(SpatialDepthwiseConvolution_updateOutput)(
THCState* state,
THCTensor* input,
THCTensor* output,
THCTensor* weight,
THCTensor* bias, // [OPTIONAL]
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH);
TORCH_CUDA_API void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *weight,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH);
TORCH_CUDA_CU_API void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradInput,
THCTensor* weight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH);
TORCH_CUDA_API void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradWeight,
int kW, int kH,
int dW, int dH,
int padW, int padH,
int dilationW, int dilationH);
TORCH_CUDA_CU_API void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradWeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
int dilationW,
int dilationH);
TORCH_CUDA_API void THNN_(RReLU_updateOutput)(
THCState *state,
THCTensor *input,
THCTensor *output,
THCTensor *noise,
double lower,
double upper,
bool train,
bool inplace,
c10::optional<at::Generator> generator);
TORCH_CUDA_CU_API void THNN_(RReLU_updateOutput)(
THCState* state,
THCTensor* input,
THCTensor* output,
THCTensor* noise,
double lower,
double upper,
bool train,
bool inplace,
c10::optional<at::Generator> generator);
TORCH_CUDA_API void THNN_(RReLU_updateGradInput)(
THCState *state,
THCTensor *input,
THCTensor *gradOutput,
THCTensor *gradInput,
THCTensor *noise,
double lower,
double upper,
bool train,
bool inplace);
TORCH_CUDA_CU_API void THNN_(RReLU_updateGradInput)(
THCState* state,
THCTensor* input,
THCTensor* gradOutput,
THCTensor* gradInput,
THCTensor* noise,
double lower,
double upper,
bool train,
bool inplace);
#endif

View File

@ -106,6 +106,10 @@
#define TORCH_CUDA_API C10_IMPORT
#endif
// This is in preparation for the imminent torch_cuda split
#define TORCH_CUDA_CU_API TORCH_CUDA_API
#define TORCH_CUDA_CPP_API TORCH_CUDA_API
#if defined(TORCH_HIP_BUILD_MAIN_LIB)
#define TORCH_HIP_API C10_EXPORT
#else

View File

@ -26,7 +26,13 @@
#include <c10/cuda/CUDAGuard.h>
#define CAFFE2_CUDA_EXPORT C10_EXPORT
#define CAFFE2_CUDA_API TORCH_CUDA_API
// CAFFE2_CUDA_API gets translated to CAFFE2_HIP_API in hipify script, which
// causes a marco redefinition issue with the later definition of
// CAFFE2_HIP_API, so we exclude this definition when HIP is specified
#ifndef __HIP_PLATFORM_HCC__
#define CAFFE2_CUDA_API TORCH_CUDA_CPP_API
#endif // __HIP_PLATFORM_HCC__
#define CAFFE2_HIP_EXPORT C10_EXPORT
#define CAFFE2_HIP_API TORCH_HIP_API

View File

@ -178,8 +178,15 @@ def parse_header(path):
else:
fn_name = fn_name[:-1]
generic_functions.append(Function(fn_name))
elif l.startswith('TORCH_CUDA_API void THNN_'):
fn_name = l[len('TORCH_CUDA_API void THNN_'):]
elif l.startswith('TORCH_CUDA_CPP_API void THNN_'):
fn_name = l[len('TORCH_CUDA_CPP_API void THNN_'):]
if fn_name[0] == '(' and fn_name[-2] == ')':
fn_name = fn_name[1:-2]
else:
fn_name = fn_name[:-1]
generic_functions.append(Function(fn_name))
elif l.startswith('TORCH_CUDA_CU_API void THNN_'):
fn_name = l[len('TORCH_CUDA_CU_API void THNN_'):]
if fn_name[0] == '(' and fn_name[-2] == ')':
fn_name = fn_name[1:-2]
else:

View File

@ -14,7 +14,7 @@
namespace torch {
namespace autograd {
struct TORCH_CUDA_API Scatter : public Node {
struct TORCH_CUDA_CU_API Scatter : public Node {
explicit Scatter(
std::vector<at::Device> devices,
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
@ -33,7 +33,7 @@ struct TORCH_CUDA_API Scatter : public Node {
bool unsqueeze_scalars_;
};
struct TORCH_CUDA_API Gather : public Node {
struct TORCH_CUDA_CU_API Gather : public Node {
explicit Gather(const at::Device& destination_device, int64_t dim = 0);
~Gather() override;

View File

@ -13,34 +13,39 @@ namespace torch { namespace cuda {
using tensor_list2d = std::vector<std::vector<at::Tensor>>;
TORCH_CUDA_API std::vector<at::Tensor>& broadcast_out(const at::Tensor& tensor, std::vector<at::Tensor>& out_tensors);
TORCH_CUDA_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
TORCH_CUDA_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
size_t buffer_size);
TORCH_CUDA_CU_API std::vector<at::Tensor>& broadcast_out(
const at::Tensor& tensor,
std::vector<at::Tensor>& out_tensors);
TORCH_CUDA_CU_API std::vector<at::Tensor> broadcast(
const at::Tensor& tensor,
at::IntArrayRef devices);
TORCH_CUDA_CU_API tensor_list2d broadcast_coalesced(
at::TensorList tensors,
at::IntArrayRef devices,
size_t buffer_size);
TORCH_CUDA_API std::vector<at::Tensor>& scatter_out(
TORCH_CUDA_CU_API std::vector<at::Tensor>& scatter_out(
const at::Tensor& tensor,
std::vector<at::Tensor>& out_tensors,
int64_t dim = 0,
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
c10::nullopt);
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
streams = c10::nullopt);
TORCH_CUDA_API std::vector<at::Tensor> scatter(
TORCH_CUDA_CU_API std::vector<at::Tensor> scatter(
const at::Tensor& tensor,
at::IntArrayRef devices,
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
int64_t dim = 0,
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
c10::nullopt);
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
streams = c10::nullopt);
TORCH_CUDA_API at::Tensor& gather_out(
TORCH_CUDA_CU_API at::Tensor& gather_out(
at::TensorList tensors,
at::Tensor& out_tensor,
int64_t dim);
TORCH_CUDA_API at::Tensor gather(
TORCH_CUDA_CU_API at::Tensor gather(
at::TensorList tensors,
int64_t dim,
c10::optional<int32_t> destination_index);
}}

View File

@ -60,7 +60,7 @@ enum class ncclDataType {
// Don't use them outside of these files.
namespace detail {
TORCH_CUDA_API void throw_nccl_error(ncclResult status);
TORCH_CUDA_CPP_API void throw_nccl_error(ncclResult status);
static inline void NCCL_CHECK(ncclResult status) {
if (status != ncclResult::Success) {
@ -68,13 +68,14 @@ static inline void NCCL_CHECK(ncclResult status) {
}
}
TORCH_CUDA_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
TORCH_CUDA_API void check_inputs(
TORCH_CUDA_CPP_API at::ArrayRef<ncclComm_t> get_communicators(
at::TensorList inputs);
TORCH_CUDA_CPP_API void check_inputs(
at::TensorList inputs,
at::TensorList outputs,
int input_multiplier,
int output_multiplier);
TORCH_CUDA_API void check_inputs(
TORCH_CUDA_CPP_API void check_inputs(
at::TensorList inputs,
const at::Tensor& output,
int root,
@ -86,22 +87,23 @@ TORCH_CUDA_API void check_inputs(
using comm_list = std::vector<ncclComm_t>;
using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
TORCH_CUDA_API std::uint64_t version();
TORCH_CUDA_CPP_API std::uint64_t version();
bool is_available(at::TensorList tensors);
TORCH_CUDA_API void get_unique_id(ncclUniqueId& id);
TORCH_CUDA_API ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
TORCH_CUDA_API void comm_destroy(ncclComm_t comm);
TORCH_CUDA_CPP_API void get_unique_id(ncclUniqueId& id);
TORCH_CUDA_CPP_API ncclComm_t
comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
TORCH_CUDA_CPP_API void comm_destroy(ncclComm_t comm);
TORCH_CUDA_API void broadcast(
TORCH_CUDA_CPP_API void broadcast(
at::TensorList tensors,
const stream_list& streams = {},
const comm_list& user_comms = {});
size_t get_max_count();
TORCH_CUDA_API void reduce(
TORCH_CUDA_CPP_API void reduce(
const std::vector<at::Tensor>& inputs,
at::Tensor& output,
int32_t root = 0,
@ -109,41 +111,41 @@ TORCH_CUDA_API void reduce(
const stream_list& streams = {},
const comm_list& user_comms = {});
TORCH_CUDA_API void reduce(
TORCH_CUDA_CPP_API void reduce(
std::vector<at::Tensor>& inputs,
int32_t root = 0,
int32_t op = static_cast<int>(ncclRedOp::Sum),
const stream_list& streams = {},
const comm_list& user_comms = {});
TORCH_CUDA_API void all_reduce(
TORCH_CUDA_CPP_API void all_reduce(
const std::vector<at::Tensor>& inputs,
std::vector<at::Tensor>& outputs,
int32_t op = static_cast<int>(ncclRedOp::Sum),
const stream_list& streams = {},
const comm_list& user_comms = {});
TORCH_CUDA_API void reduce_scatter(
TORCH_CUDA_CPP_API void reduce_scatter(
const std::vector<at::Tensor>& inputs,
std::vector<at::Tensor>& outputs,
int32_t op = static_cast<int>(ncclRedOp::Sum),
const stream_list& streams = {},
const comm_list& user_comms = {});
TORCH_CUDA_API void all_gather(
TORCH_CUDA_CPP_API void all_gather(
const std::vector<at::Tensor>& inputs,
std::vector<at::Tensor>& outputs,
const stream_list& streams = {},
const comm_list& user_comms = {});
TORCH_CUDA_API void all2all_single_equal_split(
TORCH_CUDA_CPP_API void all2all_single_equal_split(
at::Tensor& input,
at::Tensor& output,
int size,
ncclComm_t comm,
at::cuda::CUDAStream& stream);
TORCH_CUDA_API void all2all_single_unequal_split(
TORCH_CUDA_CPP_API void all2all_single_unequal_split(
void* sendbuff,
const size_t* sendcounts,
const size_t* senddispls,
@ -155,19 +157,19 @@ TORCH_CUDA_API void all2all_single_unequal_split(
ncclComm_t comm,
at::cuda::CUDAStream& stream);
TORCH_CUDA_API void all2all(
TORCH_CUDA_CPP_API void all2all(
std::vector<at::Tensor>& outputTensors,
std::vector<at::Tensor>& inputTensors,
ncclComm_t _comm,
at::cuda::CUDAStream& stream);
TORCH_CUDA_API void send(
TORCH_CUDA_CPP_API void send(
const at::Tensor& input,
ncclComm_t comm,
at::cuda::CUDAStream stream,
int dst);
TORCH_CUDA_API void recv(
TORCH_CUDA_CPP_API void recv(
at::Tensor& output,
ncclComm_t comm,
at::cuda::CUDAStream stream,

View File

@ -286,7 +286,7 @@ class TensorPipeAgent : public RpcAgent {
#ifdef USE_CUDA_NOT_ROCM
// An RPC-specific CUDAFuture subclass. It overrides the extractDataPtrs
// function to handle and only handle RPC Messages.
struct TORCH_CUDA_API RpcCUDAFuture final : at::cuda::CUDAFuture {
struct TORCH_CUDA_CPP_API RpcCUDAFuture final : at::cuda::CUDAFuture {
public:
using at::cuda::CUDAFuture::CUDAFuture;

View File

@ -61,7 +61,7 @@ inline std::shared_ptr<LazyStreamContext> createLazyStreamContext() {
#else
// CUDA is available. Implement CUDA-related operations.
struct TORCH_CUDA_API CudaLazyStreamContext : public LazyStreamContext {
struct TORCH_CUDA_CPP_API CudaLazyStreamContext : public LazyStreamContext {
using LazyStreamContext::LazyStreamContext;
// let streams in this context wiat for current streams.

View File

@ -243,7 +243,7 @@ TensorView* arithOpOverloads(
}
} // namespace
TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
TORCH_CUDA_CU_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
auto vals = maybeBroadcast({v1, v2});
Val* out = newOutputVal({vals[0], vals[1]});
if (is_logical_op(type)) {
@ -593,7 +593,7 @@ TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* v3) {
return arithOpOverloads(sub_alpha, v1, v2, v3);
}
// lerp
TORCH_CUDA_API Val* lerp(Val* start, Val* end, Val* weight) {
TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight) {
auto vals = maybeBroadcast({start, end, weight});
Val* intrm1 = binaryOp(BinaryOpType::Sub, vals[1], vals[0]);
Val* intrm2 = binaryOp(BinaryOpType::Mul, vals[2], intrm1);

View File

@ -20,148 +20,154 @@ namespace fuser {
namespace cuda {
// Insertion of casting op to dtype, returns new resulting val
TORCH_CUDA_API Val* castOp(DataType dtype, Val* v1);
TORCH_CUDA_API TensorView* castOp(DataType dtype, TensorView* v1);
TORCH_CUDA_CU_API Val* castOp(DataType dtype, Val* v1);
TORCH_CUDA_CU_API TensorView* castOp(DataType dtype, TensorView* v1);
// Perform unary op type and return the output
TORCH_CUDA_API Val* unaryOp(UnaryOpType type, Val* v1);
TORCH_CUDA_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
TORCH_CUDA_CU_API Val* unaryOp(UnaryOpType type, Val* v1);
TORCH_CUDA_CU_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
// Perform binary op type on v1 and v2 and return a type promoted output.
// Mod, CeilDiv, and LT are considered Int only output operations for now.
TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2);
TORCH_CUDA_API TensorView* binaryOp(BinaryOpType type, TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* binaryOp(BinaryOpType type, Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* binaryOp(
TORCH_CUDA_CU_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
TensorView* v1,
Val* v2);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
Val* v1,
TensorView* v2);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
TensorView* v1,
TensorView* v2);
// Perform a reduction operation on v1, initial value for reduction is init,
// reduces across axes, and reduction operation defined by BinaryOp.
TORCH_CUDA_API TensorView* reductionOp(
TORCH_CUDA_CU_API TensorView* reductionOp(
BinaryOpType reduction_op_type,
const std::vector<int>& axes,
Val* init,
TensorView* v1);
// UNARY OPERATIONS
TORCH_CUDA_API Val* neg(Val* v);
TORCH_CUDA_API TensorView* neg(TensorView* v);
TORCH_CUDA_CU_API Val* neg(Val* v);
TORCH_CUDA_CU_API TensorView* neg(TensorView* v);
// Broadcasts v1 based on bool vector. Size of broadcast bool vector should be
// the number of dims desired in the broadcasted tensor. This vector should be
// true if output dim should be a broadcasted dim, and false if it is not a
// broadcasted dim. Number of false entires must match the number of input dims.
TORCH_CUDA_API TensorView* broadcast(
TORCH_CUDA_CU_API TensorView* broadcast(
TensorView* inp,
const std::vector<bool>& is_broadcast_dim);
// BINARY OPERATIONS
// add
TORCH_CUDA_API Val* add(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* add(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* add(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* add(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* add(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* add(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* add(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* add(TensorView* v1, TensorView* v2);
// sub
TORCH_CUDA_API Val* sub(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* sub(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* sub(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* sub(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* sub(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* sub(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, TensorView* v2);
// mul
TORCH_CUDA_API Val* mul(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* mul(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* mul(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* mul(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* mul(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mul(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, TensorView* v2);
// div
TORCH_CUDA_API Val* div(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* div(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* div(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* div(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* div(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* div(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* div(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* div(TensorView* v1, TensorView* v2);
// mod
TORCH_CUDA_API Val* mod(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* mod(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* mod(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* mod(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* mod(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mod(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, TensorView* v2);
// lt
TORCH_CUDA_API Val* lt(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* lt(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* lt(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* lt(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* lt(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* lt(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, TensorView* v2);
// eq
TORCH_CUDA_API Val* eq(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* eq(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* eq(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* eq(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* eq(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* eq(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, TensorView* v2);
// ceilDiv
TORCH_CUDA_API Val* ceilDiv(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* ceilDiv(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* ceilDiv(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* ceilDiv(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ceilDiv(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
// andOp
TORCH_CUDA_API Val* andOp(Val* v1, Val* v2);
TORCH_CUDA_API TensorView* andOp(TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* andOp(Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* andOp(TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* andOp(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* andOp(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, TensorView* v2);
// REDUCTION OPERATIONS
TORCH_CUDA_API TensorView* sum(
TORCH_CUDA_CU_API TensorView* sum(
TensorView* v1,
const std::vector<int>& reduction_axes);
// COMPOUND OPERATIONS
// add_alpha
TORCH_CUDA_API Val* add_alpha(Val* v1, Val* v2, Val* s);
TORCH_CUDA_API TensorView* add_alpha(TensorView* v1, Val* v2, Val* s);
TORCH_CUDA_API TensorView* add_alpha(Val* v1, TensorView* v2, Val* s);
TORCH_CUDA_API TensorView* add_alpha(TensorView* v1, TensorView* v2, Val* s);
TORCH_CUDA_CU_API Val* add_alpha(Val* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* add_alpha(Val* v1, TensorView* v2, Val* s);
TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, TensorView* v2, Val* s);
// sub_alpha
TORCH_CUDA_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
TORCH_CUDA_API TensorView* sub_alpha(TensorView* v1, Val* v2, Val* s);
TORCH_CUDA_API TensorView* sub_alpha(Val* v1, TensorView* v2, Val* s);
TORCH_CUDA_API TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* s);
TORCH_CUDA_CU_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* sub_alpha(Val* v1, TensorView* v2, Val* s);
TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* s);
// lerp
TORCH_CUDA_API Val* lerp(Val* start, Val* end, Val* weight);
TORCH_CUDA_API TensorView* lerp(TensorView* start, Val* end, Val* weight);
TORCH_CUDA_API TensorView* lerp(Val* start, TensorView* end, Val* weight);
TORCH_CUDA_API TensorView* lerp(Val* start, Val* end, TensorView* weight);
TORCH_CUDA_API TensorView* lerp(
TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(TensorView* start, Val* end, Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(Val* start, TensorView* end, Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(Val* start, Val* end, TensorView* weight);
TORCH_CUDA_CU_API TensorView* lerp(
TensorView* start,
TensorView* end,
Val* weight);
TORCH_CUDA_API TensorView* lerp(
TORCH_CUDA_CU_API TensorView* lerp(
TensorView* start,
Val* end,
TensorView* weight);
TORCH_CUDA_API TensorView* lerp(
TORCH_CUDA_CU_API TensorView* lerp(
Val* start,
TensorView* end,
TensorView* weight);
TORCH_CUDA_API TensorView* lerp(
TORCH_CUDA_CU_API TensorView* lerp(
TensorView* start,
TensorView* end,
TensorView* weight);
// addcmul
TORCH_CUDA_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
TORCH_CUDA_API TensorView* addcmul(TensorView* v1, Val* v2, Val* v3, Val* s);
TORCH_CUDA_API TensorView* addcmul(Val* v1, TensorView* v2, Val* v3, Val* s);
TORCH_CUDA_API TensorView* addcmul(Val* v1, Val* v2, TensorView* v3, Val* s);
TORCH_CUDA_API TensorView* addcmul(
TORCH_CUDA_CU_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(TensorView* v1, Val* v2, Val* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, TensorView* v2, Val* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, Val* v2, TensorView* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(
TensorView* v1,
TensorView* v2,
Val* v3,
Val* s);
TORCH_CUDA_API TensorView* addcmul(
TORCH_CUDA_CU_API TensorView* addcmul(
TensorView* v1,
Val* v2,
TensorView* v3,
Val* s);
TORCH_CUDA_API TensorView* addcmul(
TORCH_CUDA_CU_API TensorView* addcmul(
Val* v1,
TensorView* v2,
TensorView* v3,
Val* s);
TORCH_CUDA_API TensorView* addcmul(
TORCH_CUDA_CU_API TensorView* addcmul(
TensorView* v1,
TensorView* v2,
TensorView* v3,
@ -169,20 +175,26 @@ TORCH_CUDA_API TensorView* addcmul(
// TERNARY OPERATIONS
// where
TORCH_CUDA_API Val* where(Val* c, Val* v1, Val* v2);
TORCH_CUDA_API TensorView* where(TensorView* c, Val* v1, Val* v2);
TORCH_CUDA_API TensorView* where(Val* c, TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* where(Val* c, Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* where(TensorView* c, TensorView* v1, Val* v2);
TORCH_CUDA_API TensorView* where(TensorView* c, Val* v1, TensorView* v2);
TORCH_CUDA_API TensorView* where(Val* c, TensorView* v1, TensorView* v2);
TORCH_CUDA_API TensorView* where(TensorView* c, TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API Val* where(Val* c, Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(Val* c, Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* where(TensorView* c, TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* where(
TensorView* c,
TensorView* v1,
TensorView* v2);
// threshold
TORCH_CUDA_API Val* threshold(Val* in, Val* thresh, Val* value);
TORCH_CUDA_API TensorView* threshold(TensorView* in, Val* thresh, Val* value);
TORCH_CUDA_CU_API Val* threshold(Val* in, Val* thresh, Val* value);
TORCH_CUDA_CU_API TensorView* threshold(
TensorView* in,
Val* thresh,
Val* value);
// clamp
TORCH_CUDA_API Val* clamp(Val* in, Val* min_val, Val* max_val);
TORCH_CUDA_API TensorView* clamp(TensorView* in, Val* min_val, Val* max_val);
TORCH_CUDA_CU_API Val* clamp(Val* in, Val* min_val, Val* max_val);
TORCH_CUDA_CU_API TensorView* clamp(TensorView* in, Val* min_val, Val* max_val);
} // namespace cuda
} // namespace fuser

View File

@ -12,7 +12,7 @@ namespace cuda {
namespace codegen {
//! Generates a CUDA kernel definition for the given kernel
TORCH_CUDA_API std::string generateCudaKernel(
TORCH_CUDA_CU_API std::string generateCudaKernel(
const Kernel* kernel,
const std::string& kernel_name = "CUDAGeneratedKernel");

View File

@ -107,7 +107,7 @@ class Sync;
* By default, all IR nodes are handled in this dispatch, and will call an empty
* function on all nodes.
*/
class TORCH_CUDA_API OptOutConstDispatch {
class TORCH_CUDA_CU_API OptOutConstDispatch {
public:
virtual ~OptOutConstDispatch() = default;
OptOutConstDispatch() = default;
@ -167,7 +167,7 @@ class TORCH_CUDA_API OptOutConstDispatch {
virtual void handle(const kir::Sync*) {}
};
class TORCH_CUDA_API OptOutDispatch {
class TORCH_CUDA_CU_API OptOutDispatch {
public:
virtual ~OptOutDispatch() = default;
OptOutDispatch() = default;
@ -227,7 +227,7 @@ class TORCH_CUDA_API OptOutDispatch {
virtual void handle(kir::Sync*) {}
};
class TORCH_CUDA_API OptInConstDispatch {
class TORCH_CUDA_CU_API OptInConstDispatch {
public:
virtual ~OptInConstDispatch() = default;
OptInConstDispatch() = default;
@ -361,7 +361,7 @@ class TORCH_CUDA_API OptInConstDispatch {
}
};
class TORCH_CUDA_API OptInDispatch {
class TORCH_CUDA_CU_API OptInDispatch {
public:
virtual ~OptInDispatch() = default;
OptInDispatch() = default;
@ -494,7 +494,7 @@ class TORCH_CUDA_API OptInDispatch {
}
};
class TORCH_CUDA_API OptOutMutator {
class TORCH_CUDA_CU_API OptOutMutator {
public:
virtual ~OptOutMutator() = default;
OptOutMutator() = default;
@ -559,7 +559,7 @@ class TORCH_CUDA_API OptOutMutator {
virtual Statement* mutate(kir::Sync*);
};
class TORCH_CUDA_API OptInMutator {
class TORCH_CUDA_CU_API OptInMutator {
public:
virtual ~OptInMutator() = default;
OptInMutator() = default;

View File

@ -17,11 +17,11 @@ namespace fuser {
namespace cuda {
// TODO: Should this actually be in launch params?
struct TORCH_CUDA_API CompileOptions {
struct TORCH_CUDA_CU_API CompileOptions {
c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
};
class TORCH_CUDA_API FusionExecutor : public NonCopyable {
class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
public:
// Unsafe compilation that's useful for debugging kernels, iterating over
// slight modifications of a generated kernel

View File

@ -6,7 +6,7 @@ namespace jit {
namespace fuser {
namespace cuda {
class TORCH_CUDA_API LaunchParams {
class TORCH_CUDA_CU_API LaunchParams {
public:
static constexpr int64_t UNINITIALIZED_VAL = -1;

View File

@ -14,7 +14,7 @@ namespace jit {
namespace fuser {
namespace cuda {
class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch {
class TORCH_CUDA_CU_API StatefulExpressionEvaluator : private OptOutDispatch {
public:
explicit StatefulExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {}

View File

@ -45,7 +45,7 @@ class TensorView;
// Fusion Guard is our "context manager". It holds the actrive fusion and allows
// it to be accessed anywhere through FusionGuard::getCurFusion().
class TORCH_CUDA_API FusionGuard {
class TORCH_CUDA_CU_API FusionGuard {
public:
Fusion* prev_fusion;
@ -66,7 +66,7 @@ class TORCH_CUDA_API FusionGuard {
*
* The Fusion owns the whole IR graph (Vals and Exprs)
*/
class TORCH_CUDA_API Fusion final {
class TORCH_CUDA_CU_API Fusion final {
public:
Fusion() = default;

View File

@ -61,7 +61,7 @@ class IrCloner;
* Basically beinng able to succienctly traverse down the inhereitance stack of
* a Statment at runtime. This is currently implemented in dispatch.h
*/
class TORCH_CUDA_API Statement : public NonCopyable, public PolymorphicBase {
class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
friend void swap(Fusion&, Fusion&) noexcept;
public:
@ -163,7 +163,7 @@ class TORCH_CUDA_API Statement : public NonCopyable, public PolymorphicBase {
* 5) An enum value must be added to ValType in type.h
* 6) A string entry must be added in val_type_string_map
*/
class TORCH_CUDA_API Val : public Statement {
class TORCH_CUDA_CU_API Val : public Statement {
public:
virtual ~Val() = default;
@ -286,7 +286,7 @@ class TORCH_CUDA_API Val : public Statement {
// 7) A string entry must be added in expr_type_string_map
// 8) Entry added to ir_graphviz .cpp/.h
class TORCH_CUDA_API Expr : public Statement {
class TORCH_CUDA_CU_API Expr : public Statement {
public:
Expr() = delete;
explicit Expr(ExprType _type);

View File

@ -14,7 +14,7 @@ namespace cuda {
class Fusion;
// Clones nodes from an exiting Fusion
class TORCH_CUDA_API IrCloner : private OptInConstDispatch {
class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
friend class Statement;
public:

View File

@ -33,7 +33,7 @@ namespace cuda {
// for example you can't use "~/temp/ir.dot" ("/home/user/temp/ir.dot"
// must be used instead)
//
class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch {
class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
public:
enum class DetailLevel {
ComputeOnly, // Only dataflow (compute) nodes

View File

@ -23,7 +23,7 @@ namespace cuda {
* This value can be a symbolic value (defined after the kernel
* is compiled) or a constant value (inlined into the kernel definition).
*/
class TORCH_CUDA_API Bool : public Val {
class TORCH_CUDA_CU_API Bool : public Val {
public:
~Bool() = default;
@ -61,7 +61,7 @@ class TORCH_CUDA_API Bool : public Val {
* Float32. This value can be a symbolic value (defined after the kernel
* is compiled) or a constant value (inlined into the kernel definition).
*/
class TORCH_CUDA_API Float : public Val {
class TORCH_CUDA_CU_API Float : public Val {
public:
using ScalarType = double;
@ -101,7 +101,7 @@ class TORCH_CUDA_API Float : public Val {
* This value can be a symbolic value (defined after the kernel
* is compiled) or a constant value (inlined into the kernel definition).
*/
class TORCH_CUDA_API Half : public Val {
class TORCH_CUDA_CU_API Half : public Val {
public:
~Half() = default;
@ -136,7 +136,7 @@ class TORCH_CUDA_API Half : public Val {
// An Int64 value. If used for indexing it's set as size_t. Otherwise it's an
// inlined literal in the kernel.
class TORCH_CUDA_API Int : public Val {
class TORCH_CUDA_CU_API Int : public Val {
public:
using ScalarType = int64_t;
@ -200,7 +200,7 @@ class TVDomainGuard;
// that should be const, const. Gave this a try but expanded really quickly.
// getComputeAtAxis not being const because it can return a TV that some expect
// to be non-const is the biggest headache.
class TORCH_CUDA_API TensorView : public Val {
class TORCH_CUDA_CU_API TensorView : public Val {
public:
~TensorView() = default;
@ -352,9 +352,9 @@ class TORCH_CUDA_API TensorView : public Val {
void setMemoryType(MemoryType mt);
friend TORCH_CUDA_API TransformReplay;
friend TORCH_CUDA_API OptOutMutator;
friend TORCH_CUDA_API LoopNestGenerator;
friend TORCH_CUDA_CU_API TransformReplay;
friend TORCH_CUDA_CU_API OptOutMutator;
friend TORCH_CUDA_CU_API LoopNestGenerator;
friend ComputeAt;
friend void IrFixComputeAt(Fusion*);
friend void adjustMemoryTypes(Fusion* fusion);

View File

@ -35,7 +35,7 @@ bool areEqualScalars(Val* v1, Val* v2);
* 3) Reduction across a dimension i.e. val.sum(axis=2)
* 4) split/merge
*/
class TORCH_CUDA_API UnaryOp : public Expr {
class TORCH_CUDA_CU_API UnaryOp : public Expr {
public:
~UnaryOp() = default;
UnaryOp(UnaryOpType _type, Val* _out, Val* _in);
@ -73,7 +73,7 @@ class TORCH_CUDA_API UnaryOp : public Expr {
* 1) Add/mul/div/mod/sub (A * B)
* 2) LT (A < B)
*/
class TORCH_CUDA_API BinaryOp : public Expr {
class TORCH_CUDA_CU_API BinaryOp : public Expr {
public:
~BinaryOp() = default;
BinaryOp(BinaryOpType _type, Val* _out, Val* _lhs, Val* _rhs);
@ -113,7 +113,7 @@ class TORCH_CUDA_API BinaryOp : public Expr {
* Broadcast _in to match _out. broadcast_dims are relative to out. Where
* broadcast_dims.size() + _in->nDims() == _out->nDims().
*/
class TORCH_CUDA_API BroadcastOp : public Expr {
class TORCH_CUDA_CU_API BroadcastOp : public Expr {
public:
~BroadcastOp() = default;
BroadcastOp(Val* _out, Val* _in);
@ -147,7 +147,7 @@ class TORCH_CUDA_API BroadcastOp : public Expr {
* tensor. The output tensors size will be the size of all
* non-reduction/non-broadcast dimensions.
*/
class TORCH_CUDA_API ReductionOp : public Expr {
class TORCH_CUDA_CU_API ReductionOp : public Expr {
public:
~ReductionOp() = default;
ReductionOp(BinaryOpType _reduction_op_type, Val* _init, Val* _out, Val* _in);
@ -183,7 +183,7 @@ class TORCH_CUDA_API ReductionOp : public Expr {
Val* const in_ = nullptr;
};
class TORCH_CUDA_API TernaryOp : public Expr {
class TORCH_CUDA_CU_API TernaryOp : public Expr {
public:
~TernaryOp() = default;
TernaryOp(TernaryOpType _type, Val* _out, Val* _in1, Val* _in2, Val* _in3);
@ -228,7 +228,7 @@ class TORCH_CUDA_API TernaryOp : public Expr {
// TensorDomains which represent how to iterate over a tensor is made up of
// IterDomains to form an ND iterable. We directly set parallization strategies
// on IterDomains.
class TORCH_CUDA_API IterDomain : public Val {
class TORCH_CUDA_CU_API IterDomain : public Val {
public:
IterDomain(
Val* _start,
@ -363,7 +363,7 @@ class TORCH_CUDA_API IterDomain : public Val {
* operations that take in a TensorDomain, applies a transformation and outputs
* a tensor domain.
*/
class TORCH_CUDA_API TensorDomain : public Val {
class TORCH_CUDA_CU_API TensorDomain : public Val {
public:
TensorDomain() = delete;
~TensorDomain() = default;
@ -556,7 +556,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
* Representation a split on an IterDomain by "factor"
* TODO: Implement split by nparts
*/
class TORCH_CUDA_API Split : public Expr {
class TORCH_CUDA_CU_API Split : public Expr {
public:
~Split() = default;
@ -598,7 +598,7 @@ class TORCH_CUDA_API Split : public Expr {
* if there is one.
* TODO: Should this be a unary op type?
*/
class TORCH_CUDA_API Merge : public Expr {
class TORCH_CUDA_CU_API Merge : public Expr {
public:
~Merge() = default;
Merge(IterDomain* _out, IterDomain* _outer, IterDomain* _inner);
@ -636,7 +636,7 @@ class TORCH_CUDA_API Merge : public Expr {
* - blockDim.z
* - T3.stride[2]
*/
class TORCH_CUDA_API NamedScalar : public Val {
class TORCH_CUDA_CU_API NamedScalar : public Val {
public:
~NamedScalar() = default;
NamedScalar() = delete;

View File

@ -16,7 +16,7 @@ namespace cuda {
//! This class is intended for debug printing, so it attempts
//! to handle invalid states as well.
//!
class TORCH_CUDA_API IrPrinter : public OptInConstDispatch {
class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
public:
explicit IrPrinter(std::ostream& os) : os_(os) {}
@ -107,12 +107,12 @@ class TORCH_CUDA_API IrPrinter : public OptInConstDispatch {
int indent_size_ = 0;
};
TORCH_CUDA_API std::ostream& operator<<(
TORCH_CUDA_CU_API std::ostream& operator<<(
std::ostream& os,
const Statement* stmt);
TORCH_CUDA_API std::ostream& operator<<(std::ostream& os, Fusion* f);
TORCH_CUDA_API std::ostream& operator<<(std::ostream& os, Fusion& f);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion* f);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion& f);
// TODO(kir): catch accidental << printing of Kernel IR nodes
// (use kir::toString(node) instead)

View File

@ -25,7 +25,7 @@ namespace cuda {
//
//! \sa IrTransformPrinter
//!
class TORCH_CUDA_API IrMathPrinter : public IrPrinter {
class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter {
public:
IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
@ -41,7 +41,7 @@ class TORCH_CUDA_API IrMathPrinter : public IrPrinter {
//!
//! \sa IrMathPrinter
//!
class TORCH_CUDA_API IrTransformPrinter : public IrPrinter {
class TORCH_CUDA_CU_API IrTransformPrinter : public IrPrinter {
public:
IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}

View File

@ -30,7 +30,7 @@ namespace cuda {
* TODO: We may want to have ordering of outputs to inputs. I'm not sure why we
* would want this, but seems like it would be a reasonable request.
*/
class TORCH_CUDA_API IterVisitor : public OptOutDispatch {
class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
public:
virtual ~IterVisitor() = default;
@ -147,7 +147,7 @@ class TORCH_CUDA_API IterVisitor : public OptOutDispatch {
* outputs to guarentee that we will traverse all outputs of all exprs during
* the backward traversal.
*/
class TORCH_CUDA_API BackwardVisitor : public OptOutDispatch {
class TORCH_CUDA_CU_API BackwardVisitor : public OptOutDispatch {
public:
virtual ~BackwardVisitor() = default;
@ -207,7 +207,7 @@ class TORCH_CUDA_API BackwardVisitor : public OptOutDispatch {
bool traverseAllPaths = false);
};
class TORCH_CUDA_API DependencyCheck {
class TORCH_CUDA_CU_API DependencyCheck {
public:
// Returns if "dependency" is a dependency of "of".
static bool isDependencyOf(Val* dependency, Val* of);

View File

@ -53,7 +53,7 @@ struct KernelSummary {
//! by a Fusion object. The goal is to have the Kernel object
//! own the Kernel IR nodes
//!
class TORCH_CUDA_API Kernel final : public NonCopyable {
class TORCH_CUDA_CU_API Kernel final : public NonCopyable {
public:
Kernel() = default;

View File

@ -26,7 +26,7 @@ namespace cuda {
//! \note the uniqueness of the ide generated for a given input set is only
//! local to the instance of `InputsIdLookup`.
//!
class TORCH_CUDA_API InputsIdLookup {
class TORCH_CUDA_CU_API InputsIdLookup {
public:
//! constructor where maximum cache size is fixed during init
explicit InputsIdLookup(size_t max_cache_size = 10)

View File

@ -34,7 +34,7 @@ class Passkey {
Passkey() {}
};
class TORCH_CUDA_API NamedScalar : public Val {
class TORCH_CUDA_CU_API NamedScalar : public Val {
public:
NamedScalar(Passkey, std::string name, DataType dtype)
: Val(ValType::KirNamedScalar, dtype, true, true), name_(name) {}
@ -64,7 +64,7 @@ class TORCH_CUDA_API NamedScalar : public Val {
std::string name_;
};
class TORCH_CUDA_API Bool : public Val {
class TORCH_CUDA_CU_API Bool : public Val {
public:
explicit Bool(Passkey, const c10::optional<bool>& value)
: Val(ValType::KirScalar, DataType::Bool, true, true),
@ -87,7 +87,7 @@ class TORCH_CUDA_API Bool : public Val {
const c10::optional<bool> maybe_value_;
};
class TORCH_CUDA_API Float : public Val {
class TORCH_CUDA_CU_API Float : public Val {
public:
using ScalarType = double;
@ -112,7 +112,7 @@ class TORCH_CUDA_API Float : public Val {
const c10::optional<ScalarType> maybe_value_;
};
class TORCH_CUDA_API Half : public Val {
class TORCH_CUDA_CU_API Half : public Val {
public:
explicit Half(Passkey, const c10::optional<float>& value)
: Val(ValType::KirScalar, DataType::Half, true, true),
@ -135,7 +135,7 @@ class TORCH_CUDA_API Half : public Val {
const c10::optional<float> maybe_value_;
};
class TORCH_CUDA_API Int : public Val {
class TORCH_CUDA_CU_API Int : public Val {
public:
using ScalarType = int64_t;
@ -163,7 +163,7 @@ class TORCH_CUDA_API Int : public Val {
const c10::optional<ScalarType> maybe_value_;
};
class TORCH_CUDA_API IterDomain : public Val {
class TORCH_CUDA_CU_API IterDomain : public Val {
public:
IterDomain(Passkey, Val* start, Val* extent);
@ -233,7 +233,7 @@ class TORCH_CUDA_API IterDomain : public Val {
bool is_rfactor_domain_ = false;
};
class TORCH_CUDA_API TensorDomain : public Val {
class TORCH_CUDA_CU_API TensorDomain : public Val {
public:
explicit TensorDomain(Passkey, std::vector<IterDomain*> domain);
@ -304,7 +304,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
const std::vector<bool> contiguity_;
};
class TORCH_CUDA_API TensorView : public Val {
class TORCH_CUDA_CU_API TensorView : public Val {
public:
explicit TensorView(Passkey, const fuser::cuda::TensorView* tv);
@ -329,7 +329,7 @@ class TORCH_CUDA_API TensorView : public Val {
const fuser::cuda::TensorView* fuser_tv_ = nullptr;
};
class TORCH_CUDA_API UnaryOp : public Expr {
class TORCH_CUDA_CU_API UnaryOp : public Expr {
public:
UnaryOp(Passkey, UnaryOpType type, Val* out, Val* in);
@ -351,7 +351,7 @@ class TORCH_CUDA_API UnaryOp : public Expr {
Val* const in_ = nullptr;
};
class TORCH_CUDA_API BinaryOp : public Expr {
class TORCH_CUDA_CU_API BinaryOp : public Expr {
public:
BinaryOp(Passkey, BinaryOpType type, Val* out, Val* lhs, Val* rhs);
@ -378,7 +378,7 @@ class TORCH_CUDA_API BinaryOp : public Expr {
Val* const rhs_ = nullptr;
};
class TORCH_CUDA_API TernaryOp : public Expr {
class TORCH_CUDA_CU_API TernaryOp : public Expr {
public:
TernaryOp(
Passkey,
@ -416,7 +416,7 @@ class TORCH_CUDA_API TernaryOp : public Expr {
Val* const in3_ = nullptr;
};
class TORCH_CUDA_API ReductionOp : public Expr {
class TORCH_CUDA_CU_API ReductionOp : public Expr {
public:
ReductionOp(
Passkey,
@ -460,7 +460,7 @@ class TORCH_CUDA_API ReductionOp : public Expr {
Bool* const pred_ = nullptr;
};
class TORCH_CUDA_API TensorIndex : public Val {
class TORCH_CUDA_CU_API TensorIndex : public Val {
public:
TensorIndex(
Passkey,
@ -486,7 +486,7 @@ class TORCH_CUDA_API TensorIndex : public Val {
std::vector<Val*> indices_;
};
class TORCH_CUDA_API BroadcastOp : public Expr {
class TORCH_CUDA_CU_API BroadcastOp : public Expr {
public:
BroadcastOp(Passkey, Val* out, Val* in);
@ -510,7 +510,7 @@ class TORCH_CUDA_API BroadcastOp : public Expr {
//
// TODO: The components of Allocate like Type and Name could be separated from
// the the assocated TensorView. Perhaps that is more appropriate?
class TORCH_CUDA_API Allocate : public Expr {
class TORCH_CUDA_CU_API Allocate : public Expr {
public:
explicit Allocate(
Passkey,
@ -560,7 +560,7 @@ class TORCH_CUDA_API Allocate : public Expr {
};
// Sync represents __syncthreads barrier for block level coordination.
class TORCH_CUDA_API Sync : public Expr {
class TORCH_CUDA_CU_API Sync : public Expr {
public:
explicit Sync(Passkey, bool war_sync = false);
@ -574,7 +574,7 @@ class TORCH_CUDA_API Sync : public Expr {
};
// TODO(kir): promote to IR node
class TORCH_CUDA_API Scope {
class TORCH_CUDA_CU_API Scope {
public:
Scope() = default;
@ -633,7 +633,7 @@ class TORCH_CUDA_API Scope {
//
// TODO(kir): this is not a real expression
//
class TORCH_CUDA_API ForLoop : public Expr {
class TORCH_CUDA_CU_API ForLoop : public Expr {
public:
ForLoop(Passkey, Val* index, IterDomain* iter_domain, Expr* parent_scope);
@ -673,7 +673,7 @@ class TORCH_CUDA_API ForLoop : public Expr {
//
// TODO(kir): this is not a real expression
//
class TORCH_CUDA_API IfThenElse : public Expr {
class TORCH_CUDA_CU_API IfThenElse : public Expr {
public:
explicit IfThenElse(Passkey, Bool* cond, Expr* parent_scope);
@ -717,7 +717,7 @@ class TORCH_CUDA_API IfThenElse : public Expr {
// explicitly mark a grid reduction and the buffer allocation needed to do it.
// This node provides FusionExecutor the information it needs to allocate the
// reduction and sync buffers.
class TORCH_CUDA_API GridReduction : public Expr {
class TORCH_CUDA_CU_API GridReduction : public Expr {
public:
explicit GridReduction(Passkey, ReductionOp* reduction_op);

View File

@ -20,7 +20,7 @@ namespace kir {
//! This class is intended for debug printing, so it attempts
//! to handle invalid IR states as much as possible.
//!
class TORCH_CUDA_API IrPrinter : private OptInConstDispatch {
class TORCH_CUDA_CU_API IrPrinter : private OptInConstDispatch {
static constexpr char* kTab = " ";
public:

View File

@ -148,7 +148,7 @@ Kernel* GpuLower::kernel() const {
//
// TODO(kir): this is a interim solution for easing the Kernel IR splitting
//
class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
class TORCH_CUDA_CU_API GpuLower::KernelIrMapper : private OptInConstDispatch {
public:
explicit KernelIrMapper(GpuLower* gpu_lower)
: gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {}

View File

@ -14,7 +14,7 @@ namespace jit {
namespace fuser {
namespace cuda {
class TORCH_CUDA_API GpuLower {
class TORCH_CUDA_CU_API GpuLower {
class KernelIrMapper;
public:

View File

@ -14,7 +14,7 @@ namespace jit {
namespace fuser {
namespace cuda {
class TORCH_CUDA_API IndexLowering : public OptInDispatch {
class TORCH_CUDA_CU_API IndexLowering : public OptInDispatch {
public:
static std::vector<Expr*> getIndexedExprs(
Fusion* fusion,

View File

@ -29,7 +29,7 @@ namespace cuda {
* nests to initialize reduction buffers.
*
*/
class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
class TORCH_CUDA_CU_API LoopNestGenerator : public OptOutDispatch {
public:
static std::vector<Expr*> loweredExprs(
Fusion* fusion,

View File

@ -22,7 +22,7 @@ namespace cuda {
//! If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
//! no longer need the predicate and can reset the bit accordingly
//!
class TORCH_CUDA_API ThreadPredicateMap {
class TORCH_CUDA_CU_API ThreadPredicateMap {
public:
using SourceMapType = std::unordered_map<
ParallelType,

View File

@ -49,7 +49,7 @@ namespace cuda {
* corners.
*/
class TORCH_CUDA_API UnrollPass : public OptOutDispatch {
class TORCH_CUDA_CU_API UnrollPass : public OptOutDispatch {
private:
// Wrapper to access thread_predicates_ based on an output TV
kir::Bool* getThreadPredicate(TensorView*);

View File

@ -22,15 +22,17 @@ namespace cuda {
// Get fusion_node ready for execution.
// find or compile `CudaKernel` for graph stored in `attr::Subgraph`
// this function assigns `attr::cache_id` to `fusion_node`
TORCH_CUDA_API void compileCudaFusionGroup(Node* fusion_node);
TORCH_CUDA_CU_API void compileCudaFusionGroup(Node* fusion_node);
// Execute fusion_node.
// Current protocol is that the function allocates output tensor append them to
// `stack` after execution.
// TODO: support shape inferencing. Right now we only handles static shape
TORCH_CUDA_API void runCudaFusionGroup(const Node* fusion_node, Stack& stack);
TORCH_CUDA_CU_API void runCudaFusionGroup(
const Node* fusion_node,
Stack& stack);
TORCH_CUDA_API void CudaFuseGraph(std::shared_ptr<Graph>& graph);
TORCH_CUDA_CU_API void CudaFuseGraph(std::shared_ptr<Graph>& graph);
} // namespace cuda
} // namespace fuser

View File

@ -30,15 +30,15 @@ constexpr int kFcdReductionThreadX = 128;
constexpr int kNonFcdReductionThreadX = 32;
constexpr int kNonFcdReductionThreadY = 32;
TORCH_CUDA_API bool hasReductionNode(const Block* block);
TORCH_CUDA_CU_API bool hasReductionNode(const Block* block);
TORCH_CUDA_API bool isReductionNode(const Node* node);
TORCH_CUDA_CU_API bool isReductionNode(const Node* node);
// returns whether or not a parsing function exists for the given node type.
TORCH_CUDA_API bool isNodeParsible(const Node* node);
TORCH_CUDA_CU_API bool isNodeParsible(const Node* node);
// lowers PyTorch jit graph to `Fusion`.
TORCH_CUDA_API std::unique_ptr<Fusion> parseJitIR(
TORCH_CUDA_CU_API std::unique_ptr<Fusion> parseJitIR(
const std::shared_ptr<Graph>& graph);
} // namespace cuda

View File

@ -19,10 +19,10 @@ namespace jit {
namespace fuser {
namespace cuda {
TORCH_CUDA_API bool isFusableCudaFusionGroup(const Node* node);
TORCH_CUDA_CU_API bool isFusableCudaFusionGroup(const Node* node);
// consider if `node` could be fused into `fusion`
TORCH_CUDA_API bool isFusableCudaFusionGroup(
TORCH_CUDA_CU_API bool isFusableCudaFusionGroup(
const Node* fusion,
const Node* node);

View File

@ -50,7 +50,7 @@ class PredicateCompute {
bool ignore_block_grid_reductions = true);
};
class TORCH_CUDA_API UnrollPredicate {
class TORCH_CUDA_CU_API UnrollPredicate {
public:
static kir::Bool* get(
const std::vector<kir::ForLoop*>& outer_loops,

View File

@ -288,7 +288,7 @@ ReductionParams reductionHeuristic(
}
} // anonymous namespace
TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
Fusion* fusion,
const at::ArrayRef<c10::IValue>& fusion_inputs,
TensorView* red_tv) {

View File

@ -11,7 +11,7 @@ namespace fuser {
namespace cuda {
// return true or false on whether given fusion could be scheduled;
TORCH_CUDA_API bool scheduleFusion(
TORCH_CUDA_CU_API bool scheduleFusion(
Fusion* fusion,
const at::ArrayRef<c10::IValue> inputs);
@ -56,12 +56,12 @@ class ReductionParamsHash {
}
};
TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
Fusion* fusion,
const at::ArrayRef<c10::IValue>& fusion_inputs,
TensorView* red_tv);
TORCH_CUDA_API void scheduleReduction(
TORCH_CUDA_CU_API void scheduleReduction(
Fusion* fusion,
const ReductionParams& rparams,
TensorView* red_tv,

View File

@ -8,7 +8,7 @@ namespace jit {
namespace fuser {
namespace cuda {
TORCH_CUDA_API void TypePropagate(std::shared_ptr<Graph>& graph);
TORCH_CUDA_CU_API void TypePropagate(std::shared_ptr<Graph>& graph);
} // namespace cuda
} // namespace fuser

View File

@ -39,7 +39,7 @@ struct id_int_lt {
//
// If error_on_failure = false, replay will replay everything it can, and ignore
// operations it can't.
class TORCH_CUDA_API ReplayTransformations : public IterVisitor {
class TORCH_CUDA_CU_API ReplayTransformations : public IterVisitor {
protected:
const std::vector<IterDomain*>& target_domain_;
std::unordered_map<IterDomain*, IterDomain*> id_map_;
@ -148,7 +148,7 @@ class TORCH_CUDA_API ReplayTransformations : public IterVisitor {
* history replay_domain
*/
class TORCH_CUDA_API BestEffortReplay {
class TORCH_CUDA_CU_API BestEffortReplay {
private:
std::unordered_map<IterDomain*, IterDomain*> id_map_;
std::unordered_map<IterDomain*, size_t> leaf_ids_;

View File

@ -120,7 +120,7 @@ namespace cuda {
class TensorDomain;
class TensorView;
class TORCH_CUDA_API TransformReplay {
class TORCH_CUDA_CU_API TransformReplay {
public:
// Replay producer as consumer, returns {producer, producer_compute_at_axis}.
static std::pair<TensorDomain*, unsigned int> replayPasC(

View File

@ -15,7 +15,7 @@ namespace cuda {
// TODO: Only replay dispatch is really borrowed from TransformIter, we should
// reevaluate the reuse of dispatch for classes that inherit TransformIter.
class TORCH_CUDA_API TransformRFactor {
class TORCH_CUDA_CU_API TransformRFactor {
public:
// Create a copy of td, change its history by presrving axes so they appear in
// the root domain

View File

@ -489,11 +489,13 @@ std::ostream& operator<<(std::ostream& out, const MemoryType mtype) {
return out << memory_type2string(mtype);
}
TORCH_CUDA_API std::ostream& operator<<(std::ostream& out, const IterType bt) {
TORCH_CUDA_CU_API std::ostream& operator<<(
std::ostream& out,
const IterType bt) {
return out << iter_type2string(bt);
}
TORCH_CUDA_API c10::optional<std::string> inline_op_str(
TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(
const UnaryOpType uotype) {
const char* str = unary_op_type_inline_op2string(uotype);
return str != nullptr ? c10::optional<std::string>(std::string(str))

View File

@ -169,23 +169,23 @@ bool is_logical_op(const BinaryOpType& bot);
DataType aten_to_data_type(const at::ScalarType& scalar_type);
at::ScalarType data_type_to_aten(const DataType& data_type);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const ValType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const DataType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const ExprType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const UnaryOpType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const BinaryOpType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const TernaryOpType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const ParallelType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const MemoryType);
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const IterType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ValType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const DataType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ExprType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const UnaryOpType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const BinaryOpType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const TernaryOpType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ParallelType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const MemoryType);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const IterType);
std::string stringifyThreadSize(const ParallelType);
std::string stringifyThread(const ParallelType);
TORCH_CUDA_API c10::optional<std::string> inline_op_str(const UnaryOpType);
TORCH_CUDA_API c10::optional<std::string> inline_op_str(const BinaryOpType);
TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(const UnaryOpType);
TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(const BinaryOpType);
TORCH_CUDA_API c10::optional<std::string> cast_func_str(
TORCH_CUDA_CU_API c10::optional<std::string> cast_func_str(
const std::pair<DataType, DataType>&);
size_t dataTypeSize(DataType type);

View File

@ -17,14 +17,14 @@ namespace jit {
namespace fuser {
namespace cuda {
TORCH_CUDA_API void getMajorMinor(
TORCH_CUDA_CU_API void getMajorMinor(
const cudaDeviceProp* const prop,
int& major,
int& minor);
// A class holding metadata for an actual CUDA function.
// Note: CUDA functions are per device.
struct TORCH_CUDA_API FusedKernelCUDA
struct TORCH_CUDA_CU_API FusedKernelCUDA
: public ::torch::jit::fuser::FusedKernel {
FusedKernelCUDA(
at::DeviceIndex device,

View File

@ -185,7 +185,7 @@ class CudaPrinter : public IRPrinter {
// Construct Cuda C from the buffer and tensor input, and invoke the kernel
// when real arguments are provided.
class TORCH_CUDA_API CudaCodeGen : public CodeGen {
class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
public:
template <typename... Ts>
CudaCodeGen(Stmt* stmt, Ts... ts)