mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Introducing TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API to the code (#50627)
Summary: Sub-step of my attempt to split up the torch_cuda library, as it is huge. Please look at https://github.com/pytorch/pytorch/issues/49050 for details on the split and which files are in which target. This PR introduces two new macros for Windows DLL purposes, TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API. Both are defined as TORCH_CUDA_API for the time being. Pull Request resolved: https://github.com/pytorch/pytorch/pull/50627 Reviewed By: mruberry Differential Revision: D25955441 Pulled By: janeyx99 fbshipit-source-id: ff226026833b8fb2fb7c77df6f2d6c824f006869
This commit is contained in:
parent
3aed177484
commit
533cb9530e
|
|
@ -119,7 +119,7 @@ struct PhiloxCudaState {
|
|||
bool captured_ = false;
|
||||
};
|
||||
|
||||
struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
|
||||
struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
|
||||
// Constructors
|
||||
CUDAGeneratorImpl(DeviceIndex device_index = -1);
|
||||
~CUDAGeneratorImpl() = default;
|
||||
|
|
@ -155,10 +155,10 @@ private:
|
|||
namespace cuda {
|
||||
namespace detail {
|
||||
|
||||
TORCH_CUDA_API const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1);
|
||||
TORCH_CUDA_API Generator createCUDAGenerator(DeviceIndex device_index = -1);
|
||||
TORCH_CUDA_CPP_API const Generator& getDefaultCUDAGenerator(
|
||||
DeviceIndex device_index = -1);
|
||||
TORCH_CUDA_CPP_API Generator createCUDAGenerator(DeviceIndex device_index = -1);
|
||||
|
||||
} // namespace detail
|
||||
} // namespace cuda
|
||||
} // namespace at
|
||||
|
||||
|
|
|
|||
|
|
@ -6,4 +6,4 @@
|
|||
|
||||
#include <c10/macros/Export.h>
|
||||
|
||||
// Use TORCH_CUDA_API for exports from this folder
|
||||
// Use TORCH_CUDA_CPP_API or TORCH_CUDA_CU_API for exports from this folder
|
||||
|
|
|
|||
|
|
@ -56,22 +56,24 @@ inline bool is_available() {
|
|||
return c10::cuda::device_count() > 0;
|
||||
}
|
||||
|
||||
TORCH_CUDA_API cudaDeviceProp* getCurrentDeviceProperties();
|
||||
TORCH_CUDA_CPP_API cudaDeviceProp* getCurrentDeviceProperties();
|
||||
|
||||
TORCH_CUDA_API int warp_size();
|
||||
TORCH_CUDA_CPP_API int warp_size();
|
||||
|
||||
TORCH_CUDA_API cudaDeviceProp* getDeviceProperties(int64_t device);
|
||||
TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(int64_t device);
|
||||
|
||||
TORCH_CUDA_API bool canDeviceAccessPeer(int64_t device, int64_t peer_device);
|
||||
TORCH_CUDA_CPP_API bool canDeviceAccessPeer(
|
||||
int64_t device,
|
||||
int64_t peer_device);
|
||||
|
||||
TORCH_CUDA_API Allocator* getCUDADeviceAllocator();
|
||||
TORCH_CUDA_CPP_API Allocator* getCUDADeviceAllocator();
|
||||
|
||||
/* Handles */
|
||||
TORCH_CUDA_API cusparseHandle_t getCurrentCUDASparseHandle();
|
||||
TORCH_CUDA_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
|
||||
TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
|
||||
#ifdef CUDART_VERSION
|
||||
TORCH_CUDA_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
|
||||
TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
|
||||
#endif
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ namespace at { namespace cuda {
|
|||
* called before the event is ever recorded, it will use the current device.
|
||||
* Later streams that record the event must match this device.
|
||||
*/
|
||||
struct TORCH_CUDA_API CUDAEvent {
|
||||
struct TORCH_CUDA_CPP_API CUDAEvent {
|
||||
// Constructors
|
||||
// Default value for `flags` is specified below - it's cudaEventDisableTiming
|
||||
CUDAEvent() {}
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
|
||||
namespace at { namespace cuda {
|
||||
|
||||
struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future {
|
||||
struct TORCH_CUDA_CPP_API CUDAFuture : at::ivalue::Future {
|
||||
public:
|
||||
using at::ivalue::Future::Future;
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
namespace at {
|
||||
namespace cuda {
|
||||
|
||||
struct TORCH_CUDA_API CUDAGraph {
|
||||
struct TORCH_CUDA_CPP_API CUDAGraph {
|
||||
CUDAGraph();
|
||||
~CUDAGraph();
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,5 @@
|
|||
|
||||
namespace at { namespace cuda {
|
||||
|
||||
TORCH_CUDA_API at::Allocator* getPinnedMemoryAllocator();
|
||||
|
||||
TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator();
|
||||
}} // namespace at::cuda
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ namespace at {
|
|||
namespace cuda {
|
||||
namespace detail {
|
||||
|
||||
TORCH_CUDA_API bool maybeOverlappingIndices(const at::Tensor& t);
|
||||
TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::Tensor& t);
|
||||
using at::native::canUse32BitIndexMath;
|
||||
|
||||
template <typename scalar, typename IndexType>
|
||||
|
|
|
|||
|
|
@ -105,6 +105,5 @@ extern "C" typedef struct NVRTC {
|
|||
#undef CREATE_MEMBER
|
||||
} NVRTC;
|
||||
|
||||
extern "C" TORCH_CUDA_API NVRTC* load_nvrtc();
|
||||
|
||||
extern "C" TORCH_CUDA_CPP_API NVRTC* load_nvrtc();
|
||||
}} // at::cuda
|
||||
|
|
|
|||
|
|
@ -80,9 +80,8 @@ struct DescriptorDeleter {
|
|||
// initialized the first time you call set() or any other initializing
|
||||
// function.
|
||||
template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
|
||||
class TORCH_CUDA_API Descriptor
|
||||
{
|
||||
public:
|
||||
class TORCH_CUDA_CU_API Descriptor {
|
||||
public:
|
||||
// TODO: Figure out why const-correctness doesn't work here
|
||||
|
||||
// Use desc() to access the underlying descriptor pointer in
|
||||
|
|
@ -109,12 +108,11 @@ private:
|
|||
std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API TensorDescriptor
|
||||
: public Descriptor<cudnnTensorStruct,
|
||||
class TORCH_CUDA_CU_API TensorDescriptor : public Descriptor<
|
||||
cudnnTensorStruct,
|
||||
&cudnnCreateTensorDescriptor,
|
||||
&cudnnDestroyTensorDescriptor>
|
||||
{
|
||||
public:
|
||||
&cudnnDestroyTensorDescriptor> {
|
||||
public:
|
||||
TensorDescriptor() {}
|
||||
explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
|
||||
set(t, pad);
|
||||
|
|
@ -149,12 +147,11 @@ private:
|
|||
|
||||
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
|
||||
|
||||
class TORCH_CUDA_API FilterDescriptor
|
||||
: public Descriptor<cudnnFilterStruct,
|
||||
class TORCH_CUDA_CU_API FilterDescriptor : public Descriptor<
|
||||
cudnnFilterStruct,
|
||||
&cudnnCreateFilterDescriptor,
|
||||
&cudnnDestroyFilterDescriptor>
|
||||
{
|
||||
public:
|
||||
&cudnnDestroyFilterDescriptor> {
|
||||
public:
|
||||
void set(const at::Tensor &t, int64_t pad = 0, bool force_nhwc = false);
|
||||
|
||||
void print();
|
||||
|
|
@ -166,11 +163,11 @@ private:
|
|||
|
||||
std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
|
||||
|
||||
struct TORCH_CUDA_API ConvolutionDescriptor
|
||||
: public Descriptor<cudnnConvolutionStruct,
|
||||
struct TORCH_CUDA_CU_API ConvolutionDescriptor
|
||||
: public Descriptor<
|
||||
cudnnConvolutionStruct,
|
||||
&cudnnCreateConvolutionDescriptor,
|
||||
&cudnnDestroyConvolutionDescriptor>
|
||||
{
|
||||
&cudnnDestroyConvolutionDescriptor> {
|
||||
void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool allow_tf32) {
|
||||
cudnnDataType_t mathType = dataType;
|
||||
if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
|
||||
|
|
@ -189,21 +186,21 @@ struct TORCH_CUDA_API ConvolutionDescriptor
|
|||
}
|
||||
};
|
||||
|
||||
struct TORCH_CUDA_API SpatialTransformerDescriptor
|
||||
: public Descriptor<cudnnSpatialTransformerStruct,
|
||||
struct TORCH_CUDA_CU_API SpatialTransformerDescriptor
|
||||
: public Descriptor<
|
||||
cudnnSpatialTransformerStruct,
|
||||
&cudnnCreateSpatialTransformerDescriptor,
|
||||
&cudnnDestroySpatialTransformerDescriptor>
|
||||
{
|
||||
&cudnnDestroySpatialTransformerDescriptor> {
|
||||
void set(cudnnDataType_t dataType, int dim, int* size) {
|
||||
AT_CUDNN_CHECK(cudnnSetSpatialTransformerNdDescriptor(mut_desc(), CUDNN_SAMPLER_BILINEAR, dataType, dim, size));
|
||||
}
|
||||
};
|
||||
|
||||
struct TORCH_CUDA_API DropoutDescriptor
|
||||
: public Descriptor<cudnnDropoutStruct,
|
||||
struct TORCH_CUDA_CU_API DropoutDescriptor
|
||||
: public Descriptor<
|
||||
cudnnDropoutStruct,
|
||||
&cudnnCreateDropoutDescriptor,
|
||||
&cudnnDestroyDropoutDescriptor>
|
||||
{
|
||||
&cudnnDestroyDropoutDescriptor> {
|
||||
at::Tensor state;
|
||||
|
||||
// Initialize a dropout descriptor's RNG state.
|
||||
|
|
@ -238,11 +235,10 @@ struct TORCH_CUDA_API DropoutDescriptor
|
|||
}
|
||||
};
|
||||
|
||||
struct TORCH_CUDA_API RNNDescriptor
|
||||
: public Descriptor<cudnnRNNStruct,
|
||||
struct TORCH_CUDA_CU_API RNNDescriptor : public Descriptor<
|
||||
cudnnRNNStruct,
|
||||
&cudnnCreateRNNDescriptor,
|
||||
&cudnnDestroyRNNDescriptor>
|
||||
{
|
||||
&cudnnDestroyRNNDescriptor> {
|
||||
DropoutDescriptor dropout_desc_;
|
||||
void set(cudnnHandle_t handle, int hidden_size, int proj_size, int num_layers, DropoutDescriptor&& dropout_desc,
|
||||
cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional,
|
||||
|
|
@ -286,11 +282,11 @@ struct TORCH_CUDA_API RNNDescriptor
|
|||
}
|
||||
};
|
||||
|
||||
struct TORCH_CUDA_API CTCLossDescriptor
|
||||
: public Descriptor<cudnnCTCLossStruct,
|
||||
struct TORCH_CUDA_CU_API CTCLossDescriptor
|
||||
: public Descriptor<
|
||||
cudnnCTCLossStruct,
|
||||
&cudnnCreateCTCLossDescriptor,
|
||||
&cudnnDestroyCTCLossDescriptor>
|
||||
{
|
||||
&cudnnDestroyCTCLossDescriptor> {
|
||||
void set(cudnnDataType_t datatype) {
|
||||
AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,5 @@
|
|||
|
||||
namespace at { namespace native {
|
||||
|
||||
TORCH_CUDA_API cudnnHandle_t getCudnnHandle();
|
||||
|
||||
TORCH_CUDA_CU_API cudnnHandle_t getCudnnHandle();
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
namespace at { namespace native {
|
||||
|
||||
TORCH_CUDA_API cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
|
||||
TORCH_CUDA_CU_API cudnnDataType_t
|
||||
getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
|
||||
cudnnDataType_t getCudnnDataType(const at::Tensor& tensor);
|
||||
|
||||
int64_t cudnn_version();
|
||||
|
|
|
|||
|
|
@ -774,7 +774,8 @@ namespace {
|
|||
// Utilities exposed in RNNUtils.h
|
||||
namespace cudnn_rnn {
|
||||
|
||||
TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_views(
|
||||
TORCH_CUDA_CU_API std::tuple<Tensor, std::vector<Tensor>>
|
||||
copy_weights_to_flat_buf_views(
|
||||
TensorList weight_arr,
|
||||
int64_t weight_stride0,
|
||||
int64_t input_size,
|
||||
|
|
@ -787,17 +788,27 @@ namespace cudnn_rnn {
|
|||
const cudnnDataType_t flat_buf_datatype,
|
||||
const TensorOptions& flat_buf_options,
|
||||
bool set_orig_weights_to_flat_buf,
|
||||
bool allow_type_change/*=false*/,
|
||||
bool include_bias/*=true*/) {
|
||||
// flat_buf_datatype is accepted as a separate argument (rather than extracted from flat_buf_options)
|
||||
// because to extract flat_buf_datatype from flat_buf_options, we'd need to say
|
||||
// auto flat_buf_datatype = getCudnnDataTypeFromScalarType(typeMetaToScalarType(options.dtype()));
|
||||
// typeMetaToScalarType is a surprisingly nontrivial function. We should avoid it if we can.
|
||||
TORCH_CHECK(weight_arr.size() > 0,
|
||||
bool allow_type_change /*=false*/,
|
||||
bool include_bias /*=true*/) {
|
||||
// flat_buf_datatype is accepted as a separate argument (rather than extracted
|
||||
// from flat_buf_options) because to extract flat_buf_datatype from
|
||||
// flat_buf_options, we'd need to say auto flat_buf_datatype =
|
||||
// getCudnnDataTypeFromScalarType(typeMetaToScalarType(options.dtype()));
|
||||
// typeMetaToScalarType is a surprisingly nontrivial function. We should
|
||||
// avoid it if we can.
|
||||
TORCH_CHECK(
|
||||
weight_arr.size() > 0,
|
||||
"copy_weights_to_flat_buf_views: cannot flatten empty weight list");
|
||||
|
||||
RNNDescriptorParams rnn;
|
||||
rnn.set(mode, hidden_size, proj_size, num_layers, bidirectional, promote_rnn_math_type(flat_buf_datatype), flat_buf_datatype);
|
||||
rnn.set(
|
||||
mode,
|
||||
hidden_size,
|
||||
proj_size,
|
||||
num_layers,
|
||||
bidirectional,
|
||||
promote_rnn_math_type(flat_buf_datatype),
|
||||
flat_buf_datatype);
|
||||
|
||||
auto handle = getCudnnHandle();
|
||||
RNNDescriptor rnn_desc = rnn.descriptor(handle);
|
||||
|
|
@ -809,7 +820,8 @@ namespace cudnn_rnn {
|
|||
// expects descriptors padded to 3 dimensions.
|
||||
x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5);
|
||||
|
||||
auto num_weights = get_num_weights(handle, rnn_desc, x_desc, flat_buf_datatype);
|
||||
auto num_weights =
|
||||
get_num_weights(handle, rnn_desc, x_desc, flat_buf_datatype);
|
||||
auto weight_buf = at::zeros(num_weights, flat_buf_options);
|
||||
|
||||
FilterDescriptor w_desc;
|
||||
|
|
@ -818,7 +830,8 @@ namespace cudnn_rnn {
|
|||
// Slice off views into weight_buf
|
||||
std::vector<Tensor> params_arr;
|
||||
size_t params_stride0;
|
||||
std::tie(params_arr, params_stride0) = get_parameters(handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias);
|
||||
std::tie(params_arr, params_stride0) = get_parameters(
|
||||
handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias);
|
||||
MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)},
|
||||
params{params_arr, params_stride0};
|
||||
|
||||
|
|
@ -834,8 +847,10 @@ namespace cudnn_rnn {
|
|||
weight[i][1].set_(params[i][1].view_as(weight[i][1]));
|
||||
weight[i][2].set_(params[i][4].view_as(weight[i][2]));
|
||||
} else {
|
||||
for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin();
|
||||
orig_param_it != weight[i].end() && new_param_it != params[i].end();
|
||||
for (auto orig_param_it = weight[i].begin(),
|
||||
new_param_it = params[i].begin();
|
||||
orig_param_it != weight[i].end() &&
|
||||
new_param_it != params[i].end();
|
||||
orig_param_it++, new_param_it++) {
|
||||
auto orig_param = *orig_param_it, new_param = *new_param_it;
|
||||
orig_param.set_(new_param.view_as(orig_param));
|
||||
|
|
@ -845,7 +860,7 @@ namespace cudnn_rnn {
|
|||
}
|
||||
|
||||
return std::make_tuple(weight_buf, params_arr);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace cudnn_rnn
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ namespace at {
|
|||
namespace native {
|
||||
namespace cudnn_rnn {
|
||||
|
||||
TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_views(
|
||||
TORCH_CUDA_CU_API std::tuple<Tensor, std::vector<Tensor>>
|
||||
copy_weights_to_flat_buf_views(
|
||||
TensorList weight_arr,
|
||||
int64_t weight_stride0,
|
||||
int64_t input_size,
|
||||
|
|
@ -21,8 +22,8 @@ TORCH_CUDA_API std::tuple<Tensor, std::vector<Tensor>> copy_weights_to_flat_buf_
|
|||
const cudnnDataType_t flat_buf_datatype,
|
||||
const TensorOptions& flat_buf_options,
|
||||
bool set_orig_weights_to_flat_buf,
|
||||
bool allow_type_change=false,
|
||||
bool include_bias=true);
|
||||
bool allow_type_change = false,
|
||||
bool include_bias = true);
|
||||
|
||||
} // namespace cudnn_rnn
|
||||
} // namespace native
|
||||
|
|
|
|||
|
|
@ -4,17 +4,61 @@
|
|||
|
||||
namespace at { namespace native { namespace sparse { namespace cuda {
|
||||
|
||||
TORCH_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
|
||||
TORCH_CUDA_CU_API void Xcoo2csr(
|
||||
const int* coorowind,
|
||||
int64_t nnz,
|
||||
int64_t m,
|
||||
int* csrrowptr);
|
||||
|
||||
/* Level 3 */
|
||||
template<typename T>
|
||||
TORCH_CUDA_API void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, T alpha, T *csrvala, int *csrrowptra, int *csrcolinda, T *b, int64_t ldb, T beta, T *c, int64_t ldc);
|
||||
template <typename T>
|
||||
TORCH_CUDA_CU_API void csrmm2(
|
||||
char transa,
|
||||
char transb,
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
int64_t k,
|
||||
int64_t nnz,
|
||||
T alpha,
|
||||
T* csrvala,
|
||||
int* csrrowptra,
|
||||
int* csrcolinda,
|
||||
T* b,
|
||||
int64_t ldb,
|
||||
T beta,
|
||||
T* c,
|
||||
int64_t ldc);
|
||||
|
||||
/* format conversion */
|
||||
TORCH_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
|
||||
TORCH_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
|
||||
TORCH_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
|
||||
TORCH_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
|
||||
TORCH_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
|
||||
|
||||
TORCH_CUDA_CU_API void CreateIdentityPermutation(int64_t nnz, int* P);
|
||||
TORCH_CUDA_CU_API void Xcsrsort_bufferSizeExt(
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
int64_t nnz,
|
||||
const int* csrRowPtr,
|
||||
const int* csrColInd,
|
||||
size_t* pBufferSizeInBytes);
|
||||
TORCH_CUDA_CU_API void Xcsrsort(
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
int64_t nnz,
|
||||
const int* csrRowPtr,
|
||||
int* csrColInd,
|
||||
int* P,
|
||||
void* pBuffer);
|
||||
TORCH_CUDA_CU_API void Xcoosort_bufferSizeExt(
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
int64_t nnz,
|
||||
const int* cooRows,
|
||||
const int* cooCols,
|
||||
size_t* pBufferSizeInBytes);
|
||||
TORCH_CUDA_CU_API void XcoosortByRow(
|
||||
int64_t m,
|
||||
int64_t n,
|
||||
int64_t nnz,
|
||||
int* cooRows,
|
||||
int* cooCols,
|
||||
int* P,
|
||||
void* pBuffer);
|
||||
}}}} // namespace at::native::sparse::cuda
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
// IPC doesn't support (re)allocation
|
||||
|
||||
class TORCH_CUDA_API THCIpcDeleter {
|
||||
class TORCH_CUDA_CU_API THCIpcDeleter {
|
||||
public:
|
||||
THCIpcDeleter(std::shared_ptr<void> basePtr);
|
||||
~THCIpcDeleter();
|
||||
|
|
|
|||
|
|
@ -21,13 +21,14 @@
|
|||
// Note that this allocator does not split larger allocations into smaller
|
||||
// blocks, unlike the caching device allocator.
|
||||
//
|
||||
TORCH_CUDA_API c10::Allocator* getTHCCachingHostAllocator(void);
|
||||
TORCH_CUDA_CU_API c10::Allocator* getTHCCachingHostAllocator(void);
|
||||
|
||||
// Records an event in the specified stream. The allocation 'ptr' will not be
|
||||
// re-used until the event has occurred.
|
||||
TORCH_CUDA_API cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, at::cuda::CUDAStream stream);
|
||||
TORCH_CUDA_CU_API cudaError_t
|
||||
THCCachingHostAllocator_recordEvent(void* ptr, at::cuda::CUDAStream stream);
|
||||
|
||||
// Releases cached pinned memory allocations via cudaHostFree
|
||||
TORCH_CUDA_API void THCCachingHostAllocator_emptyCache(void);
|
||||
TORCH_CUDA_CU_API void THCCachingHostAllocator_emptyCache(void);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -31,22 +31,22 @@ typedef struct _THCCudaResourcesPerDevice {
|
|||
size_t scratchSpacePerStream;
|
||||
} THCCudaResourcesPerDevice;
|
||||
|
||||
TORCH_CUDA_API THCState* THCState_alloc(void);
|
||||
TORCH_CUDA_API void THCState_free(THCState* state);
|
||||
TORCH_CUDA_CU_API THCState* THCState_alloc(void);
|
||||
TORCH_CUDA_CU_API void THCState_free(THCState* state);
|
||||
|
||||
TORCH_CUDA_API void THCudaInit(THCState* state);
|
||||
TORCH_CUDA_API void THCudaShutdown(THCState* state);
|
||||
TORCH_CUDA_CU_API void THCudaInit(THCState* state);
|
||||
TORCH_CUDA_CU_API void THCudaShutdown(THCState* state);
|
||||
|
||||
/* If device `dev` can access allocations on device `devToAccess`, this will return */
|
||||
/* 1; otherwise, 0. */
|
||||
TORCH_CUDA_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
|
||||
TORCH_CUDA_CU_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
|
||||
|
||||
TORCH_CUDA_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);
|
||||
TORCH_CUDA_CU_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);
|
||||
|
||||
TORCH_CUDA_API void THCMagma_init(THCState *state);
|
||||
TORCH_CUDA_CU_API void THCMagma_init(THCState *state);
|
||||
|
||||
/* For the current device and stream, returns the allocated scratch space */
|
||||
TORCH_CUDA_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
|
||||
TORCH_CUDA_CU_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
|
||||
|
||||
#define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs")
|
||||
#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__)
|
||||
|
|
@ -54,16 +54,16 @@ TORCH_CUDA_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
|
|||
#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__)
|
||||
#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__)
|
||||
|
||||
TORCH_CUDA_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
|
||||
TORCH_CUDA_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
|
||||
TORCH_CUDA_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
|
||||
TORCH_CUDA_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
|
||||
TORCH_CUDA_CU_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
|
||||
TORCH_CUDA_CU_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
|
||||
TORCH_CUDA_CU_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
|
||||
TORCH_CUDA_CU_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
|
||||
|
||||
TORCH_CUDA_API void* THCudaMalloc(THCState *state, size_t size);
|
||||
TORCH_CUDA_API void THCudaFree(THCState *state, void* ptr);
|
||||
TORCH_CUDA_CU_API void* THCudaMalloc(THCState *state, size_t size);
|
||||
TORCH_CUDA_CU_API void THCudaFree(THCState *state, void* ptr);
|
||||
|
||||
at::DataPtr THCudaHostAlloc(THCState *state, size_t size);
|
||||
|
||||
TORCH_CUDA_API void THCudaHostRecord(THCState *state, void *ptr);
|
||||
TORCH_CUDA_CU_API void THCudaHostRecord(THCState *state, void *ptr);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -147,6 +147,6 @@ __device__ T reduceBlockWithNThreadLocalReductions(T *smem,
|
|||
void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
|
||||
|
||||
// Produces a grid with at least one point per tile
|
||||
TORCH_CUDA_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
|
||||
TORCH_CUDA_CU_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
|
||||
|
||||
#endif // THC_REDUCE_APPLY_UTILS_INC
|
||||
|
|
|
|||
|
|
@ -5,6 +5,6 @@
|
|||
#include <time.h>
|
||||
|
||||
// enqueues a kernel that spins for the specified number of cycles
|
||||
TORCH_CUDA_API void THC_sleep(THCState* state, int64_t cycles);
|
||||
TORCH_CUDA_CU_API void THC_sleep(THCState* state, int64_t cycles);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -13,17 +13,19 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
TORCH_CUDA_API THCStorage* THCStorage_new(THCState* state);
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_new(THCState* state);
|
||||
|
||||
TORCH_CUDA_API void THCStorage_retain(THCState *state, THCStorage *storage);
|
||||
TORCH_CUDA_CU_API void THCStorage_retain(THCState* state, THCStorage* storage);
|
||||
|
||||
TORCH_CUDA_API void THCStorage_resizeBytes(
|
||||
TORCH_CUDA_CU_API void THCStorage_resizeBytes(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
ptrdiff_t size_bytes);
|
||||
TORCH_CUDA_API int THCStorage_getDevice(THCState* state, const THCStorage* storage);
|
||||
TORCH_CUDA_CU_API int THCStorage_getDevice(
|
||||
THCState* state,
|
||||
const THCStorage* storage);
|
||||
|
||||
TORCH_CUDA_API THCStorage* THCStorage_newWithDataAndAllocator(
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_newWithDataAndAllocator(
|
||||
THCState* state,
|
||||
at::DataPtr&& data,
|
||||
ptrdiff_t size,
|
||||
|
|
|
|||
|
|
@ -9,8 +9,7 @@
|
|||
|
||||
#define THC_DESC_BUFF_LEN 64
|
||||
|
||||
typedef struct TORCH_CUDA_API THCDescBuff
|
||||
{
|
||||
typedef struct TORCH_CUDA_CU_API THCDescBuff {
|
||||
char str[THC_DESC_BUFF_LEN];
|
||||
} THCDescBuff;
|
||||
|
||||
|
|
|
|||
|
|
@ -12,46 +12,110 @@
|
|||
#include <ATen/ATen.h>
|
||||
|
||||
// See [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
|
||||
TORCH_CUDA_API int THCTensor_nDimension(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API int THCTensor_nDimensionLegacyNoScalars(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API int THCTensor_nDimensionLegacyAll(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_CU_API int THCTensor_nDimension(
|
||||
THCState* state,
|
||||
const THCTensor* self);
|
||||
TORCH_CUDA_CU_API int THCTensor_nDimensionLegacyNoScalars(
|
||||
THCState* state,
|
||||
const THCTensor* self);
|
||||
TORCH_CUDA_CU_API int THCTensor_nDimensionLegacyAll(
|
||||
THCState* state,
|
||||
const THCTensor* self);
|
||||
|
||||
TORCH_CUDA_API int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API int64_t THCTensor_sizeLegacyNoScalars(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t
|
||||
THCTensor_size(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t
|
||||
THCTensor_sizeLegacyNoScalars(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t
|
||||
THCTensor_stride(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t THCTensor_strideLegacyNoScalars(
|
||||
THCState* state,
|
||||
const THCTensor* self,
|
||||
int dim);
|
||||
|
||||
TORCH_CUDA_API THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_new(
|
||||
THCState* state,
|
||||
caffe2::TypeMeta type_meta);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_resize(THCState *state, THCTensor *tensor, at::IntArrayRef size, at::IntArrayRef stride);
|
||||
TORCH_CUDA_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
|
||||
TORCH_CUDA_API void THCTensor_resizeAs(THCState *state, THCTensor *tensor, THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_resize(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
at::IntArrayRef size,
|
||||
at::IntArrayRef stride);
|
||||
TORCH_CUDA_CU_API void THCTensor_resizeNd(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int nDimension,
|
||||
const int64_t* size,
|
||||
const int64_t* stride);
|
||||
TORCH_CUDA_CU_API void THCTensor_resizeAs(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCTensor* src);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src);
|
||||
TORCH_CUDA_API void THCTensor_setStorage(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, at::IntArrayRef size_, at::IntArrayRef stride_);
|
||||
TORCH_CUDA_CU_API void THCTensor_set(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src);
|
||||
TORCH_CUDA_CU_API void THCTensor_setStorage(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCStorage* storage_,
|
||||
ptrdiff_t storageOffset_,
|
||||
at::IntArrayRef size_,
|
||||
at::IntArrayRef stride_);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
|
||||
TORCH_CUDA_API void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
|
||||
TORCH_CUDA_CU_API void THCTensor_squeeze1d(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension_);
|
||||
TORCH_CUDA_CU_API void THCTensor_unsqueeze1d(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension_);
|
||||
|
||||
TORCH_CUDA_API bool THCTensor_allContiguous(THCState *state, THCTensor **inputs, int numInputs);
|
||||
TORCH_CUDA_API ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_CU_API bool THCTensor_allContiguous(
|
||||
THCState* state,
|
||||
THCTensor** inputs,
|
||||
int numInputs);
|
||||
TORCH_CUDA_CU_API ptrdiff_t
|
||||
THCTensor_nElement(THCState* state, const THCTensor* self);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_retain(THCState *state, THCTensor *self);
|
||||
TORCH_CUDA_API void THCTensor_free(THCState *state, THCTensor *self);
|
||||
TORCH_CUDA_CU_API void THCTensor_retain(THCState* state, THCTensor* self);
|
||||
TORCH_CUDA_CU_API void THCTensor_free(THCState* state, THCTensor* self);
|
||||
|
||||
TORCH_CUDA_API int THCTensor_getDevice(THCState* state, const THCTensor* tensor);
|
||||
TORCH_CUDA_API bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs);
|
||||
TORCH_CUDA_CU_API int THCTensor_getDevice(
|
||||
THCState* state,
|
||||
const THCTensor* tensor);
|
||||
TORCH_CUDA_CU_API bool THCTensor_allSameDevice(
|
||||
THCState* state,
|
||||
THCTensor** inputs,
|
||||
int numInputs);
|
||||
|
||||
/* Can we use 32 bit math for indexing? */
|
||||
TORCH_CUDA_API bool THCTensor_canUse32BitIndexMath(THCState* state, const THCTensor* t, ptrdiff_t max_elem=INT32_MAX);
|
||||
TORCH_CUDA_CU_API bool THCTensor_canUse32BitIndexMath(
|
||||
THCState* state,
|
||||
const THCTensor* t,
|
||||
ptrdiff_t max_elem = INT32_MAX);
|
||||
/* Are all tensors 32-bit indexable? */
|
||||
TORCH_CUDA_API bool THCTensor_all32BitIndexable(THCState* state, THCTensor** inputs, int numInputs);
|
||||
TORCH_CUDA_API void THCTensor_preserveReduceDimSemantics(THCState *state, THCTensor *tensor, int in_dims,
|
||||
int64_t dimension, int keepdim);
|
||||
TORCH_CUDA_CU_API bool THCTensor_all32BitIndexable(
|
||||
THCState* state,
|
||||
THCTensor** inputs,
|
||||
int numInputs);
|
||||
TORCH_CUDA_CU_API void THCTensor_preserveReduceDimSemantics(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int in_dims,
|
||||
int64_t dimension,
|
||||
int keepdim);
|
||||
/* Returns false if there is no possibility that the tensor */
|
||||
/* has more than one index that references the same datapoint, */
|
||||
/* true otherwise. */
|
||||
TORCH_CUDA_API bool THCTensor_maybeOverlappingIndices(THCState* state, const THCTensor* t);
|
||||
TORCH_CUDA_CU_API bool THCTensor_maybeOverlappingIndices(
|
||||
THCState* state,
|
||||
const THCTensor* t);
|
||||
|
||||
#include <THC/generic/THCTensor.hpp>
|
||||
#include <THC/THCGenerateAllTypes.h>
|
||||
|
|
|
|||
|
|
@ -19,34 +19,51 @@
|
|||
#define THCudaComplexFloatStorage THCStorage
|
||||
#define THCudaComplexDoubleStorage THCStorage
|
||||
|
||||
TORCH_CUDA_API scalar_t* THCStorage_(data)(THCState *state, const THCStorage*);
|
||||
TORCH_CUDA_API int THCStorage_(elementSize)(THCState *state);
|
||||
TORCH_CUDA_CU_API scalar_t* THCStorage_(
|
||||
data)(THCState* state, const THCStorage*);
|
||||
TORCH_CUDA_CU_API int THCStorage_(elementSize)(THCState* state);
|
||||
|
||||
/* slow access -- checks everything */
|
||||
TORCH_CUDA_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, scalar_t);
|
||||
TORCH_CUDA_API scalar_t THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
set)(THCState* state, THCStorage*, ptrdiff_t, scalar_t);
|
||||
TORCH_CUDA_CU_API scalar_t
|
||||
THCStorage_(get)(THCState* state, const THCStorage*, ptrdiff_t);
|
||||
|
||||
TORCH_CUDA_API THCStorage* THCStorage_(new)(THCState *state);
|
||||
TORCH_CUDA_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size);
|
||||
TORCH_CUDA_API THCStorage* THCStorage_(newWithSize1)(THCState *state, scalar_t);
|
||||
TORCH_CUDA_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared);
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_(new)(THCState* state);
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_(
|
||||
newWithSize)(THCState* state, ptrdiff_t size);
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_(
|
||||
newWithSize1)(THCState* state, scalar_t);
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_(newWithMapping)(
|
||||
THCState* state,
|
||||
const char* filename,
|
||||
ptrdiff_t size,
|
||||
int shared);
|
||||
|
||||
TORCH_CUDA_API THCStorage* THCStorage_(newWithAllocator)(
|
||||
THCState *state, ptrdiff_t size,
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_(newWithAllocator)(
|
||||
THCState* state,
|
||||
ptrdiff_t size,
|
||||
at::Allocator* allocator);
|
||||
TORCH_CUDA_API THCStorage* THCStorage_(newWithDataAndAllocator)(
|
||||
THCState *state, at::DataPtr&& data, ptrdiff_t size,
|
||||
TORCH_CUDA_CU_API THCStorage* THCStorage_(newWithDataAndAllocator)(
|
||||
THCState* state,
|
||||
at::DataPtr&& data,
|
||||
ptrdiff_t size,
|
||||
at::Allocator* allocator);
|
||||
|
||||
TORCH_CUDA_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag);
|
||||
TORCH_CUDA_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag);
|
||||
TORCH_CUDA_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
setFlag)(THCState* state, THCStorage* storage, const char flag);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
clearFlag)(THCState* state, THCStorage* storage, const char flag);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
retain)(THCState* state, THCStorage* storage);
|
||||
|
||||
TORCH_CUDA_API void THCStorage_(free)(THCState *state, THCStorage *storage);
|
||||
TORCH_CUDA_API void THCStorage_(
|
||||
TORCH_CUDA_CU_API void THCStorage_(free)(THCState* state, THCStorage* storage);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
resizeBytes)(THCState* state, THCStorage* storage, ptrdiff_t size_bytes);
|
||||
TORCH_CUDA_API void THCStorage_(fill)(THCState *state, THCStorage *storage, scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
fill)(THCState* state, THCStorage* storage, scalar_t value);
|
||||
|
||||
TORCH_CUDA_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage);
|
||||
TORCH_CUDA_CU_API int THCStorage_(
|
||||
getDevice)(THCState* state, const THCStorage* storage);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -4,57 +4,146 @@
|
|||
|
||||
/* Support for copy between different Storage types */
|
||||
|
||||
TORCH_CUDA_API void THCStorage_(copy)(THCState *state, THCStorage *storage, THCStorage *src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copy)(THCState* state, THCStorage* storage, THCStorage* src);
|
||||
#if !defined(THC_REAL_IS_COMPLEXFLOAT) && !defined(THC_REAL_IS_COMPLEXDOUBLE)
|
||||
TORCH_CUDA_API void THCStorage_(copyByte)(THCState *state, THCStorage *storage, struct THByteStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyChar)(THCState *state, THCStorage *storage, struct THCharStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyShort)(THCState *state, THCStorage *storage, struct THShortStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct THIntStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyBool)(THCState *state, THCStorage *storage, struct THBoolStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyBFloat16)(THCState *state, THCStorage *storage, struct THBFloat16Storage *src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyByte)(THCState* state, THCStorage* storage, struct THByteStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyChar)(THCState* state, THCStorage* storage, struct THCharStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyShort)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THShortStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyInt)(THCState* state, THCStorage* storage, struct THIntStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyLong)(THCState* state, THCStorage* storage, struct THLongStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyFloat)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THFloatStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyDouble)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THDoubleStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyHalf)(THCState* state, THCStorage* storage, struct THHalfStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyBool)(THCState* state, THCStorage* storage, struct THBoolStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyBFloat16)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THBFloat16Storage* src);
|
||||
#else
|
||||
TORCH_CUDA_API void THCStorage_(copyComplexFloat)(THCState *state, THCStorage *storage, struct THComplexFloatStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyComplexDouble)(THCState *state, THCStorage *storage, struct THComplexDoubleStorage *src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyComplexFloat)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THComplexFloatStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyComplexDouble)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THComplexDoubleStorage* src);
|
||||
#endif
|
||||
|
||||
#if !defined(THC_REAL_IS_COMPLEXFLOAT) && !defined(THC_REAL_IS_COMPLEXDOUBLE)
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaShort)(THCState *state, THCStorage *storage, struct THCudaShortStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, struct THCudaIntStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaBool)(THCState *state, THCStorage *storage, struct THCudaBoolStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaBFloat16)(THCState *state, THCStorage *storage, struct THCudaBFloat16Storage *src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaByte)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaByteStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaChar)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaCharStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaShort)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaShortStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaInt)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaIntStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaLong)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaLongStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaFloat)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaDouble)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaDoubleStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaHalf)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaHalfStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaBool)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaBoolStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaBFloat16)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaBFloat16Storage* src);
|
||||
#else
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaComplexFloat)(THCState *state, THCStorage *storage, struct THCudaComplexFloatStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCudaComplexDouble)(THCState *state, THCStorage *storage, struct THCudaComplexDoubleStorage *src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaComplexFloat)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaComplexFloatStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(copyCudaComplexDouble)(
|
||||
THCState* state,
|
||||
THCStorage* storage,
|
||||
struct THCudaComplexDoubleStorage* src);
|
||||
#endif
|
||||
|
||||
#if !defined(THC_REAL_IS_COMPLEXFLOAT) && !defined(THC_REAL_IS_COMPLEXDOUBLE)
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THByteStorage_copyCuda , Real)(THCState *state, THByteStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THCharStorage_copyCuda , Real)(THCState *state, THCharStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THShortStorage_copyCuda , Real)(THCState *state, THShortStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THIntStorage_copyCuda , Real)(THCState *state, THIntStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THLongStorage_copyCuda , Real)(THCState *state, THLongStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THBoolStorage_copyCuda, Real)(THCState *state, THBoolStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THBFloat16Storage_copyCuda, Real)(THCState *state, THBFloat16Storage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THByteStorage_copyCuda,
|
||||
Real)(THCState* state, THByteStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THCharStorage_copyCuda,
|
||||
Real)(THCState* state, THCharStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THShortStorage_copyCuda,
|
||||
Real)(THCState* state, THShortStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THIntStorage_copyCuda,
|
||||
Real)(THCState* state, THIntStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THLongStorage_copyCuda,
|
||||
Real)(THCState* state, THLongStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THFloatStorage_copyCuda,
|
||||
Real)(THCState* state, THFloatStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THDoubleStorage_copyCuda,
|
||||
Real)(THCState* state, THDoubleStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THHalfStorage_copyCuda,
|
||||
Real)(THCState* state, THHalfStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THBoolStorage_copyCuda,
|
||||
Real)(THCState* state, THBoolStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THBFloat16Storage_copyCuda,
|
||||
Real)(THCState* state, THBFloat16Storage* self, struct THCStorage* src);
|
||||
#else
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THComplexFloatStorage_copyCuda , Real)(THCState *state, THComplexFloatStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_API void TH_CONCAT_2(THComplexDoubleStorage_copyCuda, Real)(THCState *state, THComplexDoubleStorage *self, struct THCStorage *src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(
|
||||
THComplexFloatStorage_copyCuda,
|
||||
Real)(THCState* state, THComplexFloatStorage* self, struct THCStorage* src);
|
||||
TORCH_CUDA_CU_API void TH_CONCAT_2(THComplexDoubleStorage_copyCuda, Real)(
|
||||
THCState* state,
|
||||
THComplexDoubleStorage* self,
|
||||
struct THCStorage* src);
|
||||
#endif
|
||||
|
||||
TORCH_CUDA_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src);
|
||||
TORCH_CUDA_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src);
|
||||
TORCH_CUDA_CU_API void THStorage_(
|
||||
copyCuda)(THCState* state, THStorage* self, THCStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyCuda)(THCState* state, THCStorage* self, THCStorage* src);
|
||||
TORCH_CUDA_CU_API void THCStorage_(
|
||||
copyCPU)(THCState* state, THCStorage* self, THStorage* src);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -20,87 +20,215 @@
|
|||
#define THCudaComplexDoubleTensor THCTensor
|
||||
|
||||
/**** access methods ****/
|
||||
TORCH_CUDA_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_CU_API THCStorage* THCTensor_(
|
||||
storage)(THCState* state, const THCTensor* self);
|
||||
TORCH_CUDA_CU_API ptrdiff_t
|
||||
THCTensor_(storageOffset)(THCState* state, const THCTensor* self);
|
||||
|
||||
// See [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
|
||||
TORCH_CUDA_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API int THCTensor_(nDimensionLegacyNoScalars)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API int THCTensor_(nDimensionLegacyAll)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
nDimension)(THCState* state, const THCTensor* self);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
nDimensionLegacyNoScalars)(THCState* state, const THCTensor* self);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
nDimensionLegacyAll)(THCState* state, const THCTensor* self);
|
||||
|
||||
TORCH_CUDA_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
|
||||
TORCH_CUDA_API scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag);
|
||||
TORCH_CUDA_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag);
|
||||
TORCH_CUDA_CU_API int64_t
|
||||
THCTensor_(size)(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t THCTensor_(
|
||||
sizeLegacyNoScalars)(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t
|
||||
THCTensor_(stride)(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API int64_t THCTensor_(
|
||||
strideLegacyNoScalars)(THCState* state, const THCTensor* self, int dim);
|
||||
TORCH_CUDA_CU_API scalar_t* THCTensor_(
|
||||
data)(THCState* state, const THCTensor* self);
|
||||
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
setFlag)(THCState* state, THCTensor* self, const char flag);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
clearFlag)(THCState* state, THCTensor* self, const char flag);
|
||||
|
||||
/**** creation methods ****/
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(new)(THCState *state);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
|
||||
int64_t size0_, int64_t stride0_);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(new)(THCState* state);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(
|
||||
newWithTensor)(THCState* state, THCTensor* tensor);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(newWithStorage1d)(
|
||||
THCState* state,
|
||||
THCStorage* storage_,
|
||||
ptrdiff_t storageOffset_,
|
||||
int64_t size0_,
|
||||
int64_t stride0_);
|
||||
|
||||
/* stride might be NULL */
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newWithSize1d)(THCState *state, int64_t size0_);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(
|
||||
newWithSize1d)(THCState* state, int64_t size0_);
|
||||
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newClone)(THCState *state, THCTensor *self);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newContiguous)(THCState *state, THCTensor *tensor);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newSelect)(THCState *state, THCTensor *tensor, int dimension_, int64_t sliceIndex_);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int dimension_, int64_t firstIndex_, int64_t size_);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
|
||||
TORCH_CUDA_API THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(
|
||||
newClone)(THCState* state, THCTensor* self);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(
|
||||
newContiguous)(THCState* state, THCTensor* tensor);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(newSelect)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int dimension_,
|
||||
int64_t sliceIndex_);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(newNarrow)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int dimension_,
|
||||
int64_t firstIndex_,
|
||||
int64_t size_);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(newTranspose)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int dimension1_,
|
||||
int dimension2_);
|
||||
TORCH_CUDA_CU_API THCTensor* THCTensor_(
|
||||
newFoldBatchDim)(THCState* state, THCTensor* input);
|
||||
|
||||
// resize* methods simply resize the storage. So they may not retain the current data at current indices.
|
||||
// This is especially likely to happen when the tensor is not contiguous. In general, if you still need the
|
||||
// values, unless you are doing some size and stride tricks, do not use resize*.
|
||||
TORCH_CUDA_API void THCTensor_(resizeNd)(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
|
||||
TORCH_CUDA_API void THCTensor_(resizeAs)(THCState *state, THCTensor *tensor, THCTensor *src);
|
||||
TORCH_CUDA_API void THCTensor_(resize0d)(THCState *state, THCTensor *tensor);
|
||||
TORCH_CUDA_API void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0_);
|
||||
TORCH_CUDA_API void THCTensor_(resize2d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_);
|
||||
TORCH_CUDA_API void THCTensor_(resize3d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_);
|
||||
TORCH_CUDA_API void THCTensor_(resize4d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_);
|
||||
TORCH_CUDA_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, int64_t size0_, int64_t size1_, int64_t size2_, int64_t size3_, int64_t size4_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resizeNd)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int nDimension,
|
||||
const int64_t* size,
|
||||
const int64_t* stride);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
resizeAs)(THCState* state, THCTensor* tensor, THCTensor* src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resize0d)(THCState* state, THCTensor* tensor);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
resize1d)(THCState* state, THCTensor* tensor, int64_t size0_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resize2d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t size0_,
|
||||
int64_t size1_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resize3d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t size0_,
|
||||
int64_t size1_,
|
||||
int64_t size2_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resize4d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t size0_,
|
||||
int64_t size1_,
|
||||
int64_t size2_,
|
||||
int64_t size3_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resize5d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t size0_,
|
||||
int64_t size1_,
|
||||
int64_t size2_,
|
||||
int64_t size3_,
|
||||
int64_t size4_);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
set)(THCState* state, THCTensor* self, THCTensor* src);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t firstIndex_, int64_t size_);
|
||||
TORCH_CUDA_API void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension_, int64_t sliceIndex_);
|
||||
TORCH_CUDA_API void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1_, int dimension2_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(narrow)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension_,
|
||||
int64_t firstIndex_,
|
||||
int64_t size_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(select)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension_,
|
||||
int64_t sliceIndex_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(transpose)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension1_,
|
||||
int dimension2_);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
|
||||
TORCH_CUDA_API void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(squeeze1d)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(unsqueeze1d)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dimension_);
|
||||
|
||||
TORCH_CUDA_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
|
||||
TORCH_CUDA_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
isContiguous)(THCState* state, const THCTensor* self);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
isSameSizeAs)(THCState* state, const THCTensor* self, const THCTensor* src);
|
||||
TORCH_CUDA_CU_API ptrdiff_t
|
||||
THCTensor_(nElement)(THCState* state, const THCTensor* self);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(retain)(THCState *state, THCTensor *self);
|
||||
TORCH_CUDA_API void THCTensor_(free)(THCState *state, THCTensor *self);
|
||||
TORCH_CUDA_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst);
|
||||
TORCH_CUDA_CU_API void THCTensor_(retain)(THCState* state, THCTensor* self);
|
||||
TORCH_CUDA_CU_API void THCTensor_(free)(THCState* state, THCTensor* self);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
freeCopyTo)(THCState* state, THCTensor* self, THCTensor* dst);
|
||||
|
||||
/* Slow access methods [check everything] */
|
||||
TORCH_CUDA_API void THCTensor_(set0d)(THCState *state, THCTensor *tensor, scalar_t value);
|
||||
TORCH_CUDA_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, scalar_t value);
|
||||
TORCH_CUDA_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, scalar_t value);
|
||||
TORCH_CUDA_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, scalar_t value);
|
||||
TORCH_CUDA_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
set0d)(THCState* state, THCTensor* tensor, scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
set1d)(THCState* state, THCTensor* tensor, int64_t x0, scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(set2d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t x0,
|
||||
int64_t x1,
|
||||
scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(set3d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t x0,
|
||||
int64_t x1,
|
||||
int64_t x2,
|
||||
scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(set4d)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int64_t x0,
|
||||
int64_t x1,
|
||||
int64_t x2,
|
||||
int64_t x3,
|
||||
scalar_t value);
|
||||
|
||||
TORCH_CUDA_API scalar_t THCTensor_(get0d)(THCState *state, const THCTensor *tensor);
|
||||
TORCH_CUDA_API scalar_t THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0);
|
||||
TORCH_CUDA_API scalar_t THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1);
|
||||
TORCH_CUDA_API scalar_t THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2);
|
||||
TORCH_CUDA_API scalar_t THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3);
|
||||
TORCH_CUDA_CU_API scalar_t
|
||||
THCTensor_(get0d)(THCState* state, const THCTensor* tensor);
|
||||
TORCH_CUDA_CU_API scalar_t
|
||||
THCTensor_(get1d)(THCState* state, const THCTensor* tensor, int64_t x0);
|
||||
TORCH_CUDA_CU_API scalar_t THCTensor_(
|
||||
get2d)(THCState* state, const THCTensor* tensor, int64_t x0, int64_t x1);
|
||||
TORCH_CUDA_CU_API scalar_t THCTensor_(get3d)(
|
||||
THCState* state,
|
||||
const THCTensor* tensor,
|
||||
int64_t x0,
|
||||
int64_t x1,
|
||||
int64_t x2);
|
||||
TORCH_CUDA_CU_API scalar_t THCTensor_(get4d)(
|
||||
THCState* state,
|
||||
const THCTensor* tensor,
|
||||
int64_t x0,
|
||||
int64_t x1,
|
||||
int64_t x2,
|
||||
int64_t x3);
|
||||
|
||||
/* CUDA-specific functions */
|
||||
TORCH_CUDA_API int THCTensor_(getDevice)(THCState *state, const THCTensor *self);
|
||||
TORCH_CUDA_API int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
getDevice)(THCState* state, const THCTensor* self);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
checkGPU)(THCState* state, unsigned int nTensors, ...);
|
||||
|
||||
/* debug methods */
|
||||
TORCH_CUDA_API THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor);
|
||||
TORCH_CUDA_CU_API THCDescBuff
|
||||
THCTensor_(sizeDesc)(THCState* state, const THCTensor* tensor);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -8,9 +8,18 @@
|
|||
// NOTE: functions exist here only to support dispatch via Declarations.cwrap. You probably don't want to put
|
||||
// new functions in here, they should probably be un-genericized.
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
|
||||
at::IntArrayRef size_, at::IntArrayRef stride_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(setStorage)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCStorage* storage_,
|
||||
ptrdiff_t storageOffset_,
|
||||
at::IntArrayRef size_,
|
||||
at::IntArrayRef stride_);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(resize)(THCState *state, THCTensor *self, at::IntArrayRef size, at::IntArrayRef stride);
|
||||
TORCH_CUDA_CU_API void THCTensor_(resize)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
at::IntArrayRef size,
|
||||
at::IntArrayRef stride);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2,10 +2,14 @@
|
|||
#define THC_GENERIC_FILE "THC/generic/THCTensorCopy.h"
|
||||
#else
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src);
|
||||
TORCH_CUDA_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
copy)(THCState* state, THCTensor* self, THCTensor* src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
copyIgnoringOverlaps)(THCState* state, THCTensor* self, THCTensor* src);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src);
|
||||
TORCH_CUDA_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
copyAsyncCPU)(THCState* state, THCTensor* self, THTensor* src);
|
||||
TORCH_CUDA_CU_API void THTensor_(
|
||||
copyAsyncCuda)(THCState* state, THTensor* self, THCTensor* src);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2,10 +2,34 @@
|
|||
#define THC_GENERIC_FILE "THC/generic/THCTensorIndex.h"
|
||||
#else
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(indexCopy)(THCState *state, THCTensor *res_, int dim, THCudaLongTensor *indices, THCTensor *src);
|
||||
TORCH_CUDA_API void THCTensor_(indexFill)(THCState *state, THCTensor *tensor, int dim, THCudaLongTensor *index, scalar_t val);
|
||||
TORCH_CUDA_API void THCTensor_(indexSelect)(THCState *state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
|
||||
TORCH_CUDA_API void THCTensor_(take)(THCState *state, THCTensor *res_, THCTensor *src, THCudaLongTensor *index);
|
||||
TORCH_CUDA_API void THCTensor_(put)(THCState *state, THCTensor *res_, THCudaLongTensor *indices, THCTensor *src, int accumulate);
|
||||
TORCH_CUDA_CU_API void THCTensor_(indexCopy)(
|
||||
THCState* state,
|
||||
THCTensor* res_,
|
||||
int dim,
|
||||
THCudaLongTensor* indices,
|
||||
THCTensor* src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(indexFill)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
int dim,
|
||||
THCudaLongTensor* index,
|
||||
scalar_t val);
|
||||
TORCH_CUDA_CU_API void THCTensor_(indexSelect)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCTensor* src,
|
||||
int dim,
|
||||
THCudaLongTensor* index);
|
||||
TORCH_CUDA_CU_API void THCTensor_(take)(
|
||||
THCState* state,
|
||||
THCTensor* res_,
|
||||
THCTensor* src,
|
||||
THCudaLongTensor* index);
|
||||
TORCH_CUDA_CU_API void THCTensor_(put)(
|
||||
THCState* state,
|
||||
THCTensor* res_,
|
||||
THCudaLongTensor* indices,
|
||||
THCTensor* src,
|
||||
int accumulate);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2,37 +2,42 @@
|
|||
#define THC_GENERIC_FILE "THC/generic/THCTensorMasked.h"
|
||||
#else
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(maskedFill)(THCState *state,
|
||||
THCTensor *tensor,
|
||||
THCudaByteTensor *mask,
|
||||
TORCH_CUDA_CU_API void THCTensor_(maskedFill)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCudaByteTensor* mask,
|
||||
scalar_t value);
|
||||
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(maskedFillBool)(THCState *state,
|
||||
THCTensor *tensor,
|
||||
THCudaBoolTensor *mask,
|
||||
TORCH_CUDA_CU_API void THCTensor_(maskedFillBool)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCudaBoolTensor* mask,
|
||||
scalar_t value);
|
||||
|
||||
// FIXME: remove now that we have THCudaByteTensor?
|
||||
TORCH_CUDA_API void THCTensor_(maskedFillByte)(THCState *state,
|
||||
THCTensor *tensor,
|
||||
THByteTensor *mask,
|
||||
TORCH_CUDA_CU_API void THCTensor_(maskedFillByte)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THByteTensor* mask,
|
||||
scalar_t value);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(maskedCopy)(THCState *state,
|
||||
THCTensor *tensor,
|
||||
THCudaByteTensor *mask,
|
||||
THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(maskedCopy)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCudaByteTensor* mask,
|
||||
THCTensor* src);
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(maskedCopyBool)(THCState *state,
|
||||
THCTensor *tensor,
|
||||
THCudaBoolTensor *mask,
|
||||
THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(maskedCopyBool)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCudaBoolTensor* mask,
|
||||
THCTensor* src);
|
||||
|
||||
// FIXME: remove now that we have THCudaByteTensor?
|
||||
TORCH_CUDA_API void THCTensor_(maskedCopyByte)(THCState *state,
|
||||
THCTensor *tensor,
|
||||
THByteTensor *mask,
|
||||
THCTensor *src);
|
||||
TORCH_CUDA_CU_API void THCTensor_(maskedCopyByte)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THByteTensor* mask,
|
||||
THCTensor* src);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
#define THC_GENERIC_FILE "THC/generic/THCTensorMath.h"
|
||||
#else
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(fill)(THCState *state, THCTensor *self, scalar_t value);
|
||||
TORCH_CUDA_API void THCTensor_(zero)(THCState *state, THCTensor *self);
|
||||
TORCH_CUDA_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t);
|
||||
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
fill)(THCState* state, THCTensor* self, scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(zero)(THCState* state, THCTensor* self);
|
||||
TORCH_CUDA_CU_API ptrdiff_t THCTensor_(numel)(THCState* state, THCTensor* t);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -5,9 +5,16 @@
|
|||
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
|
||||
|
||||
// MAGMA (i.e. CUDA implementation of LAPACK functions)
|
||||
TORCH_CUDA_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
|
||||
TORCH_CUDA_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, bool upper);
|
||||
TORCH_CUDA_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(gels)(
|
||||
THCState* state,
|
||||
THCTensor* rb_,
|
||||
THCTensor* ra_,
|
||||
THCTensor* b_,
|
||||
THCTensor* a_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
potri)(THCState* state, THCTensor* ra_, THCTensor* a, bool upper);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
geqrf)(THCState* state, THCTensor* ra_, THCTensor* rtau_, THCTensor* a_);
|
||||
|
||||
#endif // defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
|
||||
|
||||
|
|
|
|||
|
|
@ -2,11 +2,13 @@
|
|||
#define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.h"
|
||||
#else
|
||||
|
||||
TORCH_CUDA_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
|
||||
TORCH_CUDA_CU_API int THCTensor_(
|
||||
equal)(THCState* state, THCTensor* self, THCTensor* src);
|
||||
|
||||
#if !defined(THC_REAL_IS_BOOL)
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
|
||||
TORCH_CUDA_CU_API void THCTensor_(
|
||||
mul)(THCState* state, THCTensor* self, THCTensor* src, scalar_t value);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,12 @@
|
|||
|
||||
#if !defined(THC_REAL_IS_BOOL)
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension);
|
||||
TORCH_CUDA_CU_API void THCTensor_(crossKernel)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src1,
|
||||
THCTensor* src2,
|
||||
int dimension);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -6,16 +6,36 @@
|
|||
|
||||
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t max_norm);
|
||||
TORCH_CUDA_API void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, int keepdim);
|
||||
TORCH_CUDA_CU_API void THCTensor_(renorm)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
scalar_t value,
|
||||
int dimension,
|
||||
scalar_t max_norm);
|
||||
TORCH_CUDA_CU_API void THCTensor_(norm)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
scalar_t value,
|
||||
int dimension,
|
||||
int keepdim);
|
||||
|
||||
TORCH_CUDA_API accreal THCTensor_(std_all)(THCState *state, THCTensor *self, bool unbiased);
|
||||
TORCH_CUDA_API accreal THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t value);
|
||||
TORCH_CUDA_API accreal THCTensor_(var_all)(THCState *state, THCTensor *self, bool unbiased);
|
||||
TORCH_CUDA_CU_API accreal
|
||||
THCTensor_(std_all)(THCState* state, THCTensor* self, bool unbiased);
|
||||
TORCH_CUDA_CU_API accreal
|
||||
THCTensor_(normall)(THCState* state, THCTensor* self, scalar_t value);
|
||||
TORCH_CUDA_CU_API accreal
|
||||
THCTensor_(var_all)(THCState* state, THCTensor* self, bool unbiased);
|
||||
|
||||
#endif
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim);
|
||||
TORCH_CUDA_CU_API void THCTensor_(prod)(
|
||||
THCState* state,
|
||||
THCTensor* self,
|
||||
THCTensor* src,
|
||||
int dim,
|
||||
int keepdim);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -4,10 +4,11 @@
|
|||
|
||||
/* Returns the mode, and index of the mode, for the set of values
|
||||
* along a given dimension in the input tensor. */
|
||||
TORCH_CUDA_API void THCTensor_(mode)(THCState *state,
|
||||
THCTensor *values,
|
||||
THCudaLongTensor *indices,
|
||||
THCTensor *input,
|
||||
TORCH_CUDA_CU_API void THCTensor_(mode)(
|
||||
THCState* state,
|
||||
THCTensor* values,
|
||||
THCudaLongTensor* indices,
|
||||
THCTensor* input,
|
||||
int dimension,
|
||||
int keepdim);
|
||||
|
||||
|
|
|
|||
|
|
@ -6,8 +6,18 @@
|
|||
|
||||
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(multinomialAliasSetup)(struct THCState *state, THCTensor *probs, THCudaLongTensor *J, THCTensor *q);
|
||||
TORCH_CUDA_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCTensor *_q, THCudaLongTensor *_J, int n_sample, c10::optional<at::Generator> gen_);
|
||||
TORCH_CUDA_CU_API void THCTensor_(multinomialAliasSetup)(
|
||||
struct THCState* state,
|
||||
THCTensor* probs,
|
||||
THCudaLongTensor* J,
|
||||
THCTensor* q);
|
||||
TORCH_CUDA_CU_API void THCTensor_(multinomialAliasDraw)(
|
||||
THCState* state,
|
||||
THCudaLongTensor* self,
|
||||
THCTensor* _q,
|
||||
THCudaLongTensor* _J,
|
||||
int n_sample,
|
||||
c10::optional<at::Generator> gen_);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2,6 +2,11 @@
|
|||
#define THC_GENERIC_FILE "THC/generic/THCTensorScatterGather.h"
|
||||
#else
|
||||
|
||||
TORCH_CUDA_API void THCTensor_(gather)(THCState* state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index);
|
||||
TORCH_CUDA_CU_API void THCTensor_(gather)(
|
||||
THCState* state,
|
||||
THCTensor* tensor,
|
||||
THCTensor* src,
|
||||
int dim,
|
||||
THCudaLongTensor* index);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -4,17 +4,21 @@
|
|||
|
||||
/* Performs an in-place sort of (keys, values). Only works for slice sizes
|
||||
<= 2048 at the moment (slice size == size of keys/values dim `dim`) */
|
||||
TORCH_CUDA_API void THCTensor_(sortKeyValueInplace)(THCState* state,
|
||||
TORCH_CUDA_CU_API void THCTensor_(sortKeyValueInplace)(
|
||||
THCState* state,
|
||||
THCTensor* keys,
|
||||
THCudaLongTensor* values,
|
||||
int dim, bool dir);
|
||||
int dim,
|
||||
bool dir);
|
||||
|
||||
/* Performs an out-of-place sort of `input`, returning the per-slice indices
|
||||
in `indices` and the sorted values in `sorted` */
|
||||
TORCH_CUDA_API void THCTensor_(sort)(THCState* state,
|
||||
TORCH_CUDA_CU_API void THCTensor_(sort)(
|
||||
THCState* state,
|
||||
THCTensor* sorted,
|
||||
THCudaLongTensor* indices,
|
||||
THCTensor* input,
|
||||
int dim, int order);
|
||||
int dim,
|
||||
int order);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -4,10 +4,14 @@
|
|||
|
||||
/* Returns the set of all kth smallest (or largest) elements, depending */
|
||||
/* on `dir` */
|
||||
TORCH_CUDA_API void THCTensor_(topk)(THCState* state,
|
||||
TORCH_CUDA_CU_API void THCTensor_(topk)(
|
||||
THCState* state,
|
||||
THCTensor* topK,
|
||||
THCudaLongTensor* indices,
|
||||
THCTensor* input,
|
||||
int64_t k, int dim, int dir, int sorted);
|
||||
int64_t k,
|
||||
int dim,
|
||||
int dir,
|
||||
int sorted);
|
||||
|
||||
#endif // THC_GENERIC_FILE
|
||||
|
|
|
|||
|
|
@ -5,198 +5,219 @@
|
|||
#include <ATen/core/Reduction.h>
|
||||
#include <ATen/Generator.h>
|
||||
|
||||
TORCH_CUDA_API void THNN_(ClassNLLCriterion_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *output,
|
||||
TORCH_CUDA_CU_API void THNN_(ClassNLLCriterion_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* output,
|
||||
int64_t reduction,
|
||||
THCTensor *weights, // [OPTIONAL]
|
||||
THCTensor *total_weight,
|
||||
THCTensor* weights, // [OPTIONAL]
|
||||
THCTensor* total_weight,
|
||||
int64_t ignore_index);
|
||||
|
||||
TORCH_CUDA_API void THNN_(ClassNLLCriterion_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
TORCH_CUDA_CU_API void THNN_(ClassNLLCriterion_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
int64_t reduction,
|
||||
THCTensor *weights, // [OPTIONAL]
|
||||
THCTensor *total_weight,
|
||||
THCTensor* weights, // [OPTIONAL]
|
||||
THCTensor* total_weight,
|
||||
int64_t ignore_index);
|
||||
|
||||
TORCH_CUDA_API void THNN_(GatedLinear_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *output,
|
||||
TORCH_CUDA_CU_API void THNN_(GatedLinear_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* output,
|
||||
int dim);
|
||||
|
||||
TORCH_CUDA_API void THNN_(GatedLinear_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
TORCH_CUDA_CU_API void THNN_(GatedLinear_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
int dim);
|
||||
|
||||
TORCH_CUDA_API void THNN_(LogSigmoid_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *output,
|
||||
THCTensor *buffer);
|
||||
TORCH_CUDA_CU_API void THNN_(LogSigmoid_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* output,
|
||||
THCTensor* buffer);
|
||||
|
||||
TORCH_CUDA_API void THNN_(LogSigmoid_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
THCTensor *buffer);
|
||||
TORCH_CUDA_CU_API void THNN_(LogSigmoid_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
THCTensor* buffer);
|
||||
|
||||
TORCH_CUDA_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *output,
|
||||
THCTensor *is_target,
|
||||
TORCH_CUDA_CU_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* output,
|
||||
THCTensor* is_target,
|
||||
int64_t reduction);
|
||||
|
||||
TORCH_CUDA_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
THCTensor *is_target,
|
||||
TORCH_CUDA_CU_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
THCTensor* is_target,
|
||||
int64_t reduction);
|
||||
|
||||
TORCH_CUDA_API void THNN_(MultiMarginCriterion_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *output,
|
||||
TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* output,
|
||||
int64_t reduction,
|
||||
int p,
|
||||
THCTensor *weights, // [OPTIONAL]
|
||||
THCTensor* weights, // [OPTIONAL]
|
||||
accreal margin);
|
||||
|
||||
TORCH_CUDA_API void THNN_(MultiMarginCriterion_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
TORCH_CUDA_CU_API void THNN_(MultiMarginCriterion_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
int64_t reduction,
|
||||
int p,
|
||||
THCTensor *weights, // [OPTIONAL]
|
||||
THCTensor* weights, // [OPTIONAL]
|
||||
accreal margin);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *output,
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* output,
|
||||
int64_t reduction,
|
||||
THCTensor *weights, // [OPTIONAL]
|
||||
THCTensor *total_weight,
|
||||
THCTensor* weights, // [OPTIONAL]
|
||||
THCTensor* total_weight,
|
||||
int64_t ignore_index);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCIndexTensor *target,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCIndexTensor* target,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
int64_t reduction,
|
||||
THCTensor *weights, // [OPTIONAL]
|
||||
THCTensor *total_weight,
|
||||
THCTensor* weights, // [OPTIONAL]
|
||||
THCTensor* total_weight,
|
||||
int64_t ignore_index);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialConvolutionMM_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *output,
|
||||
THCTensor *weight,
|
||||
THCTensor *bias, // [OPTIONAL]
|
||||
THCTensor *columns,
|
||||
THCTensor *ones,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH);
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* output,
|
||||
THCTensor* weight,
|
||||
THCTensor* bias, // [OPTIONAL]
|
||||
THCTensor* columns,
|
||||
THCTensor* ones,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialConvolutionMM_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
THCTensor *weight,
|
||||
THCTensor *columns,
|
||||
THCTensor *ones,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH);
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
THCTensor* weight,
|
||||
THCTensor* columns,
|
||||
THCTensor* ones,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialConvolutionMM_accGradParameters)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradWeight,
|
||||
THCTensor *gradBias, // [OPTIONAL]
|
||||
THCTensor *columns,
|
||||
THCTensor *ones,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialConvolutionMM_accGradParameters)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradWeight,
|
||||
THCTensor* gradBias, // [OPTIONAL]
|
||||
THCTensor* columns,
|
||||
THCTensor* ones,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
accreal scale);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialDepthwiseConvolution_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *output,
|
||||
THCTensor *weight,
|
||||
THCTensor *bias, // [OPTIONAL]
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
int dilationW, int dilationH);
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialDepthwiseConvolution_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* output,
|
||||
THCTensor* weight,
|
||||
THCTensor* bias, // [OPTIONAL]
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
int dilationW,
|
||||
int dilationH);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
THCTensor *weight,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
int dilationW, int dilationH);
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
THCTensor* weight,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
int dilationW,
|
||||
int dilationH);
|
||||
|
||||
TORCH_CUDA_API void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradWeight,
|
||||
int kW, int kH,
|
||||
int dW, int dH,
|
||||
int padW, int padH,
|
||||
int dilationW, int dilationH);
|
||||
TORCH_CUDA_CU_API void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradWeight,
|
||||
int kW,
|
||||
int kH,
|
||||
int dW,
|
||||
int dH,
|
||||
int padW,
|
||||
int padH,
|
||||
int dilationW,
|
||||
int dilationH);
|
||||
|
||||
TORCH_CUDA_API void THNN_(RReLU_updateOutput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *output,
|
||||
THCTensor *noise,
|
||||
TORCH_CUDA_CU_API void THNN_(RReLU_updateOutput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* output,
|
||||
THCTensor* noise,
|
||||
double lower,
|
||||
double upper,
|
||||
bool train,
|
||||
bool inplace,
|
||||
c10::optional<at::Generator> generator);
|
||||
|
||||
TORCH_CUDA_API void THNN_(RReLU_updateGradInput)(
|
||||
THCState *state,
|
||||
THCTensor *input,
|
||||
THCTensor *gradOutput,
|
||||
THCTensor *gradInput,
|
||||
THCTensor *noise,
|
||||
TORCH_CUDA_CU_API void THNN_(RReLU_updateGradInput)(
|
||||
THCState* state,
|
||||
THCTensor* input,
|
||||
THCTensor* gradOutput,
|
||||
THCTensor* gradInput,
|
||||
THCTensor* noise,
|
||||
double lower,
|
||||
double upper,
|
||||
bool train,
|
||||
|
|
|
|||
|
|
@ -106,6 +106,10 @@
|
|||
#define TORCH_CUDA_API C10_IMPORT
|
||||
#endif
|
||||
|
||||
// This is in preparation for the imminent torch_cuda split
|
||||
#define TORCH_CUDA_CU_API TORCH_CUDA_API
|
||||
#define TORCH_CUDA_CPP_API TORCH_CUDA_API
|
||||
|
||||
#if defined(TORCH_HIP_BUILD_MAIN_LIB)
|
||||
#define TORCH_HIP_API C10_EXPORT
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -26,7 +26,13 @@
|
|||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#define CAFFE2_CUDA_EXPORT C10_EXPORT
|
||||
#define CAFFE2_CUDA_API TORCH_CUDA_API
|
||||
|
||||
// CAFFE2_CUDA_API gets translated to CAFFE2_HIP_API in hipify script, which
|
||||
// causes a marco redefinition issue with the later definition of
|
||||
// CAFFE2_HIP_API, so we exclude this definition when HIP is specified
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
#define CAFFE2_CUDA_API TORCH_CUDA_CPP_API
|
||||
#endif // __HIP_PLATFORM_HCC__
|
||||
|
||||
#define CAFFE2_HIP_EXPORT C10_EXPORT
|
||||
#define CAFFE2_HIP_API TORCH_HIP_API
|
||||
|
|
|
|||
|
|
@ -178,8 +178,15 @@ def parse_header(path):
|
|||
else:
|
||||
fn_name = fn_name[:-1]
|
||||
generic_functions.append(Function(fn_name))
|
||||
elif l.startswith('TORCH_CUDA_API void THNN_'):
|
||||
fn_name = l[len('TORCH_CUDA_API void THNN_'):]
|
||||
elif l.startswith('TORCH_CUDA_CPP_API void THNN_'):
|
||||
fn_name = l[len('TORCH_CUDA_CPP_API void THNN_'):]
|
||||
if fn_name[0] == '(' and fn_name[-2] == ')':
|
||||
fn_name = fn_name[1:-2]
|
||||
else:
|
||||
fn_name = fn_name[:-1]
|
||||
generic_functions.append(Function(fn_name))
|
||||
elif l.startswith('TORCH_CUDA_CU_API void THNN_'):
|
||||
fn_name = l[len('TORCH_CUDA_CU_API void THNN_'):]
|
||||
if fn_name[0] == '(' and fn_name[-2] == ')':
|
||||
fn_name = fn_name[1:-2]
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
namespace torch {
|
||||
namespace autograd {
|
||||
|
||||
struct TORCH_CUDA_API Scatter : public Node {
|
||||
struct TORCH_CUDA_CU_API Scatter : public Node {
|
||||
explicit Scatter(
|
||||
std::vector<at::Device> devices,
|
||||
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
|
||||
|
|
@ -33,7 +33,7 @@ struct TORCH_CUDA_API Scatter : public Node {
|
|||
bool unsqueeze_scalars_;
|
||||
};
|
||||
|
||||
struct TORCH_CUDA_API Gather : public Node {
|
||||
struct TORCH_CUDA_CU_API Gather : public Node {
|
||||
explicit Gather(const at::Device& destination_device, int64_t dim = 0);
|
||||
~Gather() override;
|
||||
|
||||
|
|
|
|||
|
|
@ -13,34 +13,39 @@ namespace torch { namespace cuda {
|
|||
|
||||
using tensor_list2d = std::vector<std::vector<at::Tensor>>;
|
||||
|
||||
TORCH_CUDA_API std::vector<at::Tensor>& broadcast_out(const at::Tensor& tensor, std::vector<at::Tensor>& out_tensors);
|
||||
TORCH_CUDA_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
|
||||
TORCH_CUDA_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
|
||||
TORCH_CUDA_CU_API std::vector<at::Tensor>& broadcast_out(
|
||||
const at::Tensor& tensor,
|
||||
std::vector<at::Tensor>& out_tensors);
|
||||
TORCH_CUDA_CU_API std::vector<at::Tensor> broadcast(
|
||||
const at::Tensor& tensor,
|
||||
at::IntArrayRef devices);
|
||||
TORCH_CUDA_CU_API tensor_list2d broadcast_coalesced(
|
||||
at::TensorList tensors,
|
||||
at::IntArrayRef devices,
|
||||
size_t buffer_size);
|
||||
|
||||
TORCH_CUDA_API std::vector<at::Tensor>& scatter_out(
|
||||
TORCH_CUDA_CU_API std::vector<at::Tensor>& scatter_out(
|
||||
const at::Tensor& tensor,
|
||||
std::vector<at::Tensor>& out_tensors,
|
||||
int64_t dim = 0,
|
||||
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
|
||||
c10::nullopt);
|
||||
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
|
||||
streams = c10::nullopt);
|
||||
|
||||
TORCH_CUDA_API std::vector<at::Tensor> scatter(
|
||||
TORCH_CUDA_CU_API std::vector<at::Tensor> scatter(
|
||||
const at::Tensor& tensor,
|
||||
at::IntArrayRef devices,
|
||||
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
|
||||
int64_t dim = 0,
|
||||
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
|
||||
c10::nullopt);
|
||||
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
|
||||
streams = c10::nullopt);
|
||||
|
||||
TORCH_CUDA_API at::Tensor& gather_out(
|
||||
TORCH_CUDA_CU_API at::Tensor& gather_out(
|
||||
at::TensorList tensors,
|
||||
at::Tensor& out_tensor,
|
||||
int64_t dim);
|
||||
|
||||
TORCH_CUDA_API at::Tensor gather(
|
||||
TORCH_CUDA_CU_API at::Tensor gather(
|
||||
at::TensorList tensors,
|
||||
int64_t dim,
|
||||
c10::optional<int32_t> destination_index);
|
||||
|
||||
}}
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ enum class ncclDataType {
|
|||
// Don't use them outside of these files.
|
||||
namespace detail {
|
||||
|
||||
TORCH_CUDA_API void throw_nccl_error(ncclResult status);
|
||||
TORCH_CUDA_CPP_API void throw_nccl_error(ncclResult status);
|
||||
|
||||
static inline void NCCL_CHECK(ncclResult status) {
|
||||
if (status != ncclResult::Success) {
|
||||
|
|
@ -68,13 +68,14 @@ static inline void NCCL_CHECK(ncclResult status) {
|
|||
}
|
||||
}
|
||||
|
||||
TORCH_CUDA_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
|
||||
TORCH_CUDA_API void check_inputs(
|
||||
TORCH_CUDA_CPP_API at::ArrayRef<ncclComm_t> get_communicators(
|
||||
at::TensorList inputs);
|
||||
TORCH_CUDA_CPP_API void check_inputs(
|
||||
at::TensorList inputs,
|
||||
at::TensorList outputs,
|
||||
int input_multiplier,
|
||||
int output_multiplier);
|
||||
TORCH_CUDA_API void check_inputs(
|
||||
TORCH_CUDA_CPP_API void check_inputs(
|
||||
at::TensorList inputs,
|
||||
const at::Tensor& output,
|
||||
int root,
|
||||
|
|
@ -86,22 +87,23 @@ TORCH_CUDA_API void check_inputs(
|
|||
using comm_list = std::vector<ncclComm_t>;
|
||||
using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
|
||||
|
||||
TORCH_CUDA_API std::uint64_t version();
|
||||
TORCH_CUDA_CPP_API std::uint64_t version();
|
||||
|
||||
bool is_available(at::TensorList tensors);
|
||||
|
||||
TORCH_CUDA_API void get_unique_id(ncclUniqueId& id);
|
||||
TORCH_CUDA_API ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
|
||||
TORCH_CUDA_API void comm_destroy(ncclComm_t comm);
|
||||
TORCH_CUDA_CPP_API void get_unique_id(ncclUniqueId& id);
|
||||
TORCH_CUDA_CPP_API ncclComm_t
|
||||
comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
|
||||
TORCH_CUDA_CPP_API void comm_destroy(ncclComm_t comm);
|
||||
|
||||
TORCH_CUDA_API void broadcast(
|
||||
TORCH_CUDA_CPP_API void broadcast(
|
||||
at::TensorList tensors,
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
size_t get_max_count();
|
||||
|
||||
TORCH_CUDA_API void reduce(
|
||||
TORCH_CUDA_CPP_API void reduce(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
at::Tensor& output,
|
||||
int32_t root = 0,
|
||||
|
|
@ -109,41 +111,41 @@ TORCH_CUDA_API void reduce(
|
|||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
TORCH_CUDA_API void reduce(
|
||||
TORCH_CUDA_CPP_API void reduce(
|
||||
std::vector<at::Tensor>& inputs,
|
||||
int32_t root = 0,
|
||||
int32_t op = static_cast<int>(ncclRedOp::Sum),
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
TORCH_CUDA_API void all_reduce(
|
||||
TORCH_CUDA_CPP_API void all_reduce(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
std::vector<at::Tensor>& outputs,
|
||||
int32_t op = static_cast<int>(ncclRedOp::Sum),
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
TORCH_CUDA_API void reduce_scatter(
|
||||
TORCH_CUDA_CPP_API void reduce_scatter(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
std::vector<at::Tensor>& outputs,
|
||||
int32_t op = static_cast<int>(ncclRedOp::Sum),
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
TORCH_CUDA_API void all_gather(
|
||||
TORCH_CUDA_CPP_API void all_gather(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
std::vector<at::Tensor>& outputs,
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
TORCH_CUDA_API void all2all_single_equal_split(
|
||||
TORCH_CUDA_CPP_API void all2all_single_equal_split(
|
||||
at::Tensor& input,
|
||||
at::Tensor& output,
|
||||
int size,
|
||||
ncclComm_t comm,
|
||||
at::cuda::CUDAStream& stream);
|
||||
|
||||
TORCH_CUDA_API void all2all_single_unequal_split(
|
||||
TORCH_CUDA_CPP_API void all2all_single_unequal_split(
|
||||
void* sendbuff,
|
||||
const size_t* sendcounts,
|
||||
const size_t* senddispls,
|
||||
|
|
@ -155,19 +157,19 @@ TORCH_CUDA_API void all2all_single_unequal_split(
|
|||
ncclComm_t comm,
|
||||
at::cuda::CUDAStream& stream);
|
||||
|
||||
TORCH_CUDA_API void all2all(
|
||||
TORCH_CUDA_CPP_API void all2all(
|
||||
std::vector<at::Tensor>& outputTensors,
|
||||
std::vector<at::Tensor>& inputTensors,
|
||||
ncclComm_t _comm,
|
||||
at::cuda::CUDAStream& stream);
|
||||
|
||||
TORCH_CUDA_API void send(
|
||||
TORCH_CUDA_CPP_API void send(
|
||||
const at::Tensor& input,
|
||||
ncclComm_t comm,
|
||||
at::cuda::CUDAStream stream,
|
||||
int dst);
|
||||
|
||||
TORCH_CUDA_API void recv(
|
||||
TORCH_CUDA_CPP_API void recv(
|
||||
at::Tensor& output,
|
||||
ncclComm_t comm,
|
||||
at::cuda::CUDAStream stream,
|
||||
|
|
|
|||
|
|
@ -286,7 +286,7 @@ class TensorPipeAgent : public RpcAgent {
|
|||
#ifdef USE_CUDA_NOT_ROCM
|
||||
// An RPC-specific CUDAFuture subclass. It overrides the extractDataPtrs
|
||||
// function to handle and only handle RPC Messages.
|
||||
struct TORCH_CUDA_API RpcCUDAFuture final : at::cuda::CUDAFuture {
|
||||
struct TORCH_CUDA_CPP_API RpcCUDAFuture final : at::cuda::CUDAFuture {
|
||||
public:
|
||||
using at::cuda::CUDAFuture::CUDAFuture;
|
||||
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ inline std::shared_ptr<LazyStreamContext> createLazyStreamContext() {
|
|||
#else
|
||||
|
||||
// CUDA is available. Implement CUDA-related operations.
|
||||
struct TORCH_CUDA_API CudaLazyStreamContext : public LazyStreamContext {
|
||||
struct TORCH_CUDA_CPP_API CudaLazyStreamContext : public LazyStreamContext {
|
||||
using LazyStreamContext::LazyStreamContext;
|
||||
|
||||
// let streams in this context wiat for current streams.
|
||||
|
|
|
|||
|
|
@ -243,7 +243,7 @@ TensorView* arithOpOverloads(
|
|||
}
|
||||
} // namespace
|
||||
|
||||
TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
|
||||
TORCH_CUDA_CU_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
|
||||
auto vals = maybeBroadcast({v1, v2});
|
||||
Val* out = newOutputVal({vals[0], vals[1]});
|
||||
if (is_logical_op(type)) {
|
||||
|
|
@ -593,7 +593,7 @@ TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* v3) {
|
|||
return arithOpOverloads(sub_alpha, v1, v2, v3);
|
||||
}
|
||||
// lerp
|
||||
TORCH_CUDA_API Val* lerp(Val* start, Val* end, Val* weight) {
|
||||
TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight) {
|
||||
auto vals = maybeBroadcast({start, end, weight});
|
||||
Val* intrm1 = binaryOp(BinaryOpType::Sub, vals[1], vals[0]);
|
||||
Val* intrm2 = binaryOp(BinaryOpType::Mul, vals[2], intrm1);
|
||||
|
|
|
|||
|
|
@ -20,148 +20,154 @@ namespace fuser {
|
|||
namespace cuda {
|
||||
|
||||
// Insertion of casting op to dtype, returns new resulting val
|
||||
TORCH_CUDA_API Val* castOp(DataType dtype, Val* v1);
|
||||
TORCH_CUDA_API TensorView* castOp(DataType dtype, TensorView* v1);
|
||||
TORCH_CUDA_CU_API Val* castOp(DataType dtype, Val* v1);
|
||||
TORCH_CUDA_CU_API TensorView* castOp(DataType dtype, TensorView* v1);
|
||||
|
||||
// Perform unary op type and return the output
|
||||
TORCH_CUDA_API Val* unaryOp(UnaryOpType type, Val* v1);
|
||||
TORCH_CUDA_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
|
||||
TORCH_CUDA_CU_API Val* unaryOp(UnaryOpType type, Val* v1);
|
||||
TORCH_CUDA_CU_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
|
||||
|
||||
// Perform binary op type on v1 and v2 and return a type promoted output.
|
||||
// Mod, CeilDiv, and LT are considered Int only output operations for now.
|
||||
TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* binaryOp(BinaryOpType type, TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* binaryOp(BinaryOpType type, Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* binaryOp(
|
||||
TORCH_CUDA_CU_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* binaryOp(
|
||||
BinaryOpType type,
|
||||
TensorView* v1,
|
||||
Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* binaryOp(
|
||||
BinaryOpType type,
|
||||
Val* v1,
|
||||
TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* binaryOp(
|
||||
BinaryOpType type,
|
||||
TensorView* v1,
|
||||
TensorView* v2);
|
||||
|
||||
// Perform a reduction operation on v1, initial value for reduction is init,
|
||||
// reduces across axes, and reduction operation defined by BinaryOp.
|
||||
TORCH_CUDA_API TensorView* reductionOp(
|
||||
TORCH_CUDA_CU_API TensorView* reductionOp(
|
||||
BinaryOpType reduction_op_type,
|
||||
const std::vector<int>& axes,
|
||||
Val* init,
|
||||
TensorView* v1);
|
||||
|
||||
// UNARY OPERATIONS
|
||||
TORCH_CUDA_API Val* neg(Val* v);
|
||||
TORCH_CUDA_API TensorView* neg(TensorView* v);
|
||||
TORCH_CUDA_CU_API Val* neg(Val* v);
|
||||
TORCH_CUDA_CU_API TensorView* neg(TensorView* v);
|
||||
|
||||
// Broadcasts v1 based on bool vector. Size of broadcast bool vector should be
|
||||
// the number of dims desired in the broadcasted tensor. This vector should be
|
||||
// true if output dim should be a broadcasted dim, and false if it is not a
|
||||
// broadcasted dim. Number of false entires must match the number of input dims.
|
||||
TORCH_CUDA_API TensorView* broadcast(
|
||||
TORCH_CUDA_CU_API TensorView* broadcast(
|
||||
TensorView* inp,
|
||||
const std::vector<bool>& is_broadcast_dim);
|
||||
|
||||
// BINARY OPERATIONS
|
||||
// add
|
||||
TORCH_CUDA_API Val* add(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* add(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* add(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* add(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* add(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* add(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* add(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* add(TensorView* v1, TensorView* v2);
|
||||
// sub
|
||||
TORCH_CUDA_API Val* sub(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* sub(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* sub(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* sub(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* sub(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* sub(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, TensorView* v2);
|
||||
// mul
|
||||
TORCH_CUDA_API Val* mul(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* mul(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* mul(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* mul(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* mul(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* mul(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, TensorView* v2);
|
||||
// div
|
||||
TORCH_CUDA_API Val* div(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* div(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* div(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* div(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* div(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* div(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* div(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* div(TensorView* v1, TensorView* v2);
|
||||
// mod
|
||||
TORCH_CUDA_API Val* mod(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* mod(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* mod(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* mod(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* mod(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* mod(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, TensorView* v2);
|
||||
// lt
|
||||
TORCH_CUDA_API Val* lt(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* lt(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* lt(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* lt(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* lt(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* lt(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, TensorView* v2);
|
||||
// eq
|
||||
TORCH_CUDA_API Val* eq(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* eq(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* eq(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* eq(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* eq(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* eq(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, TensorView* v2);
|
||||
// ceilDiv
|
||||
TORCH_CUDA_API Val* ceilDiv(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* ceilDiv(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* ceilDiv(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* ceilDiv(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* ceilDiv(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
|
||||
// andOp
|
||||
TORCH_CUDA_API Val* andOp(Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* andOp(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* andOp(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* andOp(TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* andOp(Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* andOp(Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, TensorView* v2);
|
||||
|
||||
// REDUCTION OPERATIONS
|
||||
TORCH_CUDA_API TensorView* sum(
|
||||
TORCH_CUDA_CU_API TensorView* sum(
|
||||
TensorView* v1,
|
||||
const std::vector<int>& reduction_axes);
|
||||
|
||||
// COMPOUND OPERATIONS
|
||||
// add_alpha
|
||||
TORCH_CUDA_API Val* add_alpha(Val* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_API TensorView* add_alpha(TensorView* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_API TensorView* add_alpha(Val* v1, TensorView* v2, Val* s);
|
||||
TORCH_CUDA_API TensorView* add_alpha(TensorView* v1, TensorView* v2, Val* s);
|
||||
TORCH_CUDA_CU_API Val* add_alpha(Val* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* add_alpha(Val* v1, TensorView* v2, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, TensorView* v2, Val* s);
|
||||
// sub_alpha
|
||||
TORCH_CUDA_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_API TensorView* sub_alpha(TensorView* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_API TensorView* sub_alpha(Val* v1, TensorView* v2, Val* s);
|
||||
TORCH_CUDA_API TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* s);
|
||||
TORCH_CUDA_CU_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, Val* v2, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* sub_alpha(Val* v1, TensorView* v2, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* s);
|
||||
// lerp
|
||||
TORCH_CUDA_API Val* lerp(Val* start, Val* end, Val* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(TensorView* start, Val* end, Val* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(Val* start, TensorView* end, Val* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(Val* start, Val* end, TensorView* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(
|
||||
TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight);
|
||||
TORCH_CUDA_CU_API TensorView* lerp(TensorView* start, Val* end, Val* weight);
|
||||
TORCH_CUDA_CU_API TensorView* lerp(Val* start, TensorView* end, Val* weight);
|
||||
TORCH_CUDA_CU_API TensorView* lerp(Val* start, Val* end, TensorView* weight);
|
||||
TORCH_CUDA_CU_API TensorView* lerp(
|
||||
TensorView* start,
|
||||
TensorView* end,
|
||||
Val* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(
|
||||
TORCH_CUDA_CU_API TensorView* lerp(
|
||||
TensorView* start,
|
||||
Val* end,
|
||||
TensorView* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(
|
||||
TORCH_CUDA_CU_API TensorView* lerp(
|
||||
Val* start,
|
||||
TensorView* end,
|
||||
TensorView* weight);
|
||||
TORCH_CUDA_API TensorView* lerp(
|
||||
TORCH_CUDA_CU_API TensorView* lerp(
|
||||
TensorView* start,
|
||||
TensorView* end,
|
||||
TensorView* weight);
|
||||
// addcmul
|
||||
TORCH_CUDA_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(TensorView* v1, Val* v2, Val* v3, Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(Val* v1, TensorView* v2, Val* v3, Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(Val* v1, Val* v2, TensorView* v3, Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(
|
||||
TORCH_CUDA_CU_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(TensorView* v1, Val* v2, Val* v3, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, TensorView* v2, Val* v3, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, Val* v2, TensorView* v3, Val* s);
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(
|
||||
TensorView* v1,
|
||||
TensorView* v2,
|
||||
Val* v3,
|
||||
Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(
|
||||
TensorView* v1,
|
||||
Val* v2,
|
||||
TensorView* v3,
|
||||
Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(
|
||||
Val* v1,
|
||||
TensorView* v2,
|
||||
TensorView* v3,
|
||||
Val* s);
|
||||
TORCH_CUDA_API TensorView* addcmul(
|
||||
TORCH_CUDA_CU_API TensorView* addcmul(
|
||||
TensorView* v1,
|
||||
TensorView* v2,
|
||||
TensorView* v3,
|
||||
|
|
@ -169,20 +175,26 @@ TORCH_CUDA_API TensorView* addcmul(
|
|||
|
||||
// TERNARY OPERATIONS
|
||||
// where
|
||||
TORCH_CUDA_API Val* where(Val* c, Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* where(TensorView* c, Val* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* where(Val* c, TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* where(Val* c, Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* where(TensorView* c, TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_API TensorView* where(TensorView* c, Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* where(Val* c, TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_API TensorView* where(TensorView* c, TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API Val* where(Val* c, Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(Val* c, Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(TensorView* c, TensorView* v1, Val* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, TensorView* v2);
|
||||
TORCH_CUDA_CU_API TensorView* where(
|
||||
TensorView* c,
|
||||
TensorView* v1,
|
||||
TensorView* v2);
|
||||
// threshold
|
||||
TORCH_CUDA_API Val* threshold(Val* in, Val* thresh, Val* value);
|
||||
TORCH_CUDA_API TensorView* threshold(TensorView* in, Val* thresh, Val* value);
|
||||
TORCH_CUDA_CU_API Val* threshold(Val* in, Val* thresh, Val* value);
|
||||
TORCH_CUDA_CU_API TensorView* threshold(
|
||||
TensorView* in,
|
||||
Val* thresh,
|
||||
Val* value);
|
||||
// clamp
|
||||
TORCH_CUDA_API Val* clamp(Val* in, Val* min_val, Val* max_val);
|
||||
TORCH_CUDA_API TensorView* clamp(TensorView* in, Val* min_val, Val* max_val);
|
||||
TORCH_CUDA_CU_API Val* clamp(Val* in, Val* min_val, Val* max_val);
|
||||
TORCH_CUDA_CU_API TensorView* clamp(TensorView* in, Val* min_val, Val* max_val);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace fuser
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ namespace cuda {
|
|||
namespace codegen {
|
||||
|
||||
//! Generates a CUDA kernel definition for the given kernel
|
||||
TORCH_CUDA_API std::string generateCudaKernel(
|
||||
TORCH_CUDA_CU_API std::string generateCudaKernel(
|
||||
const Kernel* kernel,
|
||||
const std::string& kernel_name = "CUDAGeneratedKernel");
|
||||
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ class Sync;
|
|||
* By default, all IR nodes are handled in this dispatch, and will call an empty
|
||||
* function on all nodes.
|
||||
*/
|
||||
class TORCH_CUDA_API OptOutConstDispatch {
|
||||
class TORCH_CUDA_CU_API OptOutConstDispatch {
|
||||
public:
|
||||
virtual ~OptOutConstDispatch() = default;
|
||||
OptOutConstDispatch() = default;
|
||||
|
|
@ -167,7 +167,7 @@ class TORCH_CUDA_API OptOutConstDispatch {
|
|||
virtual void handle(const kir::Sync*) {}
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API OptOutDispatch {
|
||||
class TORCH_CUDA_CU_API OptOutDispatch {
|
||||
public:
|
||||
virtual ~OptOutDispatch() = default;
|
||||
OptOutDispatch() = default;
|
||||
|
|
@ -227,7 +227,7 @@ class TORCH_CUDA_API OptOutDispatch {
|
|||
virtual void handle(kir::Sync*) {}
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API OptInConstDispatch {
|
||||
class TORCH_CUDA_CU_API OptInConstDispatch {
|
||||
public:
|
||||
virtual ~OptInConstDispatch() = default;
|
||||
OptInConstDispatch() = default;
|
||||
|
|
@ -361,7 +361,7 @@ class TORCH_CUDA_API OptInConstDispatch {
|
|||
}
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API OptInDispatch {
|
||||
class TORCH_CUDA_CU_API OptInDispatch {
|
||||
public:
|
||||
virtual ~OptInDispatch() = default;
|
||||
OptInDispatch() = default;
|
||||
|
|
@ -494,7 +494,7 @@ class TORCH_CUDA_API OptInDispatch {
|
|||
}
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API OptOutMutator {
|
||||
class TORCH_CUDA_CU_API OptOutMutator {
|
||||
public:
|
||||
virtual ~OptOutMutator() = default;
|
||||
OptOutMutator() = default;
|
||||
|
|
@ -559,7 +559,7 @@ class TORCH_CUDA_API OptOutMutator {
|
|||
virtual Statement* mutate(kir::Sync*);
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API OptInMutator {
|
||||
class TORCH_CUDA_CU_API OptInMutator {
|
||||
public:
|
||||
virtual ~OptInMutator() = default;
|
||||
OptInMutator() = default;
|
||||
|
|
|
|||
|
|
@ -17,11 +17,11 @@ namespace fuser {
|
|||
namespace cuda {
|
||||
|
||||
// TODO: Should this actually be in launch params?
|
||||
struct TORCH_CUDA_API CompileOptions {
|
||||
struct TORCH_CUDA_CU_API CompileOptions {
|
||||
c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API FusionExecutor : public NonCopyable {
|
||||
class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
|
||||
public:
|
||||
// Unsafe compilation that's useful for debugging kernels, iterating over
|
||||
// slight modifications of a generated kernel
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
class TORCH_CUDA_API LaunchParams {
|
||||
class TORCH_CUDA_CU_API LaunchParams {
|
||||
public:
|
||||
static constexpr int64_t UNINITIALIZED_VAL = -1;
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch {
|
||||
class TORCH_CUDA_CU_API StatefulExpressionEvaluator : private OptOutDispatch {
|
||||
public:
|
||||
explicit StatefulExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {}
|
||||
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ class TensorView;
|
|||
|
||||
// Fusion Guard is our "context manager". It holds the actrive fusion and allows
|
||||
// it to be accessed anywhere through FusionGuard::getCurFusion().
|
||||
class TORCH_CUDA_API FusionGuard {
|
||||
class TORCH_CUDA_CU_API FusionGuard {
|
||||
public:
|
||||
Fusion* prev_fusion;
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ class TORCH_CUDA_API FusionGuard {
|
|||
*
|
||||
* The Fusion owns the whole IR graph (Vals and Exprs)
|
||||
*/
|
||||
class TORCH_CUDA_API Fusion final {
|
||||
class TORCH_CUDA_CU_API Fusion final {
|
||||
public:
|
||||
Fusion() = default;
|
||||
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ class IrCloner;
|
|||
* Basically beinng able to succienctly traverse down the inhereitance stack of
|
||||
* a Statment at runtime. This is currently implemented in dispatch.h
|
||||
*/
|
||||
class TORCH_CUDA_API Statement : public NonCopyable, public PolymorphicBase {
|
||||
class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
|
||||
friend void swap(Fusion&, Fusion&) noexcept;
|
||||
|
||||
public:
|
||||
|
|
@ -163,7 +163,7 @@ class TORCH_CUDA_API Statement : public NonCopyable, public PolymorphicBase {
|
|||
* 5) An enum value must be added to ValType in type.h
|
||||
* 6) A string entry must be added in val_type_string_map
|
||||
*/
|
||||
class TORCH_CUDA_API Val : public Statement {
|
||||
class TORCH_CUDA_CU_API Val : public Statement {
|
||||
public:
|
||||
virtual ~Val() = default;
|
||||
|
||||
|
|
@ -286,7 +286,7 @@ class TORCH_CUDA_API Val : public Statement {
|
|||
// 7) A string entry must be added in expr_type_string_map
|
||||
// 8) Entry added to ir_graphviz .cpp/.h
|
||||
|
||||
class TORCH_CUDA_API Expr : public Statement {
|
||||
class TORCH_CUDA_CU_API Expr : public Statement {
|
||||
public:
|
||||
Expr() = delete;
|
||||
explicit Expr(ExprType _type);
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ namespace cuda {
|
|||
class Fusion;
|
||||
|
||||
// Clones nodes from an exiting Fusion
|
||||
class TORCH_CUDA_API IrCloner : private OptInConstDispatch {
|
||||
class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
|
||||
friend class Statement;
|
||||
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ namespace cuda {
|
|||
// for example you can't use "~/temp/ir.dot" ("/home/user/temp/ir.dot"
|
||||
// must be used instead)
|
||||
//
|
||||
class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch {
|
||||
class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
|
||||
public:
|
||||
enum class DetailLevel {
|
||||
ComputeOnly, // Only dataflow (compute) nodes
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ namespace cuda {
|
|||
* This value can be a symbolic value (defined after the kernel
|
||||
* is compiled) or a constant value (inlined into the kernel definition).
|
||||
*/
|
||||
class TORCH_CUDA_API Bool : public Val {
|
||||
class TORCH_CUDA_CU_API Bool : public Val {
|
||||
public:
|
||||
~Bool() = default;
|
||||
|
||||
|
|
@ -61,7 +61,7 @@ class TORCH_CUDA_API Bool : public Val {
|
|||
* Float32. This value can be a symbolic value (defined after the kernel
|
||||
* is compiled) or a constant value (inlined into the kernel definition).
|
||||
*/
|
||||
class TORCH_CUDA_API Float : public Val {
|
||||
class TORCH_CUDA_CU_API Float : public Val {
|
||||
public:
|
||||
using ScalarType = double;
|
||||
|
||||
|
|
@ -101,7 +101,7 @@ class TORCH_CUDA_API Float : public Val {
|
|||
* This value can be a symbolic value (defined after the kernel
|
||||
* is compiled) or a constant value (inlined into the kernel definition).
|
||||
*/
|
||||
class TORCH_CUDA_API Half : public Val {
|
||||
class TORCH_CUDA_CU_API Half : public Val {
|
||||
public:
|
||||
~Half() = default;
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ class TORCH_CUDA_API Half : public Val {
|
|||
|
||||
// An Int64 value. If used for indexing it's set as size_t. Otherwise it's an
|
||||
// inlined literal in the kernel.
|
||||
class TORCH_CUDA_API Int : public Val {
|
||||
class TORCH_CUDA_CU_API Int : public Val {
|
||||
public:
|
||||
using ScalarType = int64_t;
|
||||
|
||||
|
|
@ -200,7 +200,7 @@ class TVDomainGuard;
|
|||
// that should be const, const. Gave this a try but expanded really quickly.
|
||||
// getComputeAtAxis not being const because it can return a TV that some expect
|
||||
// to be non-const is the biggest headache.
|
||||
class TORCH_CUDA_API TensorView : public Val {
|
||||
class TORCH_CUDA_CU_API TensorView : public Val {
|
||||
public:
|
||||
~TensorView() = default;
|
||||
|
||||
|
|
@ -352,9 +352,9 @@ class TORCH_CUDA_API TensorView : public Val {
|
|||
|
||||
void setMemoryType(MemoryType mt);
|
||||
|
||||
friend TORCH_CUDA_API TransformReplay;
|
||||
friend TORCH_CUDA_API OptOutMutator;
|
||||
friend TORCH_CUDA_API LoopNestGenerator;
|
||||
friend TORCH_CUDA_CU_API TransformReplay;
|
||||
friend TORCH_CUDA_CU_API OptOutMutator;
|
||||
friend TORCH_CUDA_CU_API LoopNestGenerator;
|
||||
friend ComputeAt;
|
||||
friend void IrFixComputeAt(Fusion*);
|
||||
friend void adjustMemoryTypes(Fusion* fusion);
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ bool areEqualScalars(Val* v1, Val* v2);
|
|||
* 3) Reduction across a dimension i.e. val.sum(axis=2)
|
||||
* 4) split/merge
|
||||
*/
|
||||
class TORCH_CUDA_API UnaryOp : public Expr {
|
||||
class TORCH_CUDA_CU_API UnaryOp : public Expr {
|
||||
public:
|
||||
~UnaryOp() = default;
|
||||
UnaryOp(UnaryOpType _type, Val* _out, Val* _in);
|
||||
|
|
@ -73,7 +73,7 @@ class TORCH_CUDA_API UnaryOp : public Expr {
|
|||
* 1) Add/mul/div/mod/sub (A * B)
|
||||
* 2) LT (A < B)
|
||||
*/
|
||||
class TORCH_CUDA_API BinaryOp : public Expr {
|
||||
class TORCH_CUDA_CU_API BinaryOp : public Expr {
|
||||
public:
|
||||
~BinaryOp() = default;
|
||||
BinaryOp(BinaryOpType _type, Val* _out, Val* _lhs, Val* _rhs);
|
||||
|
|
@ -113,7 +113,7 @@ class TORCH_CUDA_API BinaryOp : public Expr {
|
|||
* Broadcast _in to match _out. broadcast_dims are relative to out. Where
|
||||
* broadcast_dims.size() + _in->nDims() == _out->nDims().
|
||||
*/
|
||||
class TORCH_CUDA_API BroadcastOp : public Expr {
|
||||
class TORCH_CUDA_CU_API BroadcastOp : public Expr {
|
||||
public:
|
||||
~BroadcastOp() = default;
|
||||
BroadcastOp(Val* _out, Val* _in);
|
||||
|
|
@ -147,7 +147,7 @@ class TORCH_CUDA_API BroadcastOp : public Expr {
|
|||
* tensor. The output tensors size will be the size of all
|
||||
* non-reduction/non-broadcast dimensions.
|
||||
*/
|
||||
class TORCH_CUDA_API ReductionOp : public Expr {
|
||||
class TORCH_CUDA_CU_API ReductionOp : public Expr {
|
||||
public:
|
||||
~ReductionOp() = default;
|
||||
ReductionOp(BinaryOpType _reduction_op_type, Val* _init, Val* _out, Val* _in);
|
||||
|
|
@ -183,7 +183,7 @@ class TORCH_CUDA_API ReductionOp : public Expr {
|
|||
Val* const in_ = nullptr;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API TernaryOp : public Expr {
|
||||
class TORCH_CUDA_CU_API TernaryOp : public Expr {
|
||||
public:
|
||||
~TernaryOp() = default;
|
||||
TernaryOp(TernaryOpType _type, Val* _out, Val* _in1, Val* _in2, Val* _in3);
|
||||
|
|
@ -228,7 +228,7 @@ class TORCH_CUDA_API TernaryOp : public Expr {
|
|||
// TensorDomains which represent how to iterate over a tensor is made up of
|
||||
// IterDomains to form an ND iterable. We directly set parallization strategies
|
||||
// on IterDomains.
|
||||
class TORCH_CUDA_API IterDomain : public Val {
|
||||
class TORCH_CUDA_CU_API IterDomain : public Val {
|
||||
public:
|
||||
IterDomain(
|
||||
Val* _start,
|
||||
|
|
@ -363,7 +363,7 @@ class TORCH_CUDA_API IterDomain : public Val {
|
|||
* operations that take in a TensorDomain, applies a transformation and outputs
|
||||
* a tensor domain.
|
||||
*/
|
||||
class TORCH_CUDA_API TensorDomain : public Val {
|
||||
class TORCH_CUDA_CU_API TensorDomain : public Val {
|
||||
public:
|
||||
TensorDomain() = delete;
|
||||
~TensorDomain() = default;
|
||||
|
|
@ -556,7 +556,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
|
|||
* Representation a split on an IterDomain by "factor"
|
||||
* TODO: Implement split by nparts
|
||||
*/
|
||||
class TORCH_CUDA_API Split : public Expr {
|
||||
class TORCH_CUDA_CU_API Split : public Expr {
|
||||
public:
|
||||
~Split() = default;
|
||||
|
||||
|
|
@ -598,7 +598,7 @@ class TORCH_CUDA_API Split : public Expr {
|
|||
* if there is one.
|
||||
* TODO: Should this be a unary op type?
|
||||
*/
|
||||
class TORCH_CUDA_API Merge : public Expr {
|
||||
class TORCH_CUDA_CU_API Merge : public Expr {
|
||||
public:
|
||||
~Merge() = default;
|
||||
Merge(IterDomain* _out, IterDomain* _outer, IterDomain* _inner);
|
||||
|
|
@ -636,7 +636,7 @@ class TORCH_CUDA_API Merge : public Expr {
|
|||
* - blockDim.z
|
||||
* - T3.stride[2]
|
||||
*/
|
||||
class TORCH_CUDA_API NamedScalar : public Val {
|
||||
class TORCH_CUDA_CU_API NamedScalar : public Val {
|
||||
public:
|
||||
~NamedScalar() = default;
|
||||
NamedScalar() = delete;
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ namespace cuda {
|
|||
//! This class is intended for debug printing, so it attempts
|
||||
//! to handle invalid states as well.
|
||||
//!
|
||||
class TORCH_CUDA_API IrPrinter : public OptInConstDispatch {
|
||||
class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
|
||||
public:
|
||||
explicit IrPrinter(std::ostream& os) : os_(os) {}
|
||||
|
||||
|
|
@ -107,12 +107,12 @@ class TORCH_CUDA_API IrPrinter : public OptInConstDispatch {
|
|||
int indent_size_ = 0;
|
||||
};
|
||||
|
||||
TORCH_CUDA_API std::ostream& operator<<(
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(
|
||||
std::ostream& os,
|
||||
const Statement* stmt);
|
||||
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream& os, Fusion* f);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream& os, Fusion& f);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion* f);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion& f);
|
||||
|
||||
// TODO(kir): catch accidental << printing of Kernel IR nodes
|
||||
// (use kir::toString(node) instead)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ namespace cuda {
|
|||
//
|
||||
//! \sa IrTransformPrinter
|
||||
//!
|
||||
class TORCH_CUDA_API IrMathPrinter : public IrPrinter {
|
||||
class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter {
|
||||
public:
|
||||
IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
|
||||
|
||||
|
|
@ -41,7 +41,7 @@ class TORCH_CUDA_API IrMathPrinter : public IrPrinter {
|
|||
//!
|
||||
//! \sa IrMathPrinter
|
||||
//!
|
||||
class TORCH_CUDA_API IrTransformPrinter : public IrPrinter {
|
||||
class TORCH_CUDA_CU_API IrTransformPrinter : public IrPrinter {
|
||||
public:
|
||||
IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ namespace cuda {
|
|||
* TODO: We may want to have ordering of outputs to inputs. I'm not sure why we
|
||||
* would want this, but seems like it would be a reasonable request.
|
||||
*/
|
||||
class TORCH_CUDA_API IterVisitor : public OptOutDispatch {
|
||||
class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
|
||||
public:
|
||||
virtual ~IterVisitor() = default;
|
||||
|
||||
|
|
@ -147,7 +147,7 @@ class TORCH_CUDA_API IterVisitor : public OptOutDispatch {
|
|||
* outputs to guarentee that we will traverse all outputs of all exprs during
|
||||
* the backward traversal.
|
||||
*/
|
||||
class TORCH_CUDA_API BackwardVisitor : public OptOutDispatch {
|
||||
class TORCH_CUDA_CU_API BackwardVisitor : public OptOutDispatch {
|
||||
public:
|
||||
virtual ~BackwardVisitor() = default;
|
||||
|
||||
|
|
@ -207,7 +207,7 @@ class TORCH_CUDA_API BackwardVisitor : public OptOutDispatch {
|
|||
bool traverseAllPaths = false);
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API DependencyCheck {
|
||||
class TORCH_CUDA_CU_API DependencyCheck {
|
||||
public:
|
||||
// Returns if "dependency" is a dependency of "of".
|
||||
static bool isDependencyOf(Val* dependency, Val* of);
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ struct KernelSummary {
|
|||
//! by a Fusion object. The goal is to have the Kernel object
|
||||
//! own the Kernel IR nodes
|
||||
//!
|
||||
class TORCH_CUDA_API Kernel final : public NonCopyable {
|
||||
class TORCH_CUDA_CU_API Kernel final : public NonCopyable {
|
||||
public:
|
||||
Kernel() = default;
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ namespace cuda {
|
|||
//! \note the uniqueness of the ide generated for a given input set is only
|
||||
//! local to the instance of `InputsIdLookup`.
|
||||
//!
|
||||
class TORCH_CUDA_API InputsIdLookup {
|
||||
class TORCH_CUDA_CU_API InputsIdLookup {
|
||||
public:
|
||||
//! constructor where maximum cache size is fixed during init
|
||||
explicit InputsIdLookup(size_t max_cache_size = 10)
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ class Passkey {
|
|||
Passkey() {}
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API NamedScalar : public Val {
|
||||
class TORCH_CUDA_CU_API NamedScalar : public Val {
|
||||
public:
|
||||
NamedScalar(Passkey, std::string name, DataType dtype)
|
||||
: Val(ValType::KirNamedScalar, dtype, true, true), name_(name) {}
|
||||
|
|
@ -64,7 +64,7 @@ class TORCH_CUDA_API NamedScalar : public Val {
|
|||
std::string name_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API Bool : public Val {
|
||||
class TORCH_CUDA_CU_API Bool : public Val {
|
||||
public:
|
||||
explicit Bool(Passkey, const c10::optional<bool>& value)
|
||||
: Val(ValType::KirScalar, DataType::Bool, true, true),
|
||||
|
|
@ -87,7 +87,7 @@ class TORCH_CUDA_API Bool : public Val {
|
|||
const c10::optional<bool> maybe_value_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API Float : public Val {
|
||||
class TORCH_CUDA_CU_API Float : public Val {
|
||||
public:
|
||||
using ScalarType = double;
|
||||
|
||||
|
|
@ -112,7 +112,7 @@ class TORCH_CUDA_API Float : public Val {
|
|||
const c10::optional<ScalarType> maybe_value_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API Half : public Val {
|
||||
class TORCH_CUDA_CU_API Half : public Val {
|
||||
public:
|
||||
explicit Half(Passkey, const c10::optional<float>& value)
|
||||
: Val(ValType::KirScalar, DataType::Half, true, true),
|
||||
|
|
@ -135,7 +135,7 @@ class TORCH_CUDA_API Half : public Val {
|
|||
const c10::optional<float> maybe_value_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API Int : public Val {
|
||||
class TORCH_CUDA_CU_API Int : public Val {
|
||||
public:
|
||||
using ScalarType = int64_t;
|
||||
|
||||
|
|
@ -163,7 +163,7 @@ class TORCH_CUDA_API Int : public Val {
|
|||
const c10::optional<ScalarType> maybe_value_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API IterDomain : public Val {
|
||||
class TORCH_CUDA_CU_API IterDomain : public Val {
|
||||
public:
|
||||
IterDomain(Passkey, Val* start, Val* extent);
|
||||
|
||||
|
|
@ -233,7 +233,7 @@ class TORCH_CUDA_API IterDomain : public Val {
|
|||
bool is_rfactor_domain_ = false;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API TensorDomain : public Val {
|
||||
class TORCH_CUDA_CU_API TensorDomain : public Val {
|
||||
public:
|
||||
explicit TensorDomain(Passkey, std::vector<IterDomain*> domain);
|
||||
|
||||
|
|
@ -304,7 +304,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
|
|||
const std::vector<bool> contiguity_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API TensorView : public Val {
|
||||
class TORCH_CUDA_CU_API TensorView : public Val {
|
||||
public:
|
||||
explicit TensorView(Passkey, const fuser::cuda::TensorView* tv);
|
||||
|
||||
|
|
@ -329,7 +329,7 @@ class TORCH_CUDA_API TensorView : public Val {
|
|||
const fuser::cuda::TensorView* fuser_tv_ = nullptr;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API UnaryOp : public Expr {
|
||||
class TORCH_CUDA_CU_API UnaryOp : public Expr {
|
||||
public:
|
||||
UnaryOp(Passkey, UnaryOpType type, Val* out, Val* in);
|
||||
|
||||
|
|
@ -351,7 +351,7 @@ class TORCH_CUDA_API UnaryOp : public Expr {
|
|||
Val* const in_ = nullptr;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API BinaryOp : public Expr {
|
||||
class TORCH_CUDA_CU_API BinaryOp : public Expr {
|
||||
public:
|
||||
BinaryOp(Passkey, BinaryOpType type, Val* out, Val* lhs, Val* rhs);
|
||||
|
||||
|
|
@ -378,7 +378,7 @@ class TORCH_CUDA_API BinaryOp : public Expr {
|
|||
Val* const rhs_ = nullptr;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API TernaryOp : public Expr {
|
||||
class TORCH_CUDA_CU_API TernaryOp : public Expr {
|
||||
public:
|
||||
TernaryOp(
|
||||
Passkey,
|
||||
|
|
@ -416,7 +416,7 @@ class TORCH_CUDA_API TernaryOp : public Expr {
|
|||
Val* const in3_ = nullptr;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API ReductionOp : public Expr {
|
||||
class TORCH_CUDA_CU_API ReductionOp : public Expr {
|
||||
public:
|
||||
ReductionOp(
|
||||
Passkey,
|
||||
|
|
@ -460,7 +460,7 @@ class TORCH_CUDA_API ReductionOp : public Expr {
|
|||
Bool* const pred_ = nullptr;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API TensorIndex : public Val {
|
||||
class TORCH_CUDA_CU_API TensorIndex : public Val {
|
||||
public:
|
||||
TensorIndex(
|
||||
Passkey,
|
||||
|
|
@ -486,7 +486,7 @@ class TORCH_CUDA_API TensorIndex : public Val {
|
|||
std::vector<Val*> indices_;
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API BroadcastOp : public Expr {
|
||||
class TORCH_CUDA_CU_API BroadcastOp : public Expr {
|
||||
public:
|
||||
BroadcastOp(Passkey, Val* out, Val* in);
|
||||
|
||||
|
|
@ -510,7 +510,7 @@ class TORCH_CUDA_API BroadcastOp : public Expr {
|
|||
//
|
||||
// TODO: The components of Allocate like Type and Name could be separated from
|
||||
// the the assocated TensorView. Perhaps that is more appropriate?
|
||||
class TORCH_CUDA_API Allocate : public Expr {
|
||||
class TORCH_CUDA_CU_API Allocate : public Expr {
|
||||
public:
|
||||
explicit Allocate(
|
||||
Passkey,
|
||||
|
|
@ -560,7 +560,7 @@ class TORCH_CUDA_API Allocate : public Expr {
|
|||
};
|
||||
|
||||
// Sync represents __syncthreads barrier for block level coordination.
|
||||
class TORCH_CUDA_API Sync : public Expr {
|
||||
class TORCH_CUDA_CU_API Sync : public Expr {
|
||||
public:
|
||||
explicit Sync(Passkey, bool war_sync = false);
|
||||
|
||||
|
|
@ -574,7 +574,7 @@ class TORCH_CUDA_API Sync : public Expr {
|
|||
};
|
||||
|
||||
// TODO(kir): promote to IR node
|
||||
class TORCH_CUDA_API Scope {
|
||||
class TORCH_CUDA_CU_API Scope {
|
||||
public:
|
||||
Scope() = default;
|
||||
|
||||
|
|
@ -633,7 +633,7 @@ class TORCH_CUDA_API Scope {
|
|||
//
|
||||
// TODO(kir): this is not a real expression
|
||||
//
|
||||
class TORCH_CUDA_API ForLoop : public Expr {
|
||||
class TORCH_CUDA_CU_API ForLoop : public Expr {
|
||||
public:
|
||||
ForLoop(Passkey, Val* index, IterDomain* iter_domain, Expr* parent_scope);
|
||||
|
||||
|
|
@ -673,7 +673,7 @@ class TORCH_CUDA_API ForLoop : public Expr {
|
|||
//
|
||||
// TODO(kir): this is not a real expression
|
||||
//
|
||||
class TORCH_CUDA_API IfThenElse : public Expr {
|
||||
class TORCH_CUDA_CU_API IfThenElse : public Expr {
|
||||
public:
|
||||
explicit IfThenElse(Passkey, Bool* cond, Expr* parent_scope);
|
||||
|
||||
|
|
@ -717,7 +717,7 @@ class TORCH_CUDA_API IfThenElse : public Expr {
|
|||
// explicitly mark a grid reduction and the buffer allocation needed to do it.
|
||||
// This node provides FusionExecutor the information it needs to allocate the
|
||||
// reduction and sync buffers.
|
||||
class TORCH_CUDA_API GridReduction : public Expr {
|
||||
class TORCH_CUDA_CU_API GridReduction : public Expr {
|
||||
public:
|
||||
explicit GridReduction(Passkey, ReductionOp* reduction_op);
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ namespace kir {
|
|||
//! This class is intended for debug printing, so it attempts
|
||||
//! to handle invalid IR states as much as possible.
|
||||
//!
|
||||
class TORCH_CUDA_API IrPrinter : private OptInConstDispatch {
|
||||
class TORCH_CUDA_CU_API IrPrinter : private OptInConstDispatch {
|
||||
static constexpr char* kTab = " ";
|
||||
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ Kernel* GpuLower::kernel() const {
|
|||
//
|
||||
// TODO(kir): this is a interim solution for easing the Kernel IR splitting
|
||||
//
|
||||
class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
|
||||
class TORCH_CUDA_CU_API GpuLower::KernelIrMapper : private OptInConstDispatch {
|
||||
public:
|
||||
explicit KernelIrMapper(GpuLower* gpu_lower)
|
||||
: gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
class TORCH_CUDA_API GpuLower {
|
||||
class TORCH_CUDA_CU_API GpuLower {
|
||||
class KernelIrMapper;
|
||||
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
class TORCH_CUDA_API IndexLowering : public OptInDispatch {
|
||||
class TORCH_CUDA_CU_API IndexLowering : public OptInDispatch {
|
||||
public:
|
||||
static std::vector<Expr*> getIndexedExprs(
|
||||
Fusion* fusion,
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ namespace cuda {
|
|||
* nests to initialize reduction buffers.
|
||||
*
|
||||
*/
|
||||
class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
|
||||
class TORCH_CUDA_CU_API LoopNestGenerator : public OptOutDispatch {
|
||||
public:
|
||||
static std::vector<Expr*> loweredExprs(
|
||||
Fusion* fusion,
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ namespace cuda {
|
|||
//! If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
|
||||
//! no longer need the predicate and can reset the bit accordingly
|
||||
//!
|
||||
class TORCH_CUDA_API ThreadPredicateMap {
|
||||
class TORCH_CUDA_CU_API ThreadPredicateMap {
|
||||
public:
|
||||
using SourceMapType = std::unordered_map<
|
||||
ParallelType,
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ namespace cuda {
|
|||
* corners.
|
||||
*/
|
||||
|
||||
class TORCH_CUDA_API UnrollPass : public OptOutDispatch {
|
||||
class TORCH_CUDA_CU_API UnrollPass : public OptOutDispatch {
|
||||
private:
|
||||
// Wrapper to access thread_predicates_ based on an output TV
|
||||
kir::Bool* getThreadPredicate(TensorView*);
|
||||
|
|
|
|||
|
|
@ -22,15 +22,17 @@ namespace cuda {
|
|||
// Get fusion_node ready for execution.
|
||||
// find or compile `CudaKernel` for graph stored in `attr::Subgraph`
|
||||
// this function assigns `attr::cache_id` to `fusion_node`
|
||||
TORCH_CUDA_API void compileCudaFusionGroup(Node* fusion_node);
|
||||
TORCH_CUDA_CU_API void compileCudaFusionGroup(Node* fusion_node);
|
||||
|
||||
// Execute fusion_node.
|
||||
// Current protocol is that the function allocates output tensor append them to
|
||||
// `stack` after execution.
|
||||
// TODO: support shape inferencing. Right now we only handles static shape
|
||||
TORCH_CUDA_API void runCudaFusionGroup(const Node* fusion_node, Stack& stack);
|
||||
TORCH_CUDA_CU_API void runCudaFusionGroup(
|
||||
const Node* fusion_node,
|
||||
Stack& stack);
|
||||
|
||||
TORCH_CUDA_API void CudaFuseGraph(std::shared_ptr<Graph>& graph);
|
||||
TORCH_CUDA_CU_API void CudaFuseGraph(std::shared_ptr<Graph>& graph);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace fuser
|
||||
|
|
|
|||
|
|
@ -30,15 +30,15 @@ constexpr int kFcdReductionThreadX = 128;
|
|||
constexpr int kNonFcdReductionThreadX = 32;
|
||||
constexpr int kNonFcdReductionThreadY = 32;
|
||||
|
||||
TORCH_CUDA_API bool hasReductionNode(const Block* block);
|
||||
TORCH_CUDA_CU_API bool hasReductionNode(const Block* block);
|
||||
|
||||
TORCH_CUDA_API bool isReductionNode(const Node* node);
|
||||
TORCH_CUDA_CU_API bool isReductionNode(const Node* node);
|
||||
|
||||
// returns whether or not a parsing function exists for the given node type.
|
||||
TORCH_CUDA_API bool isNodeParsible(const Node* node);
|
||||
TORCH_CUDA_CU_API bool isNodeParsible(const Node* node);
|
||||
|
||||
// lowers PyTorch jit graph to `Fusion`.
|
||||
TORCH_CUDA_API std::unique_ptr<Fusion> parseJitIR(
|
||||
TORCH_CUDA_CU_API std::unique_ptr<Fusion> parseJitIR(
|
||||
const std::shared_ptr<Graph>& graph);
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -19,10 +19,10 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
TORCH_CUDA_API bool isFusableCudaFusionGroup(const Node* node);
|
||||
TORCH_CUDA_CU_API bool isFusableCudaFusionGroup(const Node* node);
|
||||
|
||||
// consider if `node` could be fused into `fusion`
|
||||
TORCH_CUDA_API bool isFusableCudaFusionGroup(
|
||||
TORCH_CUDA_CU_API bool isFusableCudaFusionGroup(
|
||||
const Node* fusion,
|
||||
const Node* node);
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ class PredicateCompute {
|
|||
bool ignore_block_grid_reductions = true);
|
||||
};
|
||||
|
||||
class TORCH_CUDA_API UnrollPredicate {
|
||||
class TORCH_CUDA_CU_API UnrollPredicate {
|
||||
public:
|
||||
static kir::Bool* get(
|
||||
const std::vector<kir::ForLoop*>& outer_loops,
|
||||
|
|
|
|||
|
|
@ -288,7 +288,7 @@ ReductionParams reductionHeuristic(
|
|||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
|
||||
TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
|
||||
Fusion* fusion,
|
||||
const at::ArrayRef<c10::IValue>& fusion_inputs,
|
||||
TensorView* red_tv) {
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ namespace fuser {
|
|||
namespace cuda {
|
||||
|
||||
// return true or false on whether given fusion could be scheduled;
|
||||
TORCH_CUDA_API bool scheduleFusion(
|
||||
TORCH_CUDA_CU_API bool scheduleFusion(
|
||||
Fusion* fusion,
|
||||
const at::ArrayRef<c10::IValue> inputs);
|
||||
|
||||
|
|
@ -56,12 +56,12 @@ class ReductionParamsHash {
|
|||
}
|
||||
};
|
||||
|
||||
TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
|
||||
TORCH_CUDA_CU_API c10::optional<ReductionParams> getReductionHeuristics(
|
||||
Fusion* fusion,
|
||||
const at::ArrayRef<c10::IValue>& fusion_inputs,
|
||||
TensorView* red_tv);
|
||||
|
||||
TORCH_CUDA_API void scheduleReduction(
|
||||
TORCH_CUDA_CU_API void scheduleReduction(
|
||||
Fusion* fusion,
|
||||
const ReductionParams& rparams,
|
||||
TensorView* red_tv,
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
TORCH_CUDA_API void TypePropagate(std::shared_ptr<Graph>& graph);
|
||||
TORCH_CUDA_CU_API void TypePropagate(std::shared_ptr<Graph>& graph);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace fuser
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ struct id_int_lt {
|
|||
//
|
||||
// If error_on_failure = false, replay will replay everything it can, and ignore
|
||||
// operations it can't.
|
||||
class TORCH_CUDA_API ReplayTransformations : public IterVisitor {
|
||||
class TORCH_CUDA_CU_API ReplayTransformations : public IterVisitor {
|
||||
protected:
|
||||
const std::vector<IterDomain*>& target_domain_;
|
||||
std::unordered_map<IterDomain*, IterDomain*> id_map_;
|
||||
|
|
@ -148,7 +148,7 @@ class TORCH_CUDA_API ReplayTransformations : public IterVisitor {
|
|||
* history replay_domain
|
||||
*/
|
||||
|
||||
class TORCH_CUDA_API BestEffortReplay {
|
||||
class TORCH_CUDA_CU_API BestEffortReplay {
|
||||
private:
|
||||
std::unordered_map<IterDomain*, IterDomain*> id_map_;
|
||||
std::unordered_map<IterDomain*, size_t> leaf_ids_;
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ namespace cuda {
|
|||
class TensorDomain;
|
||||
class TensorView;
|
||||
|
||||
class TORCH_CUDA_API TransformReplay {
|
||||
class TORCH_CUDA_CU_API TransformReplay {
|
||||
public:
|
||||
// Replay producer as consumer, returns {producer, producer_compute_at_axis}.
|
||||
static std::pair<TensorDomain*, unsigned int> replayPasC(
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ namespace cuda {
|
|||
|
||||
// TODO: Only replay dispatch is really borrowed from TransformIter, we should
|
||||
// reevaluate the reuse of dispatch for classes that inherit TransformIter.
|
||||
class TORCH_CUDA_API TransformRFactor {
|
||||
class TORCH_CUDA_CU_API TransformRFactor {
|
||||
public:
|
||||
// Create a copy of td, change its history by presrving axes so they appear in
|
||||
// the root domain
|
||||
|
|
|
|||
|
|
@ -489,11 +489,13 @@ std::ostream& operator<<(std::ostream& out, const MemoryType mtype) {
|
|||
return out << memory_type2string(mtype);
|
||||
}
|
||||
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream& out, const IterType bt) {
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(
|
||||
std::ostream& out,
|
||||
const IterType bt) {
|
||||
return out << iter_type2string(bt);
|
||||
}
|
||||
|
||||
TORCH_CUDA_API c10::optional<std::string> inline_op_str(
|
||||
TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(
|
||||
const UnaryOpType uotype) {
|
||||
const char* str = unary_op_type_inline_op2string(uotype);
|
||||
return str != nullptr ? c10::optional<std::string>(std::string(str))
|
||||
|
|
|
|||
|
|
@ -169,23 +169,23 @@ bool is_logical_op(const BinaryOpType& bot);
|
|||
DataType aten_to_data_type(const at::ScalarType& scalar_type);
|
||||
at::ScalarType data_type_to_aten(const DataType& data_type);
|
||||
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const ValType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const DataType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const ExprType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const UnaryOpType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const BinaryOpType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const TernaryOpType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const ParallelType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const MemoryType);
|
||||
TORCH_CUDA_API std::ostream& operator<<(std::ostream&, const IterType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ValType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const DataType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ExprType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const UnaryOpType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const BinaryOpType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const TernaryOpType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const ParallelType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const MemoryType);
|
||||
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream&, const IterType);
|
||||
|
||||
std::string stringifyThreadSize(const ParallelType);
|
||||
std::string stringifyThread(const ParallelType);
|
||||
|
||||
TORCH_CUDA_API c10::optional<std::string> inline_op_str(const UnaryOpType);
|
||||
TORCH_CUDA_API c10::optional<std::string> inline_op_str(const BinaryOpType);
|
||||
TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(const UnaryOpType);
|
||||
TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(const BinaryOpType);
|
||||
|
||||
TORCH_CUDA_API c10::optional<std::string> cast_func_str(
|
||||
TORCH_CUDA_CU_API c10::optional<std::string> cast_func_str(
|
||||
const std::pair<DataType, DataType>&);
|
||||
|
||||
size_t dataTypeSize(DataType type);
|
||||
|
|
|
|||
|
|
@ -17,14 +17,14 @@ namespace jit {
|
|||
namespace fuser {
|
||||
namespace cuda {
|
||||
|
||||
TORCH_CUDA_API void getMajorMinor(
|
||||
TORCH_CUDA_CU_API void getMajorMinor(
|
||||
const cudaDeviceProp* const prop,
|
||||
int& major,
|
||||
int& minor);
|
||||
|
||||
// A class holding metadata for an actual CUDA function.
|
||||
// Note: CUDA functions are per device.
|
||||
struct TORCH_CUDA_API FusedKernelCUDA
|
||||
struct TORCH_CUDA_CU_API FusedKernelCUDA
|
||||
: public ::torch::jit::fuser::FusedKernel {
|
||||
FusedKernelCUDA(
|
||||
at::DeviceIndex device,
|
||||
|
|
|
|||
|
|
@ -185,7 +185,7 @@ class CudaPrinter : public IRPrinter {
|
|||
|
||||
// Construct Cuda C from the buffer and tensor input, and invoke the kernel
|
||||
// when real arguments are provided.
|
||||
class TORCH_CUDA_API CudaCodeGen : public CodeGen {
|
||||
class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
|
||||
public:
|
||||
template <typename... Ts>
|
||||
CudaCodeGen(Stmt* stmt, Ts... ts)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user