mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52240 Test Plan: Sandcastle tests Reviewed By: xush6528 Differential Revision: D26408330 fbshipit-source-id: 60779ba0e38c8f90e0e341c8faa2661e631112dd
139 lines
4.2 KiB
Plaintext
139 lines
4.2 KiB
Plaintext
/**
|
|
* Copyright (c) 2016-present, Facebook, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "caffe2/operators/unique_ops.h"
|
|
|
|
#include <thrust/device_vector.h>
|
|
#include <thrust/sequence.h>
|
|
#include <thrust/sort.h>
|
|
#include <thrust/system/cuda/execution_policy.h>
|
|
#include <thrust/unique.h>
|
|
#include <thrust/version.h>
|
|
#include "caffe2/core/context_gpu.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
#if THRUST_VERSION >= 100800
|
|
namespace {
|
|
__global__ void remap_kernel(
|
|
thrust::device_ptr<int> second_order,
|
|
thrust::device_ptr<int> order,
|
|
int* output,
|
|
int N,
|
|
int K) {
|
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
if (i >= K)
|
|
return;
|
|
int idx = second_order[i];
|
|
output[order[idx]] = i;
|
|
// Maybe cuda 1D kernel?
|
|
for (idx++; idx < N && (i == K - 1 || idx != second_order[i + 1]); idx++) {
|
|
output[order[idx]] = i;
|
|
}
|
|
return;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool UniqueOp<CUDAContext>::DoRunWithType() {
|
|
auto& inputTensor = Input(0);
|
|
// use dim32 to enforce that it's fine to have remapping of type int
|
|
int N = inputTensor.dim32(0);
|
|
CAFFE_ENFORCE_EQ(inputTensor.dim(), 1, "Input should be a vector");
|
|
|
|
int* remapping = nullptr;
|
|
if (REMAPPING < OutputSize()) {
|
|
auto* remappingTensor =
|
|
Output(REMAPPING, inputTensor.sizes(), at::dtype<int>());
|
|
remapping = remappingTensor->template mutable_data<int>();
|
|
}
|
|
|
|
if (N <= 0) {
|
|
// if the input is empty, we have nothing to do, not even launch kernel.
|
|
/* auto* uniqueTensor = */ Output(UNIQUE, {0}, at::dtype<T>());
|
|
return true;
|
|
}
|
|
|
|
const T* input = inputTensor.template data<T>();
|
|
ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(CUDA));
|
|
auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
|
|
context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);
|
|
|
|
// Create two vectors of {0, 1, ..., N-1} on CUDA device
|
|
thrust::device_vector<int> order1(N), order2(N);
|
|
thrust::sequence(
|
|
thrust::cuda::par.on(context_.cuda_stream()),
|
|
order1.begin(),
|
|
order1.end());
|
|
thrust::sequence(
|
|
thrust::cuda::par.on(context_.cuda_stream()),
|
|
order2.begin(),
|
|
order2.end());
|
|
|
|
// Sort the input along with order vector. So now we know where each element
|
|
// is permutated to. For example:
|
|
// input1 = 1,3,5,1,5,7,9
|
|
// order1 = 0,1,2,3,4,5,6
|
|
// Now we have:
|
|
// output = 1,1,3,5,5,7,9
|
|
// order1 = 0,3,1,2,4,5,6
|
|
thrust::sort_by_key(
|
|
thrust::cuda::par.on(context_.cuda_stream()),
|
|
buffer,
|
|
buffer + N,
|
|
order1.begin());
|
|
|
|
// Use consequent unique op to get another order_buffer
|
|
// input2 = 1,1,3,5,5,7,9
|
|
// order2 = 0,1,2,3,4,5,6
|
|
// Now we have:
|
|
// output = 1,3,5,7,9
|
|
// order2 = 0,2,3,5,6
|
|
auto new_last = thrust::unique_by_key(
|
|
thrust::cuda::par.on(context_.cuda_stream()),
|
|
buffer,
|
|
buffer + N,
|
|
order2.begin());
|
|
int K = new_last.first - buffer;
|
|
|
|
auto* uniqueTensor = Output(UNIQUE, {K}, at::dtype<T>());
|
|
T* unique = uniqueTensor->template mutable_data<T>();
|
|
context_.CopyItemsSameDevice(thrust_unique_buffer_.meta(), K, buffer, unique);
|
|
|
|
// Compute the remapping. For example, for the number 1, if we look at
|
|
// order2[0] and order2[1], we know that input2[0:2) are all 1. They are all
|
|
// remapped to 0 in final input. And from order1, we know where they come
|
|
// from. The rest is easy.
|
|
if (remapping != nullptr) {
|
|
// record remap
|
|
remap_kernel<<<
|
|
CAFFE_GET_BLOCKS(K),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
order2.data(), order1.data(), remapping, N, K);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(Unique, UniqueOp<CUDAContext>);
|
|
|
|
#endif // THRUST_VERSION >= 100800
|
|
} // namespace caffe2
|