[SE] Add cudnnTransformTensor to StreamExecutor.

PiperOrigin-RevId: 158062553
2025-12-07 12:20:24 +01:00 · 2017-06-05 14:30:41 -07:00 · 2017-06-05 14:30:41 -07:00 · 9e6899720a
commit 9e6899720a
parent 827874c307
5 changed files with 71 additions and 0 deletions
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -2401,6 +2401,31 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
  return (*transform_scratch)->device_memory();
 }

+bool CudnnSupport::DoTransformTensor(Stream* stream,
+                                     const dnn::BatchDescriptor& input_desc,
+                                     const DeviceMemory<float>& input_data,
+                                     const dnn::BatchDescriptor& output_desc,
+                                     DeviceMemory<float>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  ScopedTensorDescriptor input_tensor_desc(parent_, input_desc,
+                                           CUDNN_DATA_FLOAT);
+  ScopedTensorDescriptor output_tensor_desc(parent_, output_desc,
+                                            CUDNN_DATA_FLOAT);
+  cudnnStatus_t status = wrap::cudnnTransformTensor(
+      parent_, ToHandle(dnn_handle_), &alpha, input_tensor_desc.handle(),
+      input_data.opaque(), &beta, output_tensor_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "Could not transform a tensor from layout "
+               << input_desc.ToShortString() << " to "
+               << output_desc.ToShortString();
+    return false;
+  }
+  return true;
+}
+
 template <class T>
 bool CudnnSupport::DoConvolveBackwardDataImpl(
    Stream* stream,
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@ -481,6 +481,11 @@ class CudnnSupport : public dnn::DnnSupport {
      std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch)
      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_);

+  bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
+                         const DeviceMemory<float>& input_data,
+                         const dnn::BatchDescriptor& output_desc,
+                         DeviceMemory<float>* output_data) override;
+
  template <class T>
  bool DoBatchNormalizationForwardImpl(
      Stream* stream, dnn::DataType data_type, const DeviceMemory<T>& x,
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@ -1960,6 +1960,23 @@ class DnnSupport {
    return false;
  }

+  // Transforms a tensor into another tensor with a different layout and/or data
+  // type.
+  //
+  // Arguments:
+  //  stream: pointer to the stream where this operation should be enqueued to.
+  //  input_desc: descriptor for the input tensor.
+  //  input_data: the device memory region that contains the input tensor.
+  //  output_desc: descriptor for the output tensor.
+  //  output_data: the device memory region that contains the output tensor.
+  virtual bool DoTransformTensor(Stream* stream,
+                                 const dnn::BatchDescriptor& input_desc,
+                                 const DeviceMemory<float>& input_data,
+                                 const dnn::BatchDescriptor& output_desc,
+                                 DeviceMemory<float>* output_data) {
+    return false;
+  }
+
 private:
  SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
 };
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@ -4389,6 +4389,23 @@ Stream &Stream::ThenRnnBackward(
  return *this;
 }

+Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
+                                    const DeviceMemory<float> &input_data,
+                                    const dnn::BatchDescriptor &output_desc,
+                                    DeviceMemory<float> *output_data) {
+  VLOG_CALL(PARAM(input_desc), PARAM(input_data), PARAM(output_desc),
+            PARAM(output_data));
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoTransformTensor(this, input_desc, input_data,
+                                        output_desc, output_data));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenDoHostCallbackForTest(std::function<void()> callback) {
  VLOG_CALL(PARAM(callback));

--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@ -1653,6 +1653,13 @@ class Stream {
                          DeviceMemory<uint8> *reserve_space_data,
                          ScratchAllocator *workspace_allocator);

+  // Enqueue onto the stream a operation that transforms a tensor.
+  // See DnnSupport::DoTransformTensor for more details.
+  Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
+                              const DeviceMemory<float> &input_data,
+                              const dnn::BatchDescriptor &output_desc,
+                              DeviceMemory<float> *output_data);
+
  // (Synchronously) block the host code waiting for the operations
  // entrained on the stream (enqueued to this point in program
  // execution) to complete.