pytorch/torch/csrc/jit/codegen/onednn/kernel.cpp
Xia, Weiwen 97a291f6bd [ONEDNN][BC-breaking] update onednn from v2.7.3 to v3.1.1 (#97957)
**Summary**
Update onednn from v2.7.3 to v3.1.1.
It is bc-breaking as some APIs are changed on oneDNN side. Changes include:
- PyTorch code where oneDNN is directly called
- Submodule `third_party/ideep` to adapt to oneDNN's new API.
- CMAKE files to fix build issues.

**Test plan**
Building issues and correctness are covered by CI checks.
For performance, we have run TorchBench models to ensure there is no regression. Below is the comparison before and after oneDNN update.
![image](https://github.com/pytorch/pytorch/assets/12522207/415a4ff0-7566-40c6-aed0-24997a475b0e)

Note:
- Base commit of PyTorch: da322ea
- CPU: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz (Ice Lake)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/97957
Approved by: https://github.com/jgong5, https://github.com/jerryzh168
2023-08-25 12:13:18 +00:00

301 lines
10 KiB
C++

#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
#include <torch/csrc/jit/codegen/onednn/kernel.h>
#include <ATen/core/functional.h>
#include <torch/csrc/jit/jit_log.h>
namespace torch {
namespace jit {
namespace fuser {
namespace onednn {
using namespace dnnl::graph;
using data_type = dnnl::graph::logical_tensor::data_type;
LlgaKernel::LlgaKernel(const Node* fusionNode)
: fusionNode_(fusionNode),
graph_(fusionNode->g(attr::Subgraph)),
nGraphInputs_(graph_->inputs().size()),
nOutputs_(graph_->outputs().size()),
debugName_(genDebugName()) {
// TODO: This is a workaround to recreate the partitions here.
// The ideal way is to use the partition serialization API (not available from
// LLGA now) to carry a serialized string representation from graph rewrite
// and deserialize it here.
auto llgaGraphHelper = LlgaGraphHelper(graph_);
auto partitions = llgaGraphHelper.getPartitions();
tensorIdToValue_ = llgaGraphHelper.getTensorIdToValue();
TORCH_CHECK(
partitions.size() == 1,
"LLGA subgraph should contain only one partition");
partition_ = partitions[0];
nPartitionInputs_ = partition_.get_input_ports().size();
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Initialized ", debugName(), "\n", graph_->toString());
#endif
}
bool LlgaKernel::useOpaqueLayout(size_t offset) const {
return LlgaNodeWrapper(fusionNode_).useOpaqueLayout(offset);
}
void LlgaKernel::initializeConstantInputs() {
for (auto& lt : partition_.get_input_ports()) {
auto inputId = lt.get_id();
if (initializedInputIds_.find(inputId) == initializedInputIds_.end()) {
TORCH_CHECK(
tensorIdToValue_.count(inputId) > 0,
"inputs with inputId ",
inputId,
" is missing");
auto* value = tensorIdToValue_[inputId];
TORCH_CHECK(
value->node()->kind() == prim::Constant &&
value->type()->cast<TensorType>(),
"inputs with inputId ",
inputId,
" should be a Constant tensor");
constantValues_.emplace_back(value);
auto const_tensor = toIValue(value)->toTensor();
constantInputs_.emplace_back(const_tensor);
}
}
}
std::map<size_t, int64_t> LlgaKernel::initializeTensorIdToOccurence() const {
std::map<size_t, int64_t> tensorIdToOccurence;
for (auto& lt : partition_.get_input_ports()) {
auto inputId = lt.get_id();
std::map<size_t, int64_t>::iterator it(tensorIdToOccurence.find(inputId));
if (it != tensorIdToOccurence.end()) {
it->second++;
} else {
tensorIdToOccurence[inputId] = 1;
}
}
return tensorIdToOccurence;
}
ArgSpecs LlgaKernel::initializeInputSpecs(const TensorArgs& inputs) {
ArgSpecs inputSpecs;
inputSpecs.reserve(nPartitionInputs_);
GRAPH_DEBUG("Initializing graph input logical tensors");
std::map<size_t, int64_t> tensorIdToOccurence =
initializeTensorIdToOccurence();
for (const auto i : c10::irange(nGraphInputs_)) {
auto spec = ArgSpec(graph_->inputs()[i]).supplementTensorInfo(inputs[i]);
initializedInputIds_.insert(spec.tid());
int64_t occurence = tensorIdToOccurence[spec.tid()];
inputSpecs.insert(inputSpecs.end(), occurence, spec);
runArgsIdx_.insert(runArgsIdx_.end(), occurence, i);
}
GRAPH_DEBUG("Initializing constant input tensors");
initializeConstantInputs();
TORCH_CHECK(
inputSpecs.size() + constantValues_.size() ==
static_cast<size_t>(nPartitionInputs_),
"Partition inputs are missing");
GRAPH_DEBUG(
"Concatenating constant input logical tensors to graph input "
"logical tensors");
for (Value* constant_value : constantValues_) {
ArgSpec constantInputSpec(constant_value);
inputSpecs.emplace_back(constantInputSpec);
constantLogicalTensors_.emplace_back(constantInputSpec.logical_tensor());
}
return inputSpecs;
}
ArgSpecs LlgaKernel::initializeOutputSpecs() const {
ArgSpecs outputSpecs;
outputSpecs.reserve(nOutputs_);
for (const auto i : c10::irange(nOutputs_)) {
auto spec = ArgSpec(graph_->outputs()[i]);
if (useOpaqueLayout(i)) {
spec = spec.any();
}
outputSpecs.emplace_back(spec);
}
return outputSpecs;
}
std::tuple<RunArgs, RunArgs> LlgaKernel::prepareRunArgs(
const TensorArgs& inputs,
TensorArgs& outputs) const {
RunArgs runInputs, runOutputs;
auto numInputs = runArgsIdx_.size();
for (const auto i : c10::irange(numInputs)) {
auto spec = inputSpecs_[i];
auto input = inputs[runArgsIdx_[i]];
runInputs.push_back(
{spec.logical_tensor(), Engine::getEngine(), input.data_ptr()});
}
auto numConstantInputs = constantInputs_.size();
for (size_t i = 0; i < numConstantInputs; i++) {
// constantInputSpecs are placed after graphInputSpecs
auto constantInputSpecIdx = nGraphInputs_ + i;
auto constantInputSpec = inputSpecs_[constantInputSpecIdx];
runInputs.push_back(
{constantLogicalTensors_[i],
Engine::getEngine(),
constantInputs_[i].data_ptr()});
}
for (const auto i : c10::irange(nOutputs_)) {
auto spec = outputSpecs_[i];
auto opt = c10::TensorOptions(spec.aten_scalar_type()).device(device_);
if (spec.reuses_input_tensor()) {
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("inplace computation - input tensor would be reused");
#endif
auto inputTensor = inputs[spec.get_input_tensor_index()];
if (inputTensor.is_mkldnn()) {
auto dataType = spec.dtype();
if (C10_UNLIKELY(!useOpaqueLayout(i))) {
// If the input tensor was between two partitions, it would've been
// wrapped with LlgaTensorImpl. But if it's being reused as the output
// tensor, which is not between two partitions, then we'd have to
// re-wrap it with a sub-class of TensorImpl, as it'd be fed into a
// PyTorch op.
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("rewrap tensors");
#endif
auto llgaImpl =
static_cast<LlgaTensorImpl*>(inputTensor.unsafeGetTensorImpl());
switch (dataType) {
case data_type::f32:
case data_type::bf16:
inputTensor = LlgaTensorImpl::llga_to_aten_tensor(llgaImpl);
break;
case data_type::s32:
default:
TORCH_CHECK(
false, "Invalid data type ", static_cast<size_t>(dataType));
}
}
outputs.push_back(inputTensor);
runOutputs.push_back(
{spec.logical_tensor(),
Engine::getEngine(),
inputTensor.data_ptr()});
return std::make_tuple(runInputs, runOutputs);
}
}
if (useOpaqueLayout(i)) {
// Wrap tensors between partitions with LlgaTensorImpl wrapper, so that we
// can bypass guard-check, as strides would be different than those
// expected.
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Between two oneDNN Graph partitions");
#endif
auto tensor = empty_llga(spec, opt);
outputs.push_back(tensor);
runOutputs.push_back(llga_from_aten_tensor(tensor));
} else {
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Neither opaque to PyTorch nor inplace-computation");
#endif
auto tensor = at::empty_strided(spec.sizes(), spec.strides(), opt);
outputs.push_back(tensor);
runOutputs.push_back(
{spec.logical_tensor(), Engine::getEngine(), tensor.data_ptr()});
}
}
return std::make_tuple(runInputs, runOutputs);
}
compiled_partition LlgaKernel::compile(const partition& partition) {
auto inputs = fmap(inputSpecs_, toLogicalTensor);
auto outputs = fmap(outputSpecs_, toLogicalTensor);
auto compilation = partition.compile(inputs, outputs, Engine::getEngine());
// Since layouts of opaque outputs would be known after compilation,
// we need to query them out from compilation and update outputSpecs
for (const auto i : c10::irange(nOutputs_)) {
auto tid = outputSpecs_[i].tid();
outputSpecs_[i] = compilation.query_logical_tensor(tid);
}
// Build static mapping from output id to input offset
// in accordance with available inplace options
for (auto&& option : compilation.get_inplace_ports()) {
size_t inputId = option.first;
size_t outputId = option.second;
auto inputSpecIter =
std::find_if(inputSpecs_.begin(), inputSpecs_.end(), [&](auto& spec) {
return spec.tid() == inputId;
});
TORCH_CHECK(inputSpecIter != inputSpecs_.end(), "In-place input not found");
auto inputOffset = inputSpecIter - inputSpecs_.begin();
auto outputSpecIter =
std::find_if(outputSpecs_.begin(), outputSpecs_.end(), [&](auto& spec) {
return spec.tid() == outputId;
});
auto outputOffset = outputSpecIter - outputSpecs_.begin();
outputSpecs_[outputOffset].set_compute_inplace();
outputSpecs_[outputOffset].set_input_tensor_index(inputOffset);
}
return compilation;
}
void LlgaKernel::run(Stack& stack) {
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("In ", debugName(), "\n");
#endif
// Grab input values from stack
auto stackInputs = last(stack, nGraphInputs_);
auto inputs = fmap(stackInputs, [&](const IValue& v) {
TORCH_CHECK(
v.isTensor(), "Stack values for LLGA partition must be Tensor type");
return v.toTensor();
});
// Even in case of concurrent threads, the kernel would be initialized once.
// TODO: Try not using an atomic lock
c10::call_once(
initialized_flag,
[&](const TensorArgs& inputs) {
GRAPH_DEBUG("Initializing input logical tensors");
inputSpecs_ = initializeInputSpecs(inputs);
GRAPH_DEBUG("Initializing output logical tensors");
outputSpecs_ = initializeOutputSpecs();
GRAPH_DEBUG("Compiling partition");
compilation_ = compile(partition_);
is_initialized_ = true;
},
inputs);
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Preparing runtime tensors");
#endif
TensorArgs outputs;
RunArgs runInputs, runOutputs;
std::tie(runInputs, runOutputs) = prepareRunArgs(inputs, outputs);
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Executing partition");
#endif
compilation_.execute(Stream::getStream(), runInputs, runOutputs);
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Partition executed");
#endif
// Update the stack.
drop(stack, nGraphInputs_);
for (auto& o : outputs)
push_one(stack, std::move(o));
#ifdef GRAPH_DEBUG_ENABLED
GRAPH_DEBUG("Stack updated");
#endif
}
} // namespace onednn
} // namespace fuser
} // namespace jit
} // namespace torch