mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Remove -Wno-unused-variable from utils.cmake (take 2) (#75538)
Summary: [Comment](https://github.com/pytorch/pytorch/pull/62445/files#r680132022) claims, it got added for consistency with top level CMakeLists.txt, but `-Wno-unused-variable` is not mentioned there. Modify violations in 50+ files that were added in the interim by either removing unused variables, or decorating the code with `C10_UNUSED` if local variable is likely used to extend object lifetime until the end of the block. Caused preventable revert in https://github.com/pytorch/pytorch/pull/72633#issuecomment-1092300787 Pull Request resolved: https://github.com/pytorch/pytorch/pull/75538 Reviewed By: anjali411 Differential Revision: D35747333 Pulled By: malfet fbshipit-source-id: 3fc5828e44a4c05ba0e89e92613e6ebbdb260626 (cherry picked from commit c179fba21cfa2a0093fad50ccad5a22dd7cff52c)
This commit is contained in:
parent
29b004be7a
commit
f6c275f55d
|
|
@ -372,7 +372,7 @@ ivalue::TupleTypeFactory<TupleType>::fallback(const Type& type) {
|
|||
for (const auto& elem : dyn.arguments().elems) {
|
||||
types.emplace_back(elem.ty);
|
||||
if (const auto& name = elem.label) {
|
||||
fields.emplace_back(*elem.label);
|
||||
fields.emplace_back(*name);
|
||||
}
|
||||
}
|
||||
if (const auto& name = dyn.name()) {
|
||||
|
|
|
|||
|
|
@ -1833,7 +1833,7 @@ DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple_hx_typ
|
|||
|
||||
namespace {
|
||||
|
||||
static auto ensure_linear_params_registered = register_linear_params();
|
||||
static C10_UNUSED auto ensure_linear_params_registered = register_linear_params();
|
||||
|
||||
static auto cell_params_base_registry =
|
||||
torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ int register_linear_params() {
|
|||
}
|
||||
|
||||
namespace {
|
||||
static auto linear_params = register_linear_params();
|
||||
static C10_UNUSED auto linear_params = register_linear_params();
|
||||
} // namespace
|
||||
|
||||
}} // namespace ao::sparse
|
||||
|
|
|
|||
|
|
@ -554,9 +554,9 @@ int register_embedding_params() {
|
|||
|
||||
namespace {
|
||||
|
||||
static auto conv2d_params = register_conv_params<2>();
|
||||
static auto conv3d_params = register_conv_params<3>();
|
||||
static auto linear_params = register_linear_params();
|
||||
static auto embedding_params = register_embedding_params();
|
||||
static C10_UNUSED auto conv2d_params = register_conv_params<2>();
|
||||
static C10_UNUSED auto conv3d_params = register_conv_params<3>();
|
||||
static C10_UNUSED auto linear_params = register_linear_params();
|
||||
static C10_UNUSED auto embedding_params = register_embedding_params();
|
||||
|
||||
} // namespace
|
||||
|
|
|
|||
|
|
@ -187,7 +187,6 @@ Descriptor::Set dispatch_prologue(
|
|||
const Shader::Descriptor& shader_descriptor,
|
||||
const Shader::WorkGroup& local_work_group_size) {
|
||||
Context* const context = api::context();
|
||||
const GPU gpu = context->gpu();
|
||||
Descriptor& descriptor = context->descriptor();
|
||||
Pipeline& pipeline = context->pipeline();
|
||||
Shader& shader = context->shader();
|
||||
|
|
|
|||
|
|
@ -199,7 +199,6 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
|
|||
Tensor cat(
|
||||
const at::TensorList tensors,
|
||||
const int64_t dim) {
|
||||
const auto norm_dim = normalize_dim(dim, 4);
|
||||
TORCH_CHECK(
|
||||
tensors.size() > 0,
|
||||
"Vulkan cat expects at least one tensor");
|
||||
|
|
|
|||
|
|
@ -348,6 +348,7 @@ _ScopeGuard<T> ScopeGuard(T f) {
|
|||
stats.field.groupName.c_str(), \
|
||||
__caffe_event_value_, \
|
||||
##__VA_ARGS__); \
|
||||
(void)__caffe_event_value_; \
|
||||
}
|
||||
|
||||
#define CAFFE_DURATION(stats, field, ...) \
|
||||
|
|
|
|||
|
|
@ -286,9 +286,6 @@ NO_GRADIENT(BooleanMaskLengths);
|
|||
|
||||
} // namespace
|
||||
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
|
||||
const float minf = -1.0f * std::numeric_limits<float>::infinity();
|
||||
|
||||
// Template this on a functor object so we can generate different
|
||||
// implementations at compile time and have a better chance of inlining
|
||||
template <typename Functor>
|
||||
|
|
|
|||
|
|
@ -155,7 +155,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
|||
col_buffer->Resize(buffer_shape);
|
||||
T* col_buffer_data = col_buffer->template mutable_data<T>();
|
||||
// Im2col, followed by gemm.
|
||||
for (const auto image_id : c10::irange(N)) {
|
||||
for (C10_UNUSED const auto image_id : c10::irange(N)) {
|
||||
for (const auto group_id : c10::irange(group_)) {
|
||||
DeformableIm2col(
|
||||
Xdata + group_id * input_offset,
|
||||
|
|
@ -342,7 +342,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
|
|||
math::Set<T, Context>(dX->numel(), 0, dXdata, &context_);
|
||||
}
|
||||
|
||||
for (const auto image_id : c10::irange(N)) {
|
||||
for (C10_UNUSED const auto image_id : c10::irange(N)) {
|
||||
for (const auto group_id : c10::irange(group_)) {
|
||||
math::Gemm<T, Context>(
|
||||
CblasTrans,
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
|
|||
const int64_t num_bounds_per_group,
|
||||
const int64_t num_group) {
|
||||
const T* start = bounds;
|
||||
for (const auto i : c10::irange(num_group)) {
|
||||
for (C10_UNUSED const auto i : c10::irange(num_group)) {
|
||||
if (!std::is_sorted(start, start + num_bounds_per_group)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ void Decode(
|
|||
}
|
||||
|
||||
int sz = output->numel();
|
||||
for (const auto i : c10::irange(sz)) {
|
||||
for (C10_UNUSED const auto i : c10::irange(sz)) {
|
||||
DCHECK_LE(*code_ptr, cb_size);
|
||||
*out_ptr++ = cb_ptr[*code_ptr++];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -229,8 +229,8 @@ void ROIAlignForward(
|
|||
for (const auto pw : c10::irange(pooled_width)) {
|
||||
vector<int32_t> acc_buffer(channels, 0);
|
||||
|
||||
for (const auto iy : c10::irange(roi_bin_grid_h)) {
|
||||
for (const auto ix : c10::irange(roi_bin_grid_w)) {
|
||||
for (C10_UNUSED const auto iy : c10::irange(roi_bin_grid_h)) {
|
||||
for (C10_UNUSED const auto ix : c10::irange(roi_bin_grid_w)) {
|
||||
PreCalc pc = pre_calc[pre_calc_index];
|
||||
|
||||
const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1;
|
||||
|
|
|
|||
|
|
@ -216,7 +216,7 @@ static void Im2ColNHWC(
|
|||
T* data_col_temp =
|
||||
data_col + h * width_col * kernel_h * kernel_w * channels;
|
||||
int w_pad = -pad_l;
|
||||
for (const auto w : c10::irange(width_col)) {
|
||||
for (C10_UNUSED const auto w : c10::irange(width_col)) {
|
||||
int r = 0;
|
||||
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
|
||||
int s = 0;
|
||||
|
|
|
|||
|
|
@ -18,16 +18,11 @@
|
|||
namespace caffe2 {
|
||||
|
||||
// Constants for user tracepoints
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
|
||||
static constexpr int SDT_NONBLOCKING_OP = 0;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
|
||||
static constexpr int SDT_BLOCKING_OP = 1;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
|
||||
static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
|
||||
static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
|
||||
static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;
|
||||
C10_UNUSED static constexpr int SDT_NONBLOCKING_OP = 0;
|
||||
C10_UNUSED static constexpr int SDT_BLOCKING_OP = 1;
|
||||
C10_UNUSED static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1;
|
||||
C10_UNUSED static constexpr uint64_t SDT_ABORT = (uint64_t)-2;
|
||||
C10_UNUSED static constexpr uint64_t SDT_CANCEL = (uint64_t)-3;
|
||||
|
||||
BlobsQueue::BlobsQueue(
|
||||
Workspace* ws,
|
||||
|
|
@ -66,8 +61,7 @@ bool BlobsQueue::blockingRead(
|
|||
float timeout_secs) {
|
||||
Timer readTimer;
|
||||
auto keeper = this->shared_from_this();
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
const auto& name = name_.c_str();
|
||||
C10_UNUSED const auto& name = name_.c_str();
|
||||
CAFFE_SDT(queue_read_start, name, (void*)this, SDT_BLOCKING_OP);
|
||||
std::unique_lock<std::mutex> g(mutex_);
|
||||
auto canRead = [this]() {
|
||||
|
|
@ -76,7 +70,6 @@ bool BlobsQueue::blockingRead(
|
|||
};
|
||||
// Decrease queue balance before reading to indicate queue read pressure
|
||||
// is being increased (-ve queue balance indicates more reads than writes)
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, queue_balance, -1);
|
||||
if (timeout_secs > 0) {
|
||||
std::chrono::milliseconds timeout_ms(int(timeout_secs * 1000));
|
||||
|
|
@ -99,17 +92,14 @@ bool BlobsQueue::blockingRead(
|
|||
CAFFE_ENFORCE(inputs.size() >= result.size());
|
||||
for (const auto i : c10::irange(result.size())) {
|
||||
auto bytes = BlobStat::sizeBytes(*result[i]);
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, queue_dequeued_bytes, bytes, i);
|
||||
using std::swap;
|
||||
swap(*(inputs[i]), *(result[i]));
|
||||
}
|
||||
CAFFE_SDT(queue_read_end, name, (void*)this, writer_ - reader_);
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, queue_dequeued_records);
|
||||
++reader_;
|
||||
cv_.notify_all();
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, read_time_ns, readTimer.NanoSeconds());
|
||||
return true;
|
||||
}
|
||||
|
|
@ -117,8 +107,7 @@ bool BlobsQueue::blockingRead(
|
|||
bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
|
||||
Timer writeTimer;
|
||||
auto keeper = this->shared_from_this();
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
const auto& name = name_.c_str();
|
||||
C10_UNUSED const auto& name = name_.c_str();
|
||||
CAFFE_SDT(queue_write_start, name, (void*)this, SDT_NONBLOCKING_OP);
|
||||
std::unique_lock<std::mutex> g(mutex_);
|
||||
if (!canWrite()) {
|
||||
|
|
@ -127,11 +116,9 @@ bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
|
|||
}
|
||||
// Increase queue balance before writing to indicate queue write pressure is
|
||||
// being increased (+ve queue balance indicates more writes than reads)
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, queue_balance, 1);
|
||||
DCHECK(canWrite());
|
||||
doWrite(inputs);
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
|
||||
return true;
|
||||
}
|
||||
|
|
@ -139,13 +126,11 @@ bool BlobsQueue::tryWrite(const std::vector<Blob*>& inputs) {
|
|||
bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
|
||||
Timer writeTimer;
|
||||
auto keeper = this->shared_from_this();
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
const auto& name = name_.c_str();
|
||||
C10_UNUSED const auto& name = name_.c_str();
|
||||
CAFFE_SDT(queue_write_start, name, (void*)this, SDT_BLOCKING_OP);
|
||||
std::unique_lock<std::mutex> g(mutex_);
|
||||
// Increase queue balance before writing to indicate queue write pressure is
|
||||
// being increased (+ve queue balance indicates more writes than reads)
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, queue_balance, 1);
|
||||
cv_.wait(g, [this]() { return closing_ || canWrite(); });
|
||||
if (!canWrite()) {
|
||||
|
|
@ -154,7 +139,6 @@ bool BlobsQueue::blockingWrite(const std::vector<Blob*>& inputs) {
|
|||
}
|
||||
DCHECK(canWrite());
|
||||
doWrite(inputs);
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds());
|
||||
return true;
|
||||
}
|
||||
|
|
@ -178,8 +162,7 @@ bool BlobsQueue::canWrite() {
|
|||
void BlobsQueue::doWrite(const std::vector<Blob*>& inputs) {
|
||||
auto& result = queue_[writer_ % queue_.size()];
|
||||
CAFFE_ENFORCE(inputs.size() >= result.size());
|
||||
// NOLINTNEXTLINE(clang-diagnostic-unused-variable)
|
||||
const auto& name = name_.c_str();
|
||||
C10_UNUSED const auto& name = name_.c_str();
|
||||
for (const auto i : c10::irange(result.size())) {
|
||||
using std::swap;
|
||||
swap(*(inputs[i]), *(result[i]));
|
||||
|
|
|
|||
|
|
@ -449,7 +449,6 @@ function(torch_compile_options libname)
|
|||
-Wall
|
||||
-Wextra
|
||||
-Wno-unused-parameter
|
||||
-Wno-unused-variable
|
||||
-Wno-unused-function
|
||||
-Wno-unused-result
|
||||
-Wno-unused-local-typedefs
|
||||
|
|
|
|||
|
|
@ -5051,7 +5051,6 @@ Tensor group_norm_jvp(
|
|||
Tensor group_norm_mean_jvp(
|
||||
const Tensor& input_t, const Tensor& mean_p, int64_t groups) {
|
||||
int64_t N = input_t.size(0);
|
||||
int64_t C = input_t.size(1);
|
||||
std::array<int64_t, 3> view_shape = {1, N * groups, N ? -1 : 1};
|
||||
auto input_t_reshaped = input_t.view(view_shape);
|
||||
return input_t_reshaped.mean({2}, false).view_as(mean_p);
|
||||
|
|
@ -5062,7 +5061,6 @@ Tensor group_norm_invstd_jvp(
|
|||
const Tensor& mean_p, const Tensor& invstd_p,
|
||||
int64_t groups) {
|
||||
int64_t N = input_p.size(0);
|
||||
int64_t C = input_p.size(1);
|
||||
|
||||
std::vector<int64_t> view_shape = {1, N * groups, N ? -1 : 1};
|
||||
|
||||
|
|
|
|||
|
|
@ -328,7 +328,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
|
|||
// one uint64_t variable as key.
|
||||
std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
|
||||
tidSeq2activity;
|
||||
uint64_t fwd_bwd_link_id = 1;
|
||||
|
||||
for (const auto idx : c10::irange(cpu_trace->activities.size())) {
|
||||
auto& kineto_event = kineto_events_[idx];
|
||||
|
|
@ -603,7 +602,6 @@ void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
|
|||
if (!state_ptr) {
|
||||
return nullptr;
|
||||
}
|
||||
const auto& config = state_ptr->config();
|
||||
auto corr_id = next_correlation_id();
|
||||
torch::profiler::impl::kineto::pushCorrelationId(corr_id);
|
||||
return state_ptr->record_queue_.getSubqueue()->begin_op(fn, corr_id);
|
||||
|
|
|
|||
|
|
@ -2836,8 +2836,9 @@ void ProcessGroupGloo::monitoredBarrier(
|
|||
|
||||
waitLoop(sendWorkMap);
|
||||
|
||||
auto elapsedTime = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - startTime);
|
||||
using namespace std::chrono;
|
||||
C10_UNUSED auto elapsedTime = duration_cast<milliseconds>(
|
||||
steady_clock::now() - startTime);
|
||||
}
|
||||
|
||||
void ProcessGroupGloo::setSequenceNumberForGroup() {
|
||||
|
|
|
|||
|
|
@ -31,8 +31,6 @@ constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM";
|
|||
|
||||
namespace {
|
||||
|
||||
constexpr int kBytes = 8;
|
||||
|
||||
// RAII helper class to manage NCCL group API and CUDA free mutex.
|
||||
// The destructor is allowed to throw since this helper class only
|
||||
// manages group and lock lifetimes.
|
||||
|
|
@ -440,10 +438,6 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
|
|||
|
||||
// In case of blocking, wait for the operation to complete.
|
||||
if (blockingWait_) {
|
||||
// Use the passed in timeout if provided, otherwise use the default
|
||||
// opTimeout for each WorkNCCL object.
|
||||
std::chrono::milliseconds workTimeout =
|
||||
timeout == kNoTimeout ? opTimeout_ : timeout;
|
||||
// Wait for the operation to complete.
|
||||
while (!isCompleted()) {
|
||||
if (timedOut()) {
|
||||
|
|
|
|||
|
|
@ -1492,7 +1492,6 @@ TensorView* gather(
|
|||
". Padding right: ",
|
||||
pad_right);
|
||||
const auto out_stop_offset = inp_stop_offset.value() + extent_adjustment;
|
||||
Val* out_axis_dim = nullptr;
|
||||
out_root_domains.push_back(IrBuilder::create<IterDomain>(
|
||||
FusionGuard::getCurFusion()->zeroVal(),
|
||||
inp_axis->extent(),
|
||||
|
|
|
|||
|
|
@ -938,14 +938,13 @@ class CudaKernelGenerator : private OptOutConstDispatch {
|
|||
|
||||
indent() << genMmaOp(mma, true) << "(reinterpret_cast<Array<"
|
||||
<< mma->out()->getDataType().value() << ","
|
||||
<< getOutputRegisterSize(mma->options().macro) << ","
|
||||
<< getOutputRegisterSize(mma->options().macro) << ">*>"
|
||||
<< getOutputRegisterSize(options.macro) << ","
|
||||
<< getOutputRegisterSize(options.macro) << ">*>"
|
||||
<< "(&" << gen(uop->out()) << "));\n";
|
||||
}
|
||||
|
||||
void handle(const MmaOp* mma) final {
|
||||
auto options = mma->options();
|
||||
auto in_a = mma->inA()->as<kir::TensorIndex>();
|
||||
auto out = mma->out()->as<kir::TensorIndex>();
|
||||
indent() << genMmaOp(mma) << "(\n";
|
||||
indent() << kTab << "reinterpret_cast<Array<"
|
||||
|
|
@ -967,7 +966,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
|
|||
|
||||
void handle(const BroadcastOp* stmt) final {
|
||||
TORCH_INTERNAL_ASSERT(stmt->out()->isA<kir::TensorIndex>());
|
||||
const auto tensor_index = stmt->out()->as<kir::TensorIndex>();
|
||||
|
||||
const ParallelTypeBitmap parallel_types =
|
||||
kernel_->summary().broadcast_parallel_types.at(stmt);
|
||||
|
|
@ -1313,7 +1311,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
|
|||
TORCH_INTERNAL_ASSERT(rop->isFused());
|
||||
|
||||
const auto out = rop->out()->as<kir::TensorIndex>();
|
||||
const auto domain = out->view()->domain();
|
||||
|
||||
const auto data_type = rop->out()->dtype();
|
||||
const auto op_type = rop->getReductionOpType();
|
||||
|
|
@ -1384,11 +1381,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
|
|||
parallel_types.hasBID(),
|
||||
"GridBroadcast needs to be used with a broadcast op that is parallelized with the BID parallel types");
|
||||
|
||||
const auto out = bop->out()->as<kir::TensorIndex>();
|
||||
const auto domain = out->view()->domain();
|
||||
|
||||
const auto data_type = bop->out()->dtype();
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
grop->broadcast_buffer()->buffer()->isA<TensorView>());
|
||||
TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA<TensorView>());
|
||||
|
|
@ -1499,7 +1491,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
|
|||
TORCH_INTERNAL_ASSERT(wop->isFused());
|
||||
|
||||
const auto out = wop->out()->as<kir::TensorIndex>();
|
||||
const auto domain = out->view()->domain();
|
||||
|
||||
const auto data_type = wop->outAvg()->dtype();
|
||||
const auto index_type = wop->outN()->dtype();
|
||||
|
|
|
|||
|
|
@ -548,7 +548,7 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
|
|||
FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals");
|
||||
GlobalBuffers global_buffers;
|
||||
const auto kernel = lowered_->kernel();
|
||||
const auto& kernel_summary = lowered_->kernel()->summary();
|
||||
const auto& kernel_summary = kernel->summary();
|
||||
for (auto alloc : kernel_summary.global_allocations) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
alloc->buffer()->isA<TensorView>(),
|
||||
|
|
|
|||
|
|
@ -611,9 +611,6 @@ void validateAlignedVectorizedTensors(
|
|||
|
||||
// Verify extents of aligned vectorized tensors
|
||||
for (const auto& vec_info : kernel->summary().vectorized_set_info) {
|
||||
auto in_tv = vec_info.producer_tv;
|
||||
auto out_tv = vec_info.consumer_tv;
|
||||
|
||||
if (vec_info.vectorized_leaf_id->getParallelType() ==
|
||||
ParallelType::Vectorize) {
|
||||
validateAlignedVectorizeExtents(vec_info, expr_eval);
|
||||
|
|
|
|||
|
|
@ -2303,7 +2303,7 @@ void separateNestedViews(Node* cuda_fusion_group) {
|
|||
auto parent = parent_value->node();
|
||||
|
||||
auto grandparent_value = parent->input(0);
|
||||
auto grandparent = grandparent_value->node();
|
||||
C10_UNUSED auto grandparent = grandparent_value->node();
|
||||
|
||||
// Before: gp -> x -> n
|
||||
// After: gp -> x / gp -> n
|
||||
|
|
|
|||
|
|
@ -814,8 +814,6 @@ indexMapFromTV(
|
|||
kir::ForLoop* alloc_loop,
|
||||
bool as_consumer,
|
||||
kir::ForLoop* double_buffer_loop = nullptr) {
|
||||
const auto gpu_lower = GpuLower::current();
|
||||
|
||||
bool within_alloc = false;
|
||||
if (alloc_loop == nullptr) {
|
||||
within_alloc = true;
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ void Scope::insert(size_t pos, Expr* expr) {
|
|||
|
||||
void Scope::erase(std::vector<Expr*>::const_iterator pos) {
|
||||
// Remove the scope of the expr if this is the scope
|
||||
auto expr = *pos;
|
||||
C10_UNUSED auto expr = *pos;
|
||||
exprs_.erase(pos);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,9 +58,6 @@ class AllocationInserter : public kir::ExprMutator {
|
|||
// Fills info.buffer, info.alloc_pos, info.init_for_loop,
|
||||
// info.init_place_before, info.alloc_for_loop, info.alloc_place_before
|
||||
void fillAllocationInformation(AllocationInformation& info, Expr* expr) {
|
||||
size_t alloc_pos = 0;
|
||||
kir::ForLoop* init_for_loop = nullptr;
|
||||
size_t fl_idx_next = 0;
|
||||
auto loop_alloc_info =
|
||||
loop_utils::getAllocInformation(info.buffer, for_loops_);
|
||||
|
||||
|
|
|
|||
|
|
@ -215,8 +215,6 @@ class DoubleBufferLoopCloner : public kir::IrVisitor {
|
|||
}
|
||||
|
||||
void handle(kir::ForLoop* fl) final {
|
||||
const auto gpu_lower = GpuLower::current();
|
||||
|
||||
kir::ForLoop* cloned_loop = fl == double_buffer_loop_
|
||||
? cloned_top_level_loop_
|
||||
: IrBuilder::create<kir::ForLoop>(fl);
|
||||
|
|
|
|||
|
|
@ -326,8 +326,6 @@ void HaloInfo::insertToInheritanceMap(
|
|||
void HaloInfo::initializeFromRootAxisInfo(IterDomain* id) {
|
||||
TORCH_INTERNAL_ASSERT(hasRootAxisInfo(id));
|
||||
|
||||
auto gpu_lower = GpuLower::current();
|
||||
|
||||
const auto& halo_info = getRootAxisInfo(id);
|
||||
auto halo_width = halo_info.width();
|
||||
|
||||
|
|
@ -350,8 +348,6 @@ void HaloInfo::setHaloWidth(IterDomain* id, int halo_width) {
|
|||
|
||||
// Propagate extent information from root axes to descendants
|
||||
void HaloInfo::build(TensorDomain* td) {
|
||||
auto gpu_lower = GpuLower::current();
|
||||
|
||||
auto exprs = DependencyCheck::getAllExprsBetween(
|
||||
{td->getMaybeRFactorDomain().begin(), td->getMaybeRFactorDomain().end()},
|
||||
{td->domain().begin(), td->domain().end()});
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ void OptOutMutator::mutate(MmaOp* mma) {
|
|||
auto container = mma->container();
|
||||
auto options = mma->options();
|
||||
container->removeExpr(mma);
|
||||
auto new_mma =
|
||||
C10_UNUSED auto new_mma =
|
||||
IrBuilder::create<MmaOp>(container, out, in_a, in_b, init, options);
|
||||
}
|
||||
|
||||
|
|
@ -357,7 +357,7 @@ void OptOutMutator::mutate(Split* s) {
|
|||
auto container = s->container();
|
||||
auto inner_split = s->innerSplit();
|
||||
container->removeExpr(s);
|
||||
auto new_node = IrBuilder::create<Split>(
|
||||
C10_UNUSED auto new_node = IrBuilder::create<Split>(
|
||||
container, ot, inr, in, fact, inner_split, start_offset, stop_offset);
|
||||
}
|
||||
|
||||
|
|
@ -373,7 +373,7 @@ void OptOutMutator::mutate(Merge* m) {
|
|||
|
||||
auto container = m->container();
|
||||
container->removeExpr(m);
|
||||
auto new_node = IrBuilder::create<Merge>(container, ot, otr, in);
|
||||
C10_UNUSED auto new_node = IrBuilder::create<Merge>(container, ot, otr, in);
|
||||
}
|
||||
|
||||
void OptOutMutator::mutate(kir::Allocate*) {
|
||||
|
|
|
|||
|
|
@ -93,8 +93,6 @@ void ParallelDimensionMap::populateDimensionMapWithSingleCASet(
|
|||
const std::unordered_set<IterDomain*>& dom_set) {
|
||||
TORCH_INTERNAL_ASSERT(dom_set.size() == 1);
|
||||
|
||||
const auto gpu_lower = GpuLower::current();
|
||||
|
||||
// pt is used by only one concrete domain
|
||||
auto id = *dom_set.begin();
|
||||
auto it = constant_extent_map_.find(id);
|
||||
|
|
@ -119,8 +117,6 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet(
|
|||
const std::unordered_set<IterDomain*>& dom_set) {
|
||||
TORCH_INTERNAL_ASSERT(dom_set.size() > 1);
|
||||
|
||||
const auto gpu_lower = GpuLower::current();
|
||||
|
||||
bool all_equal = true;
|
||||
// Use nullptr to signal it's not initialied yet
|
||||
Val* known_dimension = nullptr;
|
||||
|
|
|
|||
|
|
@ -1369,8 +1369,6 @@ class IrParser {
|
|||
REGISTER_PARSE_RULE(
|
||||
ptr_op,
|
||||
{
|
||||
auto fusion = FusionGuard::getCurFusion();
|
||||
|
||||
// TODO: handle channels last
|
||||
MemoryFormat format;
|
||||
std::list<Val*> list_val;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ namespace fuser {
|
|||
namespace cuda {
|
||||
|
||||
void PartialSplitMap::build(Fusion* fusion) {
|
||||
const auto gpu_lower = GpuLower::current();
|
||||
auto used_vals = ir_utils::allTvs(fusion);
|
||||
|
||||
for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
|
||||
|
|
|
|||
|
|
@ -444,7 +444,6 @@ void UnswitchPredicate::predicateOn(Expr* tv_expr) {
|
|||
|
||||
auto ref_pred_info = Index::getReferenceRootPredicates(
|
||||
out_tv, for_loops_, unrolled_loop_, false);
|
||||
const ReferenceTensor& reference = ref_pred_info.second;
|
||||
|
||||
// If RootPredicateInfo has a static predicate that is more
|
||||
// restrictive than the current one, replace the current with the
|
||||
|
|
|
|||
|
|
@ -109,7 +109,6 @@ bool canValidateIsInnerDim(
|
|||
} else if (auto merge = dynamic_cast<Merge*>(expr)) {
|
||||
// Might consider just rejecting merge.
|
||||
auto outer = merge->outer();
|
||||
auto inner = merge->inner();
|
||||
if (outer->isBroadcast()) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -808,7 +808,6 @@ PersistentBufferSizeReturn persistentBufferSize(
|
|||
std::vector<bool> persistent_mask(all_buffers.size(), false);
|
||||
|
||||
for (auto buffer_i : c10::irange(persistent_buffers.size())) {
|
||||
auto buffer = all_buffers[buffer_i];
|
||||
persistent_mask[buffer_i] = true;
|
||||
}
|
||||
|
||||
|
|
@ -855,7 +854,6 @@ PersistentBufferSizeReturn persistentBufferSize(
|
|||
int64_t max_persistence_size = 0;
|
||||
int64_t max_proj_persistence_size = 0;
|
||||
for (const auto& entry : scoped_persistence_factor) {
|
||||
auto val = entry.first;
|
||||
auto active_buffers = entry.second;
|
||||
auto persistent_buffer_size = masked_dot_product(
|
||||
persistent_mask, active_buffers, persistent_buffer_sizes);
|
||||
|
|
|
|||
|
|
@ -254,7 +254,6 @@ class NaiveTypePropagator {
|
|||
}
|
||||
case aten::_batch_norm_impl_index_backward:
|
||||
case aten::native_batch_norm_backward: {
|
||||
int grad_input_index = 1;
|
||||
int weight_index = -1;
|
||||
int mask_index = -1;
|
||||
if (node->kind() ==
|
||||
|
|
@ -486,7 +485,6 @@ class NaiveTypePropagator {
|
|||
TORCH_CHECK(
|
||||
hasTypeAndDevice(in_type),
|
||||
"Type and device propagation has failed, or was not provided enough information.");
|
||||
const auto in_scalar_type = in_type->scalarType();
|
||||
const auto in_device = in_type->device();
|
||||
const auto cuda_enabled = constant_as<bool>(node->input(1));
|
||||
const auto cpu_enabled = constant_as<bool>(node->input(2));
|
||||
|
|
|
|||
|
|
@ -5543,7 +5543,6 @@ std::vector<Function*> CompilationUnit::define(
|
|||
|
||||
void eraseListLiterals(std::shared_ptr<Graph>& graph) {
|
||||
DepthFirstGraphNodeIterator it(graph);
|
||||
Node* n = nullptr;
|
||||
|
||||
for (auto next_node = it.next(); next_node != nullptr;) {
|
||||
Node* node = next_node;
|
||||
|
|
|
|||
|
|
@ -543,6 +543,7 @@ void IRParser::parse() {
|
|||
TORCH_INTERNAL_ASSERT(dtype);
|
||||
auto options = at::TensorOptions(*device).dtype(*dtype);
|
||||
auto t = n->t_(attr::value, at::empty_strided(*sizes, *strides, options));
|
||||
(void)t;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -264,8 +264,7 @@ c10::impl::GenericList Function::run(
|
|||
const c10::IValue& input = inputs[i];
|
||||
const auto& spec = input_specs_[i];
|
||||
const auto& input_tensor = input.toTensor();
|
||||
TORCH_CHECK(
|
||||
input_specs_[i].validate(input_tensor), "Invalid input at pos: ", i);
|
||||
TORCH_CHECK(spec.validate(input_tensor), "Invalid input at pos: ", i);
|
||||
args[i] = input_tensor.data_ptr();
|
||||
}
|
||||
offset += inputs.size();
|
||||
|
|
|
|||
|
|
@ -42,8 +42,7 @@ struct OldOpsReplacerWithUpgraders {
|
|||
get_operator_version_map().find(schema_name.value());
|
||||
if (version_entry != get_operator_version_map().end()) {
|
||||
const auto& entry = version_entry->second;
|
||||
auto upgrader_entry =
|
||||
findUpgrader(version_entry->second, current_version);
|
||||
auto upgrader_entry = findUpgrader(entry, current_version);
|
||||
if (!upgrader_entry.has_value()) {
|
||||
if (!isOpSymbolCurrent(schema_name.value(), current_version)) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
|
|
|
|||
|
|
@ -619,7 +619,7 @@ std::vector<SSArgument> getNodeInputShapes(Node* n, const AliasDb& db) {
|
|||
for (size_t node_index = 0; node_index < n->inputs().size(); ++node_index) {
|
||||
auto type = n->input(node_index)->type();
|
||||
|
||||
if (auto tt = type->castRaw<TensorType>()) {
|
||||
if (type->castRaw<TensorType>()) {
|
||||
input_shapes.push_back(tensorShapeArg(n->input(node_index)));
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ void RunDecompositions(Block* block) {
|
|||
|
||||
void RunDecompositions(std::shared_ptr<Graph> g) {
|
||||
RunDecompositions(g->block());
|
||||
for (const auto _ : c10::irange(2)) {
|
||||
for (C10_UNUSED const auto _ : c10::irange(2)) {
|
||||
PeepholeOptimize(g, /*disable_shape_peephole*/ true);
|
||||
ConstantPropagation(g);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,7 +120,6 @@ FusionStrategy setFusionStrategy(FusionStrategy& strategy) {
|
|||
}
|
||||
|
||||
static std::atomic<size_t> num_profiled_runs{kDefaultNumProfiledRuns};
|
||||
static std::atomic<size_t> bailout_depth{kDefaultBailoutDepth};
|
||||
|
||||
std::atomic<bool>& getProfilingMode() {
|
||||
return profiling_mode;
|
||||
|
|
|
|||
|
|
@ -92,7 +92,6 @@ bool isUnsupportedOp(Node* node) {
|
|||
bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
|
||||
// check for sub-blocks
|
||||
bool can_support = true;
|
||||
bool has_blocks = false;
|
||||
for (auto* node : graph->block()->nodes()) {
|
||||
const auto kind = node->kind();
|
||||
if (kind == prim::Constant) {
|
||||
|
|
|
|||
|
|
@ -407,7 +407,6 @@ void StandardMemoryPlanner::deallocateManagedTensors() {
|
|||
for (auto& ms : managed_tensors_) {
|
||||
const auto& tensors = ms.group();
|
||||
size_t max = ms.maxTensorSize();
|
||||
auto tensor_idx = 0;
|
||||
for (auto& tensor : tensors) {
|
||||
const auto& storage = tensor->storage();
|
||||
size_t current_size = compute_aligned_tensor_size(storage.nbytes());
|
||||
|
|
|
|||
|
|
@ -149,7 +149,6 @@ std::vector<std::pair<BufPtr, BufPtr>> AllocBufsWithMemReuse(
|
|||
}
|
||||
|
||||
auto start = std::get<0>(buf_ranges.at(buf));
|
||||
auto end = std::get<1>(buf_ranges.at(buf));
|
||||
|
||||
// Release memory for buffers whose liveness range ends before the creation
|
||||
// time of this buf.
|
||||
|
|
|
|||
|
|
@ -1281,7 +1281,6 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
|
|||
}
|
||||
auto const_tensor = toIValue(v)->toTensor();
|
||||
auto scalar_type = c10::typeMetaToScalarType(const_tensor.options().dtype());
|
||||
const auto& tt = v->type()->expect<TensorType>();
|
||||
auto sizes = const_tensor.sizes();
|
||||
std::vector<ExprHandle> te_sizes;
|
||||
te_sizes.reserve(sizes.size());
|
||||
|
|
|
|||
|
|
@ -1743,8 +1743,7 @@ int nnc_lowerings_lazy_registration() {
|
|||
} // namespace
|
||||
|
||||
NNCLoweringFunction getStandardLoweringFor(const std::string& schema_str) {
|
||||
// NOLINTNEXTLINE
|
||||
static const int once = nnc_lowerings_lazy_registration();
|
||||
C10_UNUSED static const int once = nnc_lowerings_lazy_registration();
|
||||
const auto& lowerings = getNNCLoweringRegistry();
|
||||
if (auto l = lowerings.find(parseSchema(schema_str))) {
|
||||
return *l;
|
||||
|
|
|
|||
|
|
@ -259,8 +259,6 @@ Tensor computeDequantizeExternalCall(
|
|||
}
|
||||
|
||||
const BufHandle& qx = c10::get<BufHandle>(inputs[0]);
|
||||
const double qscale = immQScale(qx);
|
||||
const int64_t qzero = immQZero(qx);
|
||||
const int64_t qdtype = (int64_t)immQDType(qx);
|
||||
|
||||
BufHandle ResultBuf("dequantize", outputShape, dtype);
|
||||
|
|
|
|||
|
|
@ -168,7 +168,7 @@ torch::lazy::BackendDataPtr TSBackendImpl::CreateDataPlaceholder(
|
|||
std::vector<torch::lazy::ComputationPtr> TSBackendImpl::Compile(
|
||||
std::vector<torch::lazy::ComputationPtr> instances) const {
|
||||
for (const auto& instance : instances) {
|
||||
auto ts_computation =
|
||||
C10_UNUSED auto ts_computation =
|
||||
static_cast<torch::lazy::TSComputation*>(instance.get());
|
||||
}
|
||||
return instances;
|
||||
|
|
|
|||
|
|
@ -72,7 +72,6 @@ std::list<std::pair<at::RecordFunctionHandle, int>> flattenOpIdList(c10::List<c1
|
|||
}
|
||||
|
||||
std::list<std::pair<at::RecordFunctionHandle, int>> getInputTensorOpIds(const at::RecordFunction& fn) {
|
||||
int num_inputs = fn.inputs().size();
|
||||
std::pair<at::RecordFunctionHandle, int> undefined_op_pair(0,-1);
|
||||
std::list<std::pair<at::RecordFunctionHandle, int>> input_producer_ops_;
|
||||
auto state_ptr = NVTXThreadLocalState::getTLS();
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
|
|||
ApproximateClockToUnixTimeConverter::time_pairs
|
||||
ApproximateClockToUnixTimeConverter::measurePairs() {
|
||||
static constexpr auto n_warmup = 5;
|
||||
for (const auto _ : c10::irange(n_warmup)) {
|
||||
for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
|
||||
getApproximateTime();
|
||||
steady_clock_t::now();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user