[NOOP][clangformat][codemod] Enable CLANGFORMAT for caffe2/caffe2/* (#67624)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/67624 Test Plan: Visual inspection. Sandcastle. Reviewed By: malfet Differential Revision: D31986628 fbshipit-source-id: c872bded7325997a2945dbf5d4d052628dcb3659
2025-12-06 12:20:52 +01:00 · 2021-11-02 22:11:59 -07:00 · 2021-11-02 22:11:59 -07:00 · 06d1be2447
commit 06d1be2447
parent e86a5a3a1a
83 changed files with 1217 additions and 933 deletions
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@ -7,14 +7,14 @@
 #include <cuda.h>
 #include <nvrtc.h>

-#define NVRTC_CHECK(condition)                                                 \
-  do {                                                                         \
-    nvrtcResult result = condition;                                            \
-    if (result != NVRTC_SUCCESS) {                                             \
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "   \
-                      << nvrtcGetErrorString(result);                          \
-    }                                                                          \
-  } while(0)
+#define NVRTC_CHECK(condition)                                          \
+  do {                                                                  \
+    nvrtcResult result = condition;                                     \
+    if (result != NVRTC_SUCCESS) {                                      \
+      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
+                 << nvrtcGetErrorString(result);                        \
+    }                                                                   \
+  } while (0)

 namespace caffe2 {

@ -39,15 +39,14 @@ class CudaRTCFunction {
    VLOG(1) << "function src:\n" << src;
    // Actually do the compiling.
    nvrtcProgram prog;
-    NVRTC_CHECK(nvrtcCreateProgram(
-        &prog, src.c_str(), nullptr, 0, nullptr, nullptr));
+    NVRTC_CHECK(
+        nvrtcCreateProgram(&prog, src.c_str(), nullptr, 0, nullptr, nullptr));
    // Compile the program.
    // TODO(Yangqing): how to find the current gpu architecture instead of hard
    // coding it?
-    const char *nvrtc_opts[] = {"--gpu-architecture=compute_35",
-                                "--use_fast_math"};
-    nvrtcResult compile_result = nvrtcCompileProgram(
-        prog, 2, nvrtc_opts);
+    const char* nvrtc_opts[] = {
+        "--gpu-architecture=compute_35", "--use_fast_math"};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts);
    if (compile_result != NVRTC_SUCCESS) {
      size_t log_size;
      NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
@ -74,21 +73,33 @@ class CudaRTCFunction {
  }

  template <typename... Args>
-  void Launch(unsigned int gx, unsigned int gy, unsigned int gz,
-              unsigned int bx, unsigned int by, unsigned int bz,
-              unsigned int shared_mem, cudaStream_t stream,
-              Args... args) {
+  void Launch(
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int bx,
+      unsigned int by,
+      unsigned int bz,
+      unsigned int shared_mem,
+      cudaStream_t stream,
+      Args... args) {
    CAFFE_ENFORCE(
        module_loaded_, "Cannot call Launch before a module is loaded.");
-    void * args_voidp[] = {&args...};
+    void* args_voidp[] = {&args...};
    CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
        kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
  }

-  void LaunchEx(unsigned int gx, unsigned int gy, unsigned int gz,
-                unsigned int bx, unsigned int by, unsigned int bz,
-                unsigned int shared_mem, cudaStream_t stream,
-                void** extra) {
+  void LaunchEx(
+      unsigned int gx,
+      unsigned int gy,
+      unsigned int gz,
+      unsigned int bx,
+      unsigned int by,
+      unsigned int bz,
+      unsigned int shared_mem,
+      cudaStream_t stream,
+      void** extra) {
    CAFFE_ENFORCE(
        module_loaded_, "Cannot call Launch before a module is loaded.");
    CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
@ -115,6 +126,6 @@ inline std::string GetUniqueName() {
  return ss.str();
 }

-}  // namepsace caffe2
+} // namespace caffe2

-#endif  // CAFFE2_CUDA_RTC_COMMON_RTC_H_
+#endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_
--- a/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc
@ -5,8 +5,7 @@

 namespace caffe2 {
 namespace {
-class ElementwiseRTCFunction
-    : public CudaRTCFunction<ElementwiseRTCFunction> {
+class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> {
 public:
  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}

@ -22,22 +21,21 @@ class ElementwiseRTCFunction
  string name_;
 };

-template<>
+template <>
 string ElementwiseRTCFunction::GetSource(
-    int input_size, int output_size,
+    int input_size,
+    int output_size,
    const string command_string) {
  std::stringstream ss;
-  ss << "extern \"C\" __global__ void " << name_ <<
-        "(const size_t nthreads, \n";
+  ss << "extern \"C\" __global__ void " << name_
+     << "(const size_t nthreads, \n";
  // Insert the parameter list.
  int remain_params = input_size + output_size;
  for (int i = 0; i < input_size; ++i) {
-    ss << "const float* in" << i
-       << ((remain_params--) ? ", \n" : "");
+    ss << "const float* in" << i << ((remain_params--) ? ", \n" : "");
  }
  for (int i = 0; i < output_size; ++i) {
-    ss << "float* out" << i
-       << ((remain_params--) ? ", \n" : "");
+    ss << "float* out" << i << ((remain_params--) ? ", \n" : "");
  }
  ss << ") {\n"
        "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
@ -46,7 +44,7 @@ string ElementwiseRTCFunction::GetSource(
     << "}\n}";
  return ss.str();
 }
-}  // namespace
+} // namespace

 /**
 * A GPU operator that can generate limited elementwise operations.
@ -75,17 +73,17 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
 public:
  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CUDAContext>(operator_def, ws) {
-    const string src = OperatorBase::GetSingleArgument<string>(
-        "rtc_src", "");
+    const string src = OperatorBase::GetSingleArgument<string>("rtc_src", "");
    CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
    func_.Compile(InputSize(), OutputSize(), src);
  }
  ~ElementwiseRTCOp() override {}

  bool RunOnDevice() override {
-    static_assert(sizeof(void*) == sizeof(size_t),
-                  "The argbuffer relies on the assumption that void* and "
-                  "size_t have the same size.");
+    static_assert(
+        sizeof(void*) == sizeof(size_t),
+        "The argbuffer relies on the assumption that void* and "
+        "size_t have the same size.");
    vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
    size_t* argBuffer = argBuffer_vec.data();
    CAFFE_ENFORCE(
@ -102,10 +100,11 @@ class ElementwiseRTCOp final : public Operator<CUDAContext> {
    }
    size_t argBufferSize = sizeof(argBuffer);
    void* config[] = {
-      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-      CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
-      CU_LAUNCH_PARAM_END
-    };
+        CU_LAUNCH_PARAM_BUFFER_POINTER,
+        argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,
+        &argBufferSize,
+        CU_LAUNCH_PARAM_END};
    func_.LaunchEx(
        CAFFE_GET_BLOCKS(Input(0).numel()),
        1,
@ -127,4 +126,4 @@ namespace {
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
 }

-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
+++ b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc
@ -2,14 +2,14 @@

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/pool_op.h"
 #include "caffe2/cuda_rtc/common_rtc.h"
+#include "caffe2/operators/pool_op.h"

 namespace caffe2 {
 namespace {
 class AveragePool {};
 class MaxPool {};
-}  // namespace
+} // namespace

 namespace {

@ -98,7 +98,6 @@ __global__ void %s(
 }
 )";

-
 class MaxPoolRTCFunction : public CudaRTCFunction<MaxPoolRTCFunction> {
 public:
  MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
@ -132,7 +131,6 @@ class MaxPoolGradientRTCFunction
  string name_;
 };

-
 template <>
 string MaxPoolRTCFunction::GetSource(
    const int output_size,
@ -149,9 +147,22 @@ string MaxPoolRTCFunction::GetSource(
    const int pad_l) {
  char buffer[65536];
  int nbytes = snprintf(
-      buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
-      channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
-      stride_h, stride_w, pad_t, pad_l);
+      buffer,
+      65536,
+      kMaxPoolForwardNCHWSource,
+      name_.c_str(),
+      output_size,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_t,
+      pad_l);
  DCHECK_GE(nbytes, 0);
  DCHECK_LT(nbytes, 65536);
  return string(buffer);
@ -174,16 +185,29 @@ string MaxPoolGradientRTCFunction::GetSource(
    const int pad_l) {
  char buffer[65536];
  int nbytes = snprintf(
-      buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
-      num, channels, height, width, pooled_height, pooled_width, kernel_h,
-      kernel_w, stride_h, stride_w, pad_t, pad_l);
+      buffer,
+      65536,
+      kMaxPoolBackwardNCHWSource,
+      name_.c_str(),
+      output_size,
+      num,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_t,
+      pad_l);
  DCHECK_GE(nbytes, 0);
  DCHECK_LT(nbytes, 65536);
  return string(buffer);
 }

-}  // namespace
-
+} // namespace

 class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
 public:
@ -196,7 +220,8 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {

  bool RunOnDeviceWithOrderNCHW() override {
    auto& X = Input(0);
-    auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
+    auto output_sizes =
+        ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1));
    auto* Y = Output(0, output_sizes, at::dtype<float>());

    if (input_dims_ != X.sizes()) {
@ -307,7 +332,9 @@ class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {

 namespace {
 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC, MaxPoolRTCOp);
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
-                                   MaxPoolGradientRTCOp);
-}  // namespace
-}  // namespace caffe2
+REGISTER_CUDA_OPERATOR_WITH_ENGINE(
+    MaxPoolGradient,
+    NVRTC,
+    MaxPoolGradientRTCOp);
+} // namespace
+} // namespace caffe2
--- a/caffe2/db/create_db_op.cc
+++ b/caffe2/db/create_db_op.cc
@ -6,4 +6,4 @@ REGISTER_CPU_OPERATOR(CreateDB, CreateDBOp<CPUContext>);
 OPERATOR_SCHEMA(CreateDB).NumInputs(0).NumOutputs(1);

 NO_GRADIENT(CreateDB);
-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/db/leveldb.cc
+++ b/caffe2/db/leveldb.cc
@ -1,6 +1,6 @@
 #include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
 #include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
 #include "leveldb/db.h"
 #include "leveldb/write_batch.h"

@ -19,13 +19,27 @@ class LevelDBCursor : public Cursor {
    SeekToFirst();
  }
  ~LevelDBCursor() override {}
-  void Seek(const string& key) override { iter_->Seek(key); }
-  bool SupportsSeek() override { return true; }
-  void SeekToFirst() override { iter_->SeekToFirst(); }
-  void Next() override { iter_->Next(); }
-  string key() override { return iter_->key().ToString(); }
-  string value() override { return iter_->value().ToString(); }
-  bool Valid() override { return iter_->Valid(); }
+  void Seek(const string& key) override {
+    iter_->Seek(key);
+  }
+  bool SupportsSeek() override {
+    return true;
+  }
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+  }
+  void Next() override {
+    iter_->Next();
+  }
+  string key() override {
+    return iter_->key().ToString();
+  }
+  string value() override {
+    return iter_->value().ToString();
+  }
+  bool Valid() override {
+    return iter_->Valid();
+  }

 private:
  std::unique_ptr<leveldb::Iterator> iter_;
@ -47,8 +61,7 @@ class LevelDBTransaction : public Transaction {
    leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
    batch_.reset(new leveldb::WriteBatch());
    CAFFE_ENFORCE(
-        status.ok(),
-        "Failed to write batch to leveldb. ", status.ToString());
+        status.ok(), "Failed to write batch to leveldb. ", status.ToString());
  }

 private:
@ -71,12 +84,17 @@ class LevelDB : public DB {
    leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
    CAFFE_ENFORCE(
        status.ok(),
-        "Failed to open leveldb ", source, ". ", status.ToString());
+        "Failed to open leveldb ",
+        source,
+        ". ",
+        status.ToString());
    db_.reset(db_temp);
    VLOG(1) << "Opened leveldb " << source;
  }

-  void Close() override { db_.reset(); }
+  void Close() override {
+    db_.reset();
+  }
  unique_ptr<Cursor> NewCursor() override {
    return make_unique<LevelDBCursor>(db_.get());
  }
@ -92,5 +110,5 @@ REGISTER_CAFFE2_DB(LevelDB, LevelDB);
 // For lazy-minded, one can also call with lower-case name.
 REGISTER_CAFFE2_DB(leveldb, LevelDB);

-}  // namespace db
-}  // namespace caffe2
+} // namespace db
+} // namespace caffe2
--- a/caffe2/db/lmdb.cc
+++ b/caffe2/db/lmdb.cc
@ -1,4 +1,4 @@
-#include "lmdb.h"  // NOLINT
+#include "lmdb.h" // NOLINT

 #if defined(_MSC_VER)
 #include <direct.h>
@ -14,7 +14,7 @@
 namespace caffe2 {
 namespace db {

-constexpr size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
+constexpr size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB

 inline void MDB_CHECK(int mdb_status) {
  CAFFE_ENFORCE_EQ(mdb_status, MDB_SUCCESS, mdb_strerror(mdb_status));
@ -22,8 +22,7 @@ inline void MDB_CHECK(int mdb_status) {

 class LMDBCursor : public Cursor {
 public:
-  explicit LMDBCursor(MDB_env* mdb_env)
-      : mdb_env_(mdb_env), valid_(false) {
+  explicit LMDBCursor(MDB_env* mdb_env) : mdb_env_(mdb_env), valid_(false) {
    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
    MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
@ -43,8 +42,8 @@ class LMDBCursor : public Cursor {
    // a key of 16k size should be enough? I am not sure though.
    mdb_key_.mv_size = key.size();
    mdb_key_.mv_data = const_cast<char*>(key.c_str());
-    int mdb_status = mdb_cursor_get(
-        mdb_cursor_, &mdb_key_, &mdb_value_, MDB_SET_RANGE);
+    int mdb_status =
+        mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_SET_RANGE);
    if (mdb_status == MDB_NOTFOUND) {
      valid_ = false;
    } else {
@ -53,22 +52,30 @@ class LMDBCursor : public Cursor {
    }
  }

-  bool SupportsSeek() override { return true; }
+  bool SupportsSeek() override {
+    return true;
+  }

-  void SeekToFirst() override { SeekLMDB(MDB_FIRST); }
+  void SeekToFirst() override {
+    SeekLMDB(MDB_FIRST);
+  }

-  void Next() override { SeekLMDB(MDB_NEXT); }
+  void Next() override {
+    SeekLMDB(MDB_NEXT);
+  }

  string key() override {
    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
  }

  string value() override {
-    return string(static_cast<const char*>(mdb_value_.mv_data),
-        mdb_value_.mv_size);
+    return string(
+        static_cast<const char*>(mdb_value_.mv_data), mdb_value_.mv_size);
  }

-  bool Valid() override { return valid_; }
+  bool Valid() override {
+    return valid_;
+  }

 private:
  void SeekLMDB(MDB_cursor_op op) {
@ -91,8 +98,7 @@ class LMDBCursor : public Cursor {

 class LMDBTransaction final : public Transaction {
 public:
-  explicit LMDBTransaction(MDB_env* mdb_env)
-      : mdb_env_(mdb_env) {
+  explicit LMDBTransaction(MDB_env* mdb_env) : mdb_env_(mdb_env) {
    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
  }
@ -171,5 +177,5 @@ void LMDBTransaction::Put(const string& key, string&& value) {
 REGISTER_CAFFE2_DB(LMDB, LMDB);
 REGISTER_CAFFE2_DB(lmdb, LMDB);

-}  // namespace db
-}  // namespace caffe2
+} // namespace db
+} // namespace caffe2
--- a/caffe2/db/protodb.cc
+++ b/caffe2/db/protodb.cc
@ -1,16 +1,15 @@
 #include <unordered_set>

 #include "caffe2/core/db.h"
-#include "caffe2/utils/proto_utils.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/utils/proto_utils.h"

 namespace caffe2 {
 namespace db {

 class ProtoDBCursor : public Cursor {
 public:
-  explicit ProtoDBCursor(const TensorProtos* proto)
-    : proto_(proto), iter_(0) {}
+  explicit ProtoDBCursor(const TensorProtos* proto) : proto_(proto), iter_(0) {}
  // NOLINTNEXTLINE(modernize-use-equals-default)
  ~ProtoDBCursor() override {}

@ -18,14 +17,22 @@ class ProtoDBCursor : public Cursor {
    CAFFE_THROW("ProtoDB is not designed to support seeking.");
  }

-  void SeekToFirst() override { iter_ = 0; }
-  void Next() override { ++iter_; }
-  string key() override { return proto_->protos(iter_).name(); }
-  string value() override {
-    return
-      SerializeAsString_EnforceCheck(proto_->protos(iter_), "ProtoDBCursor");
+  void SeekToFirst() override {
+    iter_ = 0;
+  }
+  void Next() override {
+    ++iter_;
+  }
+  string key() override {
+    return proto_->protos(iter_).name();
+  }
+  string value() override {
+    return SerializeAsString_EnforceCheck(
+        proto_->protos(iter_), "ProtoDBCursor");
+  }
+  bool Valid() override {
+    return iter_ < proto_->protos_size();
  }
-  bool Valid() override { return iter_ < proto_->protos_size(); }

 private:
  const TensorProtos* proto_;
@ -108,5 +115,5 @@ REGISTER_CAFFE2_DB(ProtoDB, ProtoDB);
 // For lazy-minded, one can also call with lower-case name.
 REGISTER_CAFFE2_DB(protodb, ProtoDB);

-}  // namespace db
-}  // namespace caffe2
+} // namespace db
+} // namespace caffe2
--- a/caffe2/db/zmqdb.cc
+++ b/caffe2/db/zmqdb.cc
@ -1,11 +1,11 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
-#include <thread>  // NOLINT
+#include <thread> // NOLINT

 #include "caffe2/core/db.h"
-#include "caffe2/utils/zmq_helper.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/utils/zmq_helper.h"

 namespace caffe2 {
 namespace db {
@ -13,12 +13,13 @@ namespace db {
 class ZmqDBCursor : public Cursor {
 public:
  explicit ZmqDBCursor(const string& source)
-      : source_(source), socket_(ZMQ_PULL),
-        prefetched_(false), finalize_(false) {
+      : source_(source),
+        socket_(ZMQ_PULL),
+        prefetched_(false),
+        finalize_(false) {
    socket_.Connect(source_);
    // Start prefetching thread.
-    prefetch_thread_.reset(
-        new std::thread([this] { this->Prefetch(); }));
+    prefetch_thread_.reset(new std::thread([this] { this->Prefetch(); }));
    // obtain the first value.
    Next();
  }
@ -35,27 +36,35 @@ class ZmqDBCursor : public Cursor {
  void Seek(const string& /*key*/) override { /* do nothing */
  }

-  void SeekToFirst() override { /* do nothing */ }
+  void SeekToFirst() override { /* do nothing */
+  }

  void Next() override {
    std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
-    while (!prefetched_) consumer_.wait(lock);
+    while (!prefetched_)
+      consumer_.wait(lock);
    key_ = prefetch_key_;
    value_ = prefetch_value_;
    prefetched_ = false;
    producer_.notify_one();
  }

-  string key() override { return key_; }
-  string value() override { return value_; }
-  bool Valid() override { return true; }
+  string key() override {
+    return key_;
+  }
+  string value() override {
+    return value_;
+  }
+  bool Valid() override {
+    return true;
+  }

 private:
-
  void Prefetch() {
    while (!finalize_) {
      std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
-      while (prefetched_) producer_.wait(lock);
+      while (prefetched_)
+        producer_.wait(lock);
      if (finalize_) {
        return;
      }
@ -86,8 +95,7 @@ class ZmqDBCursor : public Cursor {

 class ZmqDB : public DB {
 public:
-  ZmqDB(const string& source, Mode mode)
-      : DB(source, mode), source_(source) {
+  ZmqDB(const string& source, Mode mode) : DB(source, mode), source_(source) {
    CAFFE_ENFORCE(mode == READ, "ZeroMQ DB only supports read mode.");
  }

@ -101,7 +109,7 @@ class ZmqDB : public DB {

  unique_ptr<Transaction> NewTransaction() override {
    CAFFE_THROW("ZeroMQ DB does not support writing with a transaction.");
-    return nullptr;  // dummy placeholder to suppress old compiler warnings.
+    return nullptr; // dummy placeholder to suppress old compiler warnings.
  }

 private:
@ -112,5 +120,5 @@ REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
 // For lazy-minded, one can also call with lower-case name.
 REGISTER_CAFFE2_DB(zmqdb, ZmqDB);

-}  // namespace db
-}  // namespace caffe2
+} // namespace db
+} // namespace caffe2
--- a/caffe2/distributed/file_store_handler.cc
+++ b/caffe2/distributed/file_store_handler.cc
@ -181,4 +181,4 @@ void FileStoreHandler::wait(
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
  }
 }
-}
+} // namespace caffe2
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@ -9,19 +9,19 @@ class TORCH_API FileStoreHandler : public StoreHandler {
  explicit FileStoreHandler(const std::string& path, const std::string& prefix);
  virtual ~FileStoreHandler();

-   void set(const std::string& name, const std::string& data) override;
+  void set(const std::string& name, const std::string& data) override;

  virtual std::string get(
      const std::string& name,
      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;

-   int64_t add(const std::string& name, int64_t value) override;
+  int64_t add(const std::string& name, int64_t value) override;

-   bool deleteKey(const std::string& key) override;
+  bool deleteKey(const std::string& key) override;

-   int64_t getNumKeys() override;
+  int64_t getNumKeys() override;

-   bool check(const std::vector<std::string>& names) override;
+  bool check(const std::vector<std::string>& names) override;

  virtual void wait(
      const std::vector<std::string>& names,
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@ -15,19 +15,19 @@ class TORCH_API RedisStoreHandler : public StoreHandler {
  explicit RedisStoreHandler(std::string& host, int port, std::string& prefix);
  virtual ~RedisStoreHandler();

-   void set(const std::string& name, const std::string& data) override;
+  void set(const std::string& name, const std::string& data) override;

  virtual std::string get(
      const std::string& name,
      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;

-   int64_t add(const std::string& name, int64_t value) override;
+  int64_t add(const std::string& name, int64_t value) override;

-   int64_t getNumKeys() override;
+  int64_t getNumKeys() override;

-   bool deleteKey(const std::string& key) override;
+  bool deleteKey(const std::string& key) override;

-   bool check(const std::vector<std::string>& names) override;
+  bool check(const std::vector<std::string>& names) override;

  virtual void wait(
      const std::vector<std::string>& names,
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@ -67,8 +67,7 @@ class TORCH_API StoreHandler {
 /*
 * The backing store is no longer available. It may have been deleted.
 */
-struct TORCH_API StoreHandlerNotAvailableException
-    : public std::runtime_error {
+struct TORCH_API StoreHandlerNotAvailableException : public std::runtime_error {
  explicit StoreHandlerNotAvailableException(const std::string& msg)
      : std::runtime_error(msg) {}
 };
--- a/caffe2/distributed/store_ops.cc
+++ b/caffe2/distributed/store_ops.cc
@ -124,4 +124,4 @@ either as an input blob with blob names or as an argument.
    .Arg("blob_names", "names of the blobs to wait for (optional)")
    .Input(0, "handler", "unique_ptr<StoreHandler>")
    .Input(1, "names", "names of the blobs to wait for (optional)");
-}
+} // namespace caffe2
--- a/caffe2/distributed/store_ops.h
+++ b/caffe2/distributed/store_ops.h
@ -52,4 +52,4 @@ class StoreWaitOp final : public Operator<CPUContext> {

  INPUT_TAGS(HANDLER);
 };
-}
+} // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.cc
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.cc
@ -19,8 +19,9 @@
 namespace caffe2 {

 REGISTER_CPU_OPERATOR(FC_Decomp, FullyConnectedOpDecomp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(FCGradient_Decomp,
-                      FullyConnectedDecompGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    FCGradient_Decomp,
+    FullyConnectedDecompGradientOp<float, CPUContext>);

 OPERATOR_SCHEMA(FC_Decomp).NumInputs(4).NumOutputs(1);
 OPERATOR_SCHEMA(FCGradient_Decomp).NumInputs(4).NumOutputs(3, 4);
@ -31,10 +32,11 @@ class GetFCDecompGradient : public GradientMakerBase {
    CAFFE_ENFORCE_EQ(def_.input_size(), 4);
    // TODO(wyiming): Check whether it is right? Let's move fast first.
    return SingleGradientDef(
-        "FCGradient_Decomp", "",
+        "FCGradient_Decomp",
+        "",
        vector<string>{I(0), I(1), I(2), GO(0)},
        vector<string>{GI(1), GI(2), GI(3), GI(0)});
  }
 };
 REGISTER_GRADIENT(FC_Decomp, GetFCDecompGradient);
-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition.h
@ -30,7 +30,7 @@ namespace caffe2 {
 * W(N * K) = U(N * middle) * trans(V(K * middle))
 * */
 // This is Caffe's InnerProductOp, with a name that fits its purpose better.
-template <typename T, class Context, class Engine=DefaultEngine>
+template <typename T, class Context, class Engine = DefaultEngine>
 class FullyConnectedOpDecomp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
@ -44,15 +44,15 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
    const auto& V = Input(2);
    const auto& b = Input(3);

-    //auto* buffer_ptr = Output(1);
+    // auto* buffer_ptr = Output(1);
    // Size M * middle;
-    //auto& multi_buffer_ = *buffer_ptr;
+    // auto& multi_buffer_ = *buffer_ptr;
    CAFFE_ENFORCE_GE(X.dim(), 1);
    CAFFE_ENFORCE_GE(U.dim(), 2);
    CAFFE_ENFORCE_GE(V.dim(), 2);
    if (X.dim() > 2 || U.dim() > 2 || V.dim() > 2) {
      VLOG(1) << "Using legacy support for arbitrary input and weight "
-                       "dimensions.";
+                 "dimensions.";
    }
    CAFFE_ENFORCE_EQ(b.dim(), 1);
    // batch size
@ -79,25 +79,51 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
    T* multi_buffer_data = multi_buffer_.template mutable_data<T>();
    //  X * V * tans(U)
    math::Gemm<T, Context, Engine>(
-        CblasNoTrans, CblasNoTrans, M, middle, K, 1, X.template data<T>(),
-        V.template data<T>(), 0, multi_buffer_data,
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        middle,
+        K,
+        1,
+        X.template data<T>(),
+        V.template data<T>(),
+        0,
+        multi_buffer_data,
        &context_);
    math::Gemm<T, Context, Engine>(
-        CblasNoTrans, CblasTrans, M, N, middle, 1, multi_buffer_data,
-        U.template data<T>(), 0, Y->template mutable_data<T>(),
+        CblasNoTrans,
+        CblasTrans,
+        M,
+        N,
+        middle,
+        1,
+        multi_buffer_data,
+        U.template data<T>(),
+        0,
+        Y->template mutable_data<T>(),
        &context_);
    // Add bias term
    if (bias_multiplier_.numel() != M) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
      bias_multiplier_.Resize(M);
      math::Set<T, Context>(
-          M, static_cast<T>(1), bias_multiplier_.template mutable_data<T>(),
+          M,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
          &context_);
    }
    math::Gemm<T, Context, Engine>(
-        CblasNoTrans, CblasNoTrans, M, N, 1, 1,
-        bias_multiplier_.template data<T>(), b.template data<T>(), 1,
-        Y->template mutable_data<T>(), &context_);
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        N,
+        1,
+        1,
+        bias_multiplier_.template data<T>(),
+        b.template data<T>(),
+        1,
+        Y->template mutable_data<T>(),
+        &context_);
    return true;
  }

@ -106,7 +132,7 @@ class FullyConnectedOpDecomp final : public Operator<Context> {
  Tensor multi_buffer_{Context::GetDeviceType()};
 };

-template <typename T, class Context, class Engine=DefaultEngine>
+template <typename T, class Context, class Engine = DefaultEngine>
 class FullyConnectedDecompGradientOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
@ -148,41 +174,75 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
    du_buffer_.Resize(N, middle);
    T* du_buffer_data = du_buffer_.template mutable_data<T>();
    math::Gemm<T, Context, Engine>(
-        CblasNoTrans, CblasNoTrans, M, middle, K, 1,
-        X.template data<T>(), V.template data<T>(),
-        0, du_buffer_data,
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        middle,
+        K,
+        1,
+        X.template data<T>(),
+        V.template data<T>(),
+        0,
+        du_buffer_data,
        &context_);
    math::Gemm<T, Context, Engine>(
-        CblasTrans, CblasNoTrans, N, middle, M, 1,
-        dY.template data<T>(), du_buffer_data,
-        0, dU->template mutable_data<T>(),
+        CblasTrans,
+        CblasNoTrans,
+        N,
+        middle,
+        M,
+        1,
+        dY.template data<T>(),
+        du_buffer_data,
+        0,
+        dU->template mutable_data<T>(),
        &context_);
    // Compute dV
    // first compute dY * U
    dv_buffer_.Resize(M, middle);
    T* dv_buffer_data = dv_buffer_.template mutable_data<T>();
    math::Gemm<T, Context, Engine>(
-        CblasNoTrans, CblasNoTrans, M, middle, N, 1,
-        dY.template data<T>(), U.template data<T>(),
-        0, dv_buffer_data,
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        middle,
+        N,
+        1,
+        dY.template data<T>(),
+        U.template data<T>(),
+        0,
+        dv_buffer_data,
        &context_);
    math::Gemm<T, Context, Engine>(
-        CblasTrans, CblasNoTrans, K, middle, M, 1,
-        dY.template data<T>(), du_buffer_data,
-        0, dV->template mutable_data<T>(),
+        CblasTrans,
+        CblasNoTrans,
+        K,
+        middle,
+        M,
+        1,
+        dY.template data<T>(),
+        du_buffer_data,
+        0,
+        dV->template mutable_data<T>(),
        &context_);
    if (bias_multiplier_.numel() != M) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
      bias_multiplier_.Resize(M);
      math::Set<T, Context>(
-          M, static_cast<T>(1),
+          M,
+          static_cast<T>(1),
          bias_multiplier_.template mutable_data<T>(),
          &context_);
    }
    // Compute dB
    math::Gemv<T, Context>(
-        CblasTrans, M, N, 1, dY.template data<T>(),
-        bias_multiplier_.template data<T>(), 0,
+        CblasTrans,
+        M,
+        N,
+        1,
+        dY.template data<T>(),
+        bias_multiplier_.template data<T>(),
+        0,
        db->template mutable_data<T>(),
        &context_);
    // Compute dX if necessary.
@ -191,14 +251,28 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
      dx_buffer_.Resize(M, middle);
      T* dx_buffer_data = dx_buffer_.template mutable_data<T>();
      math::Gemm<T, Context, Engine>(
-          CblasNoTrans, CblasNoTrans, M, middle, N, 1,
-          dY.template data<T>(), U.template data<T>(),
-          0, dx_buffer_data,
+          CblasNoTrans,
+          CblasNoTrans,
+          M,
+          middle,
+          N,
+          1,
+          dY.template data<T>(),
+          U.template data<T>(),
+          0,
+          dx_buffer_data,
          &context_);
      math::Gemm<T, Context, Engine>(
-          CblasNoTrans, CblasTrans, M, K, middle, 1,
-          dx_buffer_data, V.template data<T>(),
-          0, dX->template mutable_data<T>(),
+          CblasNoTrans,
+          CblasTrans,
+          M,
+          K,
+          middle,
+          1,
+          dx_buffer_data,
+          V.template data<T>(),
+          0,
+          dX->template mutable_data<T>(),
          &context_);
    }

@ -212,6 +286,6 @@ class FullyConnectedDecompGradientOp : public Operator<Context> {
  Tensor dx_buffer_{Context::GetDeviceType()};
 };

-}  // namespace caffe2
+} // namespace caffe2

-#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
--- a/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
+++ b/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
@ -20,7 +20,8 @@
 namespace caffe2 {

 REGISTER_CUDA_OPERATOR(FC_Decomp, FullyConnectedOpDecomp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(FCGradient_Decomp,
-                       FullyConnectedDecompGradientOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    FCGradient_Decomp,
+    FullyConnectedDecompGradientOp<float, CUDAContext>);

-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_prune.cc
+++ b/caffe2/experiments/operators/fully_connected_op_prune.cc
@ -20,25 +20,29 @@ namespace caffe2 {
 namespace {

 REGISTER_CPU_OPERATOR(FC_Prune, FullyConnectedOpPrune<float, CPUContext>);
-REGISTER_CPU_OPERATOR(FCGradient_Prune,
-                      FullyConnectedPruneGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    FCGradient_Prune,
+    FullyConnectedPruneGradientOp<float, CPUContext>);
 /* 8 Inputs:
 * X    W   Mask  bias  Ag_dw   Mask_seq  thres   comp_lb
 * */
 OPERATOR_SCHEMA(FC_Prune).NumInputs(8).NumOutputs(1, 2);
-OPERATOR_SCHEMA(FCGradient_Prune).NumInputs(8).NumOutputs(6, 7)
-      .AllowInplace({{1, 2}, {2, 3}, {4, 4}, {5, 5}});
+OPERATOR_SCHEMA(FCGradient_Prune)
+    .NumInputs(8)
+    .NumOutputs(6, 7)
+    .AllowInplace({{1, 2}, {2, 3}, {4, 4}, {5, 5}});

 class GetFCPruneGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    CAFFE_ENFORCE_EQ(def_.input_size(), 8);
    return SingleGradientDef(
-        "FCGradient_Prune", "",
+        "FCGradient_Prune",
+        "",
        vector<string>{I(0), I(1), I(2), GO(0), I(4), I(5), I(6), I(7)},
        vector<string>{GI(1), GI(3), I(1), I(2), I(4), I(5), GI(0)});
  }
 };
 REGISTER_GRADIENT(FC_Prune, GetFCPruneGradient);
-}  // namespace
-}  // namespace caffe2
+} // namespace
+} // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ b/caffe2/experiments/operators/fully_connected_op_prune.h
@ -23,337 +23,384 @@

 namespace caffe2 {

-  namespace {
+namespace {

-    template<int N>
-      using Shape = std::array<int, N>;
+template <int N>
+using Shape = std::array<int, N>;

-    template<int N>
-      const std::vector<int64_t>& shape(Shape<N> vs) {
-        static thread_local std::vector<int64_t> cache;
-        cache.resize(vs.size());
-        for (auto i = 0; i < vs.size(); ++i) {
-          cache[i] = vs[i];
-        }
-        return cache;
-      }
+template <int N>
+const std::vector<int64_t>& shape(Shape<N> vs) {
+  static thread_local std::vector<int64_t> cache;
+  cache.resize(vs.size());
+  for (auto i = 0; i < vs.size(); ++i) {
+    cache[i] = vs[i];
+  }
+  return cache;
+}

-    inline const std::vector<int64_t>& shape(int i) {
-      return shape<1>(Shape<1>({i}));
+inline const std::vector<int64_t>& shape(int i) {
+  return shape<1>(Shape<1>({i}));
+}
+
+inline const std::vector<int64_t>& shape(int i, int j) {
+  return shape<2>(Shape<2>({i, j}));
+}
+
+template <typename T, class Context>
+void MaskMatrix(const T* mask, T* mat, int M, int N);
+
+template <typename T, class Context>
+void MaskMatrix_Inc(T* mask_seq, T* mat, int M, int N, int seq_len, T target);
+
+template <typename T, class Context>
+void AggrDW(T* ag_dw, const T* dw, int N, int K, Context* context);
+
+template <typename T>
+int MatrixCompare_LT(const T* mat, float thres, T* mask_seq, int M, int N);
+
+// TODO(wyiming): write an incremental Mask
+// Incremental Mask: only give the new mask positions;
+// Assuming that weights masked will not be mask again;
+// The incremental mask can also be used to update mask matrix;
+// But this will include template for bool and float;
+template <>
+void MaskMatrix<float, CPUContext>(
+    const float* mask,
+    float* mat,
+    int M,
+    int N) {
+  int offset = 0;
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      mat[offset] = mask[offset] ? mat[offset] : 0;
+      offset++;
    }
+  }
+}

-    inline const std::vector<int64_t>& shape(int i, int j) {
-      return shape<2>(Shape<2>({i, j}));
+template <>
+void MaskMatrix_Inc<float, CPUContext>(
+    float* mask_seq,
+    float* mat,
+    int /*M*/,
+    int /*N*/,
+    int seq_len,
+    float target) {
+  for (int i = 0; i < seq_len; ++i) {
+    // assume that the mask_seq is smaller than size
+    // Although it seems that random access gets bad performance,
+    // we make sure that seq is in order;
+    mat[static_cast<int>(mask_seq[i])] = target;
+  }
+}
+
+template <>
+void AggrDW<float, CPUContext>(
+    float* ag_dw,
+    const float* dw,
+    int N,
+    int K,
+    CPUContext* context) {
+  math::Add<float, CPUContext>(N * K, dw, ag_dw, ag_dw, context);
+}
+
+template <>
+int MatrixCompare_LT<float>(
+    const float* mat,
+    float thres,
+    float* mask_seq,
+    int M,
+    int N) {
+  int seq_len = 0;
+  int offset = 0;
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      if (mat[offset] != 0 && (mat[offset] < thres && mat[offset] > -thres)) {
+        mask_seq[seq_len++] = static_cast<float>(offset);
+      }
+      offset++;
    }
+  }
+  return seq_len;
+}

-    template <typename T, class Context>
-      void MaskMatrix(const T* mask, T* mat,
-          int M, int N);
+} // namespace

-    template <typename T, class Context>
-      void MaskMatrix_Inc(T* mask_seq, T* mat,
-          int M, int N, int seq_len, T target);
+// This is Caffe's InnerProductOp, with a name that fits its purpose better.
+template <typename T, class Context, class Engine = DefaultEngine>
+class FullyConnectedOpPrune final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedOpPrune(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+  ~FullyConnectedOpPrune() {}

-    template <typename T, class Context>
-      void AggrDW(T* ag_dw, const T* dw, int N, int K, Context* context);
-
-    template <typename T>
-      int MatrixCompare_LT(const T* mat, float thres,
-                           T* mask_seq, int M, int N);
-
-    // TODO(wyiming): write an incremental Mask
-    // Incremental Mask: only give the new mask positions;
-    // Assuming that weights masked will not be mask again;
-    // The incremental mask can also be used to update mask matrix;
-    // But this will include template for bool and float;
-    template <>
-      void MaskMatrix<float, CPUContext>(
-          const float* mask, float* mat, int M, int N) {
-        int offset = 0;
-        for (int i = 0; i < M; ++i) {
-          for (int j = 0; j < N; ++j) {
-            mat[offset] = mask[offset]? mat[offset] : 0;
-            offset++;
-          }
-        }
-      }
-
-      template <>
-      void MaskMatrix_Inc<float, CPUContext>(
-          float* mask_seq,
-          float* mat,
-          int /*M*/,
-          int /*N*/,
-          int seq_len,
-          float target) {
-        for (int i = 0; i < seq_len; ++i) {
-          // assume that the mask_seq is smaller than size
-          // Although it seems that random access gets bad performance,
-          // we make sure that seq is in order;
-          mat[static_cast<int>(mask_seq[i])] = target;
-        }
-      }
-
-    template <>
-      void AggrDW<float, CPUContext>(
-          float* ag_dw, const float* dw,
-          int N, int K, CPUContext* context) {
-        math::Add<float, CPUContext>(N*K, dw, ag_dw, ag_dw, context);
-      }
-
-    template <>
-      int MatrixCompare_LT<float>(
-          const float* mat, float thres,
-          float* mask_seq, int M, int N) {
-        int seq_len = 0;
-        int offset = 0;
-        for (int i = 0 ; i < M; ++i) {
-          for (int j = 0; j < N; ++j) {
-            if (mat[offset] != 0 &&
-                (mat[offset] < thres && mat[offset] > -thres)) {
-              mask_seq[seq_len++] = static_cast<float>(offset);
-            }
-            offset++;
-          }
-        }
-        return seq_len;
-      }
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    const auto& W = Input(1);
+    const auto& Mask = Input(2);
+    const auto& b = Input(3);

+    CAFFE_ENFORCE_GE(X.dim(), 1);
+    CAFFE_ENFORCE_GE(W.dim(), 2);
+    if (X.dim() > 2 || W.dim() > 2) {
+      VLOG(1) << "Using legacy support for arbitrary input and weight "
+                 "dimensions.";
+    }
+    CAFFE_ENFORCE_EQ(b.dim(), 1);
+    // batch size
+    int M = X.dim() > 1 ? X.dim32(0) : 1;
+    // Feature dimension
+    int K = X.numel() / M;
+    // number of outputs.
+    int N = W.dim32(0);
+    CAFFE_ENFORCE_EQ(K, W.numel() / W.dim32(0));
+    CAFFE_ENFORCE_EQ(N, b.dim32(0));
+    std::vector<int64_t> dims;
+    if (X.dim() > 1) {
+      dims = {M, N};
+    } else {
+      dims = {N};
+    }
+    auto* Y = Output(0, dims, at::dtype<T>());
+    // W * x
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans,
+        CblasTrans,
+        M,
+        N,
+        K,
+        1,
+        X.template data<T>(),
+        W.template data<T>(),
+        0,
+        Y->template mutable_data<T>(),
+        &context_);
+    // Add bias term
+    if (bias_multiplier_.numel() != M) {
+      // If the helper bias multiplier is not M,
+      // reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<T, Context>(
+          M,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    math::Gemm<T, Context, Engine>(
+        CblasNoTrans,
+        CblasNoTrans,
+        M,
+        N,
+        1,
+        1,
+        bias_multiplier_.template data<T>(),
+        b.template data<T>(),
+        1,
+        Y->template mutable_data<T>(),
+        &context_);
+    if (OutputSize() == 2) {
+      auto* Comp_rate = Output(1, vector<int64_t>(), at::dtype<T>());
+      T* comp_data = Comp_rate->template mutable_data<T>();
+      math::Sum<T, Context>(
+          Mask.numel(), Mask.template data<T>(), comp_data, &context_);
+      math::Scale<float, T, Context>(
+          1,
+          static_cast<T>(1.) / Mask.numel(),
+          comp_data,
+          comp_data,
+          &context_);
+    }
+    return true;
  }

-  // This is Caffe's InnerProductOp, with a name that fits its purpose better.
-  template <typename T, class Context, class Engine=DefaultEngine>
-    class FullyConnectedOpPrune final : public Operator<Context> {
-      public:
-        USE_OPERATOR_CONTEXT_FUNCTIONS;
-        FullyConnectedOpPrune(const OperatorDef& operator_def, Workspace* ws)
-          : Operator<Context>(operator_def, ws) {}
-        ~FullyConnectedOpPrune() {}
+ protected:
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+};

-        bool RunOnDevice() override {
-          const auto& X = Input(0);
-          const auto& W = Input(1);
-          const auto& Mask = Input(2);
-          const auto& b = Input(3);
+template <typename T, class Context, class Engine = DefaultEngine>
+class FullyConnectedPruneGradientOp : public Operator<Context> {
+ public:
+  int iter_offset;

-          CAFFE_ENFORCE_GE(X.dim(), 1);
-          CAFFE_ENFORCE_GE(W.dim(), 2);
-          if (X.dim() > 2 || W.dim() > 2) {
-            VLOG(1) << "Using legacy support for arbitrary input and weight "
-              "dimensions.";
-          }
-          CAFFE_ENFORCE_EQ(b.dim(), 1);
-          // batch size
-          int M = X.dim() > 1 ? X.dim32(0) : 1;
-          // Feature dimension
-          int K = X.numel() / M;
-          // number of outputs.
-          int N = W.dim32(0);
-          CAFFE_ENFORCE_EQ(K, W.numel() / W.dim32(0));
-          CAFFE_ENFORCE_EQ(N, b.dim32(0));
-          std::vector<int64_t> dims;
-          if (X.dim() > 1) {
-            dims = {M, N};
-          } else {
-            dims = {N};
-          }
-          auto* Y = Output(0, dims, at::dtype<T>());
-          // W * x
-          math::Gemm<T, Context, Engine>(
-              CblasNoTrans, CblasTrans, M, N, K, 1, X.template data<T>(),
-              W.template data<T>(), 0, Y->template mutable_data<T>(),
-              &context_);
-          // Add bias term
-          if (bias_multiplier_.numel() != M) {
-            // If the helper bias multiplier is not M,
-            // reshape and fill it with one.
-            bias_multiplier_.Resize(M);
-            math::Set<T, Context>(
-                M, static_cast<T>(1),
-                bias_multiplier_.template mutable_data<T>(),
-                &context_);
-          }
-          math::Gemm<T, Context, Engine>(
-              CblasNoTrans, CblasNoTrans, M, N, 1, 1,
-              bias_multiplier_.template data<T>(), b.template data<T>(), 1,
-              Y->template mutable_data<T>(), &context_);
-          if (OutputSize() == 2){
-            auto* Comp_rate = Output(1, vector<int64_t>(), at::dtype<T>());
-            T* comp_data = Comp_rate->template mutable_data<T>();
-            math::Sum<T, Context>(
-                Mask.numel(), Mask.template data<T>(), comp_data, &context_);
-            math::Scale<float, T, Context>(
-                1,
-                static_cast<T>(1.) / Mask.numel(),
-                comp_data,
-                comp_data,
-                &context_);
-          }
-          return true;
-        }
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  FullyConnectedPruneGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {
+    iter_offset = 0;
+  }
+  ~FullyConnectedPruneGradientOp() {}

-      protected:
-       Tensor bias_multiplier_{Context::GetDeviceType()};
-    };
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    // const auto& W = Input(1);
+    auto* W_ptr = Output(2);
+    auto& W = *W_ptr;
+    // const auto& Mask = Input(2);
+    auto* Mask_ptr = Output(3);
+    auto& Mask = *Mask_ptr;
+    const auto& dY = Input(3);
+    // const auto& Ag_dW = Input(4);
+    auto* Ag_dW_ptr = Output(4);
+    auto& Ag_dW = *Ag_dW_ptr;
+    // it is also the Input(5)

-  template <typename T, class Context, class Engine=DefaultEngine>
-    class FullyConnectedPruneGradientOp : public Operator<Context> {
-      public:
-        int iter_offset;
-      public:
-        USE_OPERATOR_CONTEXT_FUNCTIONS;
-        FullyConnectedPruneGradientOp
-          (const OperatorDef& operator_def, Workspace* ws)
-          : Operator<Context>(operator_def, ws) { iter_offset = 0; }
-        ~FullyConnectedPruneGradientOp() {}
+    // how about get threshold
+    auto& thres = Input(6);
+    // TODO(wyiming): check comp_lb is a float
+    auto& comp_lb = Input(7);
+    DCHECK_GE(X.dim(), 1);
+    DCHECK_GE(W.dim(), 2);
+    DCHECK_LE(dY.dim(), 2);
+    // batch size
+    int M = X.dim() > 1 ? X.dim32(0) : 1;
+    // Feature dimension
+    int K = X.numel() / M;
+    // number of outputs.
+    int N = W.dim32(0);
+    // TODO(wyiming): add this window_size to workspace?
+    int window_size = 100;
+    // TODO(wyiming): this threshold should be
+    // based on distribution of the layer weight
+    float thr = 0.01;
+    DCHECK_EQ(Mask.dim32(0), W.dim32(0));
+    DCHECK_EQ(Mask.dim32(1), W.dim32(1));
+    DCHECK_EQ(Ag_dW.dim32(0), W.dim32(0));
+    DCHECK_EQ(Ag_dW.dim32(1), W.dim32(1));
+    DCHECK_EQ(K, W.numel() / W.dim32(0));
+    if (dY.dim() > 1) {
+      DCHECK_EQ(M, dY.dim32(0));
+      DCHECK_EQ(N, dY.dim32(1));
+    } else {
+      DCHECK_EQ(X.dim(), 1);
+      DCHECK_EQ(N, dY.numel());
+    }

-        bool RunOnDevice() override {
-          const auto& X = Input(0);
-          //const auto& W = Input(1);
-          auto* W_ptr = Output(2);
-          auto& W = *W_ptr;
-          //const auto& Mask = Input(2);
-          auto* Mask_ptr = Output(3);
-          auto& Mask = *Mask_ptr;
-          const auto& dY = Input(3);
-          //const auto& Ag_dW = Input(4);
-          auto* Ag_dW_ptr = Output(4);
-          auto& Ag_dW = *Ag_dW_ptr;
-          // it is also the Input(5)
+    auto* dW = Output(0, W.sizes(), at::dtype<T>());
+    auto* db = Output(1, {N}, at::dtype<T>());

-          // how about get threshold
-          auto& thres = Input(6);
-          //TODO(wyiming): check comp_lb is a float
-          auto& comp_lb = Input(7);
-          DCHECK_GE(X.dim(), 1);
-          DCHECK_GE(W.dim(), 2);
-          DCHECK_LE(dY.dim(), 2);
-          // batch size
-          int M = X.dim() > 1 ? X.dim32(0) : 1;
-          // Feature dimension
-          int K = X.numel() / M;
-          // number of outputs.
-          int N = W.dim32(0);
-          // TODO(wyiming): add this window_size to workspace?
-          int window_size = 100;
-          // TODO(wyiming): this threshold should be
-          // based on distribution of the layer weight
-          float thr = 0.01;
-          DCHECK_EQ(Mask.dim32(0), W.dim32(0));
-          DCHECK_EQ(Mask.dim32(1), W.dim32(1));
-          DCHECK_EQ(Ag_dW.dim32(0), W.dim32(0));
-          DCHECK_EQ(Ag_dW.dim32(1), W.dim32(1));
-          DCHECK_EQ(K, W.numel() / W.dim32(0));
-          if (dY.dim() > 1) {
-            DCHECK_EQ(M, dY.dim32(0));
-            DCHECK_EQ(N, dY.dim32(1));
-          } else {
-            DCHECK_EQ(X.dim(), 1);
-            DCHECK_EQ(N, dY.numel());
-          }
+    // Compute dW
+    math::Gemm<T, Context, Engine>(
+        CblasTrans,
+        CblasNoTrans,
+        N,
+        K,
+        M,
+        1,
+        dY.template data<T>(),
+        X.template data<T>(),
+        0,
+        dW->template mutable_data<T>(),
+        &context_);

-          auto* dW = Output(0, W.sizes(), at::dtype<T>());
-          auto* db = Output(1, {N}, at::dtype<T>());
+    comp_r_buf_.Resize(vector<int64_t>());
+    T* comp_data = comp_r_buf_.template mutable_data<T>();
+    math::Sum<T, Context>(
+        Mask.numel(), Mask.template data<T>(), comp_data, &context_);
+    math::Scale<float, T, Context>(
+        1, static_cast<T>(1.) / Mask.numel(), comp_data, comp_data, &context_);
+    // update W size window
+    // Notice here we need to maintain state in OP.
+    // This is new in Caffe2.
+    // And this is something we might need to discuss in the future.
+    // at most mask half of the matrix at time
+    // 1. mask dw with previous mask
+    MaskMatrix<T, Context>(
+        Mask.template mutable_data<T>(), dW->template mutable_data<T>(), N, K);
+    if (*comp_data > *(comp_lb.template data<T>())) {
+      iter_offset++;
+      if (iter_offset % window_size == 0) {
+        // TODO(wyiming):do the prune here;
+        sum_buffer_.ResizeLike(W);
+        math::Add<T, Context>(
+            W.numel(),
+            W.template mutable_data<T>(),
+            Ag_dW.template mutable_data<T>(),
+            sum_buffer_.template mutable_data<T>(),
+            &context_);
+        auto* mask_seq_auto = Output(5, W.sizes(), at::dtype<T>());
+        T* mask_seq = mask_seq_auto->template mutable_data<T>();
+        math::Set<T, Context>(
+            N * K,
+            static_cast<T>(0),
+            mask_seq_auto->template mutable_data<T>(),
+            &context_);
+        // 2. find dw below thres but not eq 0
+        int seq_len = MatrixCompare_LT<T>(
+            Ag_dW_ptr->template mutable_data<T>(),
+            *thres.template data<T>(),
+            mask_seq,
+            N,
+            K);
+        // 3. use the mask_seq to update W and dw
+        MaskMatrix_Inc<T, Context>(
+            mask_seq, dW->template mutable_data<T>(), N, K, seq_len, 0);
+        MaskMatrix_Inc<T, Context>(
+            mask_seq, W.template mutable_data<T>(), N, K, seq_len, 0);
+        MaskMatrix_Inc<T, Context>(
+            mask_seq, Mask.template mutable_data<T>(), N, K, seq_len, 0);
+        math::Set<T, Context>(
+            N * K,
+            static_cast<T>(0),
+            Ag_dW.template mutable_data<T>(),
+            &context_);
+      } else {
+        // add dW to Aggregate dW.
+        AggrDW<T, Context>(
+            Ag_dW.template mutable_data<T>(),
+            dW->template mutable_data<T>(),
+            N,
+            K,
+            &context_);
+      }
+    }
+    if (bias_multiplier_.numel() != M) {
+      // If the helper bias multiplier is not M,
+      // reshape and fill it with one.
+      bias_multiplier_.Resize(M);
+      math::Set<T, Context>(
+          M,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
+          &context_);
+    }
+    // Compute dB
+    math::Gemv<T, Context>(
+        CblasTrans,
+        M,
+        N,
+        1,
+        dY.template data<T>(),
+        bias_multiplier_.template data<T>(),
+        0,
+        db->template mutable_data<T>(),
+        &context_);
+    // Compute dX if necessary.
+    if (OutputSize() == 7) {
+      auto* dX = Output(6, X.sizes(), at::dtype<T>());
+      math::Gemm<T, Context, Engine>(
+          CblasNoTrans,
+          CblasNoTrans,
+          M,
+          K,
+          N,
+          1,
+          dY.template data<T>(),
+          W.template data<T>(),
+          0,
+          dX->template mutable_data<T>(),
+          &context_);
+    }

-          // Compute dW
-          math::Gemm<T, Context, Engine>(
-              CblasTrans, CblasNoTrans, N, K, M, 1,
-              dY.template data<T>(), X.template data<T>(),
-              0, dW->template mutable_data<T>(),
-              &context_);
+    return true;
+  }

-          comp_r_buf_.Resize(vector<int64_t>());
-          T* comp_data = comp_r_buf_.template mutable_data<T>();
-          math::Sum<T, Context>(
-              Mask.numel(), Mask.template data<T>(), comp_data, &context_);
-          math::Scale<float, T, Context>(
-              1,
-              static_cast<T>(1.) / Mask.numel(),
-              comp_data,
-              comp_data,
-              &context_);
-          // update W size window
-          // Notice here we need to maintain state in OP.
-          // This is new in Caffe2.
-          // And this is something we might need to discuss in the future.
-          // at most mask half of the matrix at time
-          // 1. mask dw with previous mask
-          MaskMatrix<T, Context>(Mask.template mutable_data<T>(),
-              dW->template mutable_data<T>(), N, K);
-          if(*comp_data > *(comp_lb.template data<T>())){
-            iter_offset++;
-            if (iter_offset % window_size == 0) {
-              // TODO(wyiming):do the prune here;
-              sum_buffer_.ResizeLike(W);
-              math::Add<T, Context>(
-                  W.numel(),
-                  W.template mutable_data<T>(),
-                  Ag_dW.template mutable_data<T>(),
-                  sum_buffer_.template mutable_data<T>(),
-                  &context_);
-              auto* mask_seq_auto = Output(5, W.sizes(), at::dtype<T>());
-              T* mask_seq = mask_seq_auto->template mutable_data<T>();
-              math::Set<T, Context>(N*K, static_cast<T>(0),
-                  mask_seq_auto->template mutable_data<T>(), &context_);
-              // 2. find dw below thres but not eq 0
-              int seq_len = MatrixCompare_LT<T>(
-                  Ag_dW_ptr->template mutable_data<T>(),
-                  *thres.template data<T>(), mask_seq, N, K);
-              // 3. use the mask_seq to update W and dw
-              MaskMatrix_Inc<T, Context>(mask_seq,
-                                         dW->template mutable_data<T>(),
-                                         N, K, seq_len, 0);
-              MaskMatrix_Inc<T, Context>(mask_seq,
-                                         W.template mutable_data<T>(),
-                                         N, K, seq_len, 0);
-              MaskMatrix_Inc<T, Context>(mask_seq,
-                                         Mask.template mutable_data<T>(),
-                                         N, K, seq_len, 0);
-              math::Set<T, Context>(N*K, static_cast<T>(0),
-                  Ag_dW.template mutable_data<T>(),
-                  &context_);
-            } else {
-              // add dW to Aggregate dW.
-              AggrDW<T, Context>(
-                  Ag_dW.template mutable_data<T>(),
-                  dW->template mutable_data<T>(),
-                  N, K, &context_);
-            }
-          }
-          if (bias_multiplier_.numel() != M) {
-            // If the helper bias multiplier is not M,
-            // reshape and fill it with one.
-            bias_multiplier_.Resize(M);
-            math::Set<T, Context>(
-                M, static_cast<T>(1),
-                bias_multiplier_.template mutable_data<T>(),
-                &context_);
-          }
-          // Compute dB
-          math::Gemv<T, Context>(
-              CblasTrans, M, N, 1, dY.template data<T>(),
-              bias_multiplier_.template data<T>(), 0,
-              db->template mutable_data<T>(),
-              &context_);
-          // Compute dX if necessary.
-          if (OutputSize() == 7) {
-            auto* dX = Output(6, X.sizes(), at::dtype<T>());
-            math::Gemm<T, Context, Engine>(
-                CblasNoTrans, CblasNoTrans, M, K, N, 1,
-                dY.template data<T>(), W.template data<T>(),
-                0, dX->template mutable_data<T>(),
-                &context_);
-          }
+ protected:
+  Tensor bias_multiplier_{Context::GetDeviceType()};
+  Tensor sum_buffer_{Context::GetDeviceType()};
+  Tensor comp_r_buf_{Context::GetDeviceType()};
+};

-          return true;
-        }
+} // namespace caffe2

-      protected:
-       Tensor bias_multiplier_{Context::GetDeviceType()};
-       Tensor sum_buffer_{Context::GetDeviceType()};
-       Tensor comp_r_buf_{Context::GetDeviceType()};
-    };
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
--- a/caffe2/experiments/operators/fully_connected_op_sparse.cc
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.cc
@ -22,5 +22,5 @@ namespace {
 REGISTER_CPU_OPERATOR(FC_Sparse, FullyConnectedOp_SPARSE<float, CPUContext>);

 OPERATOR_SCHEMA(FC_Sparse).NumInputs(5).NumOutputs(1);
-}  // namespace
-}  // namespace caffe2
+} // namespace
+} // namespace caffe2
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ b/caffe2/experiments/operators/fully_connected_op_sparse.h
@ -22,16 +22,16 @@
 #include "caffe2/utils/math.h"
 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
-#endif  // CAFFE2_USE_MKL
+#endif // CAFFE2_USE_MKL

 namespace caffe2 {

 namespace {

-template<int N>
+template <int N>
 using Shape = std::array<int, N>;

-template<int N>
+template <int N>
 const std::vector<int64_t>& shape(Shape<N> vs) {
  static thread_local std::vector<int64_t> cache;
  cache.resize(vs.size());
@ -50,10 +50,18 @@ inline const std::vector<int64_t>& shape(int i, int j) {
 }

 template <typename T, class Context>
-void Sparse_mm(const T* acsr, const int* ia, const int* ja,
-              int m, int k, int n, const T* b, T* c, Context* context);
+void Sparse_mm(
+    const T* acsr,
+    const int* ia,
+    const int* ja,
+    int m,
+    int k,
+    int n,
+    const T* b,
+    T* c,
+    Context* context);

-template<typename T, class Context>
+template <typename T, class Context>
 void trans_mat(const T* o, T* t, int m, int n, Context* context);

 template <>
@ -63,9 +71,9 @@ void trans_mat<float, CPUContext>(
    int m,
    int n,
    CPUContext* /*context*/) {
-  for(int i = 0; i < m; ++i){
-    for(int j = 0; j < n; ++j){
-      t[j*m+i]=o[i*n+j];
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      t[j * m + i] = o[i * n + j];
    }
  }
 }
@ -83,22 +91,35 @@ void Sparse_mm<float, CPUContext>(
    const float* b,
    float* c,
    CPUContext* /*context*/) {
-
-  #ifdef CAFFE2_USE_MKL
+#ifdef CAFFE2_USE_MKL

  float alpha = 1.0, beta = 0.;
-  mkl_scsrmm("N", &m, &n, &k, &alpha, "GLNC",
-             acsr, ja, ia, ia+1, b, &n, &beta, c, &n);
+  mkl_scsrmm(
+      "N",
+      &m,
+      &n,
+      &k,
+      &alpha,
+      "GLNC",
+      acsr,
+      ja,
+      ia,
+      ia + 1,
+      b,
+      &n,
+      &beta,
+      c,
+      &n);

-  #else
+#else
  throw std::runtime_error("Not compiled with MKL");
-  #endif
+#endif
 }

-}
+} // namespace

 // This is Caffe's InnerProductOp, with a name that fits its purpose better.
-template <typename T, class Context, class Engine=DefaultEngine>
+template <typename T, class Context, class Engine = DefaultEngine>
 class FullyConnectedOp_SPARSE final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
@ -122,27 +143,43 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
    // Feature dimension
    int M = Xt.numel() / K;
    // number of outputs.
-    int N = iw.dim32(0)-1;
+    int N = iw.dim32(0) - 1;
    CAFFE_ENFORCE_EQ(N, b.dim32(0));
    auto* Yt = Output(0, shape(N, M), at::dtype<T>());

    // Y' = W * X';
    Sparse_mm<T, Context>(
-      Wcsr.template data<T>(), iw.template data<int>(),
-      jw.template data<int>(), N, K, M, Xt.template data<T>(),
-      Yt->template mutable_data<T>(), &context_);
+        Wcsr.template data<T>(),
+        iw.template data<int>(),
+        jw.template data<int>(),
+        N,
+        K,
+        M,
+        Xt.template data<T>(),
+        Yt->template mutable_data<T>(),
+        &context_);
    // Add bias term
    if (bias_multiplier_.numel() != M) {
      // If the helper bias multiplier is not M, reshape and fill it with one.
      bias_multiplier_.Resize(shape(M));
      math::Set<T, Context>(
-          M, static_cast<T>(1), bias_multiplier_.template mutable_data<T>(),
+          M,
+          static_cast<T>(1),
+          bias_multiplier_.template mutable_data<T>(),
          &context_);
    }
    math::Gemm<T, Context, Engine>(
-        CblasNoTrans, CblasNoTrans, N, M, 1, 1,
-        b.template data<T>(), bias_multiplier_.template data<T>(), 1,
-        Yt->template mutable_data<T>(), &context_);
+        CblasNoTrans,
+        CblasNoTrans,
+        N,
+        M,
+        1,
+        1,
+        b.template data<T>(),
+        bias_multiplier_.template data<T>(),
+        1,
+        Yt->template mutable_data<T>(),
+        &context_);
    return true;
  }

@ -150,7 +187,6 @@ class FullyConnectedOp_SPARSE final : public Operator<Context> {
  Tensor bias_multiplier_{Context::GetDeviceType()};
 };

+} // namespace caffe2

-}  // namespace caffe2
-
-#endif  // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
+#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
--- a/caffe2/experiments/operators/funhash_op.cc
+++ b/caffe2/experiments/operators/funhash_op.cc
@ -46,33 +46,39 @@ randomly mapped from the weight vector, provided the input
 )DOC")
    .Input(0, "scalars", "Values of the non-zero entries of the sparse data.")
    .Input(1, "indices", "Indices to the non-zero valued features.")
-    .Input(2, "segment_ids",
+    .Input(
+        2,
+        "segment_ids",
        "Segment IDs corresponding to the non-zero entries.")
    .Input(3, "weight", "Weight vector")
-    .Input(4, "alpha",
+    .Input(
+        4,
+        "alpha",
        "Optional coefficients for linear combination of hashed weights.")
-    .Output(0, "output",
+    .Output(
+        0,
+        "output",
        "Output tensor with the first dimension equal to the number "
        "of segments.")
    .Arg("num_outputs", "Number of outputs")
    .Arg("num_segments", "Number of segments");

-OPERATOR_SCHEMA(FunHashGradient)
-    .NumInputs(5, 6)
-    .NumOutputs(1, 2);
+OPERATOR_SCHEMA(FunHashGradient).NumInputs(5, 6).NumOutputs(1, 2);

 class GetFunHashGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    if (def_.input_size() == 4) {
      return SingleGradientDef(
-          "FunHashGradient", "",
+          "FunHashGradient",
+          "",
          vector<string>{GO(0), I(0), I(1), I(2), I(3)},
          vector<string>{GI(3)});
    }
    // def_.input_size() == 5
    return SingleGradientDef(
-        "FunHashGradient", "",
+        "FunHashGradient",
+        "",
        vector<string>{GO(0), I(0), I(1), I(2), I(3), I(4)},
        vector<string>{GI(3), GI(4)});
  }
--- a/caffe2/experiments/operators/tt_contraction_op.h
+++ b/caffe2/experiments/operators/tt_contraction_op.h
@ -70,7 +70,10 @@ class TTContractionOp final : public Operator<Context> {
      math::Gemm<T, Context, Engine>(
          CblasTrans,
          CblasNoTrans,
-          M_, N_, K_, 1,
+          M_,
+          N_,
+          K_,
+          1,
          A_data,
          B_data + B_index,
          0,
@ -126,7 +129,10 @@ class TTContractionGradientOp final : public Operator<Context> {
      math::Gemm<T, Context, Engine>(
          CblasNoTrans,
          CblasTrans,
-          K_, M_, N_, 1,
+          K_,
+          M_,
+          N_,
+          1,
          B_data + B_index,
          G_ptr,
          B_index == 0 ? 0 : 1,
@ -140,7 +146,10 @@ class TTContractionGradientOp final : public Operator<Context> {
      math::Gemm<T, Context, Engine>(
          CblasNoTrans,
          CblasNoTrans,
-          K_, N_, M_, 1,
+          K_,
+          N_,
+          M_,
+          1,
          A_data,
          G_ptr,
          0,
--- a/caffe2/mpi/mpi_common.cc
+++ b/caffe2/mpi/mpi_common.cc
@ -175,4 +175,4 @@ void MPISetupPeers(
          << MPICommSize(GlobalMPIComm());
 }

-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/mpi/mpi_gpu_test.cc
+++ b/caffe2/mpi/mpi_gpu_test.cc
@ -1,9 +1,9 @@
-#include "caffe2/core/init.h"
+#include <gtest/gtest.h>
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/core/init.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/mpi/mpi_common.h"
-#include <gtest/gtest.h>

 C10_DEFINE_string(
    caffe_test_root,
@ -47,8 +47,7 @@ const char kBcastNet[] = R"NET(

 TEST(MPITest, TestMPIBroadcast) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kBcastNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kBcastNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -108,8 +107,7 @@ const char kReduceNet[] = R"NET(

 TEST(MPITest, TestMPIReduce) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kReduceNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kReduceNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -174,8 +172,7 @@ const char kMPIAllgatherNet[] = R"NET(

 TEST(MPITest, TestMPIAllgather) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kMPIAllgatherNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kMPIAllgatherNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -237,8 +234,7 @@ const char kMPIAllreduceNet[] = R"NET(

 TEST(MPITest, TestMPIAllreduce) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kMPIAllreduceNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kMPIAllreduceNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -299,8 +295,7 @@ const char kInPlaceMPIAllreduceNet[] = R"NET(

 TEST(MPITest, TestInPlaceMPIAllreduce) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kInPlaceMPIAllreduceNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kInPlaceMPIAllreduceNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -323,10 +318,9 @@ TEST(MPITest, TestInPlaceMPIAllreduce) {
  }
 }

-}  // namespace caffe2
+} // namespace caffe2

-
-GTEST_API_ int main(int argc, char **argv) {
+GTEST_API_ int main(int argc, char** argv) {
  int mpi_ret;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
  testing::InitGoogleTest(&argc, argv);
--- a/caffe2/mpi/mpi_ops.cc
+++ b/caffe2/mpi/mpi_ops.cc
@ -2,23 +2,14 @@

 namespace caffe2 {

-OPERATOR_SCHEMA(MPICreateCommonWorld)
-  .NumInputs(0)
-  .NumOutputs(1);
+OPERATOR_SCHEMA(MPICreateCommonWorld).NumInputs(0).NumOutputs(1);
 OPERATOR_SCHEMA(MPIBroadcast)
-  .NumInputs(2)
-  .NumOutputs(1)
-  .EnforceInplace({{1, 0}});
-OPERATOR_SCHEMA(MPIReduce)
-  .NumInputs(2)
-  .NumOutputs(1);
-OPERATOR_SCHEMA(MPIAllgather)
-  .NumInputs(2)
-  .NumOutputs(1);
-OPERATOR_SCHEMA(MPIAllreduce)
-  .NumInputs(2)
-  .NumOutputs(1)
-  .AllowInplace({{1, 0}});
+    .NumInputs(2)
+    .NumOutputs(1)
+    .EnforceInplace({{1, 0}});
+OPERATOR_SCHEMA(MPIReduce).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(MPIAllgather).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(MPIAllreduce).NumInputs(2).NumOutputs(1).AllowInplace({{1, 0}});
 OPERATOR_SCHEMA(MPISendTensor);
 OPERATOR_SCHEMA(MPIReceiveTensor);

@ -30,4 +21,4 @@ REGISTER_CPU_OPERATOR(MPIAllreduce, MPIAllreduceOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(MPISendTensor, MPISendTensorOp<CPUContext>);
 REGISTER_CPU_OPERATOR(MPIReceiveTensor, MPIReceiveTensorOp<CPUContext>);

-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/mpi/mpi_ops_gpu.cc
+++ b/caffe2/mpi/mpi_ops_gpu.cc
@ -1,8 +1,7 @@
-#include "caffe2/mpi/mpi_ops.h"
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/mpi/mpi_ops.h"
 #include "caffe2/operators/operator_fallback_gpu.h"

-
 namespace caffe2 {

 // Here is a bunch of MPI macro definitions that allow us to see if the MPI
@ -61,26 +60,16 @@ REGISTER_CUDA_OPERATOR(MPISendTensor, MPISendTensorOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(MPIReceiveTensor, MPIReceiveTensorOp<CUDAContext>);
 #else
 REGISTER_CUDA_OPERATOR(MPIBroadcast, GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(
-    MPIReduce,
-    GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(
-    MPIAllgather,
-    GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(
-    MPISendTensor,
-    GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(
-    MPIReceiveTensor,
-    GPUFallbackOpEx<SkipIndices<1, 2>>);
+REGISTER_CUDA_OPERATOR(MPIReduce, GPUFallbackOp);
+REGISTER_CUDA_OPERATOR(MPIAllgather, GPUFallbackOp);
+REGISTER_CUDA_OPERATOR(MPISendTensor, GPUFallbackOp);
+REGISTER_CUDA_OPERATOR(MPIReceiveTensor, GPUFallbackOpEx<SkipIndices<1, 2>>);
 #endif

 #if CAFFE2_HAS_CUDA_MPI_ALLREDUCE
 REGISTER_CUDA_OPERATOR(MPIAllreduce, MPIAllreduceOp<float, CUDAContext>);
 #else
-REGISTER_CUDA_OPERATOR(
-    MPIAllreduce,
-    GPUFallbackOp);
+REGISTER_CUDA_OPERATOR(MPIAllreduce, GPUFallbackOp);
 #endif

-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/mpi/mpi_test.cc
+++ b/caffe2/mpi/mpi_test.cc
@ -1,8 +1,8 @@
+#include <gtest/gtest.h>
 #include "caffe2/core/init.h"
 #include "caffe2/core/net.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/mpi/mpi_common.h"
-#include <gtest/gtest.h>

 C10_DEFINE_string(
    caffe_test_root,
@ -43,8 +43,7 @@ const char kBcastNet[] = R"NET(

 TEST(MPITest, TestMPIBroadcast) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kBcastNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kBcastNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -101,8 +100,7 @@ const char kReduceNet[] = R"NET(

 TEST(MPITest, TestMPIReduce) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kReduceNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kReduceNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -163,8 +161,7 @@ const char kMPIAllgatherNet[] = R"NET(

 TEST(MPITest, TestMPIAllgather) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kMPIAllgatherNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kMPIAllgatherNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -221,8 +218,7 @@ const char kMPIAllreduceNet[] = R"NET(

 TEST(MPITest, TestMPIAllreduce) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kMPIAllreduceNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kMPIAllreduceNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -278,8 +274,7 @@ const char kInPlaceMPIAllreduceNet[] = R"NET(

 TEST(MPITest, TestInPlaceMPIAllreduce) {
  NetDef net_def;
-  CHECK(TextFormat::ParseFromString(
-      string(kInPlaceMPIAllreduceNet), &net_def));
+  CHECK(TextFormat::ParseFromString(string(kInPlaceMPIAllreduceNet), &net_def));
  // Let's set the network's constant fill value to be the mpi rank.
  auto* arg = net_def.mutable_op(1)->mutable_arg(1);
  CAFFE_ENFORCE_EQ(arg->name(), "value");
@ -301,10 +296,9 @@ TEST(MPITest, TestInPlaceMPIAllreduce) {
  }
 }

-}  // namespace caffe2
+} // namespace caffe2

-
-GTEST_API_ int main(int argc, char **argv) {
+GTEST_API_ int main(int argc, char** argv) {
  int mpi_ret;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
  testing::InitGoogleTest(&argc, argv);
--- a/caffe2/observers/profile_observer.h
+++ b/caffe2/observers/profile_observer.h
@ -95,8 +95,8 @@ class TORCH_API ProfileOperatorObserver final
 };

 class TORCH_API ProfileObserver final : public OperatorAttachingNetObserver<
-                                             ProfileOperatorObserver,
-                                             ProfileObserver> {
+                                            ProfileOperatorObserver,
+                                            ProfileObserver> {
 public:
  explicit ProfileObserver(NetBase* subject)
      : OperatorAttachingNetObserver<ProfileOperatorObserver, ProfileObserver>(
--- a/caffe2/observers/runcnt_observer.h
+++ b/caffe2/observers/runcnt_observer.h
@ -27,10 +27,9 @@ class TORCH_API RunCountOperatorObserver final
  RunCountNetObserver* netObserver_;
 };

-class TORCH_API RunCountNetObserver final
-    : public OperatorAttachingNetObserver<
-          RunCountOperatorObserver,
-          RunCountNetObserver> {
+class TORCH_API RunCountNetObserver final : public OperatorAttachingNetObserver<
+                                                RunCountOperatorObserver,
+                                                RunCountNetObserver> {
 public:
  explicit RunCountNetObserver(NetBase* subject_)
      : OperatorAttachingNetObserver<
--- a/caffe2/observers/time_observer.h
+++ b/caffe2/observers/time_observer.h
@ -28,9 +28,8 @@ class TORCH_API TimeCounter {
  int iterations_ = 0;
 };

-class TORCH_API TimeOperatorObserver final
-    : public TimeCounter,
-      public ObserverBase<OperatorBase> {
+class TORCH_API TimeOperatorObserver final : public TimeCounter,
+                                             public ObserverBase<OperatorBase> {
 public:
  explicit TimeOperatorObserver(OperatorBase* subject) = delete;
  explicit TimeOperatorObserver(
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@ -1,6 +1,6 @@
+#include "caffe2/onnx/backend.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/onnx/backend.h"
 #include "caffe2/onnx/device.h"
 #include "caffe2/onnx/helper.h"
 #include "caffe2/utils/map_utils.h"
@ -81,7 +81,9 @@ U LookUpWithDefault(
  }
 }

-void UpdateNames(std::shared_ptr<DummyName> dummy, const caffe2::OperatorDef& op) {
+void UpdateNames(
+    std::shared_ptr<DummyName> dummy,
+    const caffe2::OperatorDef& op) {
  for (const auto& n : op.input()) {
    dummy->AddName(n);
  }
@ -198,8 +200,8 @@ OnnxAttributes::get(const std::string& key) const {
 }

 template <>
-::google::protobuf::RepeatedField<float>
-OnnxAttributes::get(const std::string& key) const {
+::google::protobuf::RepeatedField<float> OnnxAttributes::get(
+    const std::string& key) const {
  ::google::protobuf::RepeatedField<float> value;
  const auto it = onnx_attrs_.find(key);
  if (it != onnx_attrs_.end()) {
@ -305,11 +307,12 @@ const std::
    Caffe2Backend::get_per_op_renamed_attrs() const {
  const static std::
      unordered_map<std::string, std::unordered_map<std::string, std::string>>
-          kPerOpRenamedAttrs = {{"Squeeze", {{"axes", "dims"}}},
-                                {"Unsqueeze", {{"axes", "dims"}}},
-                                {"Transpose", {{"perm", "axes"}}},
-                                {"ConvTranspose", {{"output_padding", "adjs"}}},
-                                {"Selu", {{"gamma", "scale"}}}};
+          kPerOpRenamedAttrs = {
+              {"Squeeze", {{"axes", "dims"}}},
+              {"Unsqueeze", {{"axes", "dims"}}},
+              {"Transpose", {{"perm", "axes"}}},
+              {"ConvTranspose", {{"output_padding", "adjs"}}},
+              {"Selu", {{"gamma", "scale"}}}};

  return kPerOpRenamedAttrs;
 }
@ -462,7 +465,8 @@ Caffe2Ops Caffe2Backend::CreateConstantOfShape(
  auto* c2_op = ret.ops.Add();
  const auto* value = onnx_node->attributes.get<const TensorProto*>("value");
  if (value) {
-    BuildTensorFillingOp(c2_op, *value, onnx_node->node.output(0), onnx_node->node.input(0));
+    BuildTensorFillingOp(
+        c2_op, *value, onnx_node->node.output(0), onnx_node->node.input(0));
  } else {
    c2_op->set_type("ConstantFill");
    c2_op->add_input(onnx_node->node.input(0));
@ -604,7 +608,8 @@ Caffe2Ops Caffe2Backend::CreateNonZeroOp(
  auto ret = CommonOnnxNodeToCaffe2Ops(&new_node, ctx);

  auto* c2_transpose = ret.ops.Add();
-  BuildOperator(c2_transpose, "Transpose", {nonzero_output}, {onnx_node->node.output(0)});
+  BuildOperator(
+      c2_transpose, "Transpose", {nonzero_output}, {onnx_node->node.output(0)});
  return ret;
 }

@ -612,7 +617,8 @@ Caffe2Ops Caffe2Backend::CreateMultinomialOp(
    OnnxNode* onnx_node,
    const ConversionContext& ctx) {
  // Fallback to ATen.
-  // ATen::Multinomial takes probabilities as input, ONNX Multinomial expects input to be log probabilities.
+  // ATen::Multinomial takes probabilities as input, ONNX Multinomial expects
+  // input to be log probabilities.
  Caffe2Ops ret;
  auto c2_exp_output = dummy_->NewDummyName();
  auto* c2_exp = ret.ops.Add();
@ -631,10 +637,10 @@ Caffe2Ops Caffe2Backend::CreateMultinomialOp(
  c2_arg_num.set_name("num_samples");
  c2_arg_num.set_i(onnx_attributes.get<int64_t>("sample_size"));

-  // ONNX Multinomial has attribute dtype in {int64, int32}, which specifies output datatype.
-  // ATen::Multinomial output dtype is always int64.
+  // ONNX Multinomial has attribute dtype in {int64, int32}, which specifies
+  // output datatype. ATen::Multinomial output dtype is always int64.
  auto onnx_dtype =
-    onnx_attributes.get<int64_t>("dtype", TensorProto::UNDEFINED);
+      onnx_attributes.get<int64_t>("dtype", TensorProto::UNDEFINED);
  if (onnx_dtype == ::ONNX_NAMESPACE::TensorProto::INT64) {
    BuildOperator(
        c2_multinomial,
@ -655,9 +661,16 @@ Caffe2Ops Caffe2Backend::CreateMultinomialOp(
    caffe2::Argument to;
    to.set_name("to");
    to.set_i(caffe2::TensorProto::INT32);
-    BuildOperator(c2_cast, "Cast", {c2_multinomial_output}, {onnx_node->node.output(0)}, {to});
+    BuildOperator(
+        c2_cast,
+        "Cast",
+        {c2_multinomial_output},
+        {onnx_node->node.output(0)},
+        {to});
  } else {
-    CAFFE_THROW("ONNX does not support dtype other than int32/int64 in Multinomial, but get ", onnx_dtype);
+    CAFFE_THROW(
+        "ONNX does not support dtype other than int32/int64 in Multinomial, but get ",
+        onnx_dtype);
  }

  return ret;
@ -750,9 +763,8 @@ Caffe2Ops Caffe2Backend::CreateGemm(
  auto trans_a = onnx_node->attributes.get<int64_t>("transA", 0L);
  auto trans_b = onnx_node->attributes.get<int64_t>("transB", 0L);
  // Support broadcast by default when opset_version > 6.
-  auto broadcast =
-    onnx_node->attributes.get<int64_t>("broadcast",
-                                       (ctx.opset_version() > 6) ? 1L : 0L);
+  auto broadcast = onnx_node->attributes.get<int64_t>(
+      "broadcast", (ctx.opset_version() > 6) ? 1L : 0L);

  // If the c's shape information is available and c is a 1d tensor(except
  // c is a scalar), use FC aggressively.
@ -796,7 +808,8 @@ Caffe2Ops Caffe2Backend::CreateGemm(
    if (trans_b) {
      BuildOperator(c2_op, "FC", {input_a, input_b, input_c}, {output});
    } else {
-      BuildOperator(c2_op, "FCTransposed", {input_a, input_b, input_c}, {output});
+      BuildOperator(
+          c2_op, "FCTransposed", {input_a, input_b, input_c}, {output});
    }
  } else {
    auto ab = dummy_->NewDummyName();
@ -1057,14 +1070,15 @@ Caffe2Ops Caffe2Backend::CreateSlice(
 //    the behavior of Caffe2's slice operator not matching that of ONNX's slice
 // 2) Fully expand the index tensor out to the rank of the data tensor.
 //    pseudocode: indices_full = zeros(rank); indices_full[axes] = indices.int()
-std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node,
-                                                      Caffe2Ops& ret,
-                                                      std::string indices_tensor,
-                                                      std::string axes_tensor,
-                                                      std::string rank_tensor,
-                                                      std::string zero_tensor,
-                                                      std::string one_tensor,
-                                                      int default_value) {
+std::string Caffe2Backend::PreprocessSliceIndexTensor(
+    OnnxNode* onnx_node,
+    Caffe2Ops& ret,
+    std::string indices_tensor,
+    std::string axes_tensor,
+    std::string rank_tensor,
+    std::string zero_tensor,
+    std::string one_tensor,
+    int default_value) {
  auto indices_tensor_full = dummy_->NewDummyName();

  {
@ -1078,8 +1092,12 @@ std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node,
    input_as_shape.set_name("input_as_shape");
    input_as_shape.set_i(1);
    auto c2_op = ret.ops.Add();
-    BuildOperator(c2_op, "ConstantFill", {rank_tensor}, {indices_tensor_full},
-                  {value, dtype, input_as_shape});
+    BuildOperator(
+        c2_op,
+        "ConstantFill",
+        {rank_tensor},
+        {indices_tensor_full},
+        {value, dtype, input_as_shape});
  }

  // Subtract 1 from each element of the indices tensor that is negative
@ -1089,7 +1107,8 @@ std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node,
    broadcast.set_name("broadcast");
    broadcast.set_i(1);
    auto c2_op = ret.ops.Add();
-    BuildOperator(c2_op, "LT", {indices_tensor, zero_tensor}, {lt_tensor}, {broadcast});
+    BuildOperator(
+        c2_op, "LT", {indices_tensor, zero_tensor}, {lt_tensor}, {broadcast});
  }

  auto sub_one_tensor = dummy_->NewDummyName();
@ -1098,18 +1117,30 @@ std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node,
    broadcast.set_name("broadcast");
    broadcast.set_i(1);
    auto c2_op = ret.ops.Add();
-    BuildOperator(c2_op, "Sub", {indices_tensor, one_tensor}, {sub_one_tensor}, {broadcast});
+    BuildOperator(
+        c2_op,
+        "Sub",
+        {indices_tensor, one_tensor},
+        {sub_one_tensor},
+        {broadcast});
  }

  auto indices_tensor_adjusted = dummy_->NewDummyName();
  auto c2_op = ret.ops.Add();
-  BuildOperator(c2_op, "Conditional", {lt_tensor, sub_one_tensor, indices_tensor}, {indices_tensor_adjusted}, {});
+  BuildOperator(
+      c2_op,
+      "Conditional",
+      {lt_tensor, sub_one_tensor, indices_tensor},
+      {indices_tensor_adjusted},
+      {});

  // Fill in values specified from the partially-specified ONNX indices tensor
  c2_op = ret.ops.Add();
-  BuildOperator(c2_op, "ScatterAssign",
-                {indices_tensor_full, axes_tensor, indices_tensor_adjusted},
-                {indices_tensor_full});
+  BuildOperator(
+      c2_op,
+      "ScatterAssign",
+      {indices_tensor_full, axes_tensor, indices_tensor_adjusted},
+      {indices_tensor_full});

  return indices_tensor_full;
 }
@ -1164,31 +1195,32 @@ Caffe2Ops Caffe2Backend::CreateDynamicSlice(
    shape.add_ints(1);
    auto c2_op = ret.ops.Add();
    auto name = dummy_->NewDummyName();
-    BuildOperator(c2_op, "ConstantFill", {}, {name},
-                  {value, dtype, shape});
+    BuildOperator(c2_op, "ConstantFill", {}, {name}, {value, dtype, shape});
    return name;
  };

  auto zero_tensor = define_integer_constant(0);
  auto one_tensor = define_integer_constant(1);

-  auto starts_tensor_full = PreprocessSliceIndexTensor(onnx_node,
-                                                       ret,
-                                                       onnx_node->node.input(1), // starts
-                                                       axes_tensor,
-                                                       rank_tensor,
-                                                       zero_tensor,
-                                                       one_tensor,
-                                                       0);
+  auto starts_tensor_full = PreprocessSliceIndexTensor(
+      onnx_node,
+      ret,
+      onnx_node->node.input(1), // starts
+      axes_tensor,
+      rank_tensor,
+      zero_tensor,
+      one_tensor,
+      0);

-  auto ends_tensor_full = PreprocessSliceIndexTensor(onnx_node,
-                                                     ret,
-                                                     onnx_node->node.input(2), // ends
-                                                     axes_tensor,
-                                                     rank_tensor,
-                                                     zero_tensor,
-                                                     one_tensor,
-                                                     -1);
+  auto ends_tensor_full = PreprocessSliceIndexTensor(
+      onnx_node,
+      ret,
+      onnx_node->node.input(2), // ends
+      axes_tensor,
+      rank_tensor,
+      zero_tensor,
+      one_tensor,
+      -1);

  // attach the original op at the end
  c2_op = ret.ops.Add();
@ -1219,7 +1251,8 @@ Caffe2Ops Caffe2Backend::CreateBatchNormalization(
    attr->set_i(1);
  }

-  if (attributes.HasAttribute("spatial") && attributes.get<int64_t>("spatial") == 1) {
+  if (attributes.HasAttribute("spatial") &&
+      attributes.get<int64_t>("spatial") == 1) {
    attributes.remove("spatial");
  }

@ -1263,10 +1296,12 @@ Caffe2Ops Caffe2Backend::CreateUpsample(
  attributes.remove("mode");

  if (ctx.opset_version() >= 7 && ctx.opset_version() < 9) {
-    const auto& scales = attributes.get<::google::protobuf::RepeatedField<float>>("scales");
+    const auto& scales =
+        attributes.get<::google::protobuf::RepeatedField<float>>("scales");
    if (scales.size() != 4) {
      CAFFE_THROW("The scales argument should have size 4");
-    } else if (!AlmostEqual(scales.Get(0), 1) || !AlmostEqual(scales.Get(1), 1))  {
+    } else if (
+        !AlmostEqual(scales.Get(0), 1) || !AlmostEqual(scales.Get(1), 1)) {
      CAFFE_THROW("The first two elements in the scales argument must be 1");
    }
    attributes.remove("scales");
@ -1332,14 +1367,14 @@ Caffe2Ops Caffe2Backend::CreateLRN(
  auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
  const auto& attributes = onnx_node->attributes;
  if (!attributes.HasAttribute("alpha")) {
-      auto* arg = c2_op.ops.Mutable(0)->add_arg();
-      arg->set_name("alpha");
-      arg->set_f(1e-4);
+    auto* arg = c2_op.ops.Mutable(0)->add_arg();
+    arg->set_name("alpha");
+    arg->set_f(1e-4);
  }
  if (!attributes.HasAttribute("beta")) {
-      auto* arg = c2_op.ops.Mutable(0)->add_arg();
-      arg->set_name("beta");
-      arg->set_f(0.75);
+    auto* arg = c2_op.ops.Mutable(0)->add_arg();
+    arg->set_name("beta");
+    arg->set_f(0.75);
  }
  return c2_op;
 }
@ -1347,8 +1382,8 @@ Caffe2Ops Caffe2Backend::CreateLRN(
 //==============================================
 // Rest of the member functions for Caffe2Backend
 //==============================================
-std::unordered_set<std::string>
-Caffe2Backend::AllNamesInGraph(const GraphProto &graph) {
+std::unordered_set<std::string> Caffe2Backend::AllNamesInGraph(
+    const GraphProto& graph) {
  std::unordered_set<std::string> names;

  for (const auto& input : graph.input()) {
@ -1406,8 +1441,7 @@ Caffe2Ops Caffe2Backend::CommonOnnxNodeToCaffe2Ops(
  c2_op->set_type(
      caffe2::get_default(get_renamed_operators(), onnx_op_type, onnx_op_type));
  if (!IsOperator(c2_op->type())) {
-    CAFFE_THROW(
-        "Don't know how to translate op ", onnx_op_type);
+    CAFFE_THROW("Don't know how to translate op ", onnx_op_type);
  }

  auto mapper = [&, this](const std::string& k) {
@ -1446,7 +1480,7 @@ void Caffe2Backend::CheckOpSchemaArguments(
    const caffe2::OpSchema& schema,
    const caffe2::OperatorDef& op) {
  const auto& schema_args = schema.args();
-  if (schema_args.size() > 0){
+  if (schema_args.size() > 0) {
    std::vector<std::string> argnames;
    std::transform(
        schema_args.begin(),
@ -1460,12 +1494,16 @@ void Caffe2Backend::CheckOpSchemaArguments(
            "Don't know how to map unexpected argument ",
            arg.name(),
            " (from operator ",
-            op.type(), ")");
+            op.type(),
+            ")");
      }
    }
  } else {
-    // A number of C2 operators do not declare proper arguments. Let's log the error
-    VLOG(2) << "Operator " << op.type() << " does not declare arguments in its schema. Please file a Caffe2 issue.";
+    // A number of C2 operators do not declare proper arguments. Let's log the
+    // error
+    VLOG(2)
+        << "Operator " << op.type()
+        << " does not declare arguments in its schema. Please file a Caffe2 issue.";
  }
 }

@ -1482,12 +1520,14 @@ Caffe2Ops Caffe2Backend::OnnxNodeToCaffe2Ops(
    res = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
  }

-  for (const auto& result_op: res.ops){
+  for (const auto& result_op : res.ops) {
    const auto* schema = OpSchemaRegistry::Schema(result_op.type());
    if (schema) {
      CheckOpSchemaArguments(*schema, result_op);
    } else {
-      CAFFE_THROW("Caffe2 has no such operator, could not find schema for ", result_op.type());
+      CAFFE_THROW(
+          "Caffe2 has no such operator, could not find schema for ",
+          result_op.type());
    }
  }
  return res;
@ -1660,23 +1700,23 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
 }

 template <typename T>
-void ConvertIntegralValueToCaffe2(caffe2::OperatorDef* c2_op,
-                                  caffe2::Argument* c2_values,
-                                  const TensorProto& onnx_tensor) {
+void ConvertIntegralValueToCaffe2(
+    caffe2::OperatorDef* c2_op,
+    caffe2::Argument* c2_values,
+    const TensorProto& onnx_tensor) {
  c2_op->set_type(
      onnx_tensor.data_type() == TensorProto::BOOL ? "GivenTensorBoolFill"
                                                   : "GivenTensorIntFill");
  ::google::protobuf::RepeatedField<T> tmp;
-  const ::google::protobuf::RepeatedField<T>* src =
-      &tmp;
+  const ::google::protobuf::RepeatedField<T>* src = &tmp;
  bool converted = TryConvertingTensorRawValues<T>(onnx_tensor, &tmp);
  if (converted) {
    for (const auto i : *src) {
      c2_values->add_ints(i);
    }
  } else {
-    const ::google::protobuf::RepeatedField<::google::protobuf::int32> *int32_src = \
-      &onnx_tensor.int32_data();
+    const ::google::protobuf::RepeatedField<::google::protobuf::int32>*
+        int32_src = &onnx_tensor.int32_data();
    for (const auto i : *int32_src) {
      c2_values->add_ints(i);
    }
@ -1684,9 +1724,10 @@ void ConvertIntegralValueToCaffe2(caffe2::OperatorDef* c2_op,
 }

 template <>
-void ConvertIntegralValueToCaffe2<::google::protobuf::int64>(caffe2::OperatorDef* c2_op,
-                                                             caffe2::Argument* c2_values,
-                                                            const TensorProto& onnx_tensor) {
+void ConvertIntegralValueToCaffe2<::google::protobuf::int64>(
+    caffe2::OperatorDef* c2_op,
+    caffe2::Argument* c2_values,
+    const TensorProto& onnx_tensor) {
  c2_op->set_type("GivenTensorInt64Fill");
  auto* ints = c2_values->mutable_ints();
  if (!TryConvertingTensorRawValues<::google::protobuf::int64>(
@ -1696,9 +1737,10 @@ void ConvertIntegralValueToCaffe2<::google::protobuf::int64>(caffe2::OperatorDef
 }

 template <>
-void ConvertIntegralValueToCaffe2<::google::protobuf::uint64>(caffe2::OperatorDef* c2_op,
-                                                              caffe2::Argument* c2_values,
-                                                              const TensorProto& onnx_tensor) {
+void ConvertIntegralValueToCaffe2<::google::protobuf::uint64>(
+    caffe2::OperatorDef* c2_op,
+    caffe2::Argument* c2_values,
+    const TensorProto& onnx_tensor) {
  c2_op->set_type("GivenTensorInt64Fill");
  ::google::protobuf::RepeatedField<::google::protobuf::uint64> tmp;
  const ::google::protobuf::RepeatedField<::google::protobuf::uint64>* src =
@ -1747,22 +1789,30 @@ void Caffe2Backend::BuildTensorFillingOp(
        c2_values->add_floats(i);
      }
    } else if (onnx_tensor.data_type() == TensorProto::INT64) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::int64>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::int64>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::UINT32) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::uint64>(c2_op, c2_values, onnx_tensor);
-    // NOLINTNEXTLINE(bugprone-branch-clone)
+      ConvertIntegralValueToCaffe2<::google::protobuf::uint64>(
+          c2_op, c2_values, onnx_tensor);
+      // NOLINTNEXTLINE(bugprone-branch-clone)
    } else if (onnx_tensor.data_type() == TensorProto::BOOL) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::int8>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::int8>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::UINT8) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::uint8>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::uint8>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::INT8) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::int8>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::int8>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::UINT16) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::uint16>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::uint16>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::INT16) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::int16>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::int16>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::INT32) {
-      ConvertIntegralValueToCaffe2<::google::protobuf::int32>(c2_op, c2_values, onnx_tensor);
+      ConvertIntegralValueToCaffe2<::google::protobuf::int32>(
+          c2_op, c2_values, onnx_tensor);
    } else if (onnx_tensor.data_type() == TensorProto::STRING) {
      c2_op->set_type("GivenTensorStringFill");
      auto* strings = c2_values->mutable_strings();
--- a/caffe2/onnx/backend_rep.cc
+++ b/caffe2/onnx/backend_rep.cc
@ -1,9 +1,10 @@
-#include "caffe2/core/common.h"
 #include "caffe2/onnx/backend_rep.h"
+#include "caffe2/core/common.h"

 #include <iostream>

-namespace caffe2 { namespace onnx {
+namespace caffe2 {
+namespace onnx {

 void Caffe2BackendRep::CheckInit() {
  if (!predictor_) {
@ -28,4 +29,5 @@ void Caffe2BackendRep::RunMap(
  (*predictor_)(inputs, outputs);
 }

-}}
+} // namespace onnx
+} // namespace caffe2
--- a/caffe2/onnx/device.cc
+++ b/caffe2/onnx/device.cc
@ -3,15 +3,16 @@
 #include <cstdlib>
 #include <unordered_map>

-namespace caffe2 { namespace onnx {
+namespace caffe2 {
+namespace onnx {
 static const std::unordered_map<std::string, DeviceType> kDeviceMap = {
-  {"CPU", DeviceType::CPU},
-  {"CUDA", DeviceType::CUDA}
-};
+    {"CPU", DeviceType::CPU},
+    {"CUDA", DeviceType::CUDA}};

-Device::Device(const std::string &spec) {
+Device::Device(const std::string& spec) {
  auto pos = spec.find_first_of(':');
  type = kDeviceMap.at(spec.substr(0, pos - 1));
  device_id = atoi(spec.substr(pos + 1).c_str());
 }
-}}
+} // namespace onnx
+} // namespace caffe2
--- a/caffe2/onnx/helper.cc
+++ b/caffe2/onnx/helper.cc
@ -3,7 +3,8 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"

-namespace caffe2 { namespace onnx  {
+namespace caffe2 {
+namespace onnx {

 std::string DummyName::NewDummyName() {
  while (true) {
@ -16,7 +17,7 @@ std::string DummyName::NewDummyName() {
  }
 }

-void DummyName::Reset(const std::unordered_set<std::string> &used_names) {
+void DummyName::Reset(const std::unordered_set<std::string>& used_names) {
  used_names_ = used_names;
  counter_ = 0;
 }
@ -44,15 +45,16 @@ NodeProto MakeNode(
    node.set_name(name);
  }
  node.set_op_type(type);
-  for (const auto& input: inputs) {
+  for (const auto& input : inputs) {
    node.add_input(input);
  }
-  for (const auto& output: outputs) {
+  for (const auto& output : outputs) {
    node.add_output(output);
  }
-  for (const auto& attr: attributes) {
+  for (const auto& attr : attributes) {
    node.add_attribute()->CopyFrom(attr);
  }
  return node;
 }
-}}
+} // namespace onnx
+} // namespace caffe2
--- a/caffe2/onnx/onnx_exporter.cc
+++ b/caffe2/onnx/onnx_exporter.cc
@ -469,11 +469,12 @@ const std::
    OnnxExporter::get_per_op_renamed_attrs() const {
  const static std::
      unordered_map<std::string, std::unordered_map<std::string, std::string>>
-          kPerOpRenamedAttrs = {{"Squeeze", {{"dims", "axes"}}},
-                                {"Unsqueeze", {{"dims", "axes"}}},
-                                {"Transpose", {{"axes", "perm"}}},
-                                {"ConvTranspose", {{"adjs", "output_padding"}}},
-                                {"Selu", {{"scale", "gamma"}}}};
+          kPerOpRenamedAttrs = {
+              {"Squeeze", {{"dims", "axes"}}},
+              {"Unsqueeze", {{"dims", "axes"}}},
+              {"Transpose", {{"axes", "perm"}}},
+              {"ConvTranspose", {{"adjs", "output_padding"}}},
+              {"Selu", {{"scale", "gamma"}}}};

  return kPerOpRenamedAttrs;
 }
@ -556,11 +557,12 @@ bool OnnxExporter::IsBlockListed(const caffe2::Argument& arg) {
  const static std::unordered_map<std::string, std::unordered_set<std::string>>
      kBlockListString = {{"order", {"NCHW"}}};
  const static std::unordered_map<std::string, std::unordered_set<int64_t>>
-      kBlockListInt = {{"cudnn_exhaustive_search", {0, 1}},
-                       {"use_cudnn", {0, 1}},
-                       {"exhaustive_search", {0, 1}},
-                       {"is_test", {0, 1}},
-                       {"broadcast", {0, 1}}};
+      kBlockListInt = {
+          {"cudnn_exhaustive_search", {0, 1}},
+          {"use_cudnn", {0, 1}},
+          {"exhaustive_search", {0, 1}},
+          {"is_test", {0, 1}},
+          {"broadcast", {0, 1}}};

  if (arg.has_i()) {
    const auto it = kBlockListInt.find(arg.name());
--- a/caffe2/perfkernels/adagrad.cc
+++ b/caffe2/perfkernels/adagrad.cc
@ -133,8 +133,8 @@ void adagrad_update_prefetch(

 // Version with prefetching for embeddings and
 // momentum using fp16
-decltype(
-    adagrad_fp16_update_prefetch__base) adagrad_fp16_update_prefetch__avx2_fma;
+decltype(adagrad_fp16_update_prefetch__base)
+    adagrad_fp16_update_prefetch__avx2_fma;
 void adagrad_fp16_update_prefetch(
    int N,
    const at::Half* w,
--- a/caffe2/perfkernels/common.h
+++ b/caffe2/perfkernels/common.h
@ -105,12 +105,12 @@ In foo.cc, do:
 #endif // CAFFE2_PERF_WITH_AVX2

 #ifdef CAFFE2_PERF_WITH_AVX
-#define AVX_DO(funcname, ...)                  \
-  {                                            \
+#define AVX_DO(funcname, ...)                                               \
+  {                                                                         \
    static const bool isDo = cpuinfo_initialize() && cpuinfo_has_x86_avx(); \
-    if (isDo) {                                \
-      return funcname##__avx(__VA_ARGS__);     \
-    }                                          \
+    if (isDo) {                                                             \
+      return funcname##__avx(__VA_ARGS__);                                  \
+    }                                                                       \
  }
 #define AVX_F16C_DO(funcname, ...)                                            \
  {                                                                           \
--- a/caffe2/perfkernels/cvtsh_ss_bugfix.h
+++ b/caffe2/perfkernels/cvtsh_ss_bugfix.h
@ -1,7 +1,9 @@
 #pragma once

 // Apple clang was fixed in 8.1
-#if defined(__apple_build_version__) && ((__clang_major__ < 8) || ((__clang_major__ == 8) && (__clang_minor__ < 1)))
+#if defined(__apple_build_version__) && \
+    ((__clang_major__ < 8) ||           \
+     ((__clang_major__ == 8) && (__clang_minor__ < 1)))
 #define CAFFE2_INTERNAL_APPLE_NEED_FIX 1
 #endif

@ -10,7 +12,8 @@
 #define CAFFE2_INTERNAL_CLANG_NEED_FIX 1
 #endif

-#if defined(CAFFE2_INTERNAL_APPLE_NEED_FIX) || defined(CAFFE2_INTERNAL_CLANG_NEED_FIX)
+#if defined(CAFFE2_INTERNAL_APPLE_NEED_FIX) || \
+    defined(CAFFE2_INTERNAL_CLANG_NEED_FIX)

 #include <c10/util/Half.h>
 #include <emmintrin.h>
@ -19,8 +22,7 @@
 // https://reviews.llvm.org/D16177
 static __inline float
    __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
-_cvtsh_ss(unsigned short a)
-{
+    _cvtsh_ss(unsigned short a) {
  __v8hi v = {(short)a, 0, 0, 0, 0, 0, 0, 0};
  __v4sf r = __builtin_ia32_vcvtph2ps(v);
  return r[0];
@ -28,7 +30,7 @@ _cvtsh_ss(unsigned short a)

 static __inline unsigned short
    __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
-_cvtss_sh(float a, int imm8) {
+    _cvtss_sh(float a, int imm8) {
  unsigned short ret;
  *reinterpret_cast<at::Half*>(&ret) = a;
  return ret;
--- a/caffe2/perfkernels/embedding_lookup.cc
+++ b/caffe2/perfkernels/embedding_lookup.cc
@ -72,6 +72,7 @@ static bool EmbeddingLookupGenericSlow(
  return current == index_size;
 }

+// clang-format off
 // Proxy back to generic implementation
 #define EMBEDDING_SPECIALIZATION(                                                                  \
    IndexType, InTypeName, InType, OutType, IS_WEIGHT_POSITIONAL)                                  \
@ -204,6 +205,7 @@ static bool EmbeddingLookupGenericSlow(
        "Your input seems to be incorrect: the sum of lengths values should be "                   \
        "the size of the indices tensor, but it appears not.");                                    \
  }
+// clang-format on

 EMBEDDING_SPECIALIZATION(int32_t, float, float, float, false);
 EMBEDDING_SPECIALIZATION(int64_t, float, float, float, false);
--- a/caffe2/perfkernels/embedding_lookup_idx.cc
+++ b/caffe2/perfkernels/embedding_lookup_idx.cc
@ -75,6 +75,7 @@ static bool EmbeddingLookupGenericSlowIdx(
  return current == index_size;
 }

+// clang-format off
 // Proxy back to generic implementation
 #define EMBEDDING_IDX_SPECIALIZATION(                                                                 \
    IndexType, InTypeName, InType, OutType, IS_WEIGHT_POSITIONAL)                                     \
@ -207,6 +208,7 @@ static bool EmbeddingLookupGenericSlowIdx(
        "Your input seems to be incorrect: the sum of lengths values should be "                      \
        "the size of the indices tensor, but it appears not.");                                       \
  }
+// clang-format on

 EMBEDDING_IDX_SPECIALIZATION(int32_t, float, float, float, false);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, float, float, float, false);
--- a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
+++ b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc
@ -78,6 +78,7 @@ static bool Fused8BitRowwiseEmbeddingLookupGenericSlow(
  return current == index_size;
 }

+// clang-format off
 // Proxy back to generic implementation
 #define FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION(IndexType, OutType)                  \
  bool                                                                                   \
@ -201,6 +202,7 @@ static bool Fused8BitRowwiseEmbeddingLookupGenericSlow(
        "Your input seems to be incorrect: the sum of lengths values should be "         \
        "the size of the indices tensor, but it appears not.");                          \
  }
+// clang-format on

 FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION(int32_t, float);
 FUSED_8BIT_ROWWISE_EMBEDDING_SPECIALIZATION(int64_t, float);
--- a/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup_idx.cc
+++ b/caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup_idx.cc
@ -80,6 +80,7 @@ static bool Fused8BitRowwiseEmbeddingLookupGenericSlowIdx(
  return current == index_size;
 }

+// clang-format off
 // Proxy back to generic implementation
 #define FUSED_8BIT_ROWWISE_EMBEDDING_IDX_SPECIALIZATION(IndexType, OutType)                 \
  bool                                                                                      \
@ -203,6 +204,7 @@ static bool Fused8BitRowwiseEmbeddingLookupGenericSlowIdx(
        "Your input seems to be incorrect: the sum of lengths values should be "            \
        "the size of the indices tensor, but it appears not.");                             \
  }
+// clang-format on

 FUSED_8BIT_ROWWISE_EMBEDDING_IDX_SPECIALIZATION(int32_t, float);
 FUSED_8BIT_ROWWISE_EMBEDDING_IDX_SPECIALIZATION(int64_t, float);
--- a/caffe2/perfkernels/lstm_unit_cpu-impl.h
+++ b/caffe2/perfkernels/lstm_unit_cpu-impl.h
@ -1,7 +1,7 @@
 #pragma once
+#include <string.h>
 #include <cmath>
 #include <cstdint>
-#include <string.h>
 #include "caffe2/utils/conversions.h"

 #if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
--- a/caffe2/predictor/ThreadLocalPtr.h
+++ b/caffe2/predictor/ThreadLocalPtr.h
@ -123,7 +123,7 @@ class ThreadLocalPtrImpl {
 template <typename T>
 class ThreadLocalPtr {
 public:
-  auto* operator-> () {
+  auto* operator->() {
    return get();
  }

@ -135,7 +135,7 @@ class ThreadLocalPtr {
    return impl_.get<T>();
  }

-  auto* operator-> () const {
+  auto* operator->() const {
    return get();
  }

--- a/caffe2/predictor/emulator/data_filler.cc
+++ b/caffe2/predictor/emulator/data_filler.cc
@ -63,7 +63,7 @@ DataRandomFiller::DataRandomFiller(

  // load op inputs and outputs
  std::unordered_set<std::string> output_names;
-  for (auto i: c10::irange(run_net.op_size())) {
+  for (auto i : c10::irange(run_net.op_size())) {
    const auto& op = run_net.op(i);
    const auto& op_dims = input_dims[i];
    const auto& op_types = input_types[i];
@ -78,7 +78,7 @@ DataRandomFiller::DataRandomFiller(
            " inputs; while the input type size is " +
            c10::to_string(op_types.size()));

-    for (auto j: c10::irange(op.input_size())) {
+    for (auto j : c10::irange(op.input_size())) {
      inputs_[op.input(j)] =
          std::make_pair(get_tensor_filler(op, j, op_dims), op_types[j]);
    }
@ -99,19 +99,19 @@ DataRandomFiller::DataRandomFiller(
      }
    }

-    for (auto j: c10::irange(op.output_size())) {
+    for (auto j : c10::irange(op.output_size())) {
      output_names.emplace(op.output(j));
    }
  }

  // load parameters
  std::unordered_set<std::string> parameters;
-  for (auto i: c10::irange(run_net.arg_size())) {
+  for (auto i : c10::irange(run_net.arg_size())) {
    const auto& arg = run_net.arg(i);
    // TODO: replace "PredictorParameters" with the constant in OSS bbp
    if (arg.has_name() && arg.name() == "PredictorParameters") {
      parameters.reserve(arg.strings_size());
-      for (auto j: c10::irange(arg.strings_size())) {
+      for (auto j : c10::irange(arg.strings_size())) {
        parameters.emplace(arg.strings(j));
      }
      break;
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@ -44,8 +44,8 @@ Predictor::Predictor(

 Predictor::Predictor(PredictorConfig config) : config_(std::move(config)) {
  const auto& initialized_vec = config_.ws->Blobs();
-  const std::unordered_set<std::string> initialized{initialized_vec.begin(),
-                                                    initialized_vec.end()};
+  const std::unordered_set<std::string> initialized{
+      initialized_vec.begin(), initialized_vec.end()};
  for (const auto& name : config_.predict_net->external_input()) {
    if (!initialized.count(name)) {
      auto* blob = config_.ws->CreateBlob(name);
@ -70,7 +70,7 @@ bool Predictor::operator()(const TensorList& inputs, TensorList* outputs) {
    return false;
  }
  outputs->clear();
-  for (auto i: c10::irange(config_.predict_net->external_output_size())) {
+  for (auto i : c10::irange(config_.predict_net->external_output_size())) {
    outputs->emplace_back(
        getTensor(config_.ws.get(), config_.predict_net->external_output(i))
            .UnsafeSharedInstance());
@ -104,7 +104,7 @@ bool Predictor::operator()(const TensorMap& inputs, TensorList* outputs) {
    return false;
  }
  outputs->clear();
-  for (auto i: c10::irange(config_.predict_net->external_output_size())) {
+  for (auto i : c10::irange(config_.predict_net->external_output_size())) {
    outputs->push_back(
        getTensor(config_.ws.get(), config_.predict_net->external_output(i))
            .UnsafeSharedInstance());
--- a/caffe2/predictor/predictor_config.h
+++ b/caffe2/predictor/predictor_config.h
@ -41,7 +41,8 @@ struct TORCH_API PredictorConfig {
  std::shared_ptr<Workspace> ws;
 };

-TORCH_API Workspace makeWorkspace(std::shared_ptr<PredictorParameters> parameters);
+TORCH_API Workspace
+makeWorkspace(std::shared_ptr<PredictorParameters> parameters);

 TORCH_API PredictorConfig makePredictorConfig(
    const MetaNetDef& net,
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@ -159,7 +159,7 @@ MetaNetDef parseMetaNetDef(const std::string& value) {
      value);
  return def;
 }
-}
+} // namespace

 class PredictorTest : public testing::Test {
 public:
--- a/caffe2/predictor/predictor_utils.cc
+++ b/caffe2/predictor/predictor_utils.cc
@ -9,9 +9,7 @@
 namespace caffe2 {
 namespace predictor_utils {

-TORCH_API const NetDef& getNet(
-    const MetaNetDef& def,
-    const std::string& name) {
+TORCH_API const NetDef& getNet(const MetaNetDef& def, const std::string& name) {
  for (const auto& n : def.nets()) {
    if (n.key() == name) {
      return n.value();
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@ -21,26 +21,26 @@ message TensorProto {
    UNDEFINED = 0;

    // Basic types
-    FLOAT = 1;     // float
-    INT32 = 2;     // int
-    BYTE = 3;      // byte, when deserialized, is going to be restored as uint8
-    STRING = 4;    // string
+    FLOAT = 1; // float
+    INT32 = 2; // int
+    BYTE = 3; // byte, when deserialized, is going to be restored as uint8
+    STRING = 4; // string

    // Less-commonly used data types
-    BOOL = 5;      // bool
-    UINT8 = 6;     // uint8_t
-    INT8 = 7;      // int8_t
-    UINT16 = 8;    // uint16_t
-    INT16 = 9;     // int16_t
-    INT64 = 10;    // int64_t
-    FLOAT16 = 12;  // at::Half
-    DOUBLE = 13;   // double
+    BOOL = 5; // bool
+    UINT8 = 6; // uint8_t
+    INT8 = 7; // int8_t
+    UINT16 = 8; // uint16_t
+    INT16 = 9; // int16_t
+    INT64 = 10; // int64_t
+    FLOAT16 = 12; // at::Half
+    DOUBLE = 13; // double

-    ZERO_COLLISION_HASH = 14;  // zero-collision hash state
-    REBATCHING_BUFFER= 15;     // rebatching buffer
+    ZERO_COLLISION_HASH = 14; // zero-collision hash state
+    REBATCHING_BUFFER = 15; // rebatching buffer
  }
  // The type of the deserialized tensor data
-  optional DataType data_type = 2 [default = FLOAT];
+  optional DataType data_type = 2 [ default = FLOAT ];

  // The format of the serialized data.
  enum SerializationFormat {
@ -58,10 +58,10 @@ message TensorProto {
  // new messages that have a SerializationFormat value that we don't
  // understand.  If we stored this as an enum then protobuf would deserialize
  // both of these cases the same way.
-  optional uint32 data_format = 15 [default = 0];
+  optional uint32 data_format = 15 [ default = 0 ];

  // For float
-  repeated float float_data = 3 [packed = true];
+  repeated float float_data = 3 [ packed = true ];
  // For int32, uint8, int8, uint16, int16, bool, and float16
  // Note about float16: in storage we will basically convert float16 byte-wise
  // to unsigned short and then store them in the int32_data field.
@ -69,15 +69,15 @@ message TensorProto {
  // larger serialized data than necessary, as protobuf's varint encoding
  // scheme requires 2 bytes to represent int8 and uint8 values that have the
  // MSB set.
-  repeated int32 int32_data = 4 [packed = true];
+  repeated int32 int32_data = 4 [ packed = true ];
  // For bytes
  optional bytes byte_data = 5;
  // For strings
  repeated bytes string_data = 6;
  // For double
-  repeated double double_data = 9 [packed = true];
+  repeated double double_data = 9 [ packed = true ];
  // For int64
-  repeated int64 int64_data = 10 [packed = true];
+  repeated int64 int64_data = 10 [ packed = true ];
  // store the raw data, contents are serialized as little-endian
  optional bytes raw_data = 13;

@ -107,9 +107,9 @@ message QTensorProto {
  required double scale = 3;
  required double bias = 4;
  required bool is_signed = 5;
-  repeated int32 data = 6 [packed = true];
+  repeated int32 data = 6 [ packed = true ];
  optional string name = 7;
-  optional TensorProto.DataType data_type = 8 [default = INT32];
+  optional TensorProto.DataType data_type = 8 [ default = INT32 ];

  // Multi-group quantization params
  repeated double scales = 9;
@ -120,7 +120,7 @@ message QTensorProto {
  optional int32 axis = 11;

  // It should be true if it is a multi-group quantization proto
-  optional bool is_multiparam = 12 [default = false];
+  optional bool is_multiparam = 12 [ default = false ];
 }

 // TensorProtos stores multiple TensorProto objects in one single proto. This
@ -132,9 +132,9 @@ message TensorProtos {

 message TensorShape {
  repeated int64 dims = 1;
-  optional TensorProto.DataType data_type = 2 [default = FLOAT];
+  optional TensorProto.DataType data_type = 2 [ default = FLOAT ];
  repeated int32 unknown_dims = 3;
-  optional bool unknown_shape = 4 [default = false];
+  optional bool unknown_shape = 4 [ default = false ];
  optional string name = 5;
 }

@ -150,8 +150,8 @@ message TensorShapes {
 message TensorBoundShape {
  optional TensorShape shape = 1;
  enum DimType {
-    UNKNOWN = 0;   // unknown
-    CONSTANT  = 1; // constant
+    UNKNOWN = 0; // unknown
+    CONSTANT = 1; // constant
    // batch, corresponding dimension is batch_size
    BATCH = 2;
    // batch_of_feature_max,
@ -164,9 +164,8 @@ message TensorBoundShape {
    FEATURE_MAX = 5;
    // feature_max_default, corresponding shape is default_feature_length
    FEATURE_MAX_DEFAULT = 6;
-
  }
-  repeated DimType dim_type = 2;  // dim_type.size() == shape.dims.size()
+  repeated DimType dim_type = 2; // dim_type.size() == shape.dims.size()
  optional string name = 3;
  // a flag to indicate whether the shape is final and cannot be changed
  // eg: input/output of in-place ops
@ -211,17 +210,17 @@ message Argument {
 // line in the DeviceTypeName() function in caffe2/utils/proto_utils.cc
 // and update c10/core/DeviceType.h
 enum DeviceTypeProto {
-  PROTO_CPU = 0;                    // In default, we will use CPU.
-  PROTO_CUDA = 1;                   // CUDA.
-  PROTO_MKLDNN = 2;                 // Reserved for explicit MKLDNN
-  PROTO_OPENGL = 3;                 // OpenGL
-  PROTO_OPENCL = 4;                 // OpenCL
-  PROTO_IDEEP = 5;                  // IDEEP.
-  PROTO_HIP = 6;                    // AMD HIP
-  PROTO_FPGA = 7;                   // FPGA
-  PROTO_ORT = 8;                    // ONNX Runtime
-  PROTO_XLA = 9;                    // XLA / TPU
-  PROTO_MLC = 10;                   // ML Compute
+  PROTO_CPU = 0; // In default, we will use CPU.
+  PROTO_CUDA = 1; // CUDA.
+  PROTO_MKLDNN = 2; // Reserved for explicit MKLDNN
+  PROTO_OPENGL = 3; // OpenGL
+  PROTO_OPENCL = 4; // OpenCL
+  PROTO_IDEEP = 5; // IDEEP.
+  PROTO_HIP = 6; // AMD HIP
+  PROTO_FPGA = 7; // FPGA
+  PROTO_ORT = 8; // ONNX Runtime
+  PROTO_XLA = 9; // XLA / TPU
+  PROTO_MLC = 10; // ML Compute
  // Change the following number if you add more devices in the code.
  PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 11;
 }
@ -270,7 +269,6 @@ message OperatorDef {
  // type.
  optional string engine = 7;

-
  // Additional 'fake' inputs used for expressing control dependencies
  // in the operator graph. This can be used to ensure that an
  // operator does not run until another operator is ready, for e.g.
@ -281,7 +279,7 @@ message OperatorDef {

  // is_gradient_op argument is only used as a hint in shape inference
  // and has no runtime significance
-  optional bool is_gradient_op = 9 [default = false];
+  optional bool is_gradient_op = 9 [ default = false ];

  // debug information associated with the construction of the operator.
  // This is an optional string with no assumed characteristics as
@ -387,7 +385,6 @@ message NetDef {
  repeated PartitionInfo partition_info = 9;
 }

-
 // ExecutionStep is actually a sort-of-hacky way we simulate iteration right
 // now.
 message ExecutionStep {
@ -415,7 +412,7 @@ message ExecutionStep {
  // Criteria network specifies a single output (TensorCPU<bool>) of
  // size (1), is run on every iteration by the executor, and
  // execution terminates when the output[0] is `false`.
-  optional string criteria_network = 5 [deprecated=true];
+  optional string criteria_network = 5 [ deprecated = true ];

  // DEPRECATED. Use `run_every_ms`.
  optional string report_net = 7;
@ -440,7 +437,8 @@ message ExecutionStep {
  // 2) the first substep decide which of the rest of the steps should be run.
  // 3) external control
  //
-  // ** It is the user's responsibility to not to put this blob in race conditions.
+  // ** It is the user's responsibility to not to put this blob in race
+  // conditions.
  // ** For example when setting this blob in concurrent substeps
  optional string should_stop_blob = 9;

--- a/caffe2/proto/caffe2_legacy.proto
+++ b/caffe2/proto/caffe2_legacy.proto
@ -14,11 +14,11 @@ message CaffeDatum {
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;
  // If true data contains an encoded image that need to be decoded
-  optional bool encoded = 7 [default = false];
+  optional bool encoded = 7 [ default = false ];
 }

 enum LegacyPadding {
-  NOTSET = 0;  // Do not use old-stype padding strategies.
+  NOTSET = 0; // Do not use old-stype padding strategies.

  // VALID and SAME are two strategies adopted in Google DistBelief: it forces
  // the input shape as follows. For SAME, the output is:
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@ -76,8 +76,7 @@ inline TORCH_API DeviceTypeProto TypeToProto(const DeviceType& t) {
  }
 }

-inline TORCH_API caffe2::DeviceOption DeviceToOption(
-    const at::Device& device) {
+inline TORCH_API caffe2::DeviceOption DeviceToOption(const at::Device& device) {
  caffe2::DeviceOption option;
  auto type = device.type();
  option.set_device_type(TypeToProto(type));
--- a/caffe2/proto/predictor_consts.proto
+++ b/caffe2/proto/predictor_consts.proto
@ -31,5 +31,6 @@ message PredictorConsts {
  // Shape info blob name
  optional string SHAPE_INFO_BLOB = 13 [ default = "SHAPE_INFO_BLOB" ];
  // Sequential blob reader name
-  optional string DEFERRED_BLOB_READER = 14 [ default = "__DEFERRED_BLOB_READER__" ];
+  optional string DEFERRED_BLOB_READER = 14
+      [ default = "__DEFERRED_BLOB_READER__" ];
 }
--- a/caffe2/proto/prof_dag.proto
+++ b/caffe2/proto/prof_dag.proto
@ -22,7 +22,7 @@ message TwoNumberStatsProto {
 // a node outputs to the blob.
 message BlobProfile {
  // Name of the blob (corresponds to OperatorDef.output).
-  optional string name = 1;  // required
+  optional string name = 1; // required

  // Profiling statistics.
  optional TwoNumberStatsProto bytes_used = 3;
@ -45,7 +45,6 @@ message ProfDAGProto {

  // The extra_info from the operator device option.
  repeated string extra_info = 7;
-
 }

 // Operator profiling information.
--- a/caffe2/proto/torch.proto
+++ b/caffe2/proto/torch.proto
@ -87,9 +87,7 @@ message LibDef {
  optional RecordRef torchscript_arena = 1;
 }

-enum ProtoVersion {
-  PROTO_VERSION_NEWEST = 0x0000000000000006;
-}
+enum ProtoVersion { PROTO_VERSION_NEWEST = 0x0000000000000006; }

 message ModelDef {
  // numbers of fields that have been removed. Do not reuse them!
--- a/caffe2/python/dlpack.h
+++ b/caffe2/python/dlpack.h
@ -26,8 +26,8 @@
 #define DLPACK_DLL
 #endif

-#include <stdint.h>
 #include <stddef.h>
+#include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
@ -160,15 +160,15 @@ typedef struct DLManagedTensor {
  /*! \brief the context of the original host framework of DLManagedTensor in
   *   which DLManagedTensor is used in the framework. It can also be NULL.
   */
-  void * manager_ctx;
+  void* manager_ctx;
  /*! \brief Destructor signature void (*)(void*) - this should be called
   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
   *   if there is no way for the caller to provide a reasonable destructor.
   *   The destructors deletes the argument self as well.
   */
-  void (*deleter)(struct DLManagedTensor * self);
+  void (*deleter)(struct DLManagedTensor* self);
 } DLManagedTensor;
 #ifdef __cplusplus
-}  // DLPACK_EXTERN_C
+} // DLPACK_EXTERN_C
 #endif
-#endif  // DLPACK_DLPACK_H_
+#endif // DLPACK_DLPACK_H_
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@ -53,7 +53,7 @@ void addObjectMethods(pybind11::module& m);
 Workspace* GetCurrentWorkspace();

 // Get workspace by name. Returns nullptr if none exists by name.
-Workspace* GetWorkspaceByName(const std::string &name);
+Workspace* GetWorkspaceByName(const std::string& name);

 class C10_EXPORT BlobFetcherBase {
 public:
@ -350,10 +350,10 @@ class PythonOpBase : public Operator<Context> {
        auto kwargs = builder_call[2].cast<py::dict>();
        auto built_func = func(*args, **kwargs);
        CAFFE_ENFORCE(built_func);
-        built_func_.reset(
-            new Func{built_func,
-                     OperatorBase::template GetSingleArgument<bool>(
-                         "pass_workspace", false)});
+        built_func_.reset(new Func{
+            built_func,
+            OperatorBase::template GetSingleArgument<bool>(
+                "pass_workspace", false)});
      } catch (const py::error_already_set& e) {
        LOG(ERROR) << "Python exception encountered while creating PythonOp: "
                   << e.what();
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@ -65,7 +65,7 @@ class DLPackWrapper {
    managed_tensor.dl_tensor = dlTensor;
    // C2 Tensor memory is managed by C2
    managed_tensor.manager_ctx = nullptr;
-    managed_tensor.deleter= [](DLManagedTensor*) {};
+    managed_tensor.deleter = [](DLManagedTensor*) {};

    return py::reinterpret_steal<py::object>(
        PyCapsule_New(&managed_tensor, "dltensor", nullptr));
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@ -13,11 +13,10 @@
 #ifdef CAFFE2_USE_CUDNN
 #include "caffe2/core/common_cudnn.h"
 #endif // CAFFE2_USE_CUDNN
+#include <c10/cuda/CUDAGuard.h>
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/operator_fallback_gpu.h"
 #include "caffe2/python/pybind_state_registry.h"
-#include <c10/cuda/CUDAGuard.h>
-

 #ifdef CAFFE2_USE_TRT
 #include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
@ -27,9 +26,7 @@ namespace caffe2 {
 namespace python {

 REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(
-    PythonGradient,
-    GPUFallbackOp);
+REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp);

 REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp);
 REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp);
@ -43,11 +40,14 @@ void addCUDAGlobalMethods(py::module& m) {
  m.def("get_cuda_version", &CudaVersion);
 #ifdef CAFFE2_USE_CUDNN
  m.def("get_cudnn_version", &cudnnCompiledVersion);
-  m.attr("cudnn_convolution_fwd_algo_count") = py::int_((int) CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
-  m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
-  m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
+  m.attr("cudnn_convolution_fwd_algo_count") =
+      py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
+  m.attr("cudnn_convolution_bwd_data_algo_count") =
+      py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
+  m.attr("cudnn_convolution_bwd_filter_algo_count") =
+      py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
 #else
-  m.def("get_cudnn_version", [](){ return static_cast<size_t>(0);});
+  m.def("get_cudnn_version", []() { return static_cast<size_t>(0); });
  m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0);
  m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0);
  m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0);
--- a/caffe2/python/pybind_state_hip.cc
+++ b/caffe2/python/pybind_state_hip.cc
@ -5,19 +5,17 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

+#include <c10/hip/HIPGuard.h>
 #include "caffe2/core/hip/common_miopen.h"
 #include "caffe2/core/hip/context_gpu.h"
 #include "caffe2/operators/hip/operator_fallback_gpu.h"
 #include "caffe2/python/pybind_state_registry.h"
-#include <c10/hip/HIPGuard.h>

 namespace caffe2 {
 namespace python {

 REGISTER_HIP_OPERATOR(Python, GPUFallbackOp);
-REGISTER_HIP_OPERATOR(
-    PythonGradient,
-    GPUFallbackOp);
+REGISTER_HIP_OPERATOR(PythonGradient, GPUFallbackOp);

 REGISTER_HIP_OPERATOR(PythonDLPack, GPUFallbackOp);
 REGISTER_HIP_OPERATOR(PythonDLPackGradient, GPUFallbackOp);
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@ -9,8 +9,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

-#include "caffe2/ideep/operators/operator_fallback_ideep.h"
 #include <caffe2/ideep/ideep_utils.h>
+#include "caffe2/ideep/operators/operator_fallback_ideep.h"

 namespace caffe2 {
 namespace python {
@ -26,39 +26,39 @@ REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()), IDeepFetcher);
 REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder);

 class IDeepFetcher : public BlobFetcherBase {
-  TypeMeta type_transform(const itensor &atensor) {
+  TypeMeta type_transform(const itensor& atensor) {
    switch (atensor.get_data_type()) {
-    case itensor::data_type::f32:
-      return TypeMeta::Make<float>();
-    case itensor::data_type::s32:
-      return TypeMeta::Make<int>();
-    case itensor::data_type::s8:
-      return TypeMeta::Make<int8_t>();
-    case itensor::data_type::u8:
-      return TypeMeta::Make<uint8_t>();
-    default:
-      // Should we throw exception?
-      return TypeMeta();
+      case itensor::data_type::f32:
+        return TypeMeta::Make<float>();
+      case itensor::data_type::s32:
+        return TypeMeta::Make<int>();
+      case itensor::data_type::s8:
+        return TypeMeta::Make<int8_t>();
+      case itensor::data_type::u8:
+        return TypeMeta::Make<uint8_t>();
+      default:
+        // Should we throw exception?
+        return TypeMeta();
    }
  }

-public:
-  pybind11::object Fetch(const Blob &blob) override {
+ public:
+  pybind11::object Fetch(const Blob& blob) override {
    try {
      return FetchTensor(blob.Get<itensor>(), true).obj;
-    } catch (ideep::error &e) {
+    } catch (ideep::error& e) {
      LOG(ERROR) << "IDEEP error: " << e.message;
      throw;
    }
  }

-  FetchedBlob FetchTensor(const itensor &atensor, bool force_copy) {
+  FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) {
 #ifdef USE_NUMPY
    FetchedBlob result;
-    CAFFE_ENFORCE((atensor.ndims() != 0) &&
-                  (atensor.get_nelems() == 0 ||
-                   atensor.get_data_handle() != nullptr),
-                  "Trying to fetch uninitialized tensor");
+    CAFFE_ENFORCE(
+        (atensor.ndims() != 0) &&
+            (atensor.get_nelems() == 0 || atensor.get_data_handle() != nullptr),
+        "Trying to fetch uninitialized tensor");
    // NOTE: Only support float so far.
    const int numpy_type = NPY_FLOAT;
    CAFFE_ENFORCE(
@ -70,12 +70,12 @@ public:

    result.copied = force_copy || atensor.need_reorder();
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    void *outPtr;
+    void* outPtr;
    if (result.copied) {
      result.obj = py::reinterpret_steal<py::object>(
          PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type));
-      outPtr = static_cast<void *>(
-          PyArray_DATA(reinterpret_cast<PyArrayObject *>(result.obj.ptr())));
+      outPtr = static_cast<void*>(
+          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
    } else {
      outPtr = atensor.get_data_handle();
      result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
@ -111,13 +111,13 @@ class IDeepFeeder : public BlobFeederBase {
      return itensor::data_type::undef;
  }

-public:
+ public:
  void FeedTensor(
-      const DeviceOption &option,
-      PyArrayObject *original_array,
-      itensor *tensor) {
+      const DeviceOption& option,
+      PyArrayObject* original_array,
+      itensor* tensor) {
 #ifdef USE_NUMPY
-    PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+    PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
    auto g = MakeGuard([&]() { Py_XDECREF(array); });
    const auto npy_type = PyArray_TYPE(array);
    const TypeMeta meta = NumpyTypeToCaffe(npy_type);
@ -125,10 +125,11 @@ public:
        meta,
        ScalarType::Undefined,
        "This numpy data type is not supported: ",
-        PyArray_TYPE(array), ".");
+        PyArray_TYPE(array),
+        ".");

    int ndim = PyArray_NDIM(array);
-    npy_intp *npy_dims = PyArray_DIMS(array);
+    npy_intp* npy_dims = PyArray_DIMS(array);

    itensor::dims adims;
    for (int i = 0; i < ndim; i++) {
@ -145,15 +146,14 @@ public:
        if (tensor->get_dims() != adims || type != tensor->get_data_type()) {
          tensor->resize(adims, type);
        }
-        tensor->feed_from(adims, type,
-                             static_cast<void *>(PyArray_DATA(array)));
+        tensor->feed_from(adims, type, static_cast<void*>(PyArray_DATA(array)));
    }
 #else
    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
 #endif // USE_NUMPY
  }

-  bool ZeroDim(PyArrayObject *array) {
+  bool ZeroDim(PyArrayObject* array) {
 #ifdef USE_NUMPY
    int ndim = PyArray_NDIM(array);
    return ndim == 0;
@ -169,15 +169,15 @@ public:
      bool in_place) override {
 #ifdef USE_NUMPY
    try {
-      PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array);
+      PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
      auto g = MakeGuard([&]() { Py_XDECREF(array); });

      const auto npy_type = PyArray_TYPE(array);
      const TypeMeta meta = NumpyTypeToCaffe(npy_type);

      // TODO: if necessary, use dispatcher.
-      if ((in_place && blob->IsType<itensor>())
-          || (meta.Match<float>() && !ZeroDim(original_array))) {
+      if ((in_place && blob->IsType<itensor>()) ||
+          (meta.Match<float>() && !ZeroDim(original_array))) {
        FeedTensor(option, original_array, blob->GetMutable<itensor>());
      } else {
        DeviceOption cpu_option(option);
@ -191,10 +191,10 @@ public:
              true);
        } else {
          blob->Reset<Tensor>(new Tensor(
-                                  cpu_tensor_feeder.FeedTensor(cpu_option, original_array)));
+              cpu_tensor_feeder.FeedTensor(cpu_option, original_array)));
        }
      }
-    } catch (ideep::error &e) {
+    } catch (ideep::error& e) {
      LOG(ERROR) << "IDEEP error: " << e.message;
      throw;
    }
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@ -526,12 +526,12 @@ void addNomnigraphMethods(pybind11::module& m) {
          "operator_def",
          [](Caffe2Annotation& annot) {
            auto opDef = py::module::import("caffe2.proto.caffe2_pb2")
-                                    .attr("OperatorDef");
+                             .attr("OperatorDef");
            // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
            auto proto = annot.getOperatorDef();
            std::string serialized_proto;
            proto.SerializeToString(&serialized_proto);
-            auto py_op_def= opDef();
+            auto py_op_def = opDef();
            py_op_def.attr("ParseFromString")(py::bytes(serialized_proto));
            return py_op_def;
          },
--- a/caffe2/queue/blobs_queue_db.h
+++ b/caffe2/queue/blobs_queue_db.h
@ -22,7 +22,7 @@ const std::string& GetStringFromBlob(Blob* blob) {
    CAFFE_THROW("Unsupported Blob type");
  }
 }
-}
+} // namespace

 class BlobsQueueDBCursor : public Cursor {
 public:
--- a/caffe2/queue/queue_ops.cc
+++ b/caffe2/queue/queue_ops.cc
@ -96,4 +96,4 @@ NO_GRADIENT(SafeEnqueueBlobs);
 NO_GRADIENT(SafeDequeueBlobs);
 NO_GRADIENT(WeightedSampleDequeueBlobs);

-}
+} // namespace caffe2
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@ -49,7 +49,8 @@ class EnqueueBlobsOp final : public Operator<Context> {
    CAFFE_ENFORCE(InputSize() > 1);
    auto queue = Operator<Context>::Inputs()[0]
                     ->template Get<std::shared_ptr<BlobsQueue>>();
-    CAFFE_ENFORCE(queue && static_cast<size_t>(OutputSize()) == queue->getNumBlobs());
+    CAFFE_ENFORCE(
+        queue && static_cast<size_t>(OutputSize()) == queue->getNumBlobs());
    return queue->blockingWrite(this->Outputs());
  }

@ -70,7 +71,8 @@ class DequeueBlobsOp final : public Operator<Context> {
    CAFFE_ENFORCE(InputSize() == 1);
    auto queue =
        OperatorBase::Inputs()[0]->template Get<std::shared_ptr<BlobsQueue>>();
-    CAFFE_ENFORCE(queue && static_cast<size_t>(OutputSize()) == queue->getNumBlobs());
+    CAFFE_ENFORCE(
+        queue && static_cast<size_t>(OutputSize()) == queue->getNumBlobs());
    return queue->blockingRead(this->Outputs(), timeout_secs_);
  }

@ -141,7 +143,7 @@ class SafeDequeueBlobsOp final : public Operator<Context> {
    if (blobs_.size() != size) {
      blobs_.resize(size);
      blobPtrs_.resize(size);
-      for (auto col: c10::irange(size)) {
+      for (auto col : c10::irange(size)) {
        blobPtrs_.at(col) = &blobs_.at(col);
      }
    }
@ -152,7 +154,7 @@ class SafeDequeueBlobsOp final : public Operator<Context> {
        // if we read at least one record, status is still true
        return i > 0;
      }
-      for (auto col: c10::irange(size)) {
+      for (auto col : c10::irange(size)) {
        auto* out = this->Output(col);
        const auto& in = blobPtrs_.at(col)->template Get<Tensor>();
        if (i == 0) {
@ -231,7 +233,7 @@ class WeightedSampleDequeueBlobsOp final : public Operator<Context> {
    float sum = accumulate(weights.begin(), weights.end(), 0.0f);
    CAFFE_ENFORCE(sum > 0.0f, "Sum of weights must be positive");
    cumProbs_.resize(weights.size());
-    for (auto i: c10::irange(weights.size())) {
+    for (auto i : c10::irange(weights.size())) {
      cumProbs_[i] = weights[i] / sum;
      CAFFE_ENFORCE_GE(
          cumProbs_[i], 0.0f, "Each probability must be non-negative");
--- a/caffe2/queue/queue_ops_gpu.cc
+++ b/caffe2/queue/queue_ops_gpu.cc
@ -1,5 +1,5 @@
-#include "caffe2/utils/math.h"
 #include "caffe2/queue/queue_ops.h"
+#include "caffe2/utils/math.h"

 #include "caffe2/core/context_gpu.h"

@ -13,4 +13,4 @@ REGISTER_CUDA_OPERATOR(CloseBlobsQueue, CloseBlobsQueueOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(SafeEnqueueBlobs, SafeEnqueueBlobsOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(SafeDequeueBlobs, SafeDequeueBlobsOp<CUDAContext>);

-}
+} // namespace caffe2
--- a/caffe2/queue/rebatching_queue.cc
+++ b/caffe2/queue/rebatching_queue.cc
@ -202,7 +202,7 @@ bool RebatchingQueue::enqueue(

      do {
        queue_[head_++ % capacity()] = std::move(splittedInputs[idx++]);
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+        // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
      } while (canWrite() && idx < splittedInputs.size());
    }

@ -234,4 +234,4 @@ void RebatchingQueue::close() {
  cvEmpty_.notify_all();
  cvOverflow_.notify_all();
 }
-} // caffe2
+} // namespace caffe2
--- a/caffe2/queue/rebatching_queue.h
+++ b/caffe2/queue/rebatching_queue.h
@ -65,4 +65,4 @@ class RebatchingQueue {

  std::vector<std::vector<TensorCPU>> queue_;
 };
-} // caffe2
+} // namespace caffe2
--- a/caffe2/queue/rebatching_queue_ops.cc
+++ b/caffe2/queue/rebatching_queue_ops.cc
@ -68,5 +68,5 @@ tensor per component.
    .Arg(
        "num_elements",
        "Number of elements to dequeue. By default we dequeue one element.");
-}
-}
+} // namespace
+} // namespace caffe2
--- a/caffe2/queue/rebatching_queue_ops.h
+++ b/caffe2/queue/rebatching_queue_ops.h
@ -80,4 +80,4 @@ class CloseRebatchingQueueOp : public Operator<CPUContext> {
    return true;
  }
 };
-} // caffe2
+} // namespace caffe2
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
@ -167,9 +167,8 @@ void runDepthwise3x3Conv(
      // fast-path, all accesses in-bounds
      if (C10_LIKELY(
              ih >= 0 && iw >= 0 && ih + 3 < args.in_rows &&
-                  iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
-                  2 * otw + 1 < args.out_cols
-              )) {
+              iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
+              2 * otw + 1 < args.out_cols)) {
        float32x4x4_t input_tile;
        for (int row = 0; row < 4; ++row) {
          input_tile.val[row] =
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@ -140,8 +140,7 @@ void compare(

          // For small values / small difference, the relative error
          // can be huge but the absolute error will be small
-          EXPECT_TRUE(
-              relErr <= maxRelErr || absErr <= absErrForRelErrFailure)
+          EXPECT_TRUE(relErr <= maxRelErr || absErr <= absErrForRelErrFailure)
              << v1 << " " << v2 << " (rel err " << relErr << ") "
              << "(" << n << " " << c << " " << h << " " << w << ") "
              << "running N " << N << " inputC " << inputC << " H " << H
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@ -3,7 +3,6 @@

 #include "caffe2/core/common.h"

-
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@ -131,10 +130,9 @@ NNPACKConvOp::getConvolutionTransformStrategy() const {
  return nnp_convolution_transform_strategy_compute;
 }

-nnp_activation
-NNPACKConvOp::getActivationType() const {
-  auto activation = OperatorBase::GetSingleArgument<std::string>(
-    "activation", "identity");
+nnp_activation NNPACKConvOp::getActivationType() const {
+  auto activation =
+      OperatorBase::GetSingleArgument<std::string>("activation", "identity");
  if (activation == "identity") {
    return nnp_activation_identity;
  } else if (activation == "Relu") {
@ -180,19 +178,23 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
    biasData = dummyBias_.data();
  }

-  const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
-                               .height = static_cast<size_t>(X.dim32(2))};
+  const nnp_size input_size = {
+      .width = static_cast<size_t>(X.dim32(3)),
+      .height = static_cast<size_t>(X.dim32(2))};
  // filter is MCHW
-  const nnp_size kernel_size = {.width = static_cast<size_t>(filter.dim32(3)),
-                                .height = static_cast<size_t>(filter.dim32(2))};
+  const nnp_size kernel_size = {
+      .width = static_cast<size_t>(filter.dim32(3)),
+      .height = static_cast<size_t>(filter.dim32(2))};
  // pad is tblr
-  const nnp_padding padding = {.top = static_cast<size_t>(pad_t()),
-                               .right = static_cast<size_t>(pad_r()),
-                               .bottom = static_cast<size_t>(pad_b()),
-                               .left = static_cast<size_t>(pad_l())};
+  const nnp_padding padding = {
+      .top = static_cast<size_t>(pad_t()),
+      .right = static_cast<size_t>(pad_r()),
+      .bottom = static_cast<size_t>(pad_b()),
+      .left = static_cast<size_t>(pad_l())};

-  const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
-                                     .height = static_cast<size_t>(stride_h())};
+  const nnp_size output_subsample = {
+      .width = static_cast<size_t>(stride_w()),
+      .height = static_cast<size_t>(stride_h())};
  initNNPACK();

 #if !defined(USE_INTERNAL_PTHREADPOOL_IMPL)
--- a/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
+++ b/caffe2/share/contrib/zstd/quant_decomp_zstd_op.cc
@ -21,10 +21,11 @@ namespace {
 uint8_t* GetMutableData(int type_index, TensorCPU* tensor) {
  // see COMP_DATA_TYPE_MAPPER in mutils.py for the mapping
  static const std::map<int, std::function<uint8_t*(TensorCPU * tensor)>>
-      gTypeMapper = {REGISTER_TYPE(TensorProto::UINT8, uint8_t),
-                     REGISTER_TYPE(TensorProto::UINT16, uint16_t),
-                     REGISTER_TYPE(TensorProto::INT32, int32_t),
-                     REGISTER_TYPE(TensorProto::FLOAT, float)};
+      gTypeMapper = {
+          REGISTER_TYPE(TensorProto::UINT8, uint8_t),
+          REGISTER_TYPE(TensorProto::UINT16, uint16_t),
+          REGISTER_TYPE(TensorProto::INT32, int32_t),
+          REGISTER_TYPE(TensorProto::FLOAT, float)};

  CAFFE_ENFORCE_EQ(
      gTypeMapper.count(type_index),
--- a/caffe2/test/caffe2_gtest_main.cc
+++ b/caffe2/test/caffe2_gtest_main.cc
@ -38,7 +38,7 @@ C10_DEFINE_string(
    "gen/",
    "The root of the caffe test folder.");

-GTEST_API_ int main(int argc, char **argv) {
+GTEST_API_ int main(int argc, char** argv) {
  // std::cout << "Running main() from gtest_main.cc\n";
  testing::InitGoogleTest(&argc, argv);
  caffe2::GlobalInit(&argc, &argv);
--- a/caffe2/transforms/common_subexpression_elimination_test.cc
+++ b/caffe2/transforms/common_subexpression_elimination_test.cc
@ -115,4 +115,4 @@ TEST(CommonSubexpressionEliminationTest, TestFromExternal) {

 } // namespace

-} // namespace Caffe2
+} // namespace caffe2
--- a/caffe2/transforms/conv_to_nnpack_transform_test.cc
+++ b/caffe2/transforms/conv_to_nnpack_transform_test.cc
@ -46,4 +46,4 @@ TEST(ConvToNNPackTest, TestSimple) {

 } // namespace

-} // namespace Caffe2
+} // namespace caffe2
--- a/caffe2/transforms/pattern_net_transform.cc
+++ b/caffe2/transforms/pattern_net_transform.cc
@ -263,4 +263,4 @@ bool PatternNetTransform::ReplaceRule(
  return true;
 }

-} // namespace Caffe2
+} // namespace caffe2
--- a/caffe2/transforms/pattern_net_transform_test.cc
+++ b/caffe2/transforms/pattern_net_transform_test.cc
@ -530,4 +530,4 @@ TEST(PatternNetTransformTest, TestMultiInputOutputTransform) {

 } // namespace

-} // namespace Caffe2
+} // namespace caffe2
--- a/caffe2/video/video_decoder.cc
+++ b/caffe2/video/video_decoder.cc
@ -1,9 +1,9 @@
-#include <caffe2/video/video_decoder.h>
 #include <assert.h>
 #include <caffe2/core/logging.h>
+#include <caffe2/video/video_decoder.h>
+#include <array>
 #include <mutex>
 #include <random>
-#include <array>

 namespace caffe2 {

--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@ -606,7 +606,9 @@ bool VideoInputOp<Context>::GetImageAndLabelsFromDBValue(
    img = scaled_img;
  } else {
    cv::cvtColor(
-        scaled_img, img, (channels_rgb_ == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR);
+        scaled_img,
+        img,
+        (channels_rgb_ == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR);
  }

  cv::Mat rgb_img;
--- a/caffe2/video/video_io.cc
+++ b/caffe2/video/video_io.cc
@ -1,5 +1,5 @@
-#include <caffe2/video/video_io.h>
 #include <caffe2/core/logging.h>
+#include <caffe2/video/video_io.h>
 #include <algorithm>
 #include <random>
 #include <string>