#pragma once

#include <ATen/CPUFunctions.h>
#include <ATen/NativeFunctions.h>
#include <torch/torch.h>

struct DeepAndWide : torch::nn::Module {
  DeepAndWide(int num_features = 50) {
    mu_ = register_parameter("mu_", torch::randn({1, num_features}));
    sigma_ = register_parameter("sigma_", torch::randn({1, num_features}));
    fc_w_ = register_parameter("fc_w_", torch::randn({1, num_features + 1}));
    fc_b_ = register_parameter("fc_b_", torch::randn({1}));
  }

  torch::Tensor forward(
      torch::Tensor ad_emb_packed,
      torch::Tensor user_emb,
      torch::Tensor wide) {
    auto wide_offset = wide + mu_;
    auto wide_normalized = wide_offset * sigma_;
    auto wide_noNaN = wide_normalized;
    // Placeholder for ReplaceNaN
    auto wide_preproc = torch::clamp(wide_noNaN, -10.0, 10.0);

    auto user_emb_t = torch::transpose(user_emb, 1, 2);
    auto dp_unflatten = torch::bmm(ad_emb_packed, user_emb_t);
    auto dp = torch::flatten(dp_unflatten, 1);
    auto input = torch::cat({dp, wide_preproc}, 1);
    auto fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
    auto pred = torch::sigmoid(fc1);
    return pred;
  }
  torch::Tensor mu_, sigma_, fc_w_, fc_b_;
};

// Implementation using native functions and pre-allocated tensors.
// It could be used as a "speed of light" for static runtime.
struct DeepAndWideFast : torch::nn::Module {
  DeepAndWideFast(int num_features = 50) {
    mu_ = register_parameter("mu_", torch::randn({1, num_features}));
    sigma_ = register_parameter("sigma_", torch::randn({1, num_features}));
    fc_w_ = register_parameter("fc_w_", torch::randn({1, num_features + 1}));
    fc_b_ = register_parameter("fc_b_", torch::randn({1}));
    allocated = false;
    prealloc_tensors = {};
  }

  torch::Tensor forward(
      torch::Tensor ad_emb_packed,
      torch::Tensor user_emb,
      torch::Tensor wide) {
    torch::NoGradGuard no_grad;
    if (!allocated) {
      auto wide_offset = at::add(wide, mu_);
      auto wide_normalized = at::mul(wide_offset, sigma_);
      // Placeholder for ReplaceNaN
      auto wide_preproc = at::cpu::clamp(wide_normalized, -10.0, 10.0);

      auto user_emb_t = at::native::transpose(user_emb, 1, 2);
      auto dp_unflatten = at::cpu::bmm(ad_emb_packed, user_emb_t);
      // auto dp = at::native::flatten(dp_unflatten, 1);
      auto dp = dp_unflatten.view({dp_unflatten.size(0), 1});
      auto input = at::cpu::cat({dp, wide_preproc}, 1);

      // fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
      fc_w_t_ = torch::t(fc_w_);
      auto fc1 = torch::addmm(fc_b_, input, fc_w_t_);

      auto pred = at::cpu::sigmoid(fc1);

      prealloc_tensors = {
          wide_offset,
          wide_normalized,
          wide_preproc,
          user_emb_t,
          dp_unflatten,
          dp,
          input,
          fc1,
          pred};
      allocated = true;

      return pred;
    } else {
      // Potential optimization: add and mul could be fused together (e.g. with
      // Eigen).
      at::add_out(prealloc_tensors[0], wide, mu_);
      at::mul_out(prealloc_tensors[1], prealloc_tensors[0], sigma_);

      at::native::clip_out(
          prealloc_tensors[1], -10.0, 10.0, prealloc_tensors[2]);

      // Potential optimization: original tensor could be pre-transposed.
      // prealloc_tensors[3] = at::native::transpose(user_emb, 1, 2);
      if (prealloc_tensors[3].data_ptr() != user_emb.data_ptr()) {
        auto sizes = user_emb.sizes();
        auto strides = user_emb.strides();
        prealloc_tensors[3].set_(
            user_emb.storage(),
            0,
            {sizes[0], sizes[2], sizes[1]},
            {strides[0], strides[2], strides[1]});
      }

      // Potential optimization: call MKLDNN directly.
      at::cpu::bmm_out(ad_emb_packed, prealloc_tensors[3], prealloc_tensors[4]);

      if (prealloc_tensors[5].data_ptr() != prealloc_tensors[4].data_ptr()) {
        // in unlikely case that the input tensor changed we need to
        // reinitialize the view
        prealloc_tensors[5] =
            prealloc_tensors[4].view({prealloc_tensors[4].size(0), 1});
      }

      // Potential optimization: we can replace cat with carefully constructed
      // tensor views on the output that are passed to the _out ops above.
      at::cpu::cat_outf(
          {prealloc_tensors[5], prealloc_tensors[2]}, 1, prealloc_tensors[6]);
      at::cpu::addmm_out(
          prealloc_tensors[7], fc_b_, prealloc_tensors[6], fc_w_t_, 1, 1);
      at::cpu::sigmoid_out(prealloc_tensors[7], prealloc_tensors[8]);

      return prealloc_tensors[8];
    }
  }
  torch::Tensor mu_, sigma_, fc_w_, fc_b_, fc_w_t_;
  std::vector<torch::Tensor> prealloc_tensors;
  bool allocated = false;
};

torch::jit::Module getDeepAndWideSciptModel(int num_features = 50);

torch::jit::Module getTrivialScriptModel();

torch::jit::Module getLeakyReLUScriptModel();

torch::jit::Module getLeakyReLUConstScriptModel();

torch::jit::Module getLongScriptModel();

torch::jit::Module getSignedLog1pModel();