#if defined(USE_CUDA) #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // fuser and IR parser #include #include #include "test_gpu_validator.h" #include #include #include #include // Tests go in torch::jit namespace torch { namespace jit { using namespace torch::jit::fuser::cuda; using namespace at::indexing; namespace { // Make a tensor that is known to be fully contiguous of dimensionality=ndims, // but unknown sizes TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) { return TensorViewBuilder() .ndims(ndims) .dtype(dtype) .contiguity(std::vector(ndims, true)) .build(); } // Make a tensor that is known to be non-contiguous of dimensionality=ndims, // but unknown sizes TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); } // Make a non-contiguous tensor of compile-time known sizes TensorView* makeConcreteTensor( std::vector shape, DataType dtype = DataType::Float) { return TensorViewBuilder().shape(shape).dtype(dtype).build(); } void checkIntValue( ExpressionEvaluator& evaluator, Val* val, Int::ScalarType expected_value) { TORCH_CHECK(val->isAnInt()); const auto actual_value = evaluator.evaluate(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } void checkIntValue( kir::ExpressionEvaluator& evaluator, const kir::Val* val, kir::Int::ScalarType expected_value) { const auto actual_value = evaluator.evaluate(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } bool isPredicated(TensorView* tv, GpuLower& gpulw) { auto parent_scope = gpulw.lowerValue(tv)->definition()->parentScope(); if (parent_scope->isA()) { return !parent_scope->predicate()->value()->isConst(); } return true; }; } // namespace // 1. Test cases are void() functions. // 2. They start with the prefix `test` // A few smoke tests for IrGraphGenerator // (These tests exercise IrGraphGenerator through a non-trivial IR, // to make sure that it runs w/o crashing. The actual output is not // validated) TEST(NVFuserTest, IrGraphGenerator_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Make sure we can handle empty IRs TORCH_CHECK(!IrGraphGenerator::toGraphviz( &fusion, IrGraphGenerator::DetailLevel::Basic) .empty()); // Construct an interesting IR TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv2 = add(tv0, new Double(3.141)); TensorView* tv3 = broadcast(tv0, {false, true, false, true}); TensorView* tv4 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv3); TensorView* tv5 = clamp(tv4, new Double(0.f), new Double(1.f)); TensorView* tv6 = add(tv2, tv2); // Another checkpoint before adding outputs TORCH_CHECK(!IrGraphGenerator::toGraphviz( &fusion, IrGraphGenerator::DetailLevel::Explicit) .empty()); fusion.addOutput(tv6); tv4->axis(2)->parallelize(ParallelType::BIDy); tv6->merge(0); tv6->split(0, 4); tv6->axis(0)->parallelize(ParallelType::BIDx); tv5->reorder({{-1, 0}}); tv2->computeAt(tv6, 1); // Another checkpoint with more node types TORCH_CHECK(!IrGraphGenerator::toGraphviz( &fusion, IrGraphGenerator::DetailLevel::ComputeOnly) .empty()); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(-1)->parallelize(ParallelType::TIDx); } } // Final IR graph TORCH_CHECK(!IrGraphGenerator::toGraphviz( &fusion, IrGraphGenerator::DetailLevel::Verbose) .empty()); } TEST(NVFuserTest, FusionDispatch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Double* f = new Double{2.f}; std::stringstream ss1, ss2, ss3; ss1 << f; ss2 << static_cast(f); ss3 << static_cast(f); TORCH_CHECK( ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0, "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*."); } // Evaluate basic scalar operations with constant values TEST(NVFuserTest, FusionExprEvalConstants_CUDA) { Fusion fusion; FusionGuard fg(&fusion); ExpressionEvaluator evaluator(&fusion); auto* a = new Int(7); auto* b = new Int(3); checkIntValue(evaluator, neg(a), -7); checkIntValue(evaluator, add(a, b), 10); checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8); checkIntValue(evaluator, mod(a, b), 1); checkIntValue(evaluator, ceilDiv(a, b), 3); } // Evaluate basic scalar operations with bound values TEST(NVFuserTest, FusionExprEvalBindings_CUDA) { Fusion fusion; FusionGuard fg(&fusion); ExpressionEvaluator evaluator(&fusion); auto* a = new Int(); auto* b = new Int(); auto* c = add(a, b); auto* d = neg(ceilDiv(c, b)); auto* e = new Int(0); // trying to evaluate before binding should give empty results TORCH_CHECK(!evaluator.evaluate(a).has_value()); TORCH_CHECK(!evaluator.evaluate(d).has_value()); evaluator.bind(a, 7); evaluator.bind(b, 3); // can't bind to the results of expressions ASSERT_ANY_THROW(evaluator.bind(c, 100)); // can't bind to concrete values ASSERT_ANY_THROW(evaluator.bind(e, 100)); checkIntValue(evaluator, c, 10); checkIntValue(evaluator, sub(a, b), 4); checkIntValue(evaluator, mod(a, b), 1); checkIntValue(evaluator, ceilDiv(a, b), 3); checkIntValue(evaluator, d, -4); // Reset evaluation context evaluator = ExpressionEvaluator(&fusion); evaluator.bind(a, 2); evaluator.bind(b, 5); checkIntValue(evaluator, c, 7); checkIntValue(evaluator, sub(a, b), -3); checkIntValue(evaluator, mod(a, b), 2); checkIntValue(evaluator, ceilDiv(a, b), 1); checkIntValue(evaluator, d, -2); } // Evaluate expressions in a simple IR TEST(NVFuserTest, FusionExprEvalBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Create a non-trivial IR TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); tv3->split(0, 4); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); // 1. Create an evaluator ExpressionEvaluator evaluator(&fusion); // 2. Bind values // // IMPORTANT: // a. The bindings are only as stable as the Vals are in the fusion graph // b. You must use the original (rootDomain) extents // (ex. `tv0->getRootDomain()[0]->extent()` // instead of `tv0->axis(0)->extent()`) // evaluator.bind(tv0->getRootDomain()[0]->extent(), 6); evaluator.bind(tv0->getRootDomain()[1]->extent(), 128); evaluator.bind(tv1->getRootDomain()[0]->extent(), 6); evaluator.bind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); checkIntValue(evaluator, tv2->axis(0)->extent(), 2); checkIntValue(evaluator, tv2->axis(1)->extent(), 4); checkIntValue(evaluator, tv2->axis(2)->extent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); checkIntValue(evaluator, tv3->axis(0)->extent(), 2); checkIntValue(evaluator, tv3->axis(1)->extent(), 4); checkIntValue(evaluator, tv3->axis(2)->extent(), 128); } // Evaluate expressions in a more complex IR TEST(NVFuserTest, FusionExprEvalComplex_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(-1.0)); TensorView* tv2 = add(tv0, new Double(3.0)); TensorView* tv3 = mul(tv0, new Double(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); TensorView* tv6 = add(tv0, tv3); fusion.addOutput(tv5); fusion.addOutput(tv6); tv5->reorder({{-1, 0}}); tv6->split(0, 5); tv5->merge(0); // 1. Create an evaluator ExpressionEvaluator evaluator(&fusion); // 2. Bind values evaluator.bind(tv0->getRootDomain()[0]->extent(), 129); evaluator.bind(tv0->getRootDomain()[1]->extent(), 127); // Evaluate and check extent values TORCH_CHECK(tv0->domain()->nDims() == 2); checkIntValue(evaluator, tv0->axis(0)->extent(), 129); checkIntValue(evaluator, tv0->axis(1)->extent(), 127); TORCH_CHECK(tv3->domain()->nDims() == 2); checkIntValue(evaluator, tv3->axis(0)->extent(), 129); checkIntValue(evaluator, tv3->axis(1)->extent(), 127); TORCH_CHECK(tv4->domain()->nDims() == 2); checkIntValue(evaluator, tv4->axis(0)->extent(), 129); checkIntValue(evaluator, tv4->axis(1)->extent(), 127); TORCH_CHECK(tv5->domain()->nDims() == 1); checkIntValue(evaluator, tv5->axis(0)->extent(), 16383); TORCH_CHECK(tv6->domain()->nDims() == 3); checkIntValue(evaluator, tv6->axis(0)->extent(), 26); checkIntValue(evaluator, tv6->axis(1)->extent(), 5); checkIntValue(evaluator, tv6->axis(2)->extent(), 127); } // Evaluate expressions post lowering TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Create a non-trivial IR TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); tv3->split(0, 4); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); auto* bid_x = add(tv3->axis(0)->extent(), new Int(0)); auto* tid_x = add(tv3->axis(-1)->extent(), new Int(0)); // Lower GpuLower gpulw(&fusion); // 1. Create an evaluation context ExpressionEvaluator evaluator(&fusion); // 2. Bind values evaluator.bind(tv0->getRootDomain()[0]->extent(), 6); evaluator.bind(tv0->getRootDomain()[1]->extent(), 128); evaluator.bind(tv1->getRootDomain()[0]->extent(), 6); evaluator.bind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); checkIntValue(evaluator, tv2->axis(0)->extent(), 2); checkIntValue(evaluator, tv2->axis(1)->extent(), 4); checkIntValue(evaluator, tv2->axis(2)->extent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); checkIntValue(evaluator, tv3->axis(0)->extent(), 2); checkIntValue(evaluator, tv3->axis(1)->extent(), 4); checkIntValue(evaluator, tv3->axis(2)->extent(), 128); checkIntValue(evaluator, bid_x, 2); checkIntValue(evaluator, tid_x, 128); } // Kernel IR: Evaluate basic scalar operations with constant values TEST(NVFuserTest, KernelExprEvalConstants_CUDA) { kir::Kernel kernel; kir::IrBuilder ir_builder(&kernel); auto a = ir_builder.create(7); auto b = ir_builder.create(3); auto c = ir_builder.subExpr(a, b); auto d = ir_builder.divExpr(a, b); auto e = ir_builder.mulExpr(c, d); kir::ExpressionEvaluator evaluator; checkIntValue(evaluator, ir_builder.negExpr(a), -7); checkIntValue(evaluator, ir_builder.addExpr(a, b), 10); checkIntValue(evaluator, ir_builder.negExpr(e), -8); checkIntValue(evaluator, ir_builder.modExpr(a, b), 1); checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3); } // Kernel IR: Evaluate basic scalar operations with bound values TEST(NVFuserTest, KernelExprEvalBindings_CUDA) { kir::Kernel kernel; kir::IrBuilder ir_builder(&kernel); kir::ExpressionEvaluator evaluator; auto a = ir_builder.create(c10::nullopt); auto b = ir_builder.create(c10::nullopt); auto c = ir_builder.addExpr(a, b); auto d = ir_builder.negExpr(ir_builder.ceilDivExpr(c, b)); auto e = ir_builder.create(0); // trying to evaluate before binding should give empty results TORCH_CHECK(!evaluator.evaluate(a).has_value()); TORCH_CHECK(!evaluator.evaluate(d).has_value()); evaluator.bind(a, 7); evaluator.bind(b, 3); // can't bind to the results of expressions ASSERT_ANY_THROW(evaluator.bind(c, 100)); // can't bind to concrete values ASSERT_ANY_THROW(evaluator.bind(e, 100)); checkIntValue(evaluator, c, 10); checkIntValue(evaluator, ir_builder.subExpr(a, b), 4); checkIntValue(evaluator, ir_builder.modExpr(a, b), 1); checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3); checkIntValue(evaluator, d, -4); // Reset the evaluation context evaluator = kir::ExpressionEvaluator(); evaluator.bind(a, 2); evaluator.bind(b, 5); checkIntValue(evaluator, c, 7); checkIntValue(evaluator, ir_builder.subExpr(a, b), -3); checkIntValue(evaluator, ir_builder.modExpr(a, b), 2); checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 1); checkIntValue(evaluator, d, -2); } TEST(NVFuserTest, FusionClear_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // 1. Create a dummy IR { TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); tv3->split(0, 4); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(-1)->parallelize(ParallelType::TIDx); } // 2. Clear the IR fusion.clear(); TORCH_CHECK(fusion.unordered_exprs().empty()); TORCH_CHECK(fusion.vals().empty()); TORCH_CHECK(fusion.inputs().empty()); TORCH_CHECK(fusion.outputs().empty()); TORCH_CHECK(!fusion.hasReduction()); // 3. Rebuild the IR { TensorView* tv0 = makeSymbolicTensor(3); TensorView* tv1 = makeSymbolicTensor(3); TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv3); // tv3 [i0, i1, i2] tv3->reorder({{0, 2}, {2, 0}}); // tv3 [i2, i1, i0] tv3->split(-1, 4); // tv3 [i2, i1, i0outer, i0inner{4}] tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); // tv3 [i0outer, i0inner{4}, i1, i2] tv0->computeAt(tv3, -1); tv1->computeAt(tv3, -1); tv3->axis(1)->parallelize(ParallelType::BIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({16, 8, 8}, options); at::Tensor input2 = at::randn_like(input1); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; TORCH_CHECK(output_ref.equal(outputs[0])); } TEST(NVFuserTest, FusionCopy_CUDA) { Fusion original_fusion; // Create the test IR { FusionGuard fg(&original_fusion); auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(3); auto tv2 = add(tv1, new Double(2.0)); auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); original_fusion.addInput(tv0); original_fusion.addInput(tv1); original_fusion.addOutput(tv3); tv3->reorder({{0, 2}, {2, 0}}); tv3->split(-1, 4); tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); tv0->computeAt(tv3, -1); tv1->computeAt(tv3, -1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); } // Test copy before lowering Fusion clone = original_fusion; // Compare IR dumps std::stringstream original_ir; std::stringstream clone_ir; original_ir << original_fusion; clone_ir << clone; ASSERT_EQ(original_ir.str(), clone_ir.str()); // Lower original fusion std::string original_kernel; { // TODO(kir): remove this guard once we implement the cuda codegen visitor FusionGuard fg(&original_fusion); original_kernel = codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); } // Make sure the "before lowering" clone was not mutated // while lowering the original fusion IR std::stringstream before_lowering_ir; before_lowering_ir << clone; ASSERT_EQ(original_ir.str(), before_lowering_ir.str()); // Test copy after lowering (including assignment operator) Fusion before_lowering = clone; clone = original_fusion; // Compare IR dumps std::stringstream original_lowered_ir; std::stringstream clone_lowered_ir; original_lowered_ir << original_fusion; clone_lowered_ir << clone; ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); // Lower the "before lowering" and compare kernels std::string clone_kernel; { // TODO(kir): remove this guard once we implement the cuda codegen visitor FusionGuard fg(&before_lowering); clone_kernel = codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); } ASSERT_EQ(original_kernel, clone_kernel); } TEST(NVFuserTest, FusionMove_CUDA) { Fusion fusion; // Create the test IR { FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(3); auto tv2 = add(tv1, new Double(2.0)); auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv3); tv3->reorder({{0, 2}, {2, 0}}); tv3->split(-1, 4); tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); tv0->computeAt(tv3, -1); tv1->computeAt(tv3, -1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); } std::stringstream original_ir; original_ir << fusion; // Test move before lowering Fusion another_fusion = std::move(fusion); // Check that the original fusion is "empty" // // IMPORTANT: these checks assume knowledge of the internal // implementation of the move operations. General uses // should only assume that the moved-from object is in // a valid, but unspecified state. This is similar to the // standard library containers: // https://en.cppreference.com/w/cpp/utility/move // TORCH_CHECK(fusion.unordered_exprs().empty()); TORCH_CHECK(fusion.vals().empty()); TORCH_CHECK(fusion.inputs().empty()); TORCH_CHECK(fusion.outputs().empty()); // clear() has no pre-conditions so it's valid to call on a moved-from object fusion.clear(); // Compare IR dumps std::stringstream another_ir; another_ir << another_fusion; ASSERT_EQ(original_ir.str(), another_ir.str()); // Lower the fusion IR GpuLower lower(&another_fusion); std::stringstream lowered_ir; lowered_ir << another_fusion; // Test move assignment after lowering fusion = std::move(another_fusion); // Compare IR dumps std::stringstream moved_lowered_ir; moved_lowered_ir << fusion; ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str()); } TEST(NVFuserTest, FusionSimpleArith_CUDA) { std::stringstream ss1, ss2; Fusion fusion; FusionGuard fg(&fusion); Double* d1 = new Double(1.f); Double* d2 = new Double{2.f}; Double* d3 = new Double(); // Disrupt the fusion to make sure guard works well { Fusion fusion2; FusionGuard fg(&fusion2); Double* d1 = new Double(1.f); Double* d2 = new Double(2.f); add(d1, d2); ss2 << fusion2; } new BinaryOp(BinaryOpType::Add, d3, d1, d2); ss1 << fusion; TORCH_CHECK( ss1.str().compare(ss2.str()) == 0, "Error where explicit add nodes don't match implicit add nodes."); } TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Double* d4 = new Double{4.f}; Int* i1 = new Int{3}; auto d5 = add(d4, i1); TORCH_CHECK(d5->getDataType() == DataType::Double); } TEST(NVFuserTest, FusionRegister_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Double* v1 = new Double{1.f}; Double* v2 = new Double{2.f}; Val* v3 = binaryOp(BinaryOpType::Add, v1, v2); Val* v4 = binaryOp(BinaryOpType::Add, v1, v2); TORCH_CHECK(v1->name() + 1 == v2->name()); TORCH_CHECK(v2->name() + 1 == v3->name()); TORCH_CHECK(v3->name() + 1 == v4->name()); TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name()); } // dummy expr with 2 outputs only for toposort test. struct DummyExpr : public Expr { ~DummyExpr() = default; DummyExpr(Val* _outlhs, Val* _outrhs, Val* _lhs, Val* _rhs) : Expr(ExprType::UnaryOp) // Not terribly safe... { addOutput(_outlhs); addOutput(_outrhs); addInput(_lhs); addInput(_rhs); this->name_ = FusionGuard::getCurFusion()->registerExpr(this); } DummyExpr(const DummyExpr& other) = delete; DummyExpr& operator=(const DummyExpr& other) = delete; DummyExpr(DummyExpr&& other) = delete; DummyExpr& operator=(DummyExpr&& other) = delete; }; TEST(NVFuserTest, FusionTopoSort_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // e0: v3, v2 = dummy(v1, v0) // e1: v4 = add(v3, v2) // e2: v5 = add(v2, v4) // e3: v6 = add(v5, v5) Double* v0 = new Double{1.f}; Double* v1 = new Double{2.f}; Double* v2 = new Double(); Double* v3 = new Double(); Double* v4 = new Double(); Double* v5 = new Double(); Double* v6 = new Double(); std::vector inputs = {v0, v1}; for (auto val : inputs) { fusion.addInput(val); } Expr* e0 = new DummyExpr(v3, v2, v1, v0); Expr* e1 = new BinaryOp(BinaryOpType::Add, v4, v3, v2); Expr* e2 = new BinaryOp(BinaryOpType::Add, v5, v2, v4); Expr* e3 = new BinaryOp(BinaryOpType::Add, v6, v5, v5); fusion.addOutput(v2); fusion.addOutput(v3); auto exprs = fusion.exprs(); TORCH_CHECK(exprs.size() == 1, "Found ", exprs.size(), " but expecting 1"); TORCH_CHECK(exprs[0] == e0); fusion.addOutput(v5); exprs = fusion.exprs(); TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3"); TORCH_CHECK(exprs[0] == e0); TORCH_CHECK(exprs[1] == e1); TORCH_CHECK(exprs[2] == e2); fusion.addOutput(v4); exprs = fusion.exprs(); TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3"); TORCH_CHECK(exprs[0] == e0); TORCH_CHECK(exprs[1] == e1); TORCH_CHECK(exprs[2] == e2); fusion.addOutput(v6); exprs = fusion.exprs(); TORCH_CHECK(exprs.size() == 4, "Found ", exprs.size(), " but expecting 4"); TORCH_CHECK(exprs[0] == e0); TORCH_CHECK(exprs[1] == e1); TORCH_CHECK(exprs[2] == e2); TORCH_CHECK(exprs[3] == e3); TORCH_CHECK(v2->definition()->name() == 0); TORCH_CHECK(v3->definition()->name() == 0); TORCH_CHECK(v4->definition()->name() == 1); TORCH_CHECK(v5->definition()->name() == 2); TORCH_CHECK(v6->definition()->name() == 3); } TEST(NVFuserTest, FusionTensor_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); Fusion fusion; FusionGuard fg(&fusion); { auto tensor = at::randn({2, 3, 4, 5}, options); auto tensor_type = TensorType::create(tensor); auto fuser_tensor = new TensorView(tensor_type); TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); TORCH_CHECK(fuser_tensor->domain() != nullptr); for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { // size 1 dimension are makred as broadcast TORCH_CHECK( fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); // check contiguity information; TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]); } } // TensorType::create fills stride_properties, which helps us to mark // IterDomain properly // Note: implementation could change, depending on how much we want to invest // in our home-brew contiguity coalescing. For now let's make sure that we // properly test what we are using. { auto tensor = at::randn({4, 4, 4}, options); auto sliced_tensor = tensor.slice(1, 0, -1, 2); auto tensor_type = TensorType::create(sliced_tensor); auto fuser_tensor = new TensorView(tensor_type); TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); TORCH_CHECK(fuser_tensor->domain() != nullptr); for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { // size 1 dimension are makred as broadcast TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false); } TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); } { auto tensor = at::randn({2, 3, 4, 5}, options); auto permuted_tensor = tensor.permute({0, 3, 1, 2}); auto tensor_type = TensorType::create(permuted_tensor); auto fuser_tensor = new TensorView(tensor_type); TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); TORCH_CHECK(fuser_tensor->domain() != nullptr); for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { // size 1 dimension are makred as broadcast TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false); } TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]); } } TEST(NVFuserTest, FusionFilterVals_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); auto tv1 = makeSymbolicTensor(1); auto scalar0 = new Double(0); auto scalar1 = new Int(0); auto scalar2 = new Int(1); const std::vector vals = {tv0, scalar0, tv1, scalar1, scalar2}; std::vector tvs( ir_utils::filterByType(vals).begin(), ir_utils::filterByType(vals).end()); TORCH_CHECK(tvs.size() == 2); TORCH_CHECK(tvs[0] == tv0); TORCH_CHECK(tvs[1] == tv1); std::vector floats( ir_utils::filterByType(vals).begin(), ir_utils::filterByType(vals).end()); TORCH_CHECK(floats.size() == 1); TORCH_CHECK(floats[0] == scalar0); std::vector ints( ir_utils::filterByType(vals).begin(), ir_utils::filterByType(vals).end()); TORCH_CHECK(ints.size() == 2); TORCH_CHECK(ints[0] == scalar1); TORCH_CHECK(ints[1] == scalar2); TORCH_CHECK( ir_utils::filterByType(vals).begin() == ir_utils::filterByType(vals).end(), "Not expecting any results"); } TEST(NVFuserTest, FusionTVSplit_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv = makeSymbolicTensor(3); tv = tv->split(2, 2); TORCH_CHECK(tv->nDims() == 4); Expr* outer = tv->axis(2)->extent()->definition(); TORCH_CHECK( outer->getExprType().value() == ExprType::BinaryOp && static_cast(outer)->getBinaryOpType() == BinaryOpType::CeilDiv && static_cast(outer)->lhs()->sameAs( tv->getRootDomain()[2]->extent()) && static_cast(static_cast(outer)->rhs()) ->sameAs(new Int(2))); IterDomain* inner = static_cast(tv->axis(3)); TORCH_CHECK( inner->extent()->isScalar() && static_cast(inner->extent())->isConst() && static_cast(inner->extent())->value().value() == 2); } TEST(NVFuserTest, FusionTVMerge_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv = makeSymbolicTensor(3); tv = tv->merge(1); Expr* axisOp = tv->axis(1)->extent()->definition(); TORCH_CHECK( tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp && static_cast(axisOp)->getBinaryOpType() == BinaryOpType::Mul && static_cast(axisOp)->lhs() == tv->getRootDomain()[1]->extent() && static_cast(axisOp)->rhs() == tv->getRootDomain()[2]->extent()); } TEST(NVFuserTest, FusionTVReorder_CUDA) { Fusion fusion; FusionGuard fg(&fusion); std::unordered_map shift_right{{-1, 0}}; std::unordered_map shift_left{{0, -1}}; std::unordered_map shift_left_2{{0, -1}, {1, 0}, {2, 1}}; std::unordered_map swap{{0, 2}, {2, 0}}; auto tv = makeSymbolicTensor(3); std::vector ref; ref = std::vector( tv->domain()->domain().begin(), tv->domain()->domain().end()); tv->reorder(shift_left); for (int i = 0; i < (int)tv->nDims(); i++) TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); tv = makeSymbolicTensor(3); ref = std::vector( tv->domain()->domain().begin(), tv->domain()->domain().end()); tv->reorder(shift_left); for (int i = 0; i < (int)tv->nDims(); i++) TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); tv = makeSymbolicTensor(3); ref = std::vector( tv->domain()->domain().begin(), tv->domain()->domain().end()); tv->reorder(shift_right); TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0))); for (int i = 1; i < (int)tv->nDims(); i++) TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i))); tv = makeSymbolicTensor(3); ref = std::vector( tv->domain()->domain().begin(), tv->domain()->domain().end()); tv->reorder(swap); TORCH_CHECK(ref[0]->sameAs(tv->axis(2))); TORCH_CHECK(ref[2]->sameAs(tv->axis(0))); TORCH_CHECK(ref[1]->sameAs(tv->axis(1))); } TEST(NVFuserTest, FusionEquality_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Double* fval1 = new Double(); Double* fval1_copy = fval1; Double* fval2 = new Double(); Double* fone = new Double(1.0); TORCH_CHECK(fval1->sameAs(fval1_copy)); TORCH_CHECK(!fval1->sameAs(fval2)); TORCH_CHECK(!fone->sameAs(fval1)); TORCH_CHECK(fone->sameAs(new Double(1.0))); Int* ival1 = new Int(); Int* ival1_copy = ival1; Int* ival2 = new Int(); Int* ione = new Int(1); TORCH_CHECK(ival1->sameAs(ival1_copy)); TORCH_CHECK(!ival1->sameAs(ival2)); TORCH_CHECK(!ione->sameAs(ival1)); TORCH_CHECK(ione->sameAs(new Int(1))); BinaryOp* add1 = new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1); BinaryOp* add1_copy = new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1); BinaryOp* sub1 = new BinaryOp(BinaryOpType::Sub, new Double(), fval1, ival1); UnaryOp* neg1 = new UnaryOp(UnaryOpType::Neg, new Double(), fval1); UnaryOp* neg2 = new UnaryOp(UnaryOpType::Neg, new Double(), fval2); UnaryOp* neg1_copy = new UnaryOp(UnaryOpType::Neg, new Double(), fval1); TORCH_CHECK(add1->sameAs(add1_copy)); TORCH_CHECK(!add1->sameAs(sub1)); TORCH_CHECK(neg1->sameAs(neg1_copy)); TORCH_CHECK(!static_cast(neg1)->sameAs(add1)); TORCH_CHECK(!neg1->sameAs(neg2)); } TEST(NVFuserTest, FusionDependency_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Double* d0 = new Double(0.f); Double* d1 = new Double(1.f); auto d2 = add(d0, d1); auto d3 = add(d2, d2); Double* d4 = new Double(4.f); Double* d5 = new Double(5.f); auto d6 = add(d4, d5); Double* d7 = new Double(7.f); Double* d8 = new Double(8.f); auto d9 = add(d7, d8); auto d10 = add(d6, d9); auto d11 = add(d3, d10); TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11)); TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11)); TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11)); TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11)); TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11)); TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11)); TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2)); TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3)); TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6)); TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4)); TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8)); auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11); TORCH_CHECK(dep_chain.back() == d11); dep_chain.pop_back(); TORCH_CHECK(dep_chain.back() == d3); dep_chain.pop_back(); TORCH_CHECK(dep_chain.back() == d2); dep_chain.pop_back(); dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11); TORCH_CHECK(dep_chain.back() == d11); dep_chain.pop_back(); TORCH_CHECK(dep_chain.back() == d10); dep_chain.pop_back(); dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11); TORCH_CHECK(dep_chain.back() == d11); dep_chain.pop_back(); TORCH_CHECK(dep_chain.back() == d10); dep_chain.pop_back(); TORCH_CHECK(dep_chain.back() == d6); dep_chain.pop_back(); dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2); TORCH_CHECK(dep_chain.empty()); } TEST(NVFuserTest, FusionParser_CUDA) { // This test may not pass if using a custom block sync as there may // be additional calls. Skip the test as it's not specifically // relevant with block synchronizatin. if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) { return; } auto g = std::make_shared(); const auto graph0_string = R"IR( graph(%0 : Float(2, strides=[1]), %1 : Float(2, strides=[1])): %c0 : Float(2, strides=[1]) = aten::mul(%0, %1) %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0) return (%d0))IR"; parseIR(graph0_string, g.get()); // strides are not yet supported in the irparser. for (auto val : g->block()->inputs()) { if (val->isCompleteTensor()) val->setType(val->type()->castRaw()->contiguous()); } for (auto node : g->block()->nodes()) { for (auto val : node->outputs()) { if (val->isCompleteTensor()) val->setType(val->type()->castRaw()->contiguous()); } } auto fusion = parseJitIR(g); FusionGuard fg(fusion.get()); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); // Avoid vectorization here as those kernels can't be lowered twice at the // moment at::Tensor input1 = at::randn({16}, options); at::Tensor input2 = at::randn({16}, options); auto lparams = schedulePointwise(fusion.get(), {input1, input2}); // CONSIDER: // 1. this can be moved to a dedicated "golden" file // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + (1 - 1)) * 1) + (1 - 1)) * 128) + ((nvfuser_index_t)threadIdx.x)) < T0.size[0])) { constexpr nvfuser_index_t ki169 = 0; float T5[1]; constexpr nvfuser_index_t ki203 = 0; T5[ki203] = 0; constexpr nvfuser_index_t ki194 = 0; T5[ki194] = T1[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki169) * 1) + ki194) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; float T4[1]; constexpr nvfuser_index_t ki209 = 0; T4[ki209] = 0; constexpr nvfuser_index_t ki189 = 0; T4[ki189] = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki169) * 1) + ki189) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; float T6[1]; constexpr nvfuser_index_t ki178 = 0; float T2[1]; T2[0] = T4[ki178] * T5[ki178]; T6[ki178] = T2[0] * T4[ki178]; constexpr nvfuser_index_t ki171 = 0; T3[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki169) * 1) + ki171) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] = T6[ki171]; } } )"; const std::string actual_kernel = "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr << " Codegen mismatch, codegen possibly changed, or is incorrect. " << " \n ========= EXPECTED ========= \n" << expected_kernel << "\n========= ACTUAL ========== \n" << actual_kernel << "\n=================" << std::endl; auto it = std::mismatch( expected_kernel.begin(), expected_kernel.end(), actual_kernel.begin(), actual_kernel.end()); std::string actual_mismatched_snippet(it.second, actual_kernel.end()); actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10); std::string expected_mismatched_snippet(it.first, expected_kernel.end()); expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10); std::cerr << "First mismatch found at: " << actual_mismatched_snippet << ", expected: " << expected_mismatched_snippet << std::endl; TORCH_CHECK(false); } FusionExecutor fe; fe.compileFusion(fusion.get()); auto outputs = fe.runFusion({input1, input2}, lparams); at::Tensor output_ref = input1 * input2 * input1; TORCH_CHECK(output_ref.equal(outputs[0])); } TEST(NVFuserTest, FusionForLoop_CUDA) { // TODO(kir): re-enable this test // due to the current "GpuLower guard" approach, we can only create // kernel IR during GpuLower::lower() #if 0 Fusion fusion; FusionGuard fg(&fusion); const auto TV0 = new TensorView( new TensorDomain({new IterDomain(new Int(0), new Int(16))}), DataType::Float); const auto TV1 = new TensorView( new TensorDomain({new IterDomain(new Int(0), new Int(16))}), DataType::Float); fusion.addInput(TV0); fusion.addInput(TV1); auto ID0 = new kir::IterDomain(new IterDomain(new Int(0), new Int(8))); TensorView* TV2 = add(TV0, TV1); BinaryOp* op = static_cast(TV2->definition(); fusion.addOutput(TV2); auto fl = new kir::ForLoop(new kir::Int(c10::nullopt), ID0, {op}); std::stringstream result; std::stringstream ref; result << fl; ref << "for(size_t i3{0}; i3 < iS{8}; ++i3 ) {\nT2[ iS{16} ] = T0[ iS{16} ] + T1[ iS{16} ]\n}"; if (result.str().compare(ref.str()) == 0) { std::stringstream err_msg; err_msg << "ForLoop printing has changed or something has gone wrong. " << result.str() << "\n does not match reference: " << ref.str() << std::endl; TORCH_CHECK(false, err_msg.str()); } #endif } TEST(NVFuserTest, FusionOuterSplit_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(3); new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0)); TensorView* tv1 = add(tv0, new Double(2.0)); TensorView* tv2 = add(tv1, new Double(3.0)); fusion.addOutput(tv2); //[I0, I1, I2] tv2->split(-1, 4, false); //[I0, I1, I2o{4}, I2i] tv2->merge(0); tv2->merge(0); //[I0*I1*I2o{4}, I2i] tv2->split(0, 2); //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i] tv2->reorder({{0, 1}, {1, 0}}); // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i] tv0->computeAt(tv2, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor output = at::empty({2, 6, 32}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({}, {output}); at::Tensor output_ref = at::zeros_like(output, options); output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; TORCH_CHECK(output_ref.equal(output)); } TEST(NVFuserTest, FusionCodeGen_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(3); new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0)); TensorView* tv1 = add(tv0, new Double(2.0)); TensorView* tv2 = add(tv1, new Double(3.0)); fusion.addOutput(tv2); //[I0, I1, I2] tv2 = tv2->split(0, 4); //[I0o, I0i{4}, I1, I2] tv2 = tv2->merge(1); //[I0o, I0i{4}*I1, I2] tv2 = tv2->split(-1, 2); //[I0o, I0i{4}*I1, I2o, I2i{2}] tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}}); //[I0i{4}*I1, I0o, I2i{2}, I2o] tv0->computeAt(tv2, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor output = at::empty({16, 8, 8}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({}, {output}); at::Tensor output_ref = at::zeros_like(output, options); output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; TORCH_CHECK(output_ref.equal(output)); } TEST(NVFuserTest, FusionCodeGen2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(3); TensorView* tv1 = makeSymbolicTensor(3); TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv3); //[I0, I1, I2] tv3->reorder({{0, 2}, {2, 0}}); //[I2, I1, I0] tv3->split(-1, 4); //[I2, I1, I0o, I0i{4}] tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); // I0o, I0i{4}, I1, I2] tv0->computeAt(tv3, -1); tv1->computeAt(tv3, -1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({16, 8, 8}, options); at::Tensor input2 = at::randn_like(input1); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; TORCH_CHECK(output_ref.equal(outputs[0])); } TEST(NVFuserTest, FusionSimplePWise_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // dimensionality of the problem int nDims = 3; // Set up your input tensor views TensorView* tv0 = makeContigTensor(nDims); TensorView* tv1 = makeContigTensor(nDims); // Register your inputs fusion.addInput(tv0); fusion.addInput(tv1); // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs fusion.addOutput(tv3); // Do transformations, remember, transformations are outputs to inputs // This doesn't have to be in this order tv3->merge(1); tv3->merge(0); // Split by n_threads tv3->split(0, 128); tv3->split(0, 4); // For all inputs, computeAt the output inline, temporaries should be squeezed // between them tv0->computeAt(tv3, -1); tv1->computeAt(tv3, -1); // Parallelize TV3 tv3->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(-2)->parallelize(ParallelType::Unroll); tv3->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({64, 2, 128}, options); at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; TORCH_CHECK(output_ref.equal(output)); } TEST(NVFuserTest, FusionExecKernel_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); // Register your inputs fusion.addInput(tv0); fusion.addInput(tv1); // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs fusion.addOutput(tv3); tv3->merge(0); tv3->split(0, 128); tv3->split(0, 4); // For all inputs, computeAt the output inline, temporaries should be squeezed // between them tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); // Parallelize TV3 tv3->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::ones({1, 128}, options); at::Tensor input2 = at::ones_like(input1); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({input1, input2}); at::Tensor check = at::full({1, 128}, 4, options); ; TORCH_CHECK(outputs[0].equal(check)); } int ceilDiv_(int a, int b) { return (a + b - 1) / b; } TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv1 + 3 // tv4 = tv1 * 2 // tv5 = tv3 + tv2 // tv6 = tv5 + tv4 // tv7 = tv1 + tv4 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = add(tv1, new Double(3.0)); TensorView* tv4 = mul(tv1, new Double(2.0)); TensorView* tv5 = add(tv3, tv2); TensorView* tv6 = add(tv5, tv4); TensorView* tv7 = add(tv1, tv4); fusion.addOutput(tv6); fusion.addOutput(tv7); // Lets setup to actually run tv7->merge(0); tv7->split(0, 128); tv7->split(0, 4); tv7->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv7, 1); GpuLower gpulw(&fusion); // The this-position of the last tensor should be zero. TORCH_CHECK( tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && tv7->getMaxProducerPosition() == 1); TORCH_CHECK( tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && tv6->getMaxProducerPosition() == 1); // The position of every other tensor should be 1. for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0))); } for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({129, 127}, options); auto t1 = aten_input.mul({0.5}); auto t2 = t1.mul({-1.0}); auto t3 = t1.add({3.0}); auto t4 = t1.mul({2.0}); auto t5 = t3.add(t2); auto t6 = t5.add(t4); auto t7 = t1.add(t4); std::vector aten_outputs = {t6, t7}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { // Case 2 // tv1 = tv0 * -1 // tv2 = tv0 + 3 // tv3 = tv0 * 2 // tv4 = tv2 + tv1 // tv5 = tv4 + tv3 // tv6 = tv5 + tv3 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(-1.0)); TensorView* tv2 = add(tv0, new Double(3.0)); TensorView* tv3 = mul(tv0, new Double(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); TensorView* tv6 = add(tv5, tv3); fusion.addOutput(tv5); fusion.addOutput(tv6); // Lets setup to actually run tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); tv6->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv6, 1); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); auto t1 = input.mul({-1.0}); auto t2 = input.add({3.0}); auto t3 = input.mul({2.0}); auto t4 = t2.add(t1); auto t5 = t4.add(t3); auto t6 = t5.add(t3); std::vector aten_outputs = {t5, t6}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { // Case 3 // T2 = T1 * 0.979361 // T3 = T2 * T0 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(4); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); TensorView* tv2 = mul(tv1, new Double(.979361)); TensorView* tv3 = mul(tv2, tv0); fusion.addOutput(tv3); // Lets setup to actually run while (tv3->nDims() > 1) tv3->merge(0); tv3->split(0, 128); tv3->split(0, 4); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({129, 127, 63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t1.mul({0.979361}); auto aten_output = t2.mul(t0); std::vector aten_inputs = {t0, t1}; at::Tensor cg_output = at::empty_like(t0, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { // Case 4 // T4 = T2 - T3 // T5 = T1 + T4 // T6 = T5 - T0 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(4); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); TensorView* tv2 = makeSymbolicTensor(4); fusion.addInput(tv2); TensorView* tv3 = makeSymbolicTensor(4); fusion.addInput(tv3); TensorView* tv4 = sub(tv2, tv3); TensorView* tv5 = add(tv1, tv4); TensorView* tv6 = sub(tv5, tv0); fusion.addOutput(tv6); // Lets setup to actually run while (tv6->nDims() > 1) tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); tv0->computeAt(tv6, 1); tv1->computeAt(tv6, 1); tv2->computeAt(tv6, 1); tv3->computeAt(tv6, 1); tv6->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({129, 127, 63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); at::Tensor t2 = at::rand_like(t0, options); at::Tensor t3 = at::rand_like(t0, options); auto t4 = t2.sub(t3); auto t5 = t1.add(t4); auto aten_output = t5.sub(t0); std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { // Case 5 // tv2 = tv0 + 2.0 // tv3 = tv1 * tv2 Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = add(tv0, new Double(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); tv3->merge(0); tv3->split(-1, 8); tv3->split(-1, 4); tv2->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t0.add(2.0); auto aten_output = t1.mul(t2); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = add(tv0, new Double(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); tv2->merge(0); tv2->split(-1, 8); tv2->split(-1, 4); tv3->merge(0); tv3->split(-1, 8); tv2->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t0.add(2.0); auto aten_output = t1.mul(t2); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1.0)); auto tv2 = makeSymbolicTensor(1); fusion.addInput(tv2); auto tv3 = add(tv2, new Double(3.0)); auto tv4 = add(tv1, tv3); fusion.addOutput(tv4); auto tv5 = broadcast(tv1, {false, true}); auto tv6 = makeSymbolicTensor(2); fusion.addInput(tv6); auto tv7 = mul(tv5, tv6); fusion.addOutput(tv7); tv7->split(1, 2); tv7->merge(0); tv7->split(0, 4); tv7->split(0, 128); tv7->axis(0)->parallelize(ParallelType::BIDx); tv7->axis(1)->parallelize(ParallelType::TIDx); tv0->computeAt(tv7, 1); auto tv5_domain = tv5->domain()->domain(); // These computeAt transformations should not affect the TV5 domain tv0->computeAt(tv4, -1); tv2->computeAt(tv4, -1); auto tv5_domain_current = tv5->domain()->domain(); TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain"); FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({numel_x}, options); auto t2 = at::randn({numel_x}, options); auto t6 = at::randn({numel_x, numel_y}, options); auto t1 = t0.add(1.0); auto t3 = t2.add(3.0); auto t4 = t1.add(t3); auto t5 = t1.unsqueeze(1); auto t7 = t5.mul(t6); std::vector aten_inputs = {t0, t2, t6}; std::vector aten_outputs = {t4, t7}; auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1.0)); auto tv2 = makeSymbolicTensor(1); fusion.addInput(tv2); auto tv3 = add(tv2, new Double(3.0)); auto tv4 = add(tv1, tv3); fusion.addOutput(tv4); auto tv5 = broadcast(tv1, {false, true}); auto tv6 = makeSymbolicTensor(2); fusion.addInput(tv6); auto tv7 = mul(tv5, tv6); fusion.addOutput(tv7); tv7->split(1, 2); tv7->merge(0); tv7->split(0, 128, false); tv7->split(0, 4, false); tv7->axis(0)->parallelize(ParallelType::BIDx); tv7->axis(1)->parallelize(ParallelType::TIDx); // Reverse computeAt structure from previous test tv0->computeAt(tv4, -1); tv2->computeAt(tv4, -1); tv0->computeAt(tv7, -1); FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({numel_x}, options); auto t2 = at::randn({numel_x}, options); auto t6 = at::randn({numel_x, numel_y}, options); auto t1 = t0.add(1.0); auto t3 = t2.add(3.0); auto t4 = t1.add(t3); auto t5 = t1.unsqueeze(1); auto t7 = t5.mul(t6); std::vector aten_inputs = {t0, t2, t6}; std::vector aten_outputs = {t4, t7}; auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv1 + 3 // tv4 = tv1 * 2 // tv5 = tv3 + tv2 // tv6 = tv5 + tv4 // tv7 = tv1 + tv4 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = add(tv1, new Double(3.0)); TensorView* tv4 = mul(tv1, new Double(2.0)); TensorView* tv5 = add(tv3, tv2); TensorView* tv6 = add(tv5, tv4); TensorView* tv7 = add(tv1, tv4); fusion.addOutput(tv6); fusion.addOutput(tv7); // Lets setup to actually run tv0->merge(0); tv0->split(0, 128); tv0->split(0, 4); tv0->axis(0)->parallelize(ParallelType::BIDx); tv0->computeWith(tv7, 1); GpuLower gpulw(&fusion); // The this-position of the last tensor should be zero. TORCH_CHECK( tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && tv7->getMaxProducerPosition() == 1); TORCH_CHECK( tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && tv6->getMaxProducerPosition() == 1); // The position of every other tensor should be 1. for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0))); } for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({129, 127}, options); auto t1 = aten_input.mul({0.5}); auto t2 = t1.mul({-1.0}); auto t3 = t1.add({3.0}); auto t4 = t1.mul({2.0}); auto t5 = t3.add(t2); auto t6 = t5.add(t4); auto t7 = t1.add(t4); std::vector aten_outputs = {t6, t7}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { // Case 2 // tv1 = tv0 * -1 // tv2 = tv0 + 3 // tv3 = tv0 * 2 // tv4 = tv2 + tv1 // tv5 = tv4 + tv3 // tv6 = tv5 + tv3 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(-1.0)); TensorView* tv2 = add(tv0, new Double(3.0)); TensorView* tv3 = mul(tv0, new Double(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); TensorView* tv6 = add(tv5, tv3); fusion.addOutput(tv5); fusion.addOutput(tv6); // Lets setup to actually run tv0->merge(0); tv0->split(0, 128); tv0->split(0, 4); tv0->axis(0)->parallelize(ParallelType::BIDx); tv0->computeWith(tv6, 1); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); auto t1 = input.mul({-1.0}); auto t2 = input.add({3.0}); auto t3 = input.mul({2.0}); auto t4 = t2.add(t1); auto t5 = t4.add(t3); auto t6 = t5.add(t3); std::vector aten_outputs = {t5, t6}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { // Case 3 // T2 = T1 * 0.979361 // T3 = T2 * T0 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(4); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); TensorView* tv2 = mul(tv1, new Double(.979361)); TensorView* tv3 = mul(tv2, tv0); fusion.addOutput(tv3); // Lets setup to actually run while (tv0->nDims() > 1) tv0->merge(0); tv0->split(0, 128); tv0->split(0, 4); while (tv1->nDims() > 1) tv1->merge(0); tv1->split(0, 128); tv1->split(0, 4); tv0->computeWith(tv3, 1); tv1->computeWith(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({129, 127, 63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t1.mul({0.979361}); auto aten_output = t2.mul(t0); std::vector aten_inputs = {t0, t1}; at::Tensor cg_output = at::empty_like(t0, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { // Case 4 // T4 = T2 - T3 // T5 = T1 + T4 // T6 = T5 - T0 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(4); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); TensorView* tv2 = makeSymbolicTensor(4); fusion.addInput(tv2); TensorView* tv3 = makeSymbolicTensor(4); fusion.addInput(tv3); TensorView* tv4 = sub(tv2, tv3); TensorView* tv5 = add(tv1, tv4); TensorView* tv6 = sub(tv5, tv0); fusion.addOutput(tv6); std::vector tvs = {tv0, tv1, tv2}; for (auto tv : tvs) { // Lets setup to actually run while (tv->nDims() > 1) { tv->merge(0); } tv->split(0, 128); tv->split(0, 4); tv->computeWith(tv6, 1); } tv6->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({129, 127, 63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); at::Tensor t2 = at::rand_like(t0, options); at::Tensor t3 = at::rand_like(t0, options); auto t4 = t2.sub(t3); auto t5 = t1.add(t4); auto aten_output = t5.sub(t0); std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { // Case 5 // tv2 = tv0 + 2.0 // tv3 = tv1 * tv2 Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = add(tv0, new Double(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); tv2->merge(0); tv2->split(-1, 8); tv2->split(-1, 4); tv2->computeWith(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t0.add(2.0); auto aten_output = t1.mul(t2); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = add(tv0, new Double(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); tv2->merge(0); tv2->split(-1, 8); tv2->split(-1, 4); tv3->merge(0); tv3->split(-1, 8); tv2->computeWith(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t0.add(2.0); auto aten_output = t1.mul(t2); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -2 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = mul(tv1, new Double(-2.0)); fusion.addOutput(tv2); fusion.addOutput(tv3); // This computeAt will affect tv2 as well, even though tv2 is not in // the data-flow path between tv1 and tv3. The reason is that tv1 is // now computed at tv3, so tv2 must also be computed at the same // location. Overall, what will happen is basically we merge // expressions of all tensors and compute them in a single loop // nest. TensorView* computeAtTarget = tv3; computeAtTarget->split(0, 128); tv1->computeAt(computeAtTarget, 1); TensorView* affected_tensors[] = {tv1, tv2, tv3}; for (auto tv : affected_tensors) { TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); } GpuLower gpulw(&fusion); TORCH_CHECK(tv1->getComputeAtPosition() == 1); TORCH_CHECK( tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1); TORCH_CHECK( tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1); // Note that tv2 is also computed at tv3. for (auto tv : {tv1, tv2}) { TORCH_CHECK( gpulw.caLoopMap().areMapped(tv->axis(0), computeAtTarget->axis(0))); } TORCH_CHECK(tv3->getComputeAtPosition() == 0); computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); for (auto tv : affected_tensors) { tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({1000}, options); auto t1 = aten_input * 0.5; auto t2 = t1 * -1.0; auto t3 = t1 * -2.0; std::vector aten_outputs = {t2, t3}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } // Similar to ComputeAtMultiConsumers, but with a common consumer. TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -2 // tv4 = tv2 + tv3 // tv5 = tv4 * 5 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = mul(tv1, new Double(-2.0)); TensorView* tv4 = add(tv2, tv3); TensorView* tv5 = mul(tv4, new Double(5.0)); fusion.addOutput(tv3); fusion.addOutput(tv4); fusion.addOutput(tv5); // Computing tv1 at tv3. This will affect tv2 as discussed in // ComplexComputeAt1. Additionally, in this case, notice that tv4 is // the common consumer of tv2 and tv3, so they are computed at // tv4. The indirect propagation of the computeAt should stop at the // common consumer, and no further change should occur. More // specifically, the computeAT position of tv4 and tv5 should be zero. TensorView* computeAtTarget = tv3; computeAtTarget->split(0, 128); tv1->computeAt(computeAtTarget, 1); TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; for (auto tv : affected_tensors) { TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); } TORCH_CHECK(tv1->getComputeAtPosition() == 1); TORCH_CHECK(tv2->getComputeAtPosition() == 1); TORCH_CHECK(tv3->getComputeAtPosition() == 1); TORCH_CHECK(tv4->getComputeAtPosition() == 0); TORCH_CHECK(tv5->getComputeAtPosition() == 0); computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); for (auto tv : affected_tensors) { tv->axis(-1)->parallelize(ParallelType::TIDx); } // Transform tv5 to make it look like the rest tv5->split(0, 128); tv5->axis(1)->parallelize(ParallelType::TIDx); tv5->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({1000}, options); auto t1 = aten_input * 0.5; auto t2 = t1 * -1.0; auto t3 = t1 * -2.0; auto t4 = t2 + t3; auto t5 = t4 * 5.0; std::vector aten_outputs = {t3, t4, t5}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -1 // tv4 = tv1 + 4 // tv5 = tv3 + tv4 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = mul(tv2, new Double(-1.0)); TensorView* tv4 = add(tv1, new Double(4.0)); TensorView* tv5 = add(tv3, tv4); fusion.addOutput(tv5); TensorView* computeAtTarget = tv3; computeAtTarget->merge(0); computeAtTarget->split(0, 128); computeAtTarget->split(0, 4); computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); // This computeAt will affect all tensors including tv3, tv4 and // tv5, even though it appears to impact only tv1 and tv2. The // reason is that tv1 is now computed at tv3, so tv4 must also be // computed at the same location. Similarly, the consumer of tv4, // tv5, must also be computed at the same location. Overall, what // will happen is basically we merge expressions of all tensors and // compute them in a single loop nest. Internally, this will be // realized by making all tensors, except for those in the path // between tv1 and tv3, computed at tv5, which we call the common // consumer. tv1->computeAt(computeAtTarget, 1); // All tensors should have the same dimenionality as the target for (Val* val : fusion.vals()) { if (fusion.hasInput(val) || val->getValType().value() != ValType::TensorView) { continue; } TensorView* tv = val->as(); TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); if (tv == tv5) { TORCH_CHECK(tv->getComputeAtPosition() == 0); } else { TORCH_CHECK(tv->getComputeAtPosition() == 1); } } for (auto tv : ir_utils::filterByType(fusion.vals())) { if (!fusion.hasInput(tv)) { tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({129, 127}, options); auto t1 = aten_input.mul({0.5}); auto t2 = t1.mul({-1.0}); auto t3 = t2.mul({-1.0}); auto t4 = t1.add({4.0}); auto aten_output = t3 + t4; at::Tensor cg_output = at::empty_like(aten_input, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } // Similar to the above common consumer test but adds an additional // tensor that has no common consumer with the other tensors. TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -1 // tv4 = tv1 + 4 // tv5 = tv2 + tv3 // tv6 = tv1 + 6 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = mul(tv2, new Double(-1.0)); TensorView* tv4 = add(tv1, new Double(4.0)); TensorView* tv5 = add(tv3, tv4); TensorView* tv6 = add(tv1, new Double(6.0)); fusion.addOutput(tv5); fusion.addOutput(tv6); TensorView* computeAtTarget = tv3; computeAtTarget->merge(0); computeAtTarget->split(0, 128); computeAtTarget->split(0, 4); computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); // This will have the same impact on the tensors except for tv5 and // tv6. tv6 does not have any common consumer with the computeAt // target, but since it uses tv1, it must be also computed at the // same location as the other impacted tensors. We can either make // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 // should be computed at tv6 just because the current implementation // orders the computeAt relationship based on the order in which // tensors are specified as outputs. tv1->computeAt(computeAtTarget, 1); // All tensors should have the same dimenionality as the target for (auto tv : ir_utils::filterByType(fusion.vals())) { if (fusion.hasInput(tv)) { continue; } TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); if (tv == tv5 || tv == tv6) { TORCH_CHECK(tv->getComputeAtPosition() == 0); TORCH_CHECK(tv->getMaxProducerPosition() == 1); } else { TORCH_CHECK(tv->getComputeAtPosition() == 1); } } for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = val->as(); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({129, 127}, options); auto t1 = aten_input.mul({0.5}); auto t2 = t1.mul({-1.0}); auto t3 = t2.mul({-1.0}); auto t4 = t1.add({4.0}); auto t5 = t3 + t4; auto t6 = t1.add({6.0}); std::vector aten_outputs = {t5, t6}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } // Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor // that does not have data dependency with the consumer. TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv1 * -2 // tv4 = tv2 + tv3 // tv5 = tv4 * 5 // tv6 = tv1 * 6 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = mul(tv1, new Double(-2.0)); TensorView* tv4 = add(tv2, tv3); TensorView* tv5 = mul(tv4, new Double(5.0)); // Notice that tv6 is not a consumer of tv4. TensorView* tv6 = mul(tv1, new Double(6.0)); fusion.addOutput(tv3); fusion.addOutput(tv4); fusion.addOutput(tv5); fusion.addOutput(tv6); TensorView* computeAtTarget = tv3; computeAtTarget->split(0, 128); tv1->computeAt(computeAtTarget, 1); TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6}; for (auto tv : affected_tensors) { TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); if (tv == tv6 || tv == tv5) { TORCH_CHECK(tv->getComputeAtPosition() == 0); } else { TORCH_CHECK(tv->getComputeAtPosition() == 1); } } computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); for (auto tv : affected_tensors) { tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({1000}, options); auto t1 = aten_input * 0.5; auto t2 = t1 * -1.0; auto t3 = t1 * -2.0; auto t4 = t2 + t3; auto t5 = t4 * 5.0; auto t6 = t1 * 6.0; std::vector aten_outputs = {t3, t4, t5, t6}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } namespace { void checkConcretized( TensorView* v0, int a0, TensorView* v1, int a1, bool should_concretize) { if (should_concretize) { TORCH_CHECK( IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); } else { TORCH_CHECK( !IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); } } } // namespace TEST(NVFuserTest, FusionBCastConcretizeBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // tv0: [I I] TensorView* tv0 = makeSymbolicTensor(2); // tv1: [I I I] TensorView* tv1 = makeSymbolicTensor(3); fusion.addInput(tv0); fusion.addInput(tv1); // tv2*: [B I I] auto tv2_0 = broadcast(tv0, {true, false, false}); auto tv2_1 = broadcast(tv0, {true, false, false}); auto tv2 = add(tv2_0, tv2_1); // tv3: [I I I] auto tv3 = add(tv2, tv1); fusion.addOutput(tv3); checkConcretized(tv2, 0, tv1, 0, true); checkConcretized(tv2_0, 0, tv1, 0, true); checkConcretized(tv2_1, 0, tv1, 0, true); checkConcretized(tv2_0, 1, tv1, 0, false); checkConcretized(tv2_0, 0, tv1, 1, false); } TEST(NVFuserTest, FusionBCastConcretizeRfactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // both tv0 and tv1 = [I, I] TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); //[B,I,I] auto tv2 = broadcast(tv1, {true, false, false}); //[B,I,R] auto tv3 = sum(tv2, {2}); auto tv5 = add(tv3, tv1); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); // scheduling: //[B,I,R0,R1=128], root = [B,I,R] tv3->split(2, 128); // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] auto tv4 = tv3->rFactor({3}); checkConcretized(tv2, 0, tv5, 0, true); checkConcretized(tv4, 0, tv5, 0, true); checkConcretized(tv3, 0, tv5, 0, true); } namespace { void checkIdMapped( ComputeAtRootDomainMap& root_map, TensorView* v0, IterDomain* id0, TensorView* v1, IterDomain* id1, bool should_map) { if (should_map) { TORCH_CHECK( root_map.canMap(v0->domain(), id0, v1->domain(), id1), "Should be mappable: ", id0, " of ", v0, " and ", id1, " of ", v1); } else { TORCH_CHECK( !root_map.canMap(v0->domain(), id0, v1->domain(), id1), "Should not be mappable: ", id0, " of ", v0, " and ", id1, " of ", v1); } } void checkIdMapped( TensorView* v0, const std::vector& root0, const std::vector should_map0, TensorView* v1, const std::vector& root1, const std::vector should_map1) { ComputeAtRootDomainMap map; map.build(); TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size()); TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size()); size_t idx0 = 0; for (size_t i = 0; i < root0.size(); ++i) { size_t idx1 = 0; for (size_t j = 0; j < root1.size(); ++j) { if (should_map0[i] && should_map1[j] && idx0 == idx1) { checkIdMapped(map, v0, root0[i], v1, root1[j], true); } else { checkIdMapped(map, v0, root0[i], v1, root1[j], false); } if (should_map1[j]) ++idx1; } if (should_map0[i]) ++idx0; } } void checkIdMapped( TensorView* v0, const std::vector& root0, TensorView* v1, const std::vector& root1) { checkIdMapped( v0, root0, std::vector(root0.size(), true), v1, root1, std::vector(root1.size(), true)); } } // namespace TEST(NVFuserTest, FusionRootMappingBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv3 = broadcast(tv0, {true, false, false}); auto tv4 = broadcast(tv1, {false, true, false}); auto tv5 = add(tv3, tv4); fusion.addOutput(tv5); checkIdMapped( tv0, tv0->getRootDomain(), {true, true}, tv4, tv4->getRootDomain(), {false, true, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, true}, tv4, tv4->getRootDomain(), {true, false, true}); checkIdMapped( tv0, tv0->getRootDomain(), {false, true}, tv1, tv1->getRootDomain(), {false, true}); checkIdMapped( tv0, tv0->getRootDomain(), {true, true}, tv5, tv5->getRootDomain(), {false, true, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, true}, tv5, tv5->getRootDomain(), {true, false, true}); checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain()); checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain()); checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain()); } TEST(NVFuserTest, FusionRootMappingRfactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // [I,I] TensorView* tv0 = makeSymbolicTensor(2); // [I,I,I] TensorView* tv1 = makeSymbolicTensor(3); //[I,I,R] auto tv2 = sum(tv1, {2}); auto tv3 = add(tv2, tv0); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv3); // scheduling: //[B,I,R0,R1=128], root = [B,I,R] tv2->split(2, 128); // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] auto tv4 = tv2->rFactor({3}); checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain()); checkIdMapped( tv4, tv4->getRFactorDomain(), {true, true, true, false}, tv2, tv2->getRootDomain(), {true, true, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, true, false}, tv2, tv2->getRootDomain(), {true, true, false}); checkIdMapped( tv1, tv1->getRootDomain(), {true, true, false}, tv3, tv3->getRootDomain(), {true, true}); checkIdMapped( tv2, tv2->getRootDomain(), {true, true, false}, tv3, tv3->getRootDomain(), {true, true}); checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain()); checkIdMapped( tv0, tv0->getRootDomain(), {true, true}, tv1, tv1->getRootDomain(), {true, true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true, true}, tv2, tv2->getRootDomain(), {true, true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true, true}, tv4, tv4->getRFactorDomain(), {true, true, false, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true, true}, tv4, tv4->getRootDomain(), {true, true, false}); } TEST(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); fusion.addOutput(tv2); // The second dimension cannot be mapped as it would require recomputation. checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain()); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); } TEST(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = add(tv0, tv2); fusion.addOutput(tv3); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv1, tv1->getRootDomain(), {true, false}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv3, tv3->getRootDomain(), {true, false}); checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); } TEST(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); fusion.addOutput(tv2); tv1->split(-1, 4); auto tv3 = tv1->rFactor({-2}); checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain()); checkIdMapped( tv3, tv3->getMaybeRFactorDomain(), {true, false, true}, tv1, tv1->getRootDomain(), {true, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); } TEST(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = add(tv0, tv2); fusion.addOutput(tv3); tv1->split(-1, 4); auto tv4 = tv1->rFactor({-2}); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv4, tv4->getRootDomain(), {true, false}); checkIdMapped( tv4, tv4->getMaybeRFactorDomain(), {true, false, true}, tv1, tv1->getRootDomain(), {true, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); } // Reproducer of issue #749 TEST(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = sum(tv1, {1}); auto tv3 = broadcast(tv2, {false, true}); auto tv4 = add(tv0, tv3); auto tv5 = add(tv4, tv1); fusion.addOutput(tv5); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv1, tv1->getRootDomain(), {true, false}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped( tv2, tv2->getRootDomain(), {true, false}, tv3, tv3->getRootDomain(), {true, false}); checkIdMapped( tv3, tv3->getRootDomain(), {true, true}, tv4, tv4->getRootDomain(), {true, true}); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv4, tv4->getRootDomain(), {true, false}); checkIdMapped( tv4, tv4->getRootDomain(), {true, true}, tv5, tv5->getRootDomain(), {true, true}); } // Similar to RootMappingReductionDependency5 but with rFactor TEST(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = sum(tv1, {1}); auto tv3 = broadcast(tv2, {false, true}); auto tv4 = add(tv0, tv3); auto tv5 = add(tv4, tv1); fusion.addOutput(tv5); tv2->split(1, 4); auto tv6 = tv2->rFactor({-1}); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv1, tv1->getRootDomain(), {true, false}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv6, tv6->getRootDomain(), {true, false}); checkIdMapped( tv6, tv6->getMaybeRFactorDomain(), {true, true, false}, tv2, tv2->getRootDomain(), {true, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped( tv2, tv2->getRootDomain(), {true, false}, tv3, tv3->getRootDomain(), {true, false}); checkIdMapped( tv3, tv3->getRootDomain(), {true, true}, tv4, tv4->getRootDomain(), {true, true}); checkIdMapped( tv0, tv0->getRootDomain(), {true, false}, tv4, tv4->getRootDomain(), {true, false}); checkIdMapped( tv4, tv4->getRootDomain(), {true, true}, tv5, tv5->getRootDomain(), {true, true}); } TEST(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = broadcast(tv0, {true, false}); auto tv3 = add(tv1, tv2); fusion.addOutput(tv3); // tv0 cannot be mapped with the consumers as it would mean its only // domain would be mapped to both the first and second domains of // the two consumers, thus computing tv0 at both corresponding loops. checkIdMapped( tv0, tv0->getRootDomain(), {false}, tv1, tv1->getRootDomain(), {false, false}); checkIdMapped( tv0, tv0->getRootDomain(), {false}, tv2, tv2->getRootDomain(), {false, false}); checkIdMapped(tv1, tv1->getRootDomain(), tv3, tv3->getRootDomain()); checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); checkIdMapped( tv0, tv0->getRootDomain(), {false}, tv3, tv3->getRootDomain(), {false, false}); } TEST(NVFuserTest, FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = broadcast(tv0, {true, false}); fusion.addOutput(tv1); fusion.addOutput(tv2); // If there is no common consumer, there is no recomputation constraint. checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv1, tv1->getRootDomain(), {true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv2, tv2->getRootDomain(), {false, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {false, true}); } TEST(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); auto tv2 = makeSymbolicTensor(2); fusion.addInput(tv2); auto tv3 = broadcast(tv0, {false, true}); auto tv4 = add(tv1, tv3); fusion.addOutput(tv4); auto tv5 = add(tv2, tv3); fusion.addOutput(tv5); // Broadcast domains can be used with multiple domains with // different sizes. In this test, the broadcast domain of tv3 has // two consumers, tv4 and tv5, which may have different sizes. Each // of the consumers is used with the broadcast domain of tv3, but // the two consumers may not have the same size, it is not possible // to map those domains. checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv3, tv3->getRootDomain(), {true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv1, tv1->getRootDomain(), {true, false}); checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv2, tv2->getRootDomain(), {true, false}); checkIdMapped( tv1, tv1->getRootDomain(), {true, false}, tv3, tv3->getRootDomain(), {true, false}); checkIdMapped( tv2, tv2->getRootDomain(), {true, false}, tv3, tv3->getRootDomain(), {true, false}); checkIdMapped( tv3, tv3->getRootDomain(), {true, false}, tv4, tv4->getRootDomain(), {true, false}); checkIdMapped( tv3, tv3->getRootDomain(), {true, false}, tv5, tv5->getRootDomain(), {true, false}); checkIdMapped( tv4, tv4->getRootDomain(), {true, false}, tv5, tv5->getRootDomain(), {true, false}); } TEST(NVFuserTest, FusionRootMappingBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); // tv0[I0] fusion.addInput(tv0); auto tv1 = broadcast(tv0, {true, false}); // tv1[B1, I0] auto tv2 = broadcast(tv1, {true, false, false}); // tv2[B2, B1, I0] fusion.addOutput(tv2); // In this case, tv1 and tv2 has one and two broadcast domains, // respectively. It is the second broadcast domain that is mapped to // the broadcast of tv1. checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv1, tv1->getRootDomain(), {false, true}); checkIdMapped( tv1, tv1->getRootDomain(), {true, true}, tv2, tv2->getRootDomain(), {false, true, true}); // Not {true, false, true} checkIdMapped( tv0, tv0->getRootDomain(), {true}, tv2, tv2->getRootDomain(), {false, false, true}); } // Reproducer of issue #723 TEST(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = broadcast(tv0, {true, false}); auto tv3 = sum(tv2, {0}); auto tv4 = add(tv2, tv1); fusion.addOutput(tv3); fusion.addOutput(tv4); ComputeAtRootDomainMap map; map.build(); checkIdMapped( map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true); checkIdMapped( map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true); tv2->computeAt(tv4, -1); const int x = 11; const int y = 12; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x}, options); at::Tensor t1 = at::randn({y, x}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); auto t3 = t0; auto t4 = t0.unsqueeze(0).expand({y, x}) + t1; testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = broadcast(tv1, {true, false}); auto tv3 = broadcast(tv1, {false, true}); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); // computeAt should fail as there is no valid root mapping. ASSERT_ANY_THROW(tv1->computeAt(tv4, 1)); } TEST(NVFuserTest, FusionScalarInputs_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); Double* d0 = new Double(); fusion.addInput(d0); Double* d1 = new Double(); fusion.addInput(d1); Double* d2 = new Double(); fusion.addInput(d2); Double* d3 = new Double(); fusion.addInput(d3); Val* d4 = mul(d0, d1); Val* d5 = sub(d2, d3); TensorView* tv2 = sub(tv1, d4); TensorView* tv3 = add(tv0, d5); TensorView* tv4 = mul(tv3, tv2); fusion.addOutput(tv4); // Lets setup to actually run while (tv4->nDims() > 1) tv4->merge(0); tv4->split(0, 128); tv4->split(0, 4); tv0->computeAt(tv4, 1); tv1->computeAt(tv4, 1); tv4->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } // d4 = d0 * d1 // d5 = d2 - d3 // t2 = t1 - d4 // t3 = t0 + d5 // t4 = t3 * t2 auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); float fl0 = 0.1; float fl1 = -0.2; float fl2 = 0.3; float fl3 = -0.4; float fl4 = fl0 * fl1; float fl5 = fl2 - fl3; at::Tensor t0 = at::randn({129, 127}, options); at::Tensor t1 = at::rand_like(t0, options); auto t2 = t1.sub(fl4); auto t3 = t0.add(fl5); auto aten_output = t3.mul(t2); at::Tensor cg_output = at::empty_like(t0, options); at::Scalar test(fl0); std::vector aten_inputs = { t0, t1, at::Scalar(fl0), at::Scalar(fl1), at::Scalar(fl2), at::Scalar(fl3)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionLoopUnroll_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(3); TensorView* tv1 = makeSymbolicTensor(3); // Register your inputs fusion.addInput(tv0); fusion.addInput(tv1); // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs fusion.addOutput(tv3); int block_size = 16; tv3->merge(0, 1); tv3->merge(0, 1); tv3->split(0, block_size); tv3->split(0, 4); // For all inputs, computeAt the output inline, temporaries should be squeezed // between them tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); // Parallelize tv2->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({129, 13, 3}, options); at::Tensor input1 = at::randn({129, 13, 3}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({input0, input1}); TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); } /* * Helper function for single op testing that generates a codegen operand */ Val* gen_jit_operand(std::pair desc) { if (desc.first == ValType::TensorView) { return makeSymbolicTensor(2, desc.second); } else if (desc.first == ValType::Scalar) { if (desc.second == DataType::Float) { return new Double(); } else if (desc.second == DataType::Double) { return new Double(); } else if (desc.second == DataType::Int) { return new Int(); } else { TORCH_CHECK(false, "Not currently supported type: ", desc.first); } } else { TORCH_CHECK(false, "Not currently supported type: ", desc.first); } return nullptr; } /* * Helper function for single op testing that generates an ATen operand */ IValue gen_aten_operand( std::pair desc, int blocks, int threads, bool rand) { if (desc.first == ValType::TensorView) { if (desc.second == DataType::Double || desc.second == DataType::Float || desc.second == DataType::Half) { auto options = at::TensorOptions() .dtype(data_type_to_aten(desc.second)) .device(at::kCUDA, 0); if (rand) { return IValue(at::rand({blocks, threads}, options)); } else { return IValue(at::empty({blocks, threads}, options)); } } else if (desc.second == DataType::Int || desc.second == DataType::Int32) { auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong; if (rand) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype)); } else { auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); return IValue(at::empty({blocks, threads}, options)); } } else if (desc.second == DataType::Bool) { if (rand) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); return IValue( at::rand({blocks, threads}, options).round().to(at::kBool)); } else { auto options = at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0); return IValue(at::empty({blocks, threads}, options)); } } else { TORCH_CHECK(false, "Not currently supported type: ", desc.second) } } else if (desc.first == ValType::Scalar) { // IValue scalars can only be double int64 or bool if (desc.second == DataType::Double || desc.second == DataType::Float || desc.second == DataType::Half) { return IValue(at::Scalar(1.f)); } else if (desc.second == DataType::Int) { return IValue(at::Scalar(1)); } else { TORCH_CHECK(false, "Not currently supported type: ", desc.first); } } else { TORCH_CHECK(false, "Not currently supported type: ", desc.first); } return nullptr; } /* * Templatized Helper Function To generate single Op comparison between the * JIT codegen for Cuda and the ATen Library. */ using OutputPair = std::pair; template < typename AtenFunc, typename JitFunc, typename InputTuple, size_t... NumInputs> void test_op( int blocks, int threads, std::string op_str, AtenFunc af, JitFunc jf, OutputPair op, InputTuple it, std::index_sequence) { Fusion fusion; FusionGuard fg(&fusion); // Generate Input JIT function Inputs and add them as Inputs to the Fusion // Graph std::array jit_inputs = { gen_jit_operand(std::get(it))...}; std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) { fusion.addInput(v); }); TensorView* out = static_cast(jf(std::get(jit_inputs)...)); fusion.addOutput(out); std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) { if (v->getValType() == ValType::TensorView) static_cast(v)->computeAt(out, -1); }); out->axis(0)->parallelize(ParallelType::BIDx); out->axis(-1)->parallelize(ParallelType::TIDx); std::array aten_inputs = {gen_aten_operand( std::get(it), blocks, threads, /*rand*/ true)...}; const at::ArrayRef aten_inputs_ivalues(aten_inputs); at::Tensor cg_output = gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); std::vector output_vect = {cg_output}; cudaDeviceSynchronize(); if (fusion.isStochastic()) at::manual_seed(0); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); if (fusion.isStochastic()) at::manual_seed(0); at::Tensor aten_output = af(aten_inputs); cudaDeviceSynchronize(); // This sync shouldn't be necessary; std::string op_msg = "Operation " + op_str; testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__, op_msg); } /* * Templatized Helper Function that uses variadic templates to * process a variable length Input Tuple of different Operand Type. */ template void test_op( int blocks, int threads, std::string op_str, AtenFunc af, JitFunc jf, OutputPair op, InputTuple it) { static constexpr auto size = std::tuple_size::value; test_op( blocks, threads, op_str, af, jf, op, it, std::make_index_sequence{}); } TEST(NVFuserTest, FusionUnaryOps_CUDA) { using OpTuple = std::tuple; // [Note: explicit tuple type for uniform initialization list] // Tuple type must be explicitly specified for each uniform initialization // list within the vector to make this code compatible with some old env // which we still need to support. eg. gcc 5.4 + cuda 9.2. std::vector ops{ OpTuple{at::abs, UnaryOpType::Abs, "abs"}, OpTuple{at::acos, UnaryOpType::Acos, "acos"}, OpTuple{at::asin, UnaryOpType::Asin, "asin"}, OpTuple{at::atan, UnaryOpType::Atan, "atan"}, // There does not appear to be an appropriate ATen function for atanh // OpTuple{at::atanh, UnaryOpType::Atanh, "atanh" }, OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"}, OpTuple{at::cos, UnaryOpType::Cos, "cos"}, OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"}, OpTuple{at::erf, UnaryOpType::Erf, "erf"}, OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"}, OpTuple{at::exp, UnaryOpType::Exp, "exp"}, OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"}, OpTuple{at::floor, UnaryOpType::Floor, "floor"}, OpTuple{at::frac, UnaryOpType::Frac, "frac"}, // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"}, OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"}, OpTuple{at::log, UnaryOpType::Log, "log"}, OpTuple{at::log10, UnaryOpType::Log10, "log10"}, OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"}, OpTuple{at::log2, UnaryOpType::Log2, "log2"}, OpTuple{at::neg, UnaryOpType::Neg, "neg"}, OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"}, OpTuple{at::relu, UnaryOpType::Relu, "relu"}, OpTuple{at::round, UnaryOpType::Round, "round"}, OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"}, OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"}, OpTuple{at::sin, UnaryOpType::Sin, "sin"}, OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"}, OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"}, OpTuple{at::tan, UnaryOpType::Tan, "tan"}, OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"}, OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"}}; std::vector dtypes = {DataType::Float, DataType::Double}; for (auto dtype : dtypes) { std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) { test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ std::get<2>(op), /*Aten Func */ [&op](std::array& vals) { return std::get<0>(op)(vals[0].toTensor()); }, /*JIT Func */ [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); }, /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple(std::make_pair(ValType::TensorView, dtype))); }); test_op( /*blocks*/ 128, /*threads*/ 64, /*name*/ "rand_like", /*Aten Func */ [](std::array& vals) { return at::rand_like(vals[0].toTensor()); }, /*JIT Func */ [](Val* in1) -> Val* { return unaryOp(UnaryOpType::RandLike, in1); }, /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple(std::make_pair(ValType::TensorView, dtype))); } dtypes = {DataType::Int, DataType::Int32, DataType::Bool}; for (auto dtype : dtypes) { test_op( /*blocks*/ 128, /*threads*/ 64, /*name*/ "bitwise_not", /*Aten Func */ [](std::array& vals) { return at::bitwise_not(vals[0].toTensor()); }, /*JIT Func */ [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); }, /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple(std::make_pair(ValType::TensorView, dtype))); } } TEST(NVFuserTest, FusionBinaryOps_CUDA) { using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&); using OpTuple = std::tuple; // see [Note: explicit tuple type for uniform initialization list] std::vector logic_ops{ OpTuple{at::eq, BinaryOpType::Eq, "eq"}, OpTuple{at::ge, BinaryOpType::GE, "ge"}, OpTuple{at::gt, BinaryOpType::GT, "gt"}, OpTuple{at::le, BinaryOpType::LE, "le"}, OpTuple{at::lt, BinaryOpType::LT, "lt"}, OpTuple{at::ne, BinaryOpType::NE, "ne"}}; std::vector dtypes = {DataType::Double, DataType::Float}; for (auto dtype : dtypes) { std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) { test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ std::get<2>(op), /*Aten Func */ [&op](std::array& vals) { return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); }, /*JIT Func */ [&op](Val* in1, Val* in2) -> Val* { return binaryOp(std::get<1>(op), in1, in2); }, /*Output */ std::make_pair(ValType::TensorView, DataType::Bool), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype))); }); // see [Note: explicit tuple type for uniform initialization list] std::vector math_ops{ OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"}, OpTuple{at::div, BinaryOpType::Div, "div"}, OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"}, OpTuple{at::max, BinaryOpType::Max, "max"}, OpTuple{at::min, BinaryOpType::Min, "min"}, OpTuple{at::mul, BinaryOpType::Mul, "mul"}, OpTuple{at::pow, BinaryOpType::Pow, "pow"}, // NOTE: Remainder does not match the Aten impl exactly // despite using an identical function. OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"}, }; std::for_each(math_ops.begin(), math_ops.end(), [&](OpTuple& op) { test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ std::get<2>(op), /*Aten Func */ [&op](std::array& vals) { return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); }, /*JIT Func */ [&op](Val* in1, Val* in2) -> Val* { return binaryOp(std::get<1>(op), in1, in2); }, /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype))); }); test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "add_alpha", /*Aten Func */ [](std::array& vals) { return at::add( vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); }, /*JIT Func */ static_cast(&add_alpha), /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::Scalar, dtype))); test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "sub_alpha", /*Aten Func */ [](std::array& vals) { return at::sub( vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); }, /*JIT Func */ static_cast(&sub_alpha), /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::Scalar, dtype))); } } TEST(NVFuserTest, FusionTernaryOps_CUDA) { std::vector dtypes = {DataType::Double, DataType::Float}; for (auto dtype : dtypes) { test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "clamp", /*Aten Func */ [](std::array& vals) { return at::clamp(vals[0].toTensor(), 0.f, 1.f); }, /*JIT Func */ [&](Val* in1) -> Val* { if (dtype == DataType::Float) { return clamp(in1, new Double(0.f), new Double(1.f)); } else { return clamp(in1, new Double(0.f), new Double(1.f)); } }, /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple(std::make_pair(ValType::TensorView, dtype))); test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "threshold", /*Aten Func */ [](std::array& vals) { return at::threshold(vals[0].toTensor(), 0.f, 1.f); }, /*JIT Func */ [&](Val* in1) -> Val* { if (dtype == DataType::Float) { return threshold(in1, new Double(0.f), new Double(1.f)); } else { return threshold(in1, new Double(0.f), new Double(1.f)); } }, /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple(std::make_pair(ValType::TensorView, dtype))); test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "where", /*Aten Func */ [](std::array& vals) { return at::where( vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); }, /*JIT Func */ static_cast(&where), /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, DataType::Bool), std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype))); } } TEST(NVFuserTest, FusionCompoundOps_CUDA) { std::vector dtypes = {DataType::Double, DataType::Float}; for (auto dtype : dtypes) { test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "lerp", /*Aten Func */ [](std::array& vals) { return at::lerp( vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); }, /*JIT Func */ static_cast(&lerp), /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype))); test_op( /*blocks*/ 640, /*threads*/ 64, /*name*/ "addcmul", /*Aten Func */ [](std::array& vals) { return at::addcmul( vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor(), vals[3].toScalar()); }, /*JIT Func */ static_cast(&addcmul), /*Output */ std::make_pair(ValType::TensorView, dtype), /*Inputs Tuple*/ std::make_tuple( std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::TensorView, dtype), std::make_pair(ValType::Scalar, dtype))); } } TEST(NVFuserTest, FusionCastOps_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2, DataType::Half); TensorView* intrm1 = castOp(DataType::Float, tv0); TensorView* out = castOp(DataType::Half, intrm1); fusion.addInput(tv0); fusion.addOutput(out); tv0->computeAt(out, -1); out->axis(0)->parallelize(ParallelType::BIDx); out->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor input1 = at::randn({1, 4}, options); at::Tensor ref_output = at::empty_like(input1); std::array inputs = {input1}; const at::ArrayRef input_ivalues(inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(input_ivalues); ref_output = at::_cast_Half(at::_cast_Double(input1)); TORCH_CHECK( outputs[0].equal(ref_output), "\nOp Type: -- ", "cast FP16->FP32->FP16", " -- had a mismatch.\n", "\nABS MAX DIFF: ", outputs[0].sub(ref_output).abs().max(), "\n"); } // Start off simple, block on the outer dim // block stride + thread all reduce + unrolling on inner dim TEST(NVFuserTest, FusionReduction1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(1, 128); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] tv1->split(1, 4); // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] // tv1[I0, R1oi{4}, R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] TensorView* tv3 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] // tv3[I0, R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] // tv1[I0, R1i{128}] = tv3[I0, R1oi{4}, Ir1i{128}] // Incrementally, can print in between for debugging tv0->computeAt(tv2, 1); tv2->computeAt(tv3, 1); tv3->computeAt(tv1, 1); // Re do it all at once, because why not. tv0->computeAt(tv1, 1); tv2->axis(2)->parallelize(ParallelType::Unroll); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 65000; int numel_y = 1025; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); // switches to try some different scenarios. maybe we should iterate on all // permutations. bool bind_bidx = true; bool bind_tidx = true; bool bind_tidy = true; bool bind_unroll = true; int numel_x = 1025; // Cannot exceed block dim max size / tidy int numel_y = 129; int tidx = 16; int tidy = 8; int unroll_factor = 4; tv1->split(1, tidx); // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1] tv1->split(1, unroll_factor); // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1] tv1->split(0, tidy); TensorView* tv2 = tv1->rFactor({-3}); // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] // tv1[I0o, I0i{tidy}, R1oi{unroll}, R1i{tidx}] TensorView* tv3 = tv1->rFactor({-2}); // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] // tv3[I0, R1oi{unroll}, Ir1i{tidx}] // tv1[I0o, I0i{tidy}, R1i{tidx}] tv0->computeAt(tv1, -2); if (bind_unroll) tv2->axis(-2)->parallelize(ParallelType::Unroll); if (bind_bidx) tv1->axis(0)->parallelize(ParallelType::BIDx); if (bind_tidy) tv1->axis(1)->parallelize(ParallelType::TIDy); if (bind_tidx) { tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReduction3_CUDA) { // What if Z participates in the reduction with X? Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); int numel_x = 1025; // Cannot exceed block dim max size / tidy int numel_y = 129; int tidx = 16; int tidz = 8; tv1->split(1, tidz); // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1] tv1->split(1, tidx); // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({-3}); // tv2[I0, >R1oo<, Ir1oi{tidx}, Ir1i{tidz}] // tv1[I0o, R1oi{tidx}, R1i{tidz}] tv0->computeAt(tv1, -3); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(-2)->parallelize(ParallelType::TIDx); tv1->axis(-1)->parallelize(ParallelType::TIDz); tv2->axis(-2)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDz); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, {cg_output}); auto aten_output = aten_input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReduction4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); TensorView* tv2 = add(tv0, tv1); // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2); // tv3[I0, R1] = tv2[I0, I1] TensorView* tv4 = makeSymbolicTensor(1); fusion.addInput(tv4); // tv5[I0] = tv3[I0, R1] * tv4[I0] TensorView* tv5 = mul(tv3, tv4); fusion.addOutput(tv5); int tidx = 16; // RFactor the reduction tv3->split(1, tidx); // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1] TensorView* tv6 = tv3->rFactor({-2}); // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1] // tv3[I0, R1i{tidx}] = tv3[I0, I1] tv2->computeAt(tv6, 2); // Compute at inline with tv5 (only 1D) tv6->computeAt(tv3, 1); tv3->computeAt(tv5, 1); tv5->axis(0)->parallelize(ParallelType::BIDx); // Intermediate tensors only need this, but doesn't hurt to do on inputs // tv0, 1, 4 tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv6->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 1025; int numel_y = 129; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); at::Tensor t1 = at::randn({numel_x, numel_y}, options); at::Tensor t4 = at::randn({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); auto aten_output = t3.mul(t4); testValidate( &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReduction5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(3); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); int bidy = 2; int tidy = 4; int tidx = 5; int dim1 = 11; tv1->split(-2, tidy); TensorView* tv2 = tv1->rFactor({-3}); tv0->computeAt(tv1, 1); tv1->axis(0)->parallelize(ParallelType::BIDy); for (auto* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { val->as()->axis(-1)->parallelize(ParallelType::TIDx); } } tv2->axis(-2)->parallelize(ParallelType::TIDy); tv1->axis(-2)->parallelize(ParallelType::TIDy); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({bidy, dim1, tidx}, options); at::Tensor cg_output = at::empty({bidy, tidx}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReduction6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int bdimx = 64; const int bdimy = 8; // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(3); fusion.addInput(tv0); // tv1[I0, R1, R2] = tv0[I0, I1, I2] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(2, bdimx); // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] tv1->split(1, bdimy); // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2] TensorView* tv2 = tv1->rFactor({3}); // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2] // tv1[I0, R1o, R1i{8}, R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}] TensorView* tv3 = tv1->rFactor({1}); // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2] // tv3[I0, R1o, I1i{8}, I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}] // tv1[I0, R1i{8}, R2i{128}] = tv3[I0, R1o, I1i{8}, I2i{128}] tv3->computeAt(tv1, 1); tv2->computeAt(tv3, 2); tv1->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-2)->parallelize(ParallelType::TIDy); tv3->axis(-2)->parallelize(ParallelType::TIDy); tv2->axis(-3)->parallelize(ParallelType::TIDy); int numel_x = 650; int numel_y = 1000; int numel_z = 4; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1, 2}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionMultiGridReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = max(tv0, {0}); TensorView* tv2 = sum(tv0, {0}); fusion.addOutput(tv1); fusion.addOutput(tv2); int numel_x = 4; int numel_y = 2; tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::TIDx); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); std::vector aten_outputs = { std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)}; testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionMultiGridReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {0}); auto tv2 = sum(tv1, {0}); fusion.addOutput(tv2); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::BIDy); tv2->axis(0)->parallelize(ParallelType::BIDy); FusionExecutor fe; ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionReductionTFT_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); int numel_x = 1025; int numel_y = 129; int tidx = 16; int tidy = 8; int tidz = 8; tv1->split(1, tidx); // tv1[I0, R1o, R1i{tidx}] tv1->split(1, tidz); // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}] tv1->split(0, tidy); // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}] TensorView* tv2 = tv1->rFactor({2}); // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}] // tv1[I0o, I0i, R1Oi{tidz}, R1R1i{tidx}] tv2->computeAt(tv1, 2); tv1->axis(1)->parallelize(ParallelType::TIDy); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-2)->parallelize(ParallelType::TIDz); tv2->axis(-2)->parallelize(ParallelType::TIDz); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) { // based off FusionReduction4 Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); TensorView* tv2 = add(tv0, tv1); // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2); // tv3[I0, R1] = tv2[I0, I1] TensorView* tv4 = makeSymbolicTensor(1); fusion.addInput(tv4); // tv5[I0] = tv3[I0, R1] * tv4[I0] TensorView* tv5 = mul(tv3, tv4); fusion.addOutput(tv5); // RFactor the reduction tv3->split(1, 16, false); // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1] TensorView* tv6 = tv3->rFactor({-2}); // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1] // tv3[I0, R1i{tidx}] = tv3[I0, I1] tv2->computeAt(tv6, 2); // Compute at inline with tv5 (only 1D) tv6->computeAt(tv3, 1); tv3->computeAt(tv5, 1); tv5->axis(0)->parallelize(ParallelType::BIDx); // Intermediate tensors only need this, but doesn't hurt to do on inputs // tv0, 1, 4 tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv6->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 1025; int numel_y = 129; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); at::Tensor t1 = at::randn({numel_x, numel_y}, options); at::Tensor t4 = at::randn({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t1, t4}); auto t2 = t0.add(t1); auto t3 = t2.to(at::kDouble).sum({1}); auto aten_output = t3.mul(t4); testValidate( &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBranches_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); TensorView* tv2 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addInput(tv2); auto tv3 = add(tv0, new Double(1.0)); auto tv4 = add(tv3, tv1); auto tv5 = add(tv3, tv2); auto tv6 = add(tv4, tv5); fusion.addOutput(tv6); constexpr int x = 63, y = 33; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x, y}, options); at::Tensor t1 = at::randn({x, y}, options); at::Tensor t2 = at::randn({x, y}, options); FusionExecutor fe; tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); tv6->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv6, 1); tv1->computeAt(tv6, 1); tv2->computeAt(tv6, 1); tv3->axis(-2)->parallelize(ParallelType::Unroll); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-2)->parallelize(ParallelType::Unroll); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv5->axis(-2)->parallelize(ParallelType::Unroll); tv5->axis(-1)->parallelize(ParallelType::TIDx); tv6->axis(-1)->parallelize(ParallelType::TIDx); std::vector aten_inputs = {t0, t1, t2}; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t3 = t0.add(1.0); auto t4 = t3.add(t1); auto t5 = t3.add(t2); auto aten_output = t4.add(t5); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSimpleBCast1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1.5)); TensorView* tv2 = makeSymbolicTensor(2); fusion.addInput(tv2); TensorView* tv3 = makeSymbolicTensor(2); fusion.addInput(tv3); TensorView* tv4 = sub(tv2, tv3); TensorView* tv5 = broadcast(tv1, {false, false, true}); TensorView* tv6 = broadcast(tv4, {true, false, false}); TensorView* tv7 = add(tv5, tv6); fusion.addOutput(tv7); tv7->split(-1, 4); tv7->split(0, 8); tv0->computeAt(tv7, -1); tv2->computeAt(tv7, -1); tv7->axis(0)->parallelize(ParallelType::BIDx); tv7->axis(-1)->parallelize(ParallelType::TIDx); constexpr int x = 63, y = 33, z = 15; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x, y}, options); at::Tensor t1 = t0.add(1.5); at::Tensor t2 = at::randn({y, z}, options); at::Tensor t3 = at::randn({y, z}, options); at::Tensor t4 = t2.sub(t3); at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z}); at::Tensor t6 = t4.expand({x, y, z}); at::Tensor aten_output = t5.add(t6); std::vector aten_inputs = {t0, t2, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSimpleBCast2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = add(tv0, tv1); TensorView* tv3 = broadcast(tv2, {false, false, true}); TensorView* tv4 = makeSymbolicTensor(2); fusion.addInput(tv4); TensorView* tv5 = sub(tv4, new Double(0.1)); TensorView* tv6 = broadcast(tv5, {true, false, false}); TensorView* tv7 = add(tv3, tv6); fusion.addOutput(tv7); tv7->merge(0, 1); tv0->computeAt(tv7, -1); tv4->computeAt(tv7, -1); tv7->axis(0)->parallelize(ParallelType::BIDx); tv7->axis(-1)->parallelize(ParallelType::TIDx); constexpr int x = 63, y = 33, z = 15; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x, y}, options); at::Tensor t1 = at::randn({x, y}, options); at::Tensor t2 = t0.add(t1); at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z}); at::Tensor t4 = at::randn({y, z}, options); at::Tensor t5 = t4.sub(0.1); at::Tensor t6 = t5.expand({x, y, z}); at::Tensor aten_output = t3.add(t6); at::Tensor cg_output = at::empty({x, y, z}, options); std::vector aten_inputs = {t0, t1, t4}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSimpleBCast3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views std::vector dom; dom.push_back(new IterDomain(new Int(0), new Int())); dom.push_back(new IterDomain( new Int(0), new Int(1), ParallelType::Serial, IterType::BroadcastWithStride)); // tv0[I1, B{1}] TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float); fusion.addInput(tv0); // tv1[I0, I1, I2] TensorView* tv2 = makeSymbolicTensor(3); fusion.addInput(tv2); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); tv3->merge(0); tv3->merge(0); tv0->computeAt(tv3, -1); tv2->computeAt(tv3, -1); tv3->axis(0)->parallelize(ParallelType::BIDx); constexpr int x = 2, y = 3, z = 4; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y, 1}, options); at::Tensor t2 = at::randn({x, y, z}, options); auto aten_output = t0.add(t2); std::vector aten_inputs = {t0, t2}; at::Tensor cg_output = at::empty({x, y, z}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSimpleBCast4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views std::vector dom; dom.push_back(new IterDomain( new Int(0), new Int(1), ParallelType::Serial, IterType::BroadcastWithStride)); dom.push_back(new IterDomain(new Int(0), new Int())); TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float); TensorView* tv1 = makeSymbolicTensor(3); fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv3 = add(tv0, tv1); tv3->merge(0); tv3->merge(0); tv3->split(0, 128); tv3->split(0, 4); fusion.addOutput(tv3); tv0->computeAt(tv3, -1); tv1->computeAt(tv3, -1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-2)->parallelize(ParallelType::Unroll); constexpr int x = 63, y = 33, z = 15; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1, z}, options); at::Tensor t1 = at::randn({x, y, z}, options); auto aten_output = t0.add(t1); at::Tensor cg_output = at::empty({x, y, z}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSimpleBCast5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int m = 2, k = 3, n = 4; auto zero = new Int(0); auto M = new IterDomain(zero, new Int(m)); auto K = new IterDomain(zero, new Int(k)); auto N = new IterDomain(zero, new Int(n)); // Set up your input tensor views TensorView* tv0 = new TensorView(new TensorDomain({M, K}, {true, true}), DataType::Float); // Note: IterDomain must not be reused, so K needs to be cloned. TensorView* tv1 = new TensorView( new TensorDomain({K->clone(), N}, {true, true}), DataType::Float); fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv2 = broadcast(tv0, {false, false, true}); TensorView* tv3 = broadcast(tv1, {true, false, false}); TensorView* tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv4->merge(0); tv4->merge(0); tv0->computeAt(tv4, -1); tv1->computeAt(tv4, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({m, k}, options); at::Tensor t1 = at::randn({k, n}, options); auto t2 = t0.unsqueeze(-1).expand({m, k, n}); auto t3 = t1.expand({m, k, n}); auto aten_output = t2.add(t3); at::Tensor cg_output = at::empty({m, k, n}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComplexBCast1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int x = 2, y = 3, z = 4; auto tv0 = makeConcreteTensor({y}); auto tv1 = div(tv0, new Double(2.0)); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = makeConcreteTensor({y, z}); auto tv4 = mul(tv2, tv3); auto tv5 = broadcast(tv4, {true, false, false}); auto tv6 = makeConcreteTensor({x, y, z}); auto tv7 = add(tv5, tv6); // tv0[ i1 ] = input // tv1[ i1 ] = tv0/2.0 // tv2[ i1, b2] = bcast(tv1) // tv3[ i1, i2] = input // tv4[ i1, i2] = tv2 * tv3 // tv5[b0, i1, i2] = bcast(tv4) // tv6[i0, i1, i2] = input // tv7[i0, i1, i2] = tv5 + tv6 // tv4 = bcast(tv1) * tv3 // tv7 = bcast(tv4) + tv6 fusion.addInput(tv0); fusion.addInput(tv3); fusion.addInput(tv6); fusion.addOutput(tv7); tv7->merge(0); tv7->merge(0); tv0->computeAt(tv7, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y}, options); at::Tensor t3 = at::randn({y, z}, options); at::Tensor t6 = at::randn({x, y, z}, options); auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3; auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6; std::vector aten_inputs = {t0, t3, t6}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComplexBCast2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int x = 2, y = 3, z = 4; auto tv0 = makeConcreteTensor({y, z}); auto tv1 = div(tv0, new Double(2.0)); auto tv2 = sum(tv1, {1}); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = makeConcreteTensor({x, y}); auto tv5 = add(tv3, tv4); // tv0[ i1, i2] = input // tv1[ i1, i2] = tv0/2.0 // tv2[ i1 ] = sum(tv1, 1) // tv3[b0, i1 ] = bcast(tv2) // tv4[i0, i1 ] = input // tv5[i0, i1 ] = tv3 + tv4 // tv2 = sum(tv0/2.0, 1) // tv5 = bcast(tv2) + tv4 fusion.addInput(tv0); fusion.addInput(tv4); fusion.addOutput(tv5); tv5->merge(0); tv0->computeAt(tv5, -1); tv1->computeAt(tv2, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y, z}, options); at::Tensor t4 = at::randn({x, y}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t4}); auto t1 = t0.div(2.0); auto t2 = t1.to(at::kDouble).sum(1); auto t3 = t2.unsqueeze(0).expand({x, y}); auto aten_output = t3.add(t4); testValidate( &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(4); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1.0)); auto tv3 = broadcast(tv2, {true, false, false, false}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv4->merge(0); tv4->merge(0); tv4->merge(0); tv4->split(0, 128); tv4->split(0, 4); tv2->computeAt(tv4, 1); tv4->axis(0)->parallelize(ParallelType::BIDx); tv4->axis(1)->parallelize(ParallelType::Unroll); tv4->axis(2)->parallelize(ParallelType::TIDx); tv3->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(2)->parallelize(ParallelType::TIDx); tv2->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(2)->parallelize(ParallelType::TIDx); FusionExecutor fe; at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); auto t3 = t0.add(1.0); auto aten_output = t3.add(t1); std::vector aten_inputs = {t0, t1}; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(4); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1.0)); auto tv3 = broadcast(tv2, {true, false, false, false}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv4->merge(-2); tv4->merge(-2); tv4->merge(-2); tv4->split(0, 128); tv4->split(0, 4); tv2->computeAt(tv4, 1); tv4->axis(0)->parallelize(ParallelType::BIDx); tv4->axis(1)->parallelize(ParallelType::Unroll); tv4->axis(2)->parallelize(ParallelType::TIDx); tv3->axis(1)->parallelize(ParallelType::Unroll); tv3->axis(2)->parallelize(ParallelType::TIDx); tv2->axis(1)->parallelize(ParallelType::Unroll); tv2->axis(2)->parallelize(ParallelType::TIDx); FusionExecutor fe; at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); auto t3 = t0.add(1.0); auto aten_output = t3.add(t1); std::vector aten_inputs = {t0, t1}; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(4); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1.0)); auto tv3 = add(tv2, tv1); fusion.addOutput(tv3); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x, y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); auto t2 = t0.add(1.0); auto aten_output = t2.add(t1); std::vector aten_inputs = {t0, t1}; auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({10, 20}); fusion.addInput(tv0); TensorView* tv1 = makeConcreteTensor({10, 10, 20}); fusion.addInput(tv1); TensorView* tv2 = add(tv0, new Double(1)); TensorView* tv3 = broadcast(tv2, {true, false, false}); TensorView* tv4 = add(tv3, tv1); fusion.addOutput(tv4); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({10, 20}, options); at::Tensor t1 = at::randn({10, 10, 20}, options); auto t2 = t0.add(1.0); auto aten_output = t2.add(t1); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(3); fusion.addInput(tv1); TensorView* tv2 = add(tv0, new Double(1)); TensorView* tv3 = broadcast(tv2, {true, false, true}); TensorView* tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3); tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3); tv0->computeAt(tv4, 1); tv1->computeAt(tv4, 1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({7}, options); at::Tensor t1 = at::randn({5, 7, 11}, options); auto t2 = t0.add(1.0); auto aten_output = t2.unsqueeze(-1).add(t1); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); std::vector tensor0_shape{7, 4, 7}; std::vector tensor1_shape{4, 7}; TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size()); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size()); fusion.addInput(tv1); TensorView* tv2 = add(tv0, tv1); TensorView* tv3 = sum(tv2, {0, 1}); fusion.addOutput(tv3); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn(tensor0_shape, options); at::Tensor input1 = at::randn(tensor1_shape, options); std::vector reduction_axes{0, 1}; auto reduction_params = getReductionHeuristics(&fusion, {input0, input1}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input0, input1}, reduction_params.value().lparams); auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes); testValidate( &fusion, cg_outputs, {input0, input1}, {aten_output}, __LINE__, __FILE__, "", reduction_params.value().lparams); } TEST(NVFuserTest, FusionAdvancedIndexing7_CUDA) { // Might be able to use this one without 6 as the heuristics in 6 may change // and this test is to cover the same issue. Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = makeSymbolicTensor(2); fusion.addInput(tv2); auto tv3 = add(tv1, tv2); auto tv4 = sum(tv3, {0, 1}); fusion.addOutput(tv4); tv4->merge(0, 1); tv4->split(0, 128); tv4->split(0, 4); auto tv5 = tv4->rFactor({0, 1}); tv5->computeAt(tv4, -1); tv0->computeAt(tv5, -1); tv4->axis(0)->parallelize(ParallelType::TIDx); FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); auto cg_outputs = fe.runFusion({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) .sum(); testValidate( &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing8_CUDA) { // Same as 7 but with outer splits instead of inner Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = makeSymbolicTensor(2); fusion.addInput(tv2); auto tv3 = add(tv1, tv2); auto tv4 = sum(tv3, {0, 1}); fusion.addOutput(tv4); tv4->merge(0, 1); tv4->split(0, 128, false); tv4->split(0, 4, false); auto tv5 = tv4->rFactor({0, 1}); tv5->computeAt(tv4, -1); tv0->computeAt(tv5, -1); tv4->axis(0)->parallelize(ParallelType::TIDx); FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); auto cg_outputs = fe.runFusion({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) .to(at::kDouble) .sum(); testValidate( &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) { // Same as 7 but with outer splits instead of inner Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = mul(tv1, new Double(2)); fusion.addOutput(tv2); auto tv3 = makeSymbolicTensor(3); fusion.addInput(tv3); auto tv4 = add(tv3, tv2); fusion.addOutput(tv4); const int numel_x = 200; const int numel_y = 300; const int numel_z = 400; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto at_t0 = at::randn({numel_y}, options); auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options); std::vector aten_inputs = {at_t0, at_t3}; auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); auto at_t1 = at_t0.unsqueeze(-1); auto at_t2 = at_t1.mul(2.0); auto at_t4 = at_t3.add(at_t2); testValidate( &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeContigTensor(2); TensorView* tv1 = makeContigTensor(2); // Register your inputs fusion.addInput(tv0); fusion.addInput(tv1); // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView TensorView* tv2 = add(tv1, new Double(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs fusion.addOutput(tv3); auto tv0_cache = tv0->cache_after(); auto tv1_cache = tv1->cache_after(); std::vector tvs = {tv0_cache, tv1_cache, tv2, tv3}; for (auto tv : tvs) { tv->split(1, 2, false); tv->split(1, 1); tv->split(-1, 4); // [I0, 2, 1, I1/2/4, 4] tv->reorder({{1, 2}, {2, 3}, {3, 1}}); tv->axis(0)->parallelize(ParallelType::BIDx); tv->axis(1)->parallelize(ParallelType::TIDx); } // For all inputs, computeAt the output inline, temporaries should be squeezed // between them tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize); tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({64, 128}, options); at::Tensor input2 = at::rand_like(input1); at::Tensor output = at::empty_like(input1); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; at::Tensor output_ref = input1 + tv2_ref; TORCH_CHECK(output_ref.equal(output)); } TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto tv0 = makeSymbolicTensor(4); auto tv1 = makeSymbolicTensor(1); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv1, new Double(1.0)); auto tv3 = broadcast(tv2, {true, false, true, true}); auto tv4 = add(tv3, tv0); fusion.addOutput(tv4); tv4->merge(0); tv4->merge(1); tv4->split(1, 32); tv4->split(0, 1); tv4->reorder({{2, 1}}); tv2->computeAt(tv4, 3); tv2->setMemoryType(MemoryType::Global); tv4->axis(0)->parallelize(ParallelType::BIDx); tv4->axis(1)->parallelize(ParallelType::BIDy); tv4->axis(2)->parallelize(ParallelType::Unswitch); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); FusionExecutor fe; at::Tensor t0 = at::randn({w, x, y, z}, options); at::Tensor t1 = at::randn({x}, options); auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1); auto aten_output = t3.add(t0); std::vector aten_inputs = {t0, t1}; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } // Intended to stress the lowering of our code generator TEST(NVFuserTest, FusionAdvancedLowering1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeConcreteTensor({9, 5}); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv1, new Double(3)); TensorView* tv4 = sum(tv3, {1}); fusion.addOutput(tv2); fusion.addOutput(tv4); tv4->split(1, 4); auto tv5 = tv4->rFactor({2}); tv1->computeAt(tv5, 2); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(1); at::Tensor aten_input = at::randn({9, 5}, options); auto t1 = aten_input.add(1.0); auto t2 = t1.add(2.0); auto t3 = t1.add(3.0); auto t4 = t3.sum(1); std::vector aten_outputs = {t2, t4}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Progressively broadcast tensors TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = makeSymbolicTensor(3); fusion.addInput(tv2); TensorView* tv3 = add(tv0, new Double(1)); TensorView* tv4 = broadcast(tv3, {false, true}); TensorView* tv5 = add(tv4, tv1); TensorView* tv6 = add(tv5, tv2); fusion.addOutput(tv6); // Split inner dimension tv6->split(1, 4); // Merge middle dims with outer dimensions tv6->merge(2); tv6->merge(0); // tv6[I0*I1o, I1i*I2] // Compute everything inline tv0->computeAt(tv6, -1); tv6->axis(0)->parallelize(ParallelType::BIDx); tv6->axis(1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); int x = 13, y = 9, z = 5; at::Tensor t0 = at::randn({y}, options); at::Tensor t1 = at::randn({y, z}, options); at::Tensor t2 = at::randn({x, y, z}, options); auto t3 = t0.add(1.0); auto t4 = t3.unsqueeze(-1); auto t5 = t4.add(t1); auto t6 = t5.add(t2); std::vector aten_inputs = {t0, t1, t2}; std::vector aten_outputs = {t6}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } // TODO: Complete test TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeConcreteTensor({1, -1}); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); // [b0, i1] auto tv2 = add(tv0, new Double(2.0)); // [i0, i1] auto tv3 = add(tv1, new Double(3.0)); // [b0, i1] auto tv4 = add(tv2, new Double(4.0)); // [io, i1] auto tv5 = add(tv2, tv3); fusion.addOutput(tv4); fusion.addOutput(tv5); tv0->computeAt(tv4, -1); tv3->setMemoryType(MemoryType::Global); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); int x = 13, y = 9; at::Tensor t0 = at::randn({1, y}, options); at::Tensor t1 = at::randn({x, y}, options); auto t4 = t0 + 2 + 4; auto t5 = t0 + 2 + t1 + 3; std::vector aten_inputs = {t0, t1}; std::vector aten_outputs = {t4, t5}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } // This excercises indexing with broadcast root axes. Non-broadcast // axes need to be preferred when propagating index exprs to root // axes. See, e.g., Index::getConsumerIndex_impl. TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = broadcast(tv1, {false, false, true}); auto tv3 = makeSymbolicTensor(3); fusion.addInput(tv3); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv4->merge(1)->merge(0); tv4->split(0, 8); tv0->computeAt(tv4, 1); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 10; const int by = 20; const int bz = 30; at::Tensor t0 = at::randn({bx}, options); at::Tensor t3 = at::randn({bx, by, bz}, options); std::vector aten_inputs = {t0, t3}; auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedLowering5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeConcreteTensor({5, 4, 3}); fusion.addInput(tv0); TensorView* tv1 = makeConcreteTensor({5, 3}); fusion.addInput(tv1); auto tv2 = broadcast(tv1, {false, true, false}); auto tv3 = add(tv0, tv2); fusion.addOutput(tv3); tv2->merge(0); tv1->computeAt(tv2, 1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(1); at::Tensor t0 = at::randn({5, 4, 3}, options); at::Tensor t1 = at::randn({5, 3}, options); auto t2 = t1.unsqueeze(1); auto t3 = t0 + t2; std::vector aten_inputs = {t0, t1}; std::vector aten_outputs = {t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } // Test a simple Gemm but also play around with fusion executor features TEST(NVFuserTest, FusionSimpleGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); // M, K TensorView* tv1 = makeSymbolicTensor(2); // K, N fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv2 = broadcast(tv0, {false, false, true}); // tv2[I0, I1, B] = tv0[I0, I1] TensorView* tv3 = broadcast(tv1, {true, false, false}); // tv3[B, I1, I2] = tv1[I1, I2] // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2] TensorView* tv4 = mul(tv2, tv3); // tv5[I0, R1, I2] = tv4[I0, I1, I2] TensorView* tv5 = sum(tv4, {1}); fusion.addOutput(tv5); tv5->split(1, 32); // tv5[I0, R1o, R1i{32}, I2] auto tv6 = tv5->rFactor({1}); // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2] // tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2] tv5->split(0, 4); tv5->split(-1, 4); // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] tv0->computeAt(tv5, -1); tv1->computeAt(tv5, -1); // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] // tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] //--> (line symbolizes compute at location) // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] tv0->computeAt(tv6, -1); tv1->computeAt(tv6, -1); // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::TIDz); tv5->axis(-2)->parallelize(ParallelType::BIDy); tv5->axis(-1)->parallelize(ParallelType::TIDy); tv5->axis(2)->parallelize(ParallelType::TIDx); tv6->axis(2)->parallelize(ParallelType::TIDx); constexpr int M = 65, K = 33, N = 17; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); FusionExecutor fe; fe.compileFusion(&fusion); // Lets specify a few bounds in launch params to make sure it works fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params auto cg_outputs = fe.runFusion({t0, t1}); auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble)); testValidate( &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); } // Softmax with a 1D tensor. Parallelized only with a single thread block. TEST(NVFuserTest, FusionSoftmax1D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 128; const int dimx = 1000; // Set up your input tensor views TensorView* input_tv0 = makeSymbolicTensor(1); fusion.addInput(input_tv0); TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0); TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true}); // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be // computed at sum_exp_rf_tv8. TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0); TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); fusion.addOutput(output_tv4); bcast_sum_tv3->split(0, tidx); sum_exp_tv2->split(-1, tidx); TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); output_tv4->split(-1, tidx); exp_tv1->computeAt(sum_exp_rf_tv5, -1); exp_tv1_copy->computeAt(output_tv4, -1); TensorView* tensors_to_parallelize[] = { sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; for (auto tv : tensors_to_parallelize) { tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({dimx}, options); at::Tensor cg_output = at::empty({dimx}, options); at::Tensor t3_output = at::empty_like(cg_output, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({t0}, {cg_output}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__); } // Softmax with a 1D tensor with input normalization. TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 128; const int dimx = 1000; // Set up your input tensor views TensorView* input_tv0 = makeSymbolicTensor(1); fusion.addInput(input_tv0); // Normalize with the max value before computing exp. TensorView* max_val_tv1 = reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0); TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true}); TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); TensorView* sum_exp_tv5 = sum(exp_tv4, {-1}); TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true}); // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be // computed at sum_exp_rf_tv8. TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2); TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy); TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6); fusion.addOutput(output_tv7); bcast_max_tv2->split(0, tidx); bcast_sum_tv6->split(0, tidx); max_val_tv1->split(-1, tidx); TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2}); sum_exp_tv5->split(-1, tidx); TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2}); output_tv7->split(-1, tidx); sub_tv3->computeAt(sum_exp_rf_tv9, -1); sub_tv3_copy->computeAt(output_tv7, -1); TensorView* tensors_to_parallelize[] = { max_val_tv1, bcast_max_tv2, sum_exp_tv5, bcast_sum_tv6, output_tv7, max_val_rf_tv8, sum_exp_rf_tv9}; for (auto tv : tensors_to_parallelize) { tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({dimx}, options); at::Tensor t3_output = at::empty({dimx}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } // Softmax with a 3D tensor, where the inner-most 3rd dimension is // normalized. Pallelized with multiple thread blocks. TEST(NVFuserTest, FusionSoftmax3D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 32; const int dimx = 32; const int dimy = 16; const int dimz = 130; // Set up your input tensor views TensorView* input_tv0 = makeSymbolicTensor(3); fusion.addInput(input_tv0); TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0); TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true}); // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be // computed at sum_exp_rf_tv8. TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0); TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); fusion.addOutput(output_tv4); bcast_sum_tv3->split(-1, tidx); sum_exp_tv2->split(-1, tidx); TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); output_tv4->split(-1, tidx); exp_tv1->computeAt(sum_exp_rf_tv5, -1); exp_tv1_copy->computeAt(output_tv4, -1); TensorView* tensors_to_parallelize[] = { sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; for (auto tv : tensors_to_parallelize) { tv->axis(0)->parallelize(ParallelType::BIDx); tv->axis(1)->parallelize(ParallelType::BIDy); tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({dimx, dimy, dimz}, options); at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } // Softmax with a 3D tensor with input normalization. TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 32; const int dimx = 32; const int dimy = 16; const int dimz = 130; // Set up your input tensor views TensorView* input_tv0 = makeSymbolicTensor(3); fusion.addInput(input_tv0); // Normalize with the max value before computing exp. TensorView* max_val_tv1 = reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0); TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true}); TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); TensorView* sum_exp_tv5 = sum(exp_tv4, {-1}); TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true}); // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be // computed at sum_exp_rf_tv8. TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2); TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy); TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6); fusion.addOutput(output_tv7); bcast_max_tv2->split(-1, tidx); bcast_sum_tv6->split(-1, tidx); max_val_tv1->split(-1, tidx); TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2}); sum_exp_tv5->split(-1, tidx); TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2}); output_tv7->split(-1, tidx); sub_tv3->computeAt(sum_exp_rf_tv9, -1); sub_tv3_copy->computeAt(output_tv7, -1); TensorView* tensors_to_parallelize[] = { max_val_tv1, bcast_max_tv2, sum_exp_tv5, bcast_sum_tv6, output_tv7, max_val_rf_tv8, sum_exp_rf_tv9}; for (auto tv : tensors_to_parallelize) { tv->axis(0)->parallelize(ParallelType::BIDx); tv->axis(1)->parallelize(ParallelType::BIDy); tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({dimx, dimy, dimz}, options); at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = add(tv0, new Double(1.0)); auto tv4 = mul(tv2, tv3); auto tv5 = sum(tv4, {1}); auto tv6 = broadcast(tv5, {false, true}); auto tv7 = sub(tv6, tv4); fusion.addOutput(tv7); tv1->computeAt(tv7, 1); ASSERT_ANY_THROW(tv1->computeAt(tv7, -1)); } // Similar to FusionReduction but uses grid reduction TEST(NVFuserTest, FusionGridReduction1_CUDA) { const int gdimx = 32; const int bdimx = 128; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(1, bdimx); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] tv1->split(1, gdimx); // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] // Incrementally, can print in between for debugging tv0->computeAt(tv2, 1); tv2->computeAt(tv1, 1); // Re do it all at once, because why not. tv0->computeAt(tv1, 1); tv1->axis(0)->parallelize(ParallelType::BIDy); tv1->axis(1)->parallelize(ParallelType::BIDx); tv2->axis(2)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); // reduced shape for OOM on upstream CI int numel_x = 1000; int numel_y = 65000; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } // Same test as the above but uses BIDy and TIDx for reduction TEST(NVFuserTest, FusionGridReduction2_CUDA) { const int gdimy = 32; const int bdimx = 128; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(1, bdimx); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] tv1->split(1, gdimy); // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] // Incrementally, can print in between for debugging tv0->computeAt(tv2, 1); tv2->computeAt(tv1, 1); // Re do it all at once, because why not. tv0->computeAt(tv1, 1); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::BIDy); tv2->axis(2)->parallelize(ParallelType::BIDy); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); // reduced shape for OOM on upstream CI int numel_x = 1000; int numel_y = 65000; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } // Same test but uses BIDy and BIDz for reduction. No TID used. TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { // Grid reductions when there aren't any threads are serial reductions // keep these numbers low so our error isn't too high compared to normal cuda // reductions const int gdimz = 15; const int gdimy = 9; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(1, gdimy); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] tv1->split(1, gdimz); // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] // Incrementally, can print in between for debugging tv0->computeAt(tv2, 1); tv2->computeAt(tv1, 1); // Re do it all at once, because why not. tv0->computeAt(tv1, 1); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::BIDz); tv2->axis(2)->parallelize(ParallelType::BIDz); tv1->axis(-1)->parallelize(ParallelType::BIDy); tv2->axis(-1)->parallelize(ParallelType::BIDy); int numel_x = 100; int numel_y = 6500; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } // Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0 TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { // Grid reductions when there aren't any threads are serial reductions // keep these numbers low so our error isn't too high compared to normal cuda // reductions const int gdimz = 15; const int gdimy = 9; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[R0, I1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {0}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(0, gdimy); // tv1[R0o, R0i{128}, I1] = tv0[I0, I1] tv1->split(0, gdimz); // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({0}); // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1] // tv1[ R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1] // Note that computeAt isn't going to make anything better as there // is no dynamically sized dimension. // Map parallelism as [Serial, BIDz, BIDy, BIDx] tv1->axis(-1)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::BIDx); tv1->axis(-2)->parallelize(ParallelType::BIDy); tv2->axis(-2)->parallelize(ParallelType::BIDy); tv1->axis(-3)->parallelize(ParallelType::BIDz); tv2->axis(-3)->parallelize(ParallelType::BIDz); int numel_x = 6500; int numel_y = 100; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({0}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } // This is similar to the FusionReduction, but swaps BIDx and TIDx TEST(NVFuserTest, FusionGridReduction4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int bdimx = 128; const int gdimx = 1024; // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(1, gdimx); // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1] tv1->split(1, 4); // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1] // tv1[I0, R1oi{4}, R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] TensorView* tv3 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1] // tv3[I0, R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] // tv1[I0, R1i{1024}] = tv3[I0, R1oi{4}, Ir1i{1024}] // Incrementally, can print in between for debugging tv0->computeAt(tv2, 1); tv2->computeAt(tv3, 1); tv3->computeAt(tv1, 1); // Re do it all at once, because why not. tv0->computeAt(tv1, 1); tv2->axis(2)->parallelize(ParallelType::Unroll); tv1->axis(0)->parallelize(ParallelType::TIDx); tv1->axis(-1)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::BIDx); tv3->axis(-1)->parallelize(ParallelType::BIDx); int numel_x = bdimx; int numel_y = 65000; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } // Grid reduction with 2D thread blocks but only TIDx and BIDx are // mapped to a reduction dim TEST(NVFuserTest, FusionGridReduction5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int bdimx = 64; const int bdimy = 16; const int gdimx = 4; // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); tv1->split(1, bdimx); // tv1[I0, R1o, R1i{64}] = tv0[I0, I1] tv1->split(1, gdimx); // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1] // tv1[I0, R1oi{4}, R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] tv0->computeAt(tv1, 1); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-2)->parallelize(ParallelType::BIDx); tv2->axis(-2)->parallelize(ParallelType::BIDx); tv1->axis(0)->parallelize(ParallelType::TIDy); int numel_x = bdimy; int numel_y = 6500; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } // Similar to FusionGridReduction1 but with 3D tensors TEST(NVFuserTest, FusionGridReduction6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(3); fusion.addInput(tv0); // tv1[I0, R1, R2] = tv0[I0, I1, I2] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); // Splitting for TID tv1->split(2, 128); // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] // Splitting for BID tv1->split(1, 128); // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2] TensorView* tv2 = tv1->rFactor({3}); // tv2[I0, I1o, I1i{128}, R2o, I2i{128}] // tv1[I0, R1o, R1i{128}, R2i{128}] TensorView* tv3 = tv1->rFactor({1}); // tv2[I0, I1o, I1i{128}, R2o, I2i{128}] // tv3[I0, R1o, I1i{128}, I2i{128}] // tv1[I0, R1i{128}, R2i{128}] tv3->computeAt(tv1, 1); tv2->computeAt(tv3, 3); tv1->axis(0)->parallelize(ParallelType::BIDy); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-2)->parallelize(ParallelType::BIDx); tv2->axis(-3)->parallelize(ParallelType::BIDx); tv3->axis(-2)->parallelize(ParallelType::BIDx); int numel_x = 6500; int numel_y = 200; int numel_z = numel_y; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1, 2}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } // See issue #1049 TEST(NVFuserTest, FusionGridReduction7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = sum(tv0, {0}); fusion.addOutput(tv1); tv1->split(0, 1000); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::BIDy); const int numel_x = 1; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x}, options); at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto out = fe.runFusion({input}); auto aten_output = input.sum({0}); testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { int bid_x = 3; int tid_x = 2; int red_dim = 0; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); fusion.addOutput(tv1); tv1->split(-1, tid_x); tv1->axis(-2)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({16, bid_x * tid_x}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({red_dim}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSplitBCast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* input_tv0 = makeSymbolicTensor(3); TensorView* input_tv1 = makeSymbolicTensor(3); fusion.addInput(input_tv0); fusion.addInput(input_tv1); TensorView* sum_tv2 = reductionOp(BinaryOpType::Add, {2}, new Double(0), input_tv0); TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true}); TensorView* output_tv4 = div(input_tv1, bcast_tv3); sum_tv2->split(-1, 32); TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2}); bcast_tv3->split(-1, 32); output_tv4->split(-1, 32); sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx); sum_tv2->axis(0)->parallelize(ParallelType::BIDx); bcast_tv3->axis(0)->parallelize(ParallelType::BIDx); output_tv4->axis(0)->parallelize(ParallelType::BIDx); sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy); sum_tv2->axis(1)->parallelize(ParallelType::BIDy); bcast_tv3->axis(1)->parallelize(ParallelType::BIDy); output_tv4->axis(1)->parallelize(ParallelType::BIDy); sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx); sum_tv2->axis(-1)->parallelize(ParallelType::TIDx); bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx); output_tv4->axis(-1)->parallelize(ParallelType::TIDx); fusion.addOutput(output_tv4); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({32, 32, 128}, options); at::Tensor t1 = at::randn({32, 32, 128}, options); at::Tensor cg_output = at::empty({32, 32, 128}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({t0, t1}, {cg_output}); } TEST(NVFuserTest, FusionBCastInnerDim_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // reduce then broadcast auto tv1 = sum(tv0, {0}); auto tv2 = broadcast(tv1, {false, true}); TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast()); } TEST(NVFuserTest, FusionBCastReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); auto tv1 = broadcast(tv0, {true, false, false}); auto tv2 = sum(tv1, {1}); TORCH_CHECK( tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() && !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction()); } // Multiple consumer reduction with computeAt // https://github.com/csarofeen/pytorch/issues/110 TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = unaryOp(UnaryOpType::Exp, tv0); auto tv2 = reductionOp(BinaryOpType::Max, {-1}, new Double(0), tv1); auto tv3 = reductionOp(BinaryOpType::Min, {-1}, new Double(0), tv1); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort); TORCH_CHECK(tv1->getComputeAtPosition() == 2); } TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { for (int i = 0; i < 2; ++i) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv0, new Double(1)); TensorView* tv3 = add(tv1, tv2); // Set outputs tv2 or tv1 and then tv3 if (i == 0) { fusion.addOutput(tv2); } else { fusion.addOutput(tv1); } fusion.addOutput(tv3); if (i == 0) { tv1->computeAt(tv3, -1); } else { tv2->computeAt(tv3, -1); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); std::vector aten_outputs = { aten_input + 1, (aten_input + 1) * 2}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } } TEST(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv0, new Double(1)); TensorView* tv3 = add(tv1, tv2); fusion.addOutput(tv3); tv3->split(-1, 32); tv1->computeAt(tv3, -1); tv2->computeAt(tv3, -2); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100, 100}, options); auto aten_output = (aten_input + 1) * 2; at::Tensor cg_output = at::empty_like(aten_input, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const size_t dimx = 13; const size_t dimy = 15; TensorView* tv0 = makeConcreteTensor({dimx, dimy}); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv2, new Double(3)); TensorView* tv4 = add(tv3, new Double(4)); TensorView* tv5 = mul(tv2, tv4); fusion.addOutput(tv5); tv1->computeAt(tv2, 2); tv3->computeAt(tv4, 1); tv4->computeAt(tv5, 2); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); auto t1 = aten_input.add(1.); auto t2 = t1.add(2.); auto t3 = t2.add(3.); auto t4 = t3.add(4.); auto aten_output = t2.mul(t4); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = sum(tv0, {0}); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); TORCH_CHECK(tv2->nDims() == 0); tv1->computeAt(tv2, 0); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); auto aten_output = aten_input.to(at::kDouble).sum() + 1; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(0); fusion.addInput(tv0); auto tv1 = broadcast(tv0, {true, true}); TORCH_CHECK(tv1->nDims() == 2); TensorView* tv2 = makeSymbolicTensor(2); fusion.addInput(tv2); auto tv3 = add(tv1, tv2); auto tv4 = sum(tv3, {0, 1}); fusion.addOutput(tv4); tv3->computeAt(tv4, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({}, options); at::Tensor t1 = at::randn({10, 10}, options); auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1) .to(at::kDouble) .sum(); std::vector aten_inputs = {t0, t1}; at::Tensor cg_output = at::empty({}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionZeroDimReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int bdimx = 32; const int gdimx = 32; TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = sum(tv0, {0}); fusion.addOutput(tv1); tv1->split(0, bdimx); tv1->split(0, gdimx); auto tv2 = tv1->rFactor({0}); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-2)->parallelize(ParallelType::BIDx); tv2->axis(-2)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({1000}, options); auto aten_output = aten_input.to(at::kDouble).sum(); at::Tensor cg_output = at::empty({}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 128; // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); tv1->split(1, tidx); auto tv3 = tv1->rFactor({-2}); TensorView* tv4 = makeSymbolicTensor(2); fusion.addInput(tv4); auto tv5 = add(tv2, tv4); fusion.addOutput(tv5); tv5->split(1, tidx); tv3->computeAt(tv5, 1); tv2->split(1, tidx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv5->axis(-1)->parallelize(ParallelType::TIDx); tv5->axis(0)->parallelize(ParallelType::BIDx); int x = 63, y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x, y}, options); at::Tensor t4 = at::randn({x, y}, options); auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y}); auto aten_output = t3.add(t4); std::vector aten_inputs = {t0, t4}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t4}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionOutputBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeConcreteTensor({2, 3}); fusion.addInput(tv0); TensorView* tv1 = broadcast(tv0, {true, false, true, false, true}); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({2, 3}, options); auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6}); fusion.addInput(tv0); TensorView* tv1 = sum(tv0, {0, 2, 4}, /*keep_dim=*/true); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options); auto aten_output = aten_input.to(at::kDouble).sum({0, 2, 4}, /*keepdim=*/true); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({bid_x, tid_x}); fusion.addInput(tv0); TensorView* tv1 = reductionOp( BinaryOpType::Add, {red_dim}, new Double(0), tv0, /*keep_dim=*/true); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({bid_x, tid_x}, options); auto aten_output = aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true); // Apply reduction heuristic auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); FusionExecutor fe; fe.compileFusion(&fusion); auto lparams = reduction_params.value().lparams; auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionSumTo_CUDA) { Fusion fusion; FusionGuard fg(&fusion); std::vector tensor_shape{2, 3, 4, 5, 6}; std::vector sum_to_shape{1, 5, 6}; std::vector tensor_shape_ref{2, 3, 4, 5, 6}; std::vector sum_to_shape_ref{1, 5, 6}; std::vector sum_to_symb; std::transform( sum_to_shape.begin(), sum_to_shape.end(), std::back_inserter(sum_to_symb), [](int s) -> Int* { return new Int(s); }); TensorView* tv0 = makeConcreteTensor(tensor_shape); fusion.addInput(tv0); TensorView* tv1 = sum_to(tv0, sum_to_symb); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(tensor_shape_ref, options); auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); TORCH_CHECK( cg_outputs[0].dim() == sum_to_shape.size(), "sum_to not keeping the final dimension"); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSumToNoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); std::vector tensor_shape{4, 5, 6}; std::vector sum_to_shape{4, 5, 6}; std::vector tensor_shape_ref{4, 5, 6}; std::vector sum_to_shape_ref{4, 5, 6}; std::vector sum_to_symb; std::transform( sum_to_shape.begin(), sum_to_shape.end(), std::back_inserter(sum_to_symb), [](int s) -> Int* { return new Int(s); }); TensorView* tv0 = makeConcreteTensor(tensor_shape); fusion.addInput(tv0); TensorView* tv1 = sum_to(tv0, sum_to_symb); // Dummy operator to avoid tv0 both input and output TensorView* tv2 = add(tv1, new Double(0)); fusion.addOutput(tv2); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(tensor_shape_ref, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); TORCH_CHECK( cg_outputs[0].dim() == sum_to_shape.size(), "sum_to not keeping the final dimension"); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReductionScheduler_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({bid_x, tid_x}, options); auto aten_output = aten_input.to(at::kDouble).sum({red_dim}); // Apply reduction heuristic auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } // Simple reduction parallelized on a symbolic size. TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addOutput(tv1); // Interface should just be a direct split with a Parallel type. We can // include the parallelize call if we do this. tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({1}); // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1] // tv1[I0, R1oi{4}, R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] // Incrementally, can print in between for debugging tv0->computeAt(tv2, 1); tv2->computeAt(tv1, 1); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 65000; int numel_y = 1025; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({numel_x, numel_y}, options); auto aten_output = aten_input.to(at::kDouble).sum({1}); // How many threads to use for the block reduction int runtime_threadIdx_dim = 128; LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { const std::vector red_dims = {0, 2}; // Copy is because CodeGen requires int and Pytorch requires int64_t // for a vector of reduction dimensions const std::vector red_dims64 = {0, 2}; const std::vector tensor_dims_in = {5, 10, 15, 20}; const std::vector tensor_dims_out = {10, 20}; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(tensor_dims_in, options); auto aten_output = aten_input.to(at::kDouble).sum(red_dims64); at::Tensor cg_output = at::empty(tensor_dims_out, options); // Apply reduction heuristic auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, {cg_output}, lparams); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { const std::vector red_dims = {1, 3}; // Copy is because CodeGen requires int and Pytorch requires int64_t // for a vector of reduction dimensions const std::vector red_dims64 = {1, 3}; const std::vector tensor_dims_in = {5, 10, 15, 20}; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(tensor_dims_in, options); auto aten_output = aten_input.to(at::kDouble).sum(red_dims64); auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { std::vector dtypes = { DataType::Double, DataType::Float, DataType::Half}; std::vector red_dims; // Tried to cut down the number iterations with just // doing every other power of 2. for (int i = 1; i <= 1024 * 1024; i <<= 2) { red_dims.push_back(i); } for (auto dtype : dtypes) { at::ScalarType aten_dtype = data_type_to_aten(dtype); for (auto& rdim : red_dims) { Fusion fusion; FusionGuard fg(&fusion); bool is_fp16 = dtype == DataType::Half; TensorView* tv0 = makeSymbolicTensor(1, dtype); fusion.addInput(tv0); TensorView* tv0_cast = tv0; if (is_fp16) { tv0_cast = castOp(DataType::Float, tv0); } TensorView* tv1 = sum(tv0_cast, {0}); TensorView* tv1_cast = tv1; if (is_fp16) { tv1_cast = castOp(DataType::Half, tv1); } fusion.addOutput(tv1_cast); auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({rdim}, options); auto aten_output = aten_input.to(at::kDouble).sum({0}); auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } } } TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { std::vector dtypes = { DataType::Double, DataType::Float, DataType::Half}; std::vector red_axis = {1, 0}; std::vector output_dims = {160, 320}; std::vector red_dims; // Tried to cut down the number iterations with just // doing every other power of 2. for (int i = 1; i <= 1024 * 1024; i <<= 2) { red_dims.push_back(i); } for (auto dtype : dtypes) { at::ScalarType aten_dtype = data_type_to_aten(dtype); for (auto& axis : red_axis) { for (auto& odim : output_dims) { for (auto& rdim : red_dims) { Fusion fusion; FusionGuard fg(&fusion); bool is_fp16 = dtype == DataType::Half; TensorView* tv0 = makeSymbolicTensor(2, dtype); fusion.addInput(tv0); TensorView* tv0_cast = tv0; if (is_fp16) { tv0_cast = castOp(DataType::Float, tv0); } TensorView* tv1 = sum(tv0_cast, {axis}); TensorView* tv1_cast = tv1; if (is_fp16) { tv1_cast = castOp(DataType::Half, tv1); } fusion.addOutput(tv1_cast); auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); at::Tensor aten_input = (axis ? at::randn({odim, rdim}, options) : at::randn({rdim, odim}, options)); auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({axis}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } } } } } TEST(NVFuserTest, FusionCacheBefore_CUDA) { // TVM Cache Write Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = add(tv0, new Double(1.0)); TensorView* tv2 = mul(tv1, new Double(3.0)); fusion.addInput(tv0); fusion.addOutput(tv2); // Before: TV2 = TV1 * 3 // After: TV3 = TV1 * 3; // TV2 = TV3; TensorView* tv3 = tv2->cache_before(); constexpr int BSX = 32; tv2->split(-1, BSX); tv0->computeAt(tv2, -1); // Thread and Block binding tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 32, N = 750; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); at::Tensor aten_output = (aten_input + 1.0) * 3.0; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionCacheAfter_CUDA) { // TVM Cache Read Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = add(tv0, new Double(1.0)); TensorView* tv2 = mul(tv1, new Double(3.0)); fusion.addInput(tv0); fusion.addOutput(tv2); // Before: TV1 = TV0 + 1 // After: TV3 = TV0; // TV1 = TV3 + 1 TensorView* tv3 = tv0->cache_after(); constexpr int BSX = 32; tv2->split(-1, BSX); tv0->computeAt(tv2, -1); // Thread and Block binding tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 32, N = 457; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); at::Tensor aten_output = (aten_input + 1.0) * 3.0; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionCacheFork_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = add(tv0, new Double(1.0)); TensorView* tv2 = mul(tv1, new Double(3.0)); fusion.addInput(tv0); fusion.addOutput(tv1); fusion.addOutput(tv2); // Before: TV1 = TV0 + 1 // TV2 = TV1 * 1 // Output: TV1, TV2 // After: TV1 = TV0 + 1 // TV3 = TV1 // TV2 = TV1 * 1 // Output: TV3, TV2 // cache_fork !!does not!! automatically apply ComputeAt to the cache auto tv3 = tv1->cache_fork(); constexpr int BSX = 32; tv2->split(-1, BSX); tv0->computeAt(tv2, -1); // Thread and Block binding tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 32, N = 457; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, N}, options); at::Tensor aten_output1 = aten_input + 1.0; at::Tensor aten_output2 = aten_output1 * 3.0; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output1, aten_output2}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionCacheIndirect_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); TensorView* tv2 = makeSymbolicTensor(2); TensorView* tv3 = makeSymbolicTensor(2); TensorView* tv4 = sub(tv2, tv3); TensorView* tv5 = add(tv1, tv4); TensorView* tv6 = sub(tv5, tv0); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addInput(tv2); fusion.addInput(tv3); fusion.addOutput(tv6); // t6 = ((t1 + (t2 - t3)) - t0) tv5->cache_after(); tv5->cache_before(); // cache_after on inputs placed before schedule constexpr int BSX = 32; tv6->split(-1, BSX); tv2->computeAt(tv6, -1); // Thread and Block binding tv6->axis(0)->parallelize(ParallelType::BIDx); tv6->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 32, N = 810; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, N}, options); at::Tensor t1 = at::randn({M, N}, options); at::Tensor t2 = at::randn({M, N}, options); at::Tensor t3 = at::randn({M, N}, options); std::vector aten_inputs = {t0, t1, t2, t3}; at::Tensor aten_output = (t1 + (t2 - t3)) - t0; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionCacheBcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(1); // (M, 1) TensorView* tv1 = broadcast(tv0, {false, true}); TensorView* tv2 = makeSymbolicTensor(1); // (1, N) TensorView* tv3 = broadcast(tv2, {true, false}); TensorView* tv4 = mul(tv1, tv3); fusion.addInput(tv0); fusion.addInput(tv2); fusion.addOutput(tv4); // Case 1 tv0->cache_after(); // Case 2 tv1->cache_before(); // Case 3 tv1->cache_after(); // Case 4 TensorView* tv8 = tv4->cache_before(); constexpr int BSX = 128; tv4->split(0, BSX); tv4->split(-1, BSX); tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); // M/BSX, N/BSY, BSX, BSY tv0->computeAt(tv4, 2); tv2->computeAt(tv4, 2); // 0, 1 | 2, 3, 4 tv4->axis(0)->parallelize(ParallelType::BIDx); tv4->axis(1)->parallelize(ParallelType::BIDy); tv4->axis(-1)->parallelize(ParallelType::TIDx); // Manual Replay on TV3 tv3->axis(-1)->parallelize(ParallelType::TIDx); tv8->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 92, N = 500; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M}, options); at::Tensor t1 = at::randn({N}, options); std::vector aten_inputs = {t0, t1}; at::Tensor aten_output = t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0)); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv0, new Double(1)); TensorView* tv4 = add(tv3, new Double(2)); fusion.addInput(tv0); fusion.addOutput(tv2); fusion.addOutput(tv4); auto tv5 = tv1->cache_before(); auto tv6 = tv3->cache_before(); tv5->setMemoryType(MemoryType::Shared); tv6->setMemoryType(MemoryType::Shared); tv1->computeAt(tv2, -1); tv3->computeAt(tv4, -1); // Fails because tensor must be recomputed twice // auto tv7 = tv0->cache_after(); constexpr int N = 800; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({N}, options); auto aten_output = (aten_input + 1) + 2; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output, aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSmem_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(2); // (M, N) TensorView* tv1 = makeSymbolicTensor(2); // (M, N) TensorView* tv2 = mul(tv0, tv1); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv2); // Schedule TensorView* tv3 = tv0->cache_after(); TensorView* tv4 = tv1->cache_after(); tv3->setMemoryType(MemoryType::Shared); tv4->setMemoryType(MemoryType::Shared); constexpr int BSY = 32; constexpr int BSX = 128; tv2->split(0, BSY); tv2->split(2, BSX); // M/BSX, BSX, N/BSX, BSX tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); // M/BSX, N/BSX, BSX, BSX tv0->computeAt(tv2, 2); tv1->computeAt(tv2, 2); // Thread and Block binding tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::BIDy); tv2->axis(-1)->parallelize(ParallelType::TIDx); // Manual Binding tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 128, N = 10240; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, N}, options); at::Tensor t1 = at::randn({M, N}, options); at::Tensor aten_output = mul(t0, t1); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t1}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } TEST(NVFuserTest, FusionSmemReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(3); // M, K, N TensorView* tv1 = sum(tv0, {1}); // M, R, N fusion.addInput(tv0); fusion.addOutput(tv1); TensorView* tv2 = tv0->cache_after(); tv2->setMemoryType(MemoryType::Shared); // Schedule constexpr int BSX = 32; tv1->split(2, BSX); tv1->split(1, 128); tv1->split(0, BSX); // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); TensorView* tv3 = tv1->rFactor({-2}); tv0->computeAt(tv1, -2); tv0->computeAt(tv3, -2); // Thread and Block binding tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::BIDy); tv1->axis(-1)->parallelize(ParallelType::TIDx); // Manual Binding tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 154, K = 45, N = 1524; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, K, N}, options); at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(2); // (M, K) TensorView* tv1 = makeSymbolicTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) TensorView* tv4 = mul(tv2, tv3); // M, K, N TensorView* tv5 = sum(tv4, {1}); // M, R, N fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); // Schedule constexpr int BSX = 16; tv5->split(2, BSX); tv5->split(1, BSX); tv5->split(0, BSX); // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX TensorView* tv6 = tv5->rFactor({-1}); tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); tv4->setMemoryType(MemoryType::Shared); tv6->setMemoryType(MemoryType::Shared); tv0->computeAt(tv5, 3); tv1->computeAt(tv5, 3); // Thread and Block binding tv5->axis(0)->parallelize(ParallelType::BIDx); tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(-2)->parallelize(ParallelType::TIDy); tv5->axis(-1)->parallelize(ParallelType::TIDx); // Manual Binding tv2->axis(-3)->parallelize(ParallelType::TIDy); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-3)->parallelize(ParallelType::TIDy); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv6->axis(-3)->parallelize(ParallelType::TIDy); tv6->axis(-2)->parallelize(ParallelType::TIDx); constexpr int M = 154, K = 45, N = 1524; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); std::vector aten_inputs = {t0, t1}; at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t1}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(2); // (M, K) TensorView* tv1 = makeSymbolicTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) TensorView* tv4 = mul(tv2, tv3); // M, K, N TensorView* tv5 = sum(tv4, {1}); // M, R, N fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); // Schedule // Remove reduction axis from tv5 // tv6 = (M, R, N) // tv5 = (M, N) TensorView* tv6 = tv5->cache_before(); constexpr int BSX = 16; tv5->split(1, BSX); tv5->split(0, BSX); // M/BSX, BSX, N/BSX, BSX tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); // tv5 = M/BSX, N/BSX, MSX, NSX tv6->computeAt(tv5, 2); tv6->computeAt(tv5, 2); tv6->split(-1, BSX); // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX TensorView* tv7 = tv6->rFactor({-1}); // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); tv0->computeAt(tv7, 3); tv1->computeAt(tv7, 3); tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); tv4->setMemoryType(MemoryType::Shared); tv6->setMemoryType(MemoryType::Shared); tv7->setMemoryType(MemoryType::Shared); // Memory Type // Thread and Block binding tv5->axis(0)->parallelize(ParallelType::BIDx); tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(-2)->parallelize(ParallelType::TIDy); tv5->axis(-1)->parallelize(ParallelType::TIDx); // Manual Binding tv2->axis(-3)->parallelize(ParallelType::TIDy); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-3)->parallelize(ParallelType::TIDy); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv7->axis(-3)->parallelize(ParallelType::TIDy); tv7->axis(-2)->parallelize(ParallelType::TIDx); tv6->axis(-2)->parallelize(ParallelType::TIDy); tv6->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 154, K = 45, N = 1524; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* x = makeSymbolicTensor(2); fusion.addInput(x); TensorView* max_val = reductionOp(BinaryOpType::Max, {-1}, new Double(FLT_MIN), x); // (M) TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) TensorView* x_max_sub = sub(x, bcast_max); // (M, N) TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N) TensorView* sum_exp = sum(exp, {-1}); // (M, R) TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B) TensorView* softmax = div(exp, bcast_sum); // (M, N) fusion.addOutput(softmax); // Read Input into Shared Memory // Load Input + Pwise into shared memory auto cache_x = x->cache_after(); cache_x->setMemoryType(MemoryType::Shared); exp->setMemoryType(MemoryType::Shared); std::vector all_tensors( {x, cache_x, max_val, bcast_max, x_max_sub, exp, sum_exp, bcast_sum, softmax}); auto tidx = new Int(); fusion.addInput(tidx); for (auto tensor : all_tensors) { tensor->split(-1, tidx); } auto sum_exp_rf = sum_exp->rFactor({1}); all_tensors.push_back(sum_exp_rf); // computeAt x->computeAt(x_max_sub, 1); exp->computeAt(softmax, 1); x_max_sub->computeAt(exp, 2); softmax->axis(0)->parallelize(ParallelType::BIDx); for (auto tensor : all_tensors) { tensor->axis(-1)->parallelize(ParallelType::TIDx); } const size_t dimx = 1024; const size_t dimy = 4096; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input, 128}); testValidate( &fusion, cg_outputs, {aten_input, 128}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int kReductionAxis = 3; std::vector input_shape{10, 10, 10, 67}; TensorView* input = makeSymbolicTensor(input_shape.size()); fusion.addInput(input); auto output = softmax(input, kReductionAxis); fusion.addOutput(output); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(input_shape, options); auto aten_output = at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false); auto reduction_params = getNormalizationHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleNormalization(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); std::vector shape{20, 100, 35, 67}; std::vector norm_shape{67}; const size_t kM = shape.size(); const size_t kN = norm_shape.size(); const size_t kOuterNumDims = kM - kN; std::vector outer_shape; for (size_t idx = 0; idx < kOuterNumDims; ++idx) { outer_shape.push_back(shape[idx]); } for (size_t idx = kOuterNumDims; idx < kM; ++idx) { outer_shape.push_back(1); } auto grad_out = makeSymbolicTensor(shape.size()); auto input = makeSymbolicTensor(shape.size()); auto mean = makeConcreteTensor(outer_shape); auto rstd = makeConcreteTensor(outer_shape); auto weight = makeSymbolicTensor(norm_shape.size()); auto bias = makeSymbolicTensor(norm_shape.size()); fusion.addInput(grad_out); fusion.addInput(input); fusion.addInput(mean); fusion.addInput(rstd); fusion.addInput(weight); fusion.addInput(bias); auto grads = layer_norm_backward( grad_out, input, norm_shape, mean, rstd, weight, bias, {true, true, true}); fusion.addOutput(grads.grad_input); fusion.addOutput(grads.grad_weight); fusion.addOutput(grads.grad_bias); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_grad_out = at::randn(shape, options); at::Tensor aten_input = at::randn(shape, options); at::Tensor aten_weight = at::randn(norm_shape, options); at::Tensor aten_bias = at::randn(norm_shape, options); auto at_weight = c10::optional(aten_weight); auto at_bias = c10::optional(aten_bias); const float kEps = 1e-5; auto aten_results = at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps); auto aten_output = std::get<0>(aten_results); auto aten_mean = std::get<1>(aten_results); auto aten_rstd = std::get<2>(aten_results); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector aten_inputs = { aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; auto cg_outputs = fec.runFusionWithInputs(aten_inputs); auto aten_gradients = at::native_layer_norm_backward( aten_grad_out.to(at::kDouble), aten_input.to(at::kDouble), norm_shape, aten_mean.to(at::kDouble), aten_rstd.to(at::kDouble), c10::optional(aten_weight.to(at::kDouble)), c10::optional(aten_bias.to(at::kDouble)), {true, true, true}); testValidate( &fusion, cg_outputs, aten_inputs, {std::get<0>(aten_gradients), std::get<1>(aten_gradients), std::get<2>(aten_gradients)}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); const float kEps = 1e-5; Double* eps_ptr = new Double(kEps); std::vector input_shape{20, 100, 35, 67}; std::vector norm_shape{67}; auto input = makeSymbolicTensor(input_shape.size()); fusion.addInput(input); auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr); fusion.addOutput(result.output); fusion.addOutput(result.mean); fusion.addOutput(result.invstd); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(input_shape, options); c10::optional aten_weight = c10::nullopt; c10::optional aten_bias = c10::nullopt; auto aten_outputs = at::native_layer_norm( aten_input, norm_shape, aten_weight, aten_bias, kEps); // Check reduction axis is same for all reductions // Generate Launch Parameters auto reduction_params = getNormalizationHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleNormalization(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {std::get<0>(aten_outputs), std::get<1>(aten_outputs), std::get<2>(aten_outputs)}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); const float kMomentum = 0.1; const float kEps = 1e-5; const bool kTraining = true; std::vector input_shape{20, 100, 35, 45}; auto input = makeSymbolicTensor(input_shape.size()); auto weight = makeSymbolicTensor(1); auto bias = makeSymbolicTensor(1); auto running_mean = makeSymbolicTensor(1); auto running_var = makeSymbolicTensor(1); fusion->addInput(input); fusion->addInput(weight); fusion->addInput(bias); fusion->addInput(running_mean); fusion->addInput(running_var); Double* momentum = new Double(kMomentum); Double* eps = new Double(kEps); auto result = batch_norm( input, weight, bias, running_mean, running_var, kTraining, momentum, eps); fusion->addOutput(result.output); fusion->addOutput(result.mean); fusion->addOutput(result.invstd); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto at_input = at::randn(input_shape, options); auto at_weight = at::ones({input_shape[1]}, options); auto at_bias = at::zeros({input_shape[1]}, options); auto at_run_mean = at::zeros({input_shape[1]}, options); auto at_run_var = at::ones({input_shape[1]}, options); std::vector aten_inputs = { at_input, at_weight, at_bias, at_run_mean, at_run_var}; FusionExecutorCache executor_cache(std::move(fusion)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); auto aten_outputs = at::native_batch_norm( at_input, c10::optional(at_weight), c10::optional(at_bias), c10::optional(at_run_mean), c10::optional(at_run_var), kTraining, kMomentum, kEps); testValidate( executor_cache.fusion(), cg_outputs, aten_inputs, {at_run_mean, at_run_var, std::get<0>(aten_outputs), std::get<1>(aten_outputs), std::get<2>(aten_outputs)}, __LINE__, __FILE__, ""); } // Disabling for now because memory reuse pass needs to be fixed. #if 0 TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int pixels_per_thread = 64; const int TIDX = 128; const int static_size = pixels_per_thread * TIDX; TensorView* sx = makeConcreteTensor({-1, static_size}); TensorView* dx = makeSymbolicTensor(2); fusion.addInput(sx); fusion.addInput(dx); TensorView* max_sx = reductionOp(BinaryOpType::Max, {-1}, new Double(FLT_MIN), sx); // (M) TensorView* max_dx = reductionOp(BinaryOpType::Max, {-1}, new Double(FLT_MIN), dx); // (M) // Reduction => merge local and shared memory TensorViews TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx); TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N) TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N) TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N) TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N) TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R) TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R) // Reduction => merge local and shared memory TensorViews TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp); TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B) TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N) TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N) fusion.addOutput(sx_softmax); fusion.addOutput(dx_softmax); auto sx_cache = sx->cache_after(); auto dx_cache = dx->cache_after(); dx_cache->setMemoryType(MemoryType::Shared); dx_exp->setMemoryType(MemoryType::Shared); // Reduction and Broadcast Tensors common to both memory TVs std::vector common_tensors( {max_val, sum_exp, bcast_max, bcast_sum}); // Static Local Memory TVs std::vector static_tensors( {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax}); // Dynamic Local Memory TVs std::vector dynamic_tensors( {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax}); std::vector all_tensors; all_tensors.insert( all_tensors.end(), common_tensors.begin(), common_tensors.end()); all_tensors.insert( all_tensors.end(), static_tensors.begin(), static_tensors.end()); all_tensors.insert( all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end()); // M => M // M, N => M, N/128, 128 for (auto tensor : all_tensors) { if (tensor->nDims() > 1) { tensor->split(-1, TIDX); } } auto sx_sum_exp_rf = sx_sum_exp->rFactor({1}); auto dx_sum_exp_rf = dx_sum_exp->rFactor({1}); all_tensors.push_back(sx_sum_exp_rf); all_tensors.push_back(dx_sum_exp_rf); // computeAt sx->computeAt(sx_max_sub, 1); dx->computeAt(dx_max_sub, 1); sx_exp->computeAt(sx_softmax, 1); dx_exp->computeAt(dx_softmax, 1); sx_max_sub->computeAt(sx_exp, 2); dx_max_sub->computeAt(dx_exp, 2); sx_softmax->axis(0)->parallelize(ParallelType::BIDx); dx_softmax->axis(0)->parallelize(ParallelType::BIDx); for (auto tensor : all_tensors) { if (tensor->nDims() > 1) { tensor->axis(-1)->parallelize(ParallelType::TIDx); } } const size_t dimx = 1024; const size_t dimy = 16384; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size); at::Tensor aten_dynamic_in = aten_input.narrow(1, static_size, dimy - static_size); at::Tensor out = at::zeros({dimx, dimy}, options); at::Tensor cg_static_out = out.narrow(1, 0, static_size); at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size); std::vector aten_outputs; auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size); at::Tensor aten_dynamic_out = aten_output.narrow(1, static_size, dimy - static_size); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion( {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); testValidate( &fusion, {cg_static_out, cg_dynamic_out}, {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}, __LINE__, __FILE__); } #endif // DISABLED. TODO: https://github.com/csarofeen/pytorch/issues/743 TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { return; Fusion fusion; FusionGuard fg(&fusion); const int pixels_per_thread = 64; const int TIDX = 128; const int static_size = pixels_per_thread * TIDX; TensorView* sx = makeConcreteTensor({-1, static_size}); TensorView* dx = makeSymbolicTensor(2); fusion.addInput(sx); fusion.addInput(dx); Double* gamma = new Double(); Double* beta = new Double(); Double* eps = new Double(); Int* N = new Int(); fusion.addInput(gamma); fusion.addInput(beta); fusion.addInput(eps); fusion.addInput(N); // Reduction auto sx_sum = sum(sx, {-1}); // (M, R) auto dx_sum = sum(dx, {-1}); // (M, R) // Reduction => merge local and shared memory TensorViews auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum); // Broadcast auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B) // Pwise auto x_mean = div(x_sum_bcast, N); // (M, B) auto sx_mean_sub = sub(sx, x_mean); // (M, N) auto dx_mean_sub = sub(dx, x_mean); // (M, N) auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N) auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N) // Reduction auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R) auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R) // Reduction => merge local and shared memory TensorViews auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum); // Broadcast auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B) // Pwise auto var = div(var_sum_bcast, N); // (M, B) auto var_eps = add(var, eps); // (M, B) auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B) auto sx_norm = mul(sx_mean_sub, rvar); auto dx_norm = mul(dx_mean_sub, rvar); auto sx_norm_gamma = mul(sx_norm, gamma); auto dx_norm_gamma = mul(dx_norm, gamma); auto sx_norm_gamma_beta = add(sx_norm_gamma, beta); auto dx_norm_gamma_beta = add(dx_norm_gamma, beta); fusion.addOutput(sx_norm_gamma_beta); fusion.addOutput(dx_norm_gamma_beta); // Read Input into Shared Memory // Read Input minus Input_Mean into Shared Memory auto sx_cache = sx->cache_after(); auto dx_cache = dx->cache_after(); dx_cache->setMemoryType(MemoryType::Shared); dx_mean_sub->setMemoryType(MemoryType::Shared); std::vector common_tensors( {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar}); std::vector static_tensors( {sx, sx_cache, sx_sum, sx_mean_sub, sx_mean_sub_pow, sx_var_sum, sx_norm, sx_norm_gamma, sx_norm_gamma_beta}); std::vector dynamic_tensors( {dx, dx_cache, dx_sum, dx_mean_sub, dx_mean_sub_pow, dx_var_sum, dx_norm, dx_norm_gamma, dx_norm_gamma_beta}); std::vector all_tensors; all_tensors.insert( all_tensors.end(), common_tensors.begin(), common_tensors.end()); all_tensors.insert( all_tensors.end(), static_tensors.begin(), static_tensors.end()); all_tensors.insert( all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end()); // M => M // M, N => M, N/128, 128 for (auto tensor : all_tensors) { if (tensor->nDims() > 1) { tensor->split(-1, TIDX); } } // Local Sum => Block Broadcast TensorView* sx_sum_rf = sx_sum->rFactor({1}); TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1}); TensorView* dx_sum_rf = dx_sum->rFactor({1}); TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1}); all_tensors.push_back(sx_sum_rf); all_tensors.push_back(sx_var_sum_rf); all_tensors.push_back(dx_sum_rf); all_tensors.push_back(dx_var_sum_rf); // ComputeAt sx->computeAt(sx_mean_sub_pow, 1); dx->computeAt(dx_mean_sub_pow, 1); var_sum->computeAt(rvar, 1); sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2); dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2); sx_norm->computeAt(sx_norm_gamma_beta, 2); dx_norm->computeAt(dx_norm_gamma_beta, 2); sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx); dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx); for (auto tensor : all_tensors) { if (tensor->nDims() > 1) { tensor->axis(-1)->parallelize(ParallelType::TIDx); } } const int dimx = 1024; const int dimy = 16384; const float kGamma = 1.0f; const float kBeta = 0.0f; const float kEps = 1e-5; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size); at::Tensor aten_dynamic_in = aten_input.narrow(1, static_size, dimy - static_size); at::Tensor out = at::zeros({dimx, dimy}, options); at::Tensor cg_static_out = out.narrow(1, 0, static_size); at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size); std::vector aten_inputs = { aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); auto at_rvar = at::rsqrt(at::add(at_var, kEps)); auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar); auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta); at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size); at::Tensor aten_dynamic_out = aten_output.narrow(1, static_size, dimy - static_size); testValidate( &fusion, {cg_static_out, cg_dynamic_out}, aten_inputs, {aten_static_out, aten_dynamic_out}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views auto x = makeSymbolicTensor(2); Double* gamma = new Double(); Double* beta = new Double(); Double* eps = new Double(); Int* N = new Int(); fusion.addInput(x); fusion.addInput(gamma); fusion.addInput(beta); fusion.addInput(eps); fusion.addInput(N); // Reduction auto x_sum = sum(x, {-1}); // (M, R) // Broadcast auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B) // Pwise auto x_mean = div(x_sum_bcast, N); // (M, B) auto x_mean_sub = sub(x, x_mean); // (M, N) auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N) // Reduction auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R) // Broadcast auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B) // Pwise auto var = div(var_sum_bcast, N); // (M, B) auto var_eps = add(var, eps); // (M, B) auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B) auto norm = mul(x_mean_sub, rvar); auto norm_gamma = mul(norm, gamma); auto norm_gamma_beta = add(norm_gamma, beta); fusion.addOutput(norm_gamma_beta); // Read Input into Shared Memory // Read Input minus Input_Mean into Shared Memory auto cache_x = x->cache_after(); cache_x->setMemoryType(MemoryType::Shared); x_mean_sub->setMemoryType(MemoryType::Shared); std::vector all_tensors( {x_sum, x_mean, cache_x, x_sum_bcast, x_mean_sub, x_mean_sub_pow, var_sum, var_sum_bcast, var, var_eps, rvar, norm, norm_gamma, norm_gamma_beta}); auto tidx = new Int(); fusion.addInput(tidx); for (auto tensor : all_tensors) { tensor->split(-1, tidx); } // Local Sum => Block Broadcast TensorView* x_sum_rf = x_sum->rFactor({1}); TensorView* var_sum_rf = var_sum->rFactor({1}); all_tensors.push_back(x_sum_rf); all_tensors.push_back(var_sum_rf); // ComputeAt x->computeAt(x_mean_sub_pow, 1); var_sum->computeAt(rvar, 1); x_mean_sub_pow->computeAt(var_sum_rf, 2); norm->computeAt(norm_gamma_beta, 2); for (auto tv : all_tensors) { tv->axis(0)->parallelize(ParallelType::BIDx); tv->axis(-1)->parallelize(ParallelType::TIDx); } const int dimx = 128; const int dimy = 2048; const float kGamma = 1.0f; const float kBeta = 0.0f; const float kEps = 1e-5; const int TIDX = 128; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({dimx, dimy}, options); auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_rvar = at::rsqrt(at::add(at_var, kEps)); auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar); auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta); std::vector aten_inputs = { aten_input, kGamma, kBeta, kEps, dimy, TIDX}; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addInput(tv0); fusion.addOutput(tv1); // tv1[I0, R1] = tv0[I0, I1] // Interface should just be a direct split with a Parallel type. We can // include the parallelize call if we do this. tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({2}); tv2->setMemoryType(MemoryType::Shared); // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] tv0->computeAt(tv1, 1); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(0)->parallelize(ParallelType::BIDx); constexpr int numel_x = 65000, numel_y = 1024; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({numel_x, numel_y}, options); auto aten_output = aten_input.to(at::kDouble).sum({1}); // How many threads to use for the block reduction constexpr int runtime_threadIdx_dim = 128; LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm Int* sym_bsx = new Int(); TensorView* tv0 = makeSymbolicTensor(3); // M, K, N fusion.addInput(tv0); fusion.addInput(sym_bsx); TensorView* tv1 = sum(tv0, {1}); // M, R, N fusion.addOutput(tv1); TensorView* tv2 = tv0->cache_after(); tv2->setMemoryType(MemoryType::Shared); // Schedule constexpr int BSX = 32; tv1->split(2, BSX); tv1->split(1, sym_bsx); tv1->split(0, BSX); // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); TensorView* tv3 = tv1->rFactor({-2}); tv0->computeAt(tv1, -2); tv0->computeAt(tv3, -2); // Thread and Block binding tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::BIDy); tv1->axis(-1)->parallelize(ParallelType::TIDx); // Manual Binding tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); constexpr int M = 154, K = 45, N = 1524; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, K, N}, options); at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}); // How many threads to use for the block reduction constexpr int runtime_threadIdx_dim = 128; auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams); testValidate( &fusion, cg_outputs, {aten_input, runtime_threadIdx_dim}, {aten_output}, __LINE__, __FILE__, "", lparams); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Int* sym_bsx = new Int(); TensorView* tv0 = makeSymbolicTensor(2); // (M, K) TensorView* tv1 = makeSymbolicTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) TensorView* tv4 = mul(tv2, tv3); // M, K, N fusion.addInput(tv0); fusion.addInput(tv1); fusion.addInput(sym_bsx); fusion.addOutput(tv4); // Algorithm tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); constexpr int BSX = 32; tv4->split(2, BSX); tv4->split(1, sym_bsx); tv4->split(0, BSX); // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX tv0->computeAt(tv4, 3); tv1->computeAt(tv4, 3); // Schedule tv4->axis(0)->parallelize(ParallelType::BIDx); tv4->axis(2)->parallelize(ParallelType::BIDy); // Manual Binding tv2->axis(-2)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); // Thread and Block binding constexpr int M = 128, K = 457, N = 1024; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); std::vector aten_inputs = {t0, t1, BSX}; LaunchParams lparams(-1, -1, -1, BSX, -1, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__, "", lparams); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic integers we will use for runtime tiling Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x // Compile-time integer for tiling int n_smem_tile = 8; // bound to threadIdx.y // Symbolic 2D tensors TV0[M, K], TV1[K, N] TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); // Broadcast tv0 to [M, K, *] TensorView* tv2 = broadcast(tv0, {false, false, true}); // Broadcast tv1 to [*, K, N] TensorView* tv3 = broadcast(tv1, {true, false, false}); // Pointwise multiplication resulting in tv3[M, K, N] TensorView* tv4 = mul(tv2, tv3); // Turn the K-dimension of tv4 into a reduction dimension TensorView* tv5 = sum(tv4, {1}); // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); // Register runtime tile dims as inputs fusion.addInput(symbolic_m_tile_dim); fusion.addInput(symbolic_split_k_tile_dim); fusion.addInput(symbolic_block_k_tile_dim); // Make a 3D tile, mix of symbolic and constant, do in reverse order because // dims are inserted tv5->split(2, n_smem_tile); tv5->split(1, symbolic_block_k_tile_dim); tv5->split(1, symbolic_split_k_tile_dim); tv5->split(0, symbolic_m_tile_dim); // Reorder so all outer tiles are in the leftmost 3 positions tv5->reorder({{1, 5}, {5, 1}}); // Factor out the outer reduction IterDomain, then run the inter-cta // reduction, and intra-cta reduction auto tv6 = tv5->rFactor({2}); // Scope computations tv6->computeAt(tv5, 2); // RFactor moves reduction axes around, reorder to match ordering of tv5 tv6->reorder({ {2, -2}, {3, -1}, {4, 2}, {5, 3}, {6, 4}, }); // Setup compute at schedule tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); tv4->computeAt(tv6, -1); // // T2[Mo, bNo, Koo, Koi, Kii, Mi, bNi] CA(4, 3) // T3[bMo, No, Koo, Koi, Kii, bMi, Ni] CA(4, 3) // T4[ Mo, No, Koo, Koi, Kii, Mi, Ni] // T6[ Mo, No, rKoo, Koi, Kii, Mi, Ni] // T5[ Mo, No, rKoi, rKii, Mi, Ni] // Cache smem tiles tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); tv4->setMemoryType(MemoryType::Local); tv6->setMemoryType(MemoryType::Local); tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; for (auto tv : tv_list) { tv->axis(-2)->parallelize(ParallelType::TIDz); tv->axis(-1)->parallelize(ParallelType::TIDy); } tv2->axis(3)->parallelize(ParallelType::TIDx); tv3->axis(3)->parallelize(ParallelType::TIDx); tv4->axis(3)->parallelize(ParallelType::TIDx); tv6->axis(3)->parallelize(ParallelType::TIDx); tv5->axis(2)->parallelize(ParallelType::TIDx); tv2->axis(4)->parallelize(ParallelType::BIDx); tv3->axis(4)->parallelize(ParallelType::BIDx); tv4->axis(4)->parallelize(ParallelType::BIDx); tv6->axis(4)->parallelize(ParallelType::BIDx); tv5->axis(3)->parallelize(ParallelType::BIDx); constexpr int M = 31, K = 65, N = 33; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); FusionExecutor fe; // Generate CUDA and compile with nvRTC fe.compileFusion(&fusion); // Runtime tiling int m_tile = 4; // bound to threadIdx.z int split_k = 7; // bound to blockIdx.x int intra_cta = 8; // bound to threadIdx.x std::vector aten_inputs = {t0, t1, m_tile, split_k, intra_cta}; at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); fusion.addInput(tv0); fusion.addOutput(tv1); // tv1[I0, R1] = tv0[I0, I1] // Interface should just be a direct split with a Parallel type. We can // include the parallelize call if we do this. tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] TensorView* tv2 = tv1->rFactor({2}); tv2->setMemoryType(MemoryType::Global); // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] tv0->computeAt(tv1, 1); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(0)->parallelize(ParallelType::BIDx); constexpr int numel_x = 65000, numel_y = 1024; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); // How many threads to use for the block reduction constexpr int runtime_threadIdx_dim = 128; auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}, lparams); auto aten_output = input.to(at::kDouble).sum({1}); testValidate( &fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); TensorView* tv2 = makeSymbolicTensor(2); TensorView* tv3 = makeSymbolicTensor(2); TensorView* tv4 = sub(tv2, tv3); TensorView* tv5 = add(tv1, tv4); TensorView* tv6 = sub(tv5, tv0); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addInput(tv2); fusion.addInput(tv3); fusion.addOutput(tv6); // t6 = ((t1 + (t2 - t3)) - t0) tv4->setMemoryType(MemoryType::Global); tv5->setMemoryType(MemoryType::Global); tv6->setMemoryType(MemoryType::Global); constexpr int M = 32, N = 810; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, N}, options); at::Tensor t1 = at::randn({M, N}, options); at::Tensor t2 = at::randn({M, N}, options); at::Tensor t3 = at::randn({M, N}, options); at::Tensor aten_output = (t1 + (t2 - t3)) - t0; std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({t0, t1, t2, t3}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionConstCheck_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto one = new Int(1); TORCH_CHECK(one->isConstScalar()); auto one_x2 = mul(one, one); TORCH_CHECK(one_x2->isConstScalar()); auto one_x3 = mul(one_x2, one); TORCH_CHECK(one_x3->isConstScalar()); auto one_x4 = mul(one_x3, one); TORCH_CHECK(one_x4->isConstScalar()); } TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { const std::vector tensor_dims_in = {128, 128}; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(0)); TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1); fusion.addOutput(tv2); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn(tensor_dims_in, options); at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options); // Schedule tv2->split(1, 32); tv2->split(1, 4); // unroll auto tv2_rf = tv2->rFactor({-3, -2}); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv2_rf->axis(0)->parallelize(ParallelType::BIDx); tv2_rf->axis(-1)->parallelize(ParallelType::TIDx); tv2_rf->axis(-2)->parallelize(ParallelType::Unroll); tv1->computeAt(tv2_rf, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto aten_output = (input + 0).to(at::kDouble).sum(1); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } // Test isZeroInt TEST(NVFuserTest, FusionIsZeroInt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Int* x = new Int(0); Int* y = new Int(1); Val* z = mul(x, y); TORCH_CHECK(x->isZeroInt()); TORCH_CHECK(!y->isZeroInt()); TORCH_CHECK(!z->isZeroInt()); } // Test isOneInt TEST(NVFuserTest, FusionIsOneInt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Int* x = new Int(1); Int* y = new Int(1); Val* z = mul(x, y); TORCH_CHECK(x->isOneInt()); TORCH_CHECK(y->isOneInt()); TORCH_CHECK(!z->isOneInt()); } // This is to verify no cycle of computeAt is created. A more complex // variation of this pattern appears in one of the Python tests // (test_random_topo). TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); // Common intermediate tensor auto tv1 = add(tv0, new Double(1)); // tv1 -> tv2 auto tv2 = add(tv1, new Double(2)); // tv1 -> tv3 -> tv4 auto tv3 = add(tv1, new Double(3)); auto tv4 = add(tv3, new Double(4)); // NOTE: This should no longer occur as of PR #201. // The order of adding outputs matters. If tv3 is added before tv4, // it should be fine. However, if tv4 is added before tv3, there // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created // first, and then tv4->tv3 is created at the final phase of // computeAt (ComputeAt::setupOutputs). fusion.addOutput(tv2); fusion.addOutput(tv4); fusion.addOutput(tv3); tv0->computeAt(tv2, -1); TORCH_CHECK(tv3->hasComputeAt()); TORCH_CHECK(!tv4->hasComputeAt()); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn(100, options); auto t1 = aten_input + 1; auto t2 = t1 + 2; auto t3 = t1 + 3; auto t4 = t3 + 4; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); std::vector aten_outputs = {t2, t4, t3}; testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv0, new Double(2)); TensorView* tv3 = add(tv1, new Double(3)); TensorView* tv4 = add(tv1, new Double(4)); fusion.addOutput(tv2); fusion.addOutput(tv3); fusion.addOutput(tv4); tv1->computeAt(tv3, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 10}, options); auto t1 = aten_input + 1; auto t2 = aten_input + 2; auto t3 = t1 + 3; auto t4 = t1 + 4; std::vector aten_outputs = {t2, t3, t4}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv0, new Double(3)); TensorView* tv4 = add(tv3, new Double(4)); TensorView* tv5 = add(tv1, tv3); fusion.addOutput(tv2); fusion.addOutput(tv4); fusion.addOutput(tv5); tv1->computeAt(tv5, -1); tv3->computeAt(tv5, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 10}, options); auto t1 = aten_input + 1; auto t2 = t1 + 2; auto t3 = aten_input + 3; auto t4 = t3 + 4; auto t5 = t1 + t3; std::vector aten_outputs = {t2, t4, t5}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { for (int i = 0; i < 2; ++i) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv0, new Double(3)); TensorView* tv4 = add(tv3, new Double(4)); TensorView* tv5 = add(tv1, tv3); fusion.addOutput(tv2); fusion.addOutput(tv4); fusion.addOutput(tv5); const int tile = 32; tv1->split(-1, tile); tv2->split(-1, tile); tv3->split(-1, tile); tv4->split(-1, tile); tv5->split(-1, tile); auto compute_at_outer = tv1; auto compute_at_inner = tv3; if (i == 1) { std::swap(compute_at_inner, compute_at_outer); } compute_at_outer->computeAt(tv5, -2); compute_at_inner->computeAt(tv5, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); auto t1 = aten_input + 1; auto t2 = t1 + 2; auto t3 = aten_input + 3; auto t4 = t3 + 4; auto t5 = t1 + t3; std::vector aten_outputs = {t2, t4, t5}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } } TEST(NVFuserTest, FusionTraversalOrder4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // First tree TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv1, new Double(3)); fusion.addOutput(tv2); fusion.addOutput(tv3); // Second tree TensorView* tv4 = makeSymbolicTensor(1); fusion.addInput(tv4); TensorView* tv5 = add(tv4, new Double(5)); TensorView* tv6 = add(tv5, new Double(6)); TensorView* tv7 = add(tv5, new Double(7)); fusion.addOutput(tv6); fusion.addOutput(tv7); tv1->computeAt(tv2, -1); tv5->computeAt(tv6, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({100}, options); at::Tensor t4 = at::rand_like(t0, options); auto t1 = t0 + 1; auto t2 = t1 + 2; auto t3 = t1 + 3; auto t5 = t4 + 5; auto t6 = t5 + 6; auto t7 = t5 + 7; std::vector aten_outputs = {t2, t3, t6, t7}; std::vector aten_inputs = {t0, t4}; std::vector cg_outputs = { at::empty_like(t0, options), at::empty_like(t0, options), at::empty_like(t0, options), at::empty_like(t0, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion(aten_inputs, cg_outputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv0, new Double(3)); TensorView* tv4 = add(tv3, new Double(4)); TensorView* tv5 = add(tv2, tv4); fusion.addOutput(tv1); fusion.addOutput(tv3); fusion.addOutput(tv5); tv2->computeAt(tv5, -1); tv4->computeAt(tv5, -1); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; fe.runFusion({aten_input}, cg_outputs); auto t1 = aten_input + 1; auto t2 = t1 + 2; auto t3 = aten_input + 3; auto t4 = t3 + 4; auto t5 = t2 + t4; std::vector aten_outputs = {t1, t3, t5}; testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTraversalOrder6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv0, new Double(2)); TensorView* tv3 = add(tv1, tv2); TensorView* tv4 = add(tv3, new Double(4)); fusion.addOutput(tv4); tv1->split(0, 32); tv2->split(0, 32); tv3->split(0, 32); tv4->split(0, 32); tv3->computeAt(tv4, -2); tv1->computeAt(tv3, -1); tv2->computeAt(tv3, -2); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); auto t1 = aten_input + 1; auto t2 = aten_input + 2; auto t3 = t1 + t2; auto aten_output = t3 + 4; at::Tensor cg_output = at::empty_like(aten_input, options); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(2)); TensorView* tv3 = add(tv0, new Double(3)); TensorView* tv4 = add(tv3, new Double(4)); TensorView* tv5 = add(tv2, tv4); fusion.addOutput(tv5); TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5}; for (auto tv : tvs) { tv->split(0, 2); tv->split(0, 4); tv->split(0, 8); } // computeAt into inner loop nests tv1->computeAt(tv2, -1); tv3->computeAt(tv4, -2); tv2->computeAt(tv5, -4); tv4->computeAt(tv5, -3); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); auto t1 = aten_input + 1; auto t2 = t1 + 2; auto t3 = aten_input + 3; auto t4 = t3 + 4; auto aten_output = t2 + t4; at::Tensor cg_output = at::empty_like(aten_input, options); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } // Test predication of grid reduction TEST(NVFuserTest, FusionThreadPredicate_CUDA) { const int gdimx = 4; const int bdimx = 128; Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1); TensorView* tv3 = add(tv0, new Double(2)); fusion.addOutput(tv3); fusion.addOutput(tv2); tv1->split(1, bdimx); tv1->split(1, gdimx); tv3->split(1, bdimx); tv3->split(1, gdimx); TensorView* tv1_rf = tv1->rFactor({1}); tv1->computeAt(tv2, -1); tv1->axis(0)->parallelize(ParallelType::BIDy); tv1_rf->axis(0)->parallelize(ParallelType::BIDy); tv2->axis(0)->parallelize(ParallelType::BIDy); tv1->axis(-2)->parallelize(ParallelType::BIDx); tv1_rf->axis(-2)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(3)->parallelize(ParallelType::TIDx); tv3->axis(2)->parallelize(ParallelType::BIDx); tv3->axis(0)->parallelize(ParallelType::BIDy); int numel_x = 100; int numel_y = 1000; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({numel_x, numel_y}, options); auto t2 = -aten_input.to(at::kDouble).sum({1}); auto t3 = aten_input + 2.0; std::vector aten_outputs = {t3, t2}; std::vector cg_outputs = { at::empty_like(aten_input, options), at::empty({numel_x}, options)}; FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionLSTMCell_CUDA) { const int hidden_features = 512; const int batch_size = 64; Fusion fusion; FusionGuard fg(&fusion); TensorView* tvs[16]; for (size_t i = 0; i < 16; i++) { tvs[i] = makeSymbolicTensor(2); fusion.addInput(tvs[i]); } auto ingate = unaryOp( UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3])); auto forgetgate = unaryOp( UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7])); auto cellgate = unaryOp( UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11])); auto outgate = unaryOp( UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15])); auto cx = makeContigTensor(2); fusion.addInput(cx); auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate)); auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy)); fusion.addOutput(cy); fusion.addOutput(hy); std::vector aten_inputs; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor large_tensor0 = at::randn({batch_size, hidden_features * 4}, options); at::Tensor large_tensor1 = at::randn({batch_size, hidden_features * 4}, options); at::Tensor large_tensor2 = at::randn({batch_size, hidden_features * 4}, options); at::Tensor large_tensor3 = at::randn({batch_size, hidden_features * 4}, options); auto chunked0 = large_tensor0.chunk(4, 1); auto chunked1 = large_tensor1.chunk(4, 1); auto chunked2 = large_tensor2.chunk(4, 1); auto chunked3 = large_tensor3.chunk(4, 1); aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end()); aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end()); aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end()); aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end()); auto at_ingate = chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid(); auto at_forgetgate = chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid(); auto at_cellgate = chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh(); auto at_outgate = chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid(); auto at_cx = at::randn({batch_size, hidden_features}, options); aten_inputs.push_back(at_cx); auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate)); auto at_hy = at_outgate.mul(at_cy.tanh()); auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = broadcast(tv1, {true, false}); TensorView* tv3 = broadcast(tv1, {false, true}); TensorView* tv4 = add(tv2, tv3); fusion.addOutput(tv4); // Not possible to do computeAt at position -1 as recomputation // would be required. An exception should be thrown. ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); } TEST(NVFuserTest, FusionReductionHalf_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(3, DataType::Half); fusion.addInput(tv0); auto tv1 = castOp(DataType::Float, tv0); auto tv2 = add(tv1, new Double(1.0)); auto tv3 = sum(tv2, {2}); auto tv4 = castOp(DataType::Half, tv3); fusion.addOutput(tv4); const auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({8, 8, 16}, options); auto reduction_tv = tv3; auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionReduceSingle_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({100, 1}); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100, 1}, options); // Grab only tensor views, though there shouldn't be any other type FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}); auto aten_output = aten_input.to(at::kDouble).sum({1}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {red_dim, 2}, new Double(0), tv0); fusion.addOutput(tv1); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options); // Apply reduction heuristic auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0); TensorView* tv2 = reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv1); fusion.addOutput(tv2); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options); // Apply reduction heuristic auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({1, 2}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1); fusion.addOutput(tv2); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options); // Apply reduction heuristic auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({2, 1}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionTrivialReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({10, 20, 1}); fusion.addInput(tv0); TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0); fusion.addOutput(tv1); TORCH_CHECK(!fusion.hasReduction(), "Trivial reduction picked up by fusion"); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 20, 1}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); auto aten_output = aten_input.to(at::kDouble).sum({2}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTrivialReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int w = 1, x = 1, y = 7, z = 8; auto tv0 = makeSymbolicTensor(2); auto tv1 = makeConcreteTensor({w, x, y, z}); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = sum(tv1, {0}); auto tv3 = sum(tv2, {0}); auto tv4 = add(tv3, tv0); fusion.addOutput(tv4); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y, z}, options); at::Tensor t1 = at::randn({w, x, y, z}, options); auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0); std::vector aten_inputs = {t0, t1}; auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTrivialReduction3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int v = 1, w = 1, x = 1, y = 7, z = 8; auto tv0 = makeSymbolicTensor(2); auto tv1 = makeConcreteTensor({v, w, x, y, z}); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = sum(tv1, {0, 1, 2}); auto tv3 = add(tv2, tv0); fusion.addOutput(tv3); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({y, z}, options); at::Tensor t1 = at::randn({v, w, x, y, z}, options); auto aten_output = t1.sum({0, 1, 2}).add(t0); std::vector aten_inputs = {t0, t1}; auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } // Make sure trivial reductions are correctly detected even with // scheduling applied. TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = broadcast(tv0, {false, true}); auto tv2 = sum(tv1, {1}); fusion.addOutput(tv2); tv2->split(1, 4); tv2->split(1, 8); auto tv3 = tv2->rFactor({-1}); auto tv4 = tv2->rFactor({-1}); auto tv5 = broadcast(tv0, {true, false}); auto tv6 = add(tv5, new Double(1)); auto tv7 = sub(tv6, new Double(1)); auto tv8 = sum(tv7, {0}); fusion.addOutput(tv8); auto tv9 = broadcast(tv0, {false, true, true}); auto tv10 = sum(tv9, {1}); auto tv11 = sum(tv10, {1}); fusion.addOutput(tv11); tv8->split(0, 3); tv10->split(1, 4); tv11->split(1, 5); tv0->computeAt(tv2, -1); tv0->computeAt(tv8, -1); tv0->computeAt(tv11, 1); // Test indexing to gmem-backed tensors tv3->setMemoryType(MemoryType::Global); tv8->setMemoryType(MemoryType::Global); GpuLower gpulw(&fusion); // No kir::ReductionOp should be generated as all the reduction // exprs should be replaced with a unary set op. for (const auto& kir_node : gpulw.kernel()->irNodes()) { TORCH_CHECK(!kir_node->isA()); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({100}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__); } // Test detection of partially trivial reduction TEST(NVFuserTest, FusionDetectTrivialReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->split(1, 1); // tv1->axis(1): non-trivial // tv1->axis(2): trivial auto tv3 = tv1->rFactor({-1}); GpuLower gpulw(&fusion); // tv3's reduction axis is a trivial reduction. The only // kir::ReductionOp should be for tv1. for (const auto& kir_node : gpulw.kernel()->irNodes()) { if (kir_node->isA()) { auto reduction_out = kir_node->as()->outputs()[0]->as(); TORCH_CHECK(reduction_out->fuserTv() == tv1); } } } TEST(NVFuserTest, FusionInputsIdLookup_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({16, 8, 8}, options); at::Tensor t1 = at::randn({8, 8}, options); at::Tensor t2 = at::randn({6, 4}, options); // create a cache with max size 2; torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2); // testing basic function, same encoding for identical inputs auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0}); auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5}); TORCH_CHECK(id_0.id == id_0_lookup.id); TORCH_CHECK(inputs_id_lookup.size() == 1); TORCH_CHECK(id_0.eviction == false); // new input (even tho same shape, but we have different signature because of // missing scalar input auto id_1 = inputs_id_lookup.lookupId({t0, t1}); auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1}); TORCH_CHECK(id_1.id == id_1_lookup.id); TORCH_CHECK(inputs_id_lookup.size() == 2); TORCH_CHECK(id_1.eviction == false); // eviction should happen at this point auto id_2 = inputs_id_lookup.lookupId({t2, t1}); TORCH_CHECK(id_2.id != id_0.id); TORCH_CHECK(id_2.id != id_1.id); TORCH_CHECK(inputs_id_lookup.size() == 2); TORCH_CHECK(id_2.eviction == true); TORCH_CHECK(id_2.evict_id == id_0.id); // look at input 1 again auto id_1_relook = inputs_id_lookup.lookupId({t0, t1}); TORCH_CHECK(id_1_relook.id == id_1.id); TORCH_CHECK(id_1_relook.eviction == false); } TEST(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) { std::vector sizes_vec({16, 8, 8}); std::vector strides_vec({64, 8, 1}); auto tensor_type = TensorType::create( at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); // pass with identical shape auto t0 = at::randn({16, 8, 8}, options); TORCH_CHECK(complyWith(t0, tensor_type)); // pass with dynamic shape auto t1 = at::randn({16, 16, 8}, options); TORCH_CHECK(complyWith(t1, tensor_type)); // broadcasting semantic change failure auto t2 = at::randn({16, 1, 8}, options); TORCH_CHECK(!complyWith(t2, tensor_type)); // contiguity failure via slicing auto t3 = t0.slice(1, 0, 8, 2); TORCH_CHECK(!complyWith(t3, tensor_type)); // contiguity failure via slicing auto t4 = t0.slice(2, 0, 8, 2); TORCH_CHECK(!complyWith(t4, tensor_type)); // rank failure auto t5 = at::randn({16, 8, 8, 8}, options); TORCH_CHECK(!complyWith(t5, tensor_type)); // contiguity on stride 1 dimension with implicit broadcasting auto t = at::randn({4}, options); auto t6 = t.unsqueeze(1).expand({4, 8}); TORCH_CHECK(complyWith(t6, TensorType::create(t6))); } TEST(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) { std::vector sizes_vec({16, 1, 8}); std::vector strides_vec({8, 8, 1}); auto tensor_type = TensorType::create( at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); // broadcasting semantic change auto t0 = at::randn({16, 8, 8}, options); TORCH_CHECK(!complyWith(t0, tensor_type)); // dtype failure auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf)); TORCH_CHECK(!complyWith(t1, tensor_type)); // dtype failure auto t2 = at::randn({16, 1, 8}, options); TORCH_CHECK(complyWith(t2, tensor_type)); // device inconsistency shouldn't fail auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0)); TORCH_CHECK(complyWith(t3, tensor_type)); } TEST(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) { std::vector sizes_vec({16, 8, 8}); std::vector strides_vec({64, 1, 8}); auto tensor_type = TensorType::create( at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); // failing permutation auto t0 = at::randn({16, 8, 8}, options); TORCH_CHECK(!complyWith(t0, tensor_type)); // passing with dynamic shape auto t1 = t0.permute({0, 2, 1}); TORCH_CHECK(complyWith(t1, tensor_type)); } TEST(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) { std::vector sizes_vec({16, 8, 8}); std::vector strides_vec({128, 16, 1}); auto tensor_type = TensorType::create( at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); // contiguity check passes although it differs auto t0 = at::randn({16, 16, 8}, options); TORCH_CHECK(complyWith(t0, tensor_type)); // passing with dynamic shape auto t1 = t0.slice(1, 0, 16, 2); TORCH_CHECK(complyWith(t1, tensor_type)); } TEST(NVFuserTest, FusionDisjointSet_CUDA) { DisjointSet set; const std::set group_x({0, 1, 2}); const std::set group_y({3, 4, 5}); const std::set group_z({6, 7, 8}); const std::vector> groups({group_x, group_y, group_z}); std::set group_all; std::for_each(groups.begin(), groups.end(), [&](const auto& g) { group_all.insert(g.begin(), g.end()); }); // Initially, nothing should be considered equivalent for (auto i : group_all) { for (auto j : group_all) { TORCH_CHECK(!set.areEquivalent(i, j)); } } // Sets values in group_x are equivalent for (auto i : group_x) { for (auto j : group_x) { set.join(i, j); TORCH_CHECK(set.contains(i)); TORCH_CHECK(set.contains(j)); } } // All values in group_x shoudl be equivalent with each other for (auto i : group_x) { for (auto j : group_x) { TORCH_CHECK(set.areEquivalent(i, j)); } } // But nothing else should be equivalent for (auto i : group_all) { for (auto j : group_y) { TORCH_CHECK(!set.areEquivalent(i, j)); } for (auto j : group_z) { TORCH_CHECK(!set.areEquivalent(i, j)); } } // Sets values in group_y are equivalent for (auto i : group_y) { for (auto j : group_y) { set.join(i, j); TORCH_CHECK(set.contains(i)); TORCH_CHECK(set.contains(j)); } } // group_x should be still equivalent for (auto i : group_x) { for (auto j : group_x) { TORCH_CHECK(set.areEquivalent(i, j)); } } // group_y should be now equivalent for (auto i : group_y) { for (auto j : group_y) { TORCH_CHECK(set.areEquivalent(i, j)); } } // But group_z should not be equivalent with anything yet for (auto i : group_all) { for (auto j : group_z) { TORCH_CHECK(!set.areEquivalent(i, j)); } } // Sets values in group_z are equivalent for (auto i : group_z) { for (auto j : group_z) { set.join(i, j); TORCH_CHECK(set.contains(i)); TORCH_CHECK(set.contains(j)); } } // Now each of the three groups should be equivalent within each // group for (size_t gi = 0; gi < groups.size(); ++gi) { for (size_t gj = 0; gj < groups.size(); ++gj) { for (auto i : groups[gi]) { for (auto j : groups[gj]) { TORCH_CHECK( (gi == gj && set.areEquivalent(i, j)) || (gi != gj && !set.areEquivalent(i, j))); } } } } auto all_elements = set.getAllElements(); std::sort(all_elements.begin(), all_elements.end()); std::vector group_all_vec(group_all.begin(), group_all.end()); std::sort(group_all_vec.begin(), group_all_vec.end()); TORCH_CHECK(all_elements == group_all_vec); set.clear(); all_elements = set.getAllElements(); TORCH_CHECK(all_elements.size() == 0); // All cleared. Nothing should be considered equivalent. for (auto i : group_all) { for (auto j : group_all) { TORCH_CHECK(!set.areEquivalent(i, j)); } } } TEST(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); auto tv1 = makeSymbolicTensor(2); auto tv2 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); fusion.addInput(tv2); auto tv3 = broadcast(tv0, {false, true}); auto tv4 = add(tv3, tv1); auto tv5 = add(tv3, tv2); fusion.addOutput(tv4); fusion.addOutput(tv5); // In order to do this, tv1->axis(1) and tv2->axis(1) must have the // same size, but we can't prove it, so this should throw an error. ASSERT_ANY_THROW(tv3->computeAt(tv4, -1)); } TEST(NVFuserTest, FusionBiasGeluFwd_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const float k_079 = 0.79788456; const float k_004 = 0.044715; // bias vector auto t0 = makeSymbolicTensor(1, DataType::Half); fusion.addInput(t0); auto t1 = castOp(DataType::Float, t0); // input tensor auto t2 = makeSymbolicTensor(3, DataType::Half); fusion.addInput(t2); auto t3 = castOp(DataType::Float, t2); auto t4 = broadcast(t1, {true, true, false}); auto t5 = add(t4, t3); auto t6 = mul(t5, new Double(0.5)); auto t7 = mul(t5, new Double(k_079)); auto t8 = mul(t5, new Double(k_004)); auto t9 = mul(t8, t5); auto t10 = add(t9, new Int(1)); auto t11 = mul(t7, t10); auto t12 = unaryOp(UnaryOpType::Tanh, t11); auto t13 = add(t12, new Double(1)); auto t14 = mul(t6, t13); auto t15 = castOp(DataType::Half, t14); fusion.addOutput(t15); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::manual_seed(0); std::vector input_shape{6, 512, 4096}; std::vector bias_shape{4096}; auto at_input = at::randn(input_shape, options); auto at_bias = at::randn(bias_shape, options); auto at_x = at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float); auto aten_output_float = at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh()); auto aten_output = aten_output_float.to(c10::ScalarType::Half); std::vector aten_inputs = {at_bias, at_input}; auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) { // skipping on pre-volta device if (at::cuda::getDeviceProperties(c10::cuda::current_device())->major < 7) { return; } Fusion fusion; FusionGuard fg(&fusion); const float k_079 = 0.79788456; const float k_004 = 0.044715; const float k_010 = 0.1070322243; // gradient tensor auto t0 = makeSymbolicTensor(3, DataType::Half); fusion.addInput(t0); auto t1 = castOp(DataType::Float, t0); // bias tensor auto t2 = makeSymbolicTensor(1, DataType::Half); fusion.addInput(t2); auto t3 = castOp(DataType::Float, t2); // input tensor auto t4 = makeSymbolicTensor(3, DataType::Half); fusion.addInput(t4); auto t5 = castOp(DataType::Float, t4); auto t6 = broadcast(t3, {true, true, false}); auto t7 = add(t6, t5); auto t8 = mul(t7, new Double(k_079)); auto t9 = mul(t7, new Double(k_004)); auto t10 = mul(t9, t7); auto t11 = add(t10, new Int(1)); auto t12 = mul(t8, t11); auto t13 = unaryOp(UnaryOpType::Tanh, t12); auto t14 = mul(t7, new Double(0.5)); auto t15 = mul(t13, t13); auto t16 = unaryOp(UnaryOpType::Neg, t15); auto t17 = add(t16, new Int(1)); auto t18 = mul(t7, new Double(k_010)); auto t19 = mul(t18, t7); auto t20 = add(t19, new Double(k_079)); auto t21 = mul(t17, t20); auto t22 = mul(t14, t21); auto t23 = add(t13, new Int(1)); auto t24 = mul(t23, new Double(0.5)); auto t25 = add(t22, t24); auto t26 = mul(t25, t1); // Save float output for validation fusion.addOutput(t26); auto t27 = castOp(DataType::Half, t26); fusion.addOutput(t27); auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); at::manual_seed(1); std::vector input_shape{6, 512, 4096}; std::vector bias_shape{4096}; auto at_input = at::randn(input_shape, options); auto at_bias = at::randn(bias_shape, options); auto at_grad = at::randn(input_shape, options); auto at_x = at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float); auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh(); auto at_ff = 0.5 * at_x * ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) + 0.5 * (1 + at_tanh_out); auto at_out = at_ff * at_grad; auto at_out_half = at_out.to(c10::ScalarType::Half); std::vector aten_inputs = {at_grad, at_bias, at_input}; std::vector aten_outputs = {at_out, at_out_half}; auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } // Reproducer of issue #459 TEST(NVFuserTest, FusionIssue459_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1)); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = add(tv1, tv3); // Create two outputs from the final arithmetic result auto tv5 = add(tv4, new Double(1)); fusion.addOutput(tv5); auto tv6 = add(tv4, new Double(1)); fusion.addOutput(tv6); // Scheduling for (auto output : ir_utils::filterByType(fusion.outputs())) { output->merge(-2, -1); } for (auto output : ir_utils::filterByType(fusion.outputs())) { output->split(0, 128); } tv0->computeAt(tv5, -1); tv6->axis(0)->parallelize(ParallelType::BIDx); tv6->axis(1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); const int numel_x = 10; const int numel_y = 20; auto t0 = at::randn({numel_x}, options); auto t1 = at::randn({numel_y, numel_x}, options); auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1; std::vector aten_inputs = {t0, t1}; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output, aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSmemIndexingSimple_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); auto tv3 = add(tv2, new Double(1)); fusion.addOutput(tv3); tv3->axis(0)->parallelize(ParallelType::BIDx); tv3->axis(1)->parallelize(ParallelType::TIDx); tv0->computeAt(tv3, -1); tv1->setMemoryType(MemoryType::Shared); tv2->setMemoryType(MemoryType::Global); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto aten_input = at::randn({12, 34}, options); at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0; auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSmemIndexing_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic integers we will use for runtime tiling Int* symbolic_m_tile_dim = new Int(); Int* symbolic_split_k_tile_dim = new Int(); Int* symbolic_block_k_tile_dim = new Int(); // Compile-time integer for tiling int n_smem_tile = 32; // Symbolic 2D tensors TV0[M, K], TV1[K, N] TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); // Broadcast tv0 to [M, K, *] TensorView* tv2 = broadcast(tv0, {false, false, true}); // Broadcast tv1 to [*, K, N] TensorView* tv3 = broadcast(tv1, {true, false, false}); // Pointwise multiplication resulting in tv3[M, K, N] TensorView* tv4 = mul(tv2, tv3); // Sum the K-dim TensorView* tv5 = sum(tv4, {1}); // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); // Register runtime tile dims as inputs fusion.addInput(symbolic_m_tile_dim); fusion.addInput(symbolic_split_k_tile_dim); fusion.addInput(symbolic_block_k_tile_dim); // Make a 3D tile, mix of symbolic and constant, do in reverse order because // dims are inserted // [M, rK, N] tv5->split(2, n_smem_tile); // [M, rK, No, Ni{32}] tv5->split(1, symbolic_block_k_tile_dim); // [M, rKo, rKi{i2}, No, Ni{32}] tv5->split(1, symbolic_split_k_tile_dim); // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}] tv5->split(0, symbolic_m_tile_dim); // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}] // Reorder so all outer tiles are in the leftmost 3 positions // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}] // [Mo, No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}] tv5->reorder({{1, 5}, {5, 1}}); // Factor out the outer reduction IterDomain, then run the inter-cta // reduction, and intra-cta reduction // [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}] // [Mo, No, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}] auto tv6 = tv5->rFactor({2}); // Scope computations tv6->computeAt(tv5, 2); // [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}] // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}] tv6->reorder({ {2, -2}, {3, -1}, {4, 2}, {5, 3}, {6, 4}, }); // Setup compute at schedule tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); tv4->computeAt(tv6, -1); // Cache smem tiles tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); tv4->setMemoryType(MemoryType::Shared); tv6->setMemoryType(MemoryType::Shared); tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; for (auto tv : tv_list) { tv->axis(-2)->parallelize(ParallelType::TIDz); tv->axis(-1)->parallelize(ParallelType::TIDy); } constexpr int M = 31, K = 65, N = 32; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); // A, B, m_tile_dim, split_k, intra_cta_tile std::vector aten_inputs = {t0, t1, 3, 4, 5}; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } // Reproducer of issue 408 TEST(NVFuserTest, FusionCacheBeforeReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = sum(tv1, {1}); fusion.addOutput(tv2); tv2->split(0, 4); auto tv3 = tv2->cache_before(); tv0->computeAt(tv3, -1); tv3->computeAt(tv2, -1); tv3->axis(-1)->parallelize(ParallelType::TIDx); FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_x}, options); auto aten_output = (aten_input + 1).to(at::kDouble).sum({1}); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(3); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = sum(tv1, {1}); auto tv3 = add(tv2, new Double(1)); fusion.addOutput(tv2); fusion.addOutput(tv3); auto tv4 = tv2->cache_before(); tv4->computeAt(tv3, 1); tv0->computeAt(tv4, -1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 10; const int numel_y = 20; const int numel_z = 30; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options); auto t2 = (aten_input + 1).to(at::kDouble).sum({1}); auto t3 = t2 + 1; std::vector aten_outputs = {t2, t3}; auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue367_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic integers we will use for runtime tiling Int* symbolic_m_tile_dim = new Int(); Int* symbolic_split_k_tile_dim = new Int(); Int* symbolic_block_k_tile_dim = new Int(); // Compile-time integer for tiling int n_smem_tile = 32; // Symbolic 2D tensors TV0[M, K], TV1[K, N] TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); // Broadcast tv0 to [M, K, *] TensorView* tv2 = broadcast(tv0, {false, false, true}); // Broadcast tv1 to [*, K, N] TensorView* tv3 = broadcast(tv1, {true, false, false}); // Pointwise multiplication resulting in tv3[M, K, N] TensorView* tv4 = mul(tv2, tv3); // Sum the K-dim TensorView* tv5 = sum(tv4, {1}); // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); // Register runtime tile dims as inputs fusion.addInput(symbolic_m_tile_dim); fusion.addInput(symbolic_split_k_tile_dim); fusion.addInput(symbolic_block_k_tile_dim); // Make a 3D tile, mix of symbolic and constant, do in reverse order because // dims are inserted tv5->split(2, n_smem_tile); tv5->split(1, symbolic_block_k_tile_dim); tv5->split(1, symbolic_split_k_tile_dim); tv5->split(0, symbolic_m_tile_dim); // tv5[M/m_tile, m_tile, r{K/split_k/block_k}, r{split_k}, r{block_k}, N/32, // 32] tv5->reorder({{1, 5}, {5, 1}}); // tv5[M/m_tile, N/32, r{K/split_k/block_k}, r{split_k}, r{block_k}, m_tile, // 32] auto tv6 = tv5->rFactor({2}); auto tv7 = tv5->rFactor({2}); // Scope computations tv6->computeAt(tv5, 2); tv6->reorder({ {2, -2}, {3, -1}, {4, 2}, {5, 3}, {6, 4}, }); tv7->reorder({ {2, -2}, {3, -1}, {-2, 2}, {-1, 3}, }); tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); tv4->computeAt(tv6, -1); // Cache smem tiles tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); tv4->setMemoryType(MemoryType::Local); tv6->setMemoryType(MemoryType::Local); tv7->setMemoryType(MemoryType::Local); tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); std::vector tv_list = {tv2, tv3, tv4, tv5, tv6, tv7}; for (auto tv : tv_list) { tv->axis(-2)->parallelize(ParallelType::TIDz); tv->axis(-1)->parallelize(ParallelType::TIDy); } tv2->axis(3)->parallelize(ParallelType::TIDx); tv3->axis(3)->parallelize(ParallelType::TIDx); tv4->axis(3)->parallelize(ParallelType::TIDx); tv6->axis(3)->parallelize(ParallelType::TIDx); tv7->axis(2)->parallelize(ParallelType::TIDx); tv2->axis(4)->parallelize(ParallelType::BIDx); tv3->axis(4)->parallelize(ParallelType::BIDx); tv4->axis(4)->parallelize(ParallelType::BIDx); tv6->axis(4)->parallelize(ParallelType::BIDx); tv7->axis(3)->parallelize(ParallelType::BIDx); tv5->axis(2)->parallelize(ParallelType::BIDx); constexpr int M = 3, K = 6, N = 16; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); // A, B, m, split_k, block_k std::vector aten_inputs = {t0, t1, 2, 2, 3}; at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue468_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = sum(tv1, {0}); fusion.addOutput(tv2); tv1->axis(0)->parallelize(ParallelType::TIDy); tv1->axis(1)->parallelize(ParallelType::TIDx); tv2->axis(0)->parallelize(ParallelType::TIDy); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 100}, options); at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue363_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic 2D tensors TV0[M, K], TV1[K, N] TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(2); // Broadcast tv0 to [M, K, *] TensorView* tv2 = broadcast(tv0, {false, false, true}); // Broadcast tv1 to [*, K, N] TensorView* tv3 = broadcast(tv1, {true, false, false}); // Pointwise multiplication resulting in tv3[M, K, N] TensorView* tv4 = mul(tv2, tv3); // Sum the K-dim TensorView* tv5 = sum(tv4, {1}); // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); tv2->setMemoryType(MemoryType::Global); tv3->setMemoryType(MemoryType::Global); tv4->setMemoryType(MemoryType::Global); tv0->computeAt(tv5, -1); tv1->computeAt(tv5, -1); tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); tv5->axis(2)->parallelize(ParallelType::BIDx); constexpr int M = 3, K = 6, N = 16; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); std::vector aten_inputs = {t0, t1}; torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue484_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = add(tv1, new Double(0)); fusion.addOutput(tv2); tv1->setMemoryType(MemoryType::Global); tv1->axis(1)->parallelize(ParallelType::TIDx); constexpr int M = 100; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({M, M}, options); at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, Issue329_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = sum(tv1, {1}); fusion.addOutput(tv2); auto tv3 = sum(tv1, {1}); fusion.addOutput(tv3); tv1->computeAt(tv2, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector t0_shape{17, 19}; auto aten_input = at::randn(t0_shape, options); auto t2 = (aten_input + 1).to(at::kDouble).sum({1}); auto t3 = (aten_input + 1).to(at::kDouble).sum({1}); std::vector aten_outputs = {t2, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue382_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = broadcast(tv1, {false, false, true}); auto tv3 = makeSymbolicTensor(3); fusion.addInput(tv3); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv2->merge(1); tv4->merge(1); tv1->computeAt(tv4, 1); tv4->axis(0)->parallelize(ParallelType::BIDx); tv1->setMemoryType(MemoryType::Global); tv2->setMemoryType(MemoryType::Global); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); const int numel_x = 12; const int numel_y = 34; const int numel_z = 56; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); auto t0 = at::randn({numel_x, numel_y}, options); auto t3 = at::randn({numel_x, numel_y, numel_z}, options); std::vector aten_inputs = {t0, t3}; auto aten_output = (t0 + 1).unsqueeze(-1) + t3; auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, Issue507_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->setMemoryType(MemoryType::Shared); tv1->axis(1)->parallelize(ParallelType::TIDx); tv2->axis(1)->parallelize(ParallelType::TIDx); tv1->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); std::vector t0_shape{17, 19}; auto aten_input = at::randn(t0_shape, options); auto t1 = (aten_input + 1); auto aten_output = (t1 + 1); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue532_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(1); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(1)); fusion.addInput(tv0); fusion.addOutput(tv2); const int M_BLOCK = 64; const int M_THREAD = 4; tv2->split(0, M_BLOCK); // tv2: [M/M_BLOCK, M_BLOCK] tv1->computeAt(tv2, 1); // tv1: [M/M_BLOCK, M_BLOCK] tv1->split(-1, M_BLOCK / M_THREAD); // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD] tv2->split(-1, M_THREAD); // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD] constexpr int M = 1000; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0 + 1 + 1; testValidate( &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionLoopUnswitch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(1); TensorView* tv1 = add(tv0, new Double(1)); TensorView* tv2 = add(tv1, new Double(1)); fusion.addInput(tv0); fusion.addOutput(tv2); tv2->split(0, 32); tv1->computeAt(tv2, -1); tv2->axis(1)->parallelize(ParallelType::Unswitch); constexpr int M = 1000; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0 + 1 + 1; testValidate( &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue549_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); // M, K TensorView* tv1 = makeSymbolicTensor(2); // K, N fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1)); TensorView* tv3 = broadcast(tv2, {false, false, true}); // tv3[I0, I1, B] = tv0[I0, I1] TensorView* tv4 = broadcast(tv1, {true, false, false}); // tv4[B, I1, I2] = tv1[I1, I2] // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2] TensorView* tv5 = mul(tv3, tv4); // tv6[I0, R1, I2] = tv5[I0, I1, I2] TensorView* tv6 = sum(tv5, {1}); fusion.addOutput(tv6); tv6->split(1, 32); // tv6[I0, R1o, R1i{32}, I2] auto tv7 = tv6->rFactor({1}); // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2] // tv6[I0, , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2] tv6->split(0, 4); tv6->split(-1, 4); // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] tv0->computeAt(tv6, -1); tv1->computeAt(tv6, -1); // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] // tv6[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] //--> (line symbolizes compute at location) // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] tv0->computeAt(tv7, -1); tv1->computeAt(tv7, -1); // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] tv6->axis(0)->parallelize(ParallelType::BIDz); tv6->axis(1)->parallelize(ParallelType::TIDz); tv6->axis(-2)->parallelize(ParallelType::BIDy); tv6->axis(-1)->parallelize(ParallelType::TIDy); tv6->axis(2)->parallelize(ParallelType::TIDx); tv7->axis(2)->parallelize(ParallelType::TIDx); constexpr int M = 65, K = 33, N = 17; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); FusionExecutor fe; fe.compileFusion(&fusion); // Lets specify a few bounds in launch params to make sure it works fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); // Don't specify any launch params auto cg_outputs = fe.runFusion({t0, t1}); auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble)); testValidate( &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, simplecompileRtc_CUDA) { FusionExecutor fe; std::string kernel = R"( __global__ void kernel1(Tensor T0, Tensor T1) { if(threadIdx.x==0){ for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) { T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2; } } } )"; fe.compileRtc(kernel, "CudaCodeGen::kernel1"); LaunchParams lp( 256, // gdimx 1, // gdimy 1, // gdimz 1, // bdimx 1, // bdimy 1 // bdimz ); lp.setSmem(0); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const std::vector tensor_dims = {8}; auto in0 = at::randn(tensor_dims, options); auto out0 = at::empty_like(in0); fe.runRtc(lp, {in0, out0}); auto out_ref = in0 * 2; TORCH_CHECK(out_ref.allclose(out0)); } TEST(NVFuserTest, serialWelford_CUDA) { FusionExecutor fe; int x = 128, y = 64, z = 64; std::string kernel = R"( __global__ void kernel1( Tensor inp, Tensor out_var, Tensor out_avg ){ for(int i0=0;i0 tensor_dims = {x, y, z}; auto in0 = at::randn(tensor_dims, options); auto out_var = at::empty({x}, options); auto out_avg = at::empty({x}, options); fe.runRtc(lp, {in0, out_var, out_avg}); TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var)); TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST(NVFuserTest, blockWelford_CUDA) { FusionExecutor fe; int x = 7, y = 8, z = 9; std::string kernel = R"( __global__ void kernel1( Tensor inp, Tensor out_avg, Tensor out_var, Tensor init_avg, Tensor init_var, Tensor init_N ){ //actual generated kernel will use dynamic shared mem, // here is just for prototype __shared__ float mem_avg[512]; __shared__ float mem_M2[512]; __shared__ long mem_N[512]; float in=inp[threadIdx.x*inp.stride[0]+ threadIdx.y*inp.stride[1]]; float tmp_avg=0; float tmp_M2=0; long tmp_N=0; blockWelford( tmp_avg, tmp_M2, tmp_N, in, 0.f, (long)1, threadIdx, blockDim, (float*)mem_avg, (float*)mem_M2, (long*)mem_N, (bool)(threadIdx.x tensor_dims = {x, y}; const std::vector init_dims = {x, z}; // generate initial values auto init_in = at::randn(init_dims, options); auto init_var = init_in.var({1}, false); auto init_avg = init_in.mean({1}); auto init_N = at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0)); auto in0 = at::randn(tensor_dims, options); // run kernel auto out_var = at::zeros({x}, options); auto out_avg = at::zeros({x}, options); fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N}); // compare with reference output auto cat_tensor = at::cat({init_in, in0}, 1); TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var)); TORCH_CHECK( cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST(NVFuserTest, blockWelfordNoInit_CUDA) { FusionExecutor fe; int x = 7, y = 8, z = 9; // need support IValue for integer input as initial count std::string kernel = R"( __global__ void kernel1( Tensor inp, Tensor out_avg, Tensor out_var ){ //actual generated kernel will use dynamic shared mem, // here is just for prototype __shared__ float mem_avg[512]; __shared__ float mem_M2[512]; __shared__ long mem_N[512]; float in=inp[threadIdx.x*inp.stride[0]+ threadIdx.y*inp.stride[1]+ threadIdx.z*inp.stride[2]]; float tmp_avg=0; float tmp_M2=0; long tmp_N=0; block_sync::init(); blockWelford( tmp_avg, tmp_M2, tmp_N, in, 0.f, (long) 1, threadIdx, blockDim, (float*)mem_avg, (float*)mem_M2, (long*)mem_N, (bool)(threadIdx.x tensor_dims = {x, y, z}; auto in0 = at::randn(tensor_dims, options); auto out_var = at::empty({x}, options); auto out_avg = at::empty({x}, options); fe.runRtc(lp, {in0, out_avg, out_var}); TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var)); TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } TEST(NVFuserTest, gridWelfordNoInit_CUDA) { FusionExecutor fe; int x = 128, y = 64, z = 128; std::string kernel = R"( __global__ void kernel1( Tensor inp, Tensor out_avg, Tensor out_var, Tensor work_buf_avg, Tensor work_buf_M2, Tensor work_buf_N, Tensor sync_flag ){ __shared__ float shared_buf_avg[512]; __shared__ float shared_buf_M2[512]; __shared__ long shared_buf_N[512]; float tmp_avg=0; float tmp_M2=0; long tmp_N=0; float in = inp[ blockIdx.x * inp.stride[0]+ blockIdx.y * inp.stride[1]+ threadIdx.x * inp.stride[2]]; bool T_pred; block_sync::init(); T_pred=welford::gridWelford< true,true,false, true,false,false >( tmp_avg, tmp_M2, tmp_N, in, 0.f, (long) 1, &work_buf_avg[0], &work_buf_M2[0], &work_buf_N[0], sync_flag, (float*)shared_buf_avg, (float*)shared_buf_M2, (long*)shared_buf_N, threadIdx.x tensor_dims = {x, y, z}; auto in0 = at::randn(tensor_dims, options); auto out_avg = at::empty({z}, options); auto out_var = at::empty({z}, options); auto work_buf_avg = at::empty({x * y * z}, options); auto work_buf_var = at::empty({x * y * z}, options); auto work_buf_N = at::empty({x * y * z}, options_int); auto sync_flag = at::zeros({1}, options_int); fe.runRtc( lp, {in0, out_avg, out_var, work_buf_avg, work_buf_var, work_buf_N, sync_flag}); std::vector dims{0, 1}; TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); TORCH_CHECK(in0.var(dims, false).allclose(out_var)); } TEST(NVFuserTest, FusionWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int M = 64, N = 128; auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = mul(tv0, new Double(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; fusion.addOutput(tv_avg); fusion.addOutput(tv_M2); fusion.addOutput(tv_N); tv_avg->split(1, 32); tv_avg->split(0, 32); tv_avg->split(0, 4); tv_avg->reorder({{-1, -3}, {-3, -1}}); tv1->computeAt(tv_avg, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( &fusion, outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int M = 64, N = 128; auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = mul(tv0, new Double(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; fusion.addOutput(tv_avg); fusion.addOutput(tv_M2); fusion.addOutput(tv_N); tv_avg->axis(-1)->parallelize(ParallelType::TIDx); tv1->computeAt(tv_avg, -1); // auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); at::Tensor t_var = at::empty({M}, options); at::Tensor t_avg = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( &fusion, outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionGridWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int M = 64, N = 128; auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = mul(tv0, new Double(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; fusion.addOutput(tv_avg); fusion.addOutput(tv_M2); fusion.addOutput(tv_N); tv_avg->axis(0)->parallelize(ParallelType::TIDx); tv_avg->axis(-1)->parallelize(ParallelType::BIDx); tv1->computeAt(tv_avg, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); at::Tensor t_avg = at::empty({M}, options); at::Tensor t_var = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( &fusion, outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int M = 64, N = 128; auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = mul(tv0, new Double(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; fusion.addOutput(tv_avg); fusion.addOutput(tv_M2); fusion.addOutput(tv_N); tv_avg->split(1, 4); auto rtvs = tvs.rFactor({2}); tv1->computeAt(tv_avg, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); at::Tensor t_avg = at::empty({M}, options); at::Tensor t_var = at::empty({M}, options); at::Tensor t_N = at::empty({M}, options_int); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; testValidate( &fusion, outputs, {t0}, {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionWelfordSchedule_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int M = 64, N = 128; auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = mul(tv0, new Double(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; fusion.addOutput(tv_avg); fusion.addOutput(tv_M2); fusion.addOutput(tv_N); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); // TODO: Why do we use launch params from here, but not scheduling??? auto reduction_params = getReductionHeuristics(&fusion, {t0}); scheduleReduction(&fusion, reduction_params.value()); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}, reduction_params.value().lparams); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; auto at_avg = t0.mean({1}); auto at_var = t0.var({1}, false); auto at_n = at::ones({M}, options_int) * N; testValidate( &fusion, outputs, {t0}, {at_avg, at_var, at_n}, __LINE__, __FILE__, "validate welford", reduction_params.value().lparams); } namespace { void testWelford(DataType dtype, int red_axis, int odim, int rdim) { const int axis = red_axis; at::ScalarType aten_dtype = data_type_to_aten(dtype); Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2, dtype); bool is_fp16 = dtype == DataType::Half; TensorView* tv0_cast = tv0; if (is_fp16) { tv0_cast = castOp(DataType::Float, tv0); } fusion.addInput(tv0); auto tv1 = mul(tv0_cast, new Double(1)); auto tvs = Welford(tv1, {axis}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; TensorView* avg_cast = tv_avg; TensorView* M2_cast = tv_M2; if (is_fp16) { avg_cast = castOp(DataType::Half, tv_avg); M2_cast = castOp(DataType::Half, tv_M2); } fusion.addOutput(avg_cast); fusion.addOutput(M2_cast); fusion.addOutput(tv_N); auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); std::vector outputs_of_red; at::Tensor aten_input = (axis ? at::randn({odim, rdim}, options) : at::randn({rdim, odim}, options)); if (is_fp16) { outputs_of_red.push_back(avg_cast); outputs_of_red.push_back(M2_cast); } auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); scheduleReduction(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({aten_input}, reduction_params.value().lparams); // by default Welford outputs sum of square diff so need to divide to // get var outputs[1] /= rdim; auto at_avg = aten_input.mean({axis}); auto at_var = aten_input.var({axis}, false); auto at_n = (axis ? at::ones({odim, rdim}, options) : at::ones({rdim, odim}, options)); at_n = at_n.sum({axis}); testValidate( &fusion, outputs, {aten_input}, {at_avg, at_var, at_n}, __LINE__, __FILE__, "validate welford", reduction_params.value().lparams); } } // namespace TEST(NVFuserTest, FusionWelfordShmoo_CUDA) { std::vector dtypes = { DataType::Double, DataType::Float, DataType::Half}; std::vector red_axis = {1, 0}; std::vector output_dims = {160, 320}; std::vector red_dims; // Tried to cut down the number iterations with just // doing every other power of 2. for (int i = 1; i <= 1024 * 1024; i <<= 2) { red_dims.push_back(i); } for (auto dtype : dtypes) { for (auto& axis : red_axis) { for (auto& odim : output_dims) { for (auto& rdim : red_dims) { // TODO: original welford algorithm actually keeps a running sum of // squares, i.e. M_{2n} in the // cf: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance // algorithm notation, and it can reach inf for large numbers // with half precision. skipping too large volumes for half for // nwo might need further numerical experiments to re-design // this. if (rdim > 32768 && dtype == DataType::Half) { continue; } testWelford(dtype, axis, odim, rdim); } } } } } TEST(NVFuserTest, FusionTranspose1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int M = 10; constexpr int N = 20; auto tv0 = makeSymbolicTensor(2); auto tv1 = transpose(tv0, {{0, 1}}); fusion.addInput(tv0); fusion.addOutput(tv1); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0.t(); testValidate( &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTranspose2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int M = 10; constexpr int N = 20; auto tv0 = makeSymbolicTensor(2); auto tv1 = transpose(tv0, {{0, 1}}); fusion.addInput(tv0); fusion.addOutput(tv1); tv1->merge(0); tv1->split(0, 32); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0.t(); testValidate( &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); // K, M TensorView* tv1 = makeSymbolicTensor(2); // N, K fusion.addInput(tv0); fusion.addInput(tv1); TensorView* tv0_t = transpose(tv0, {{0, 1}}); TensorView* tv1_t = transpose(tv1, {{0, 1}}); TensorView* tv2 = broadcast(tv0_t, {false, false, true}); // tv2[I0, I1, B] = tv0[I0, I1] TensorView* tv3 = broadcast(tv1_t, {true, false, false}); // tv3[B, I1, I2] = tv1[I1, I2] // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2] TensorView* tv4 = mul(tv2, tv3); // tv5[I0, R1, I2] = tv4[I0, I1, I2] TensorView* tv5 = sum(tv4, {1}); fusion.addOutput(tv5); tv5->split(1, 32); // tv5[I0, R1o, R1i{32}, I2] auto tv6 = tv5->rFactor({1}); // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2] // tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2] tv5->split(0, 4); tv5->split(-1, 4); // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] tv0_t->computeAt(tv5, -1); tv1_t->computeAt(tv5, -1); // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] // tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] //--> (line symbolizes compute at location) // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] tv0_t->computeAt(tv6, -1); tv1_t->computeAt(tv6, -1); // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::TIDz); tv5->axis(-2)->parallelize(ParallelType::BIDy); tv5->axis(-1)->parallelize(ParallelType::TIDy); tv5->axis(2)->parallelize(ParallelType::TIDx); tv6->axis(2)->parallelize(ParallelType::TIDx); constexpr int M = 65, K = 33, N = 17; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({K, M}, options); at::Tensor t1 = at::randn({N, K}, options); FusionExecutor fe; fe.compileFusion(&fusion); // Lets specify a few bounds in launch params to make sure it works fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Don't specify any launch params auto cg_outputs = fe.runFusion({t0, t1}); auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble)); testValidate( &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 32; const int dimx = 32; const int dimy = 16; const int dimz = 130; // Set up your input tensor views TensorView* input_tv0 = makeSymbolicTensor(3); fusion.addInput(input_tv0); TensorView* input_t = transpose(input_tv0, {{1, 2}}); TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t); TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true}); // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be // computed at sum_exp_rf_tv8. TensorView* input_t_copy = transpose(input_tv0, {{1, 2}}); TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy); TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); fusion.addOutput(output_tv4); bcast_sum_tv3->split(-1, tidx); sum_exp_tv2->split(-1, tidx); TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); output_tv4->split(-1, tidx); input_t->computeAt(sum_exp_rf_tv5, -1); input_t_copy->computeAt(output_tv4, -1); TensorView* tensors_to_parallelize[] = { sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; for (auto tv : tensors_to_parallelize) { tv->axis(0)->parallelize(ParallelType::BIDx); tv->axis(1)->parallelize(ParallelType::BIDy); tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({dimx, dimz, dimy}, options); at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_input_t = at::transpose(input, 1, 2); auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv1 + 3 // tv4 = tv1 * 2 // tv5 = tv3 + tv2 // tv6 = tv5 + tv4 // tv7 = tv1 + tv4 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); tv0 = transpose(tv0, {{0, 1}}); TensorView* tv1 = mul(tv0, new Double(0.5)); TensorView* tv2 = mul(tv1, new Double(-1.0)); TensorView* tv3 = add(tv1, new Double(3.0)); TensorView* tv4 = mul(tv1, new Double(2.0)); TensorView* tv5 = add(tv3, tv2); TensorView* tv6 = add(tv5, tv4); TensorView* tv7 = add(tv1, tv4); fusion.addOutput(tv6); fusion.addOutput(tv7); // Lets setup to actually run tv7->merge(0); tv7->split(0, 128); tv7->split(0, 4); tv7->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv7, 1); // The this-position of the last tensor should be zero. TORCH_CHECK( tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && tv7->getMaxProducerPosition() == 1); TORCH_CHECK( tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 && tv6->getMaxProducerPosition() == 1); // The position of every other tensor should be 1. for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); } for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({129, 127}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); at::Tensor aten_input_t = aten_input.t(); auto t1 = aten_input_t.mul({0.5}); auto t2 = t1.mul({-1.0}); auto t3 = t1.add({3.0}); auto t4 = t1.mul({2.0}); auto t5 = t3.add(t2); auto t6 = t5.add(t4); auto t7 = t1.add(t4); std::vector aten_outputs = {t6, t7}; testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { // Case 2 // tv1 = tv0 * -1 // tv2 = tv0 + 3 // tv3 = tv0 * 2 // tv4 = tv2 + tv1 // tv5 = tv4 + tv3 // tv6 = tv5 + tv3 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); tv0 = transpose(tv0, {{0, 1}}); TensorView* tv1 = mul(tv0, new Double(-1.0)); TensorView* tv2 = add(tv0, new Double(3.0)); TensorView* tv3 = mul(tv0, new Double(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); TensorView* tv6 = add(tv5, tv3); fusion.addOutput(tv5); fusion.addOutput(tv6); // Lets setup to actually run tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); tv6->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv6, 1); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({129, 127}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input}); auto input_t = input.t(); auto t1 = input_t.mul({-1.0}); auto t2 = input_t.add({3.0}); auto t3 = input_t.mul({2.0}); auto t4 = t2.add(t1); auto t5 = t4.add(t3); auto t6 = t5.add(t3); std::vector aten_outputs = {t5, t6}; testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { // Case 3 // T2 = T1 * 0.979361 // T3 = T2 * T0 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(4); fusion.addInput(tv0); tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); TensorView* tv2 = mul(tv1, new Double(.979361)); TensorView* tv3 = mul(tv2, tv0); fusion.addOutput(tv3); // Lets setup to actually run while (tv3->nDims() > 1) tv3->merge(0); tv3->split(0, 128); tv3->split(0, 4); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({129, 127, 63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); auto t2 = t1_t.mul({0.979361}); auto aten_output = t2.mul(t0_t); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { // Case 4 // T4 = T2 - T3 // T5 = T1 + T4 // T6 = T5 - T0 Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(4); fusion.addInput(tv0); tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); TensorView* tv2 = makeSymbolicTensor(4); fusion.addInput(tv2); tv2 = transpose(tv2, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); TensorView* tv3 = makeSymbolicTensor(4); fusion.addInput(tv3); tv3 = transpose(tv3, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); TensorView* tv4 = sub(tv2, tv3); TensorView* tv5 = add(tv1, tv4); TensorView* tv6 = sub(tv5, tv0); fusion.addOutput(tv6); // Lets setup to actually run while (tv6->nDims() > 1) tv6->merge(0); tv6->split(0, 128); tv6->split(0, 4); tv0->computeAt(tv6, 1); tv1->computeAt(tv6, 1); tv2->computeAt(tv6, 1); tv3->computeAt(tv6, 1); tv6->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({129, 127, 63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); at::Tensor t2 = at::rand_like(t0, options); at::Tensor t3 = at::rand_like(t0, options); std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); auto t1_t = t1.permute({3, 0, 1, 2}); auto t2_t = t2.permute({3, 0, 1, 2}); auto t3_t = t3.permute({3, 0, 1, 2}); auto t4 = t2_t.sub(t3_t); auto t5 = t1_t.add(t4); auto aten_output = t5.sub(t0_t); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { // Case 5 // tv2 = tv0 + 2.0 // tv3 = tv1 * tv2 Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); tv0 = transpose(tv0, {{0, 1}}); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); tv1 = transpose(tv1, {{0, 1}}); TensorView* tv2 = add(tv0, new Double(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); tv3->merge(0); tv3->split(-1, 8); tv3->split(-1, 4); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); tv0 = transpose(tv0, {{0, 1}}); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); tv1 = transpose(tv1, {{0, 1}}); TensorView* tv2 = add(tv0, new Double(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); tv2->merge(0); tv2->split(-1, 8); tv2->split(-1, 4); tv3->merge(0); tv3->split(-1, 8); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({63, 65}, options); at::Tensor t1 = at::rand_like(t0, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t2 = t0.t().add(2.0); auto aten_output = t1.t().mul(t2); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSegmentReducePointwise_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(1); TensorView* tv2 = makeSymbolicTensor(2); fusion->addInput(tv0); fusion->addInput(tv1); fusion->addInput(tv2); TensorView* tv3 = add(tv0, new Double(1)); // Group 0 TensorView* tv4 = max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, // keeps normalization scheduler away) TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce) fusion->addOutput(tv6); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, 65}, options); at::Tensor t1 = at::randn({65}, options); at::Tensor t2 = at::randn({128, 65}, options); auto t3 = t0.add(1.0); auto t4 = std::get<0>(at::max(t3, 0)); auto t5 = t4.add(t1); auto t6 = t5.add(t2); FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2}); TORCH_CHECK( executor_cache.getMostRecentKernelRuntime()->isSegmented(), "segmentation didn't happen"); TORCH_CHECK( executor_cache.getMostRecentKernelRuntime() ->fusionSegments() ->groups() .size() == 2, "segmentation didn't happen as expected"); testValidate( executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionMultipleVectorize_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); TensorView* tv0 = makeContigTensor(1); TensorView* tv1 = makeContigTensor(1); fusion->addInput(tv0); fusion->addInput(tv1); TensorView* tv3 = add(tv0, tv1); fusion->addOutput(tv3); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({40960}, options); at::Tensor t1 = at::randn({40960}, options); auto t2 = t0 + t1; FusionExecutorCache executor_cache(std::move(fusion)); executor_cache.profile(true); auto outputs = executor_cache.runFusionWithInputs({t0, t1}); auto runtime1 = executor_cache.getMostRecentKernelRuntime(); auto log1 = executor_cache.getMostRecentExecutorInfo().pointwise_params; TORCH_CHECK(log1.has_value()); TORCH_CHECK(log1->vectorize); testValidate( executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__); t0 = at::randn({40964}, options); t1 = at::randn({40964}, options); t2 = t0 + t1; outputs = executor_cache.runFusionWithInputs({t0, t1}); auto runtime2 = executor_cache.getMostRecentKernelRuntime(); auto log2 = executor_cache.getMostRecentExecutorInfo().pointwise_params; TORCH_CHECK(log2.has_value()); TORCH_CHECK(log2->vectorize); testValidate( executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__); t0 = at::randn({40962}, options); t1 = at::randn({40962}, options); t2 = t0 + t1; outputs = executor_cache.runFusionWithInputs({t0, t1}); auto runtime3 = executor_cache.getMostRecentKernelRuntime(); auto log3 = executor_cache.getMostRecentExecutorInfo().pointwise_params; TORCH_CHECK(log3.has_value()); TORCH_CHECK(log3->vectorize); testValidate( executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__); TORCH_CHECK(runtime1 == runtime2); TORCH_CHECK(runtime1 != runtime3); } TEST(NVFuserTest, FusionVectorizeSimple_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeContigTensor(3); fusion.addInput(tv0); auto tv1 = unaryOp(UnaryOpType::Sin, tv0); fusion.addOutput(tv1); auto tv0_cache = tv0->cache_after(); auto tv1_cache = tv1->cache_before(); tv1->merge(0); tv1->merge(0); tv1->split(0, 4); tv1->split(0, 128); tv1->axis(0)->parallelize(ParallelType::BIDx); tv1->axis(1)->parallelize(ParallelType::TIDx); tv0->computeAt(tv1, 2); tv0_cache->axis(2)->parallelize(ParallelType::Vectorize); tv1->axis(2)->parallelize(ParallelType::Vectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::empty({2, 6, 32}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({aten_input}); at::Tensor aten_output = aten_input.sin(); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); std::vector input_shape{32, 64, 8}; const int kReductionAxis = 1; auto tv0 = TensorViewBuilder() .ndims(input_shape.size()) .dtype(DataType::Double) .build(); fusion->addInput(tv0); auto tv1 = add(tv0, new Double(1.0)); auto tv2 = sum(tv1, {2}); // Group 0 auto output = softmax(tv2, kReductionAxis); // Group 1 fusion->addOutput(output); auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({at_x}); auto t1 = at_x.add(1.0); auto t2 = t1.sum({2}); auto t3 = at::_softmax(t2.to(at::kDouble), -1, false); auto optimized_fusion = executor_cache.getMostRecentKernelRuntime(); TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen"); TORCH_CHECK( optimized_fusion->fusionSegments()->groups().size() == 2, "segmentation didn't happen as expected"); testValidate( executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSwizzle1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = mul(tv1, new Double(2)); fusion.addOutput(tv2); tv2->split(0, 7); tv2->split(0, 9); tv0->computeAt(tv2, 1); tv2->axis(0)->parallelize(ParallelType::BIDx); tv1->setMemoryType(MemoryType::Shared); tv1->swizzle(SwizzleType::Transpose, {1, 2}); tv1->axis(1)->parallelize(ParallelType::TIDx); tv1->axis(2)->parallelize(ParallelType::TIDy); tv2->axis(1)->parallelize(ParallelType::TIDx); tv2->axis(2)->parallelize(ParallelType::TIDy); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({100}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = (t0 + 1) * 2; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSwizzle2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = mul(tv1, new Double(2)); fusion.addOutput(tv2); tv1->split(-1, 4); tv1->split(-2, 4); tv2->split(-1, 4); tv2->split(-2, 4); tv0->computeAt(tv2, 1); tv2->reorder({{-1, -2}}); tv1->setMemoryType(MemoryType::Shared); tv1->swizzle(SwizzleType::Transpose, {-2, -1}); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-2)->parallelize(ParallelType::TIDy); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-2)->parallelize(ParallelType::TIDy); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({123}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = (t0 + 1) * 2; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTransposeWithSwizzle_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = transpose(tv0, {{0, 1}}); fusion.addOutput(tv1); // tv0: [I0, I1] // tv1: [I1, I0] const int BS = 32; // CTA tiling by BS*BS tv1->split(1, BS); tv1->split(0, BS); tv1->reorder({{1, 2}}); // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)] // Create a smem buffer to cache each tile auto tv0_cache = tv0->cache_after(); tv0_cache->setMemoryType(MemoryType::Shared); tv0->computeAt(tv1, 2); // tv0: [I0, I1] // tv0_cache: [I1/BS, I0/BS, BS(I1), BS(I0)] // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)] // Assign each thread block to a tile tv1->axis(0)->parallelize(ParallelType::BIDy); tv1->axis(1)->parallelize(ParallelType::BIDx); // Thread mapping for each tile. For both of the input and output // tiles, map TIDx to the fastest-changing dimension to facilitate // coalesced gmem accesses. tv1->axis(2)->parallelize(ParallelType::TIDy); tv1->axis(3)->parallelize(ParallelType::TIDx); // Note that the fastest-changing axis is next to the inner-most // axis since computeAt reorders the axes as the output tensor. tv0_cache->axis(2)->parallelize(ParallelType::TIDx); tv0_cache->axis(3)->parallelize(ParallelType::TIDy); // Swizzles the smem cache to avoid bank conflicts tv0_cache->swizzle(SwizzleType::Transpose, {3, 2}); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 100; const int by = 200; at::Tensor t0 = at::randn({bx, by}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.t(); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = transpose(tv0, {{0, 1}}); fusion.addOutput(tv1); // tv0: [I0, I1] // tv1: [I1, I0] const int BS = 32; const int BDIM = 256; // CTA tiling by BS*BS tv1->split(1, BS); tv1->split(0, BS); tv1->reorder({{1, 2}}); // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)] // Create a smem buffer to cache each tile auto tv0_cache = tv0->cache_after(); tv0_cache->setMemoryType(MemoryType::Shared); tv0->computeAt(tv1, 2); // tv0: [I0, I1] // tv0_cache: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] // Tranform the tile axes for 1D thread mapping tv1->merge(-2, -1); tv1->split(-1, BDIM); // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] // Transform the cache similarly but apply swizzle to the 2D tile axes. tv0_cache->reorder({{-2, -1}}); tv0_cache->swizzle(SwizzleType::Transpose, {2, 3}); tv0_cache->merge(-2, -1); tv0_cache->split(-1, BDIM); // tv0: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] // Assign each thread block to a tile tv1->axis(0)->parallelize(ParallelType::BIDy); tv1->axis(1)->parallelize(ParallelType::BIDx); // Thread mapping for each tile. tv1->axis(-1)->parallelize(ParallelType::TIDx); tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 100; const int by = 200; at::Tensor t0 = at::randn({bx, by}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.t(); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } // Grid reduction can be executed only once in a kernel. Should result // in an error at the time of compilation. TEST(NVFuserTest, FusionGridReductionInLoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); fusion.addOutput(tv1); tv1->axis(1)->parallelize(ParallelType::BIDx); FusionExecutor fe; ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionIssue633_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int dx = 10; const int dy = 11; const int dz = 12; auto tv0 = makeConcreteTensor({dx, dy, dz}); fusion.addInput(tv0); auto tv1 = makeConcreteTensor({dx, dy, 1}); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); tv2->merge(1); tv2->merge(0); tv2->split(-1, 128); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::TIDx); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({dx, dy, dz}, options); at::Tensor t1 = at::randn({dx, dy, 1}, options); std::vector aten_inputs = {t0, t1}; auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionKirScoping_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(2)); fusion.addOutput(tv2); tv2->merge(0); tv2->split(0, 4); tv0->computeAt(tv2, -1); GpuLower gpulw(&fusion); auto kir_tv1 = gpulw.lowerValue(tv1); auto tv1_scope = kir_tv1->definition()->scope(); TORCH_CHECK(tv1_scope != nullptr); TORCH_CHECK(tv1_scope->owner()->as()); auto kir_tv2 = gpulw.lowerValue(tv2); auto tv2_scope = kir_tv2->definition()->scope(); TORCH_CHECK(tv2_scope != nullptr); TORCH_CHECK(tv2_scope->owner()->as()); TORCH_CHECK(tv1_scope != tv2_scope); // tv1 and tv2 should have the same inner-most ForLoop auto parent_scope = tv1_scope->owner()->scope(); TORCH_CHECK(parent_scope == tv2_scope->owner()->scope()); TORCH_CHECK(parent_scope->owner()->as()); // There should be one more loop parent_scope = parent_scope->owner()->scope(); TORCH_CHECK(parent_scope->owner()->as()); // scope() should return nullptr for top-level exprs auto top_level_scope = parent_scope->owner()->scope(); TORCH_CHECK(top_level_scope == nullptr); } TEST(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); std::vector shape{17, 19}; auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); auto tv2 = broadcast(tv0, {false, true}); auto tv3 = add(tv1, tv2); fusion.addOutput(tv3); tv3->split(1, 128); tv0->computeAt(tv3, 2); for (auto tv : {tv2, tv3}) { tv->axis(-1)->parallelize(ParallelType::TIDx); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({shape[0]}, options); at::Tensor t1 = at::randn(shape, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t3 = t0.unsqueeze(-1).expand(shape) + t1; testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(2); auto tv1 = makeContigTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); const int kTDX = 64; const int kVecSize = 4; const int kNumElems = kTDX * kVecSize; tv2->split(1, kNumElems); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); tv2->split(-1, kVecSize); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-2)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 457; at::Tensor t0 = at::randn({bx, by}, options); at::Tensor t1 = at::randn({bx, by}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(4); auto tv1 = makeContigTensor(4); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); tv2->reorder({{0, 1}, {1, 0}}); tv2->merge(-2); const int kTDX = 64; const int kVecSize = 2; const int kNumElems = kTDX * kVecSize; tv2->split(-1, kNumElems); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); tv2->split(0, 128); tv2->split(-1, kVecSize); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::BIDy); tv2->axis(-2)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int n = 32; const int c = 127; const int h = 51; const int w = 23; at::Tensor t0 = at::randn({n, c, h, w}, options); at::Tensor t1 = at::randn({n, c, h, w}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int kNumDims = 4; constexpr int kTDX = 64; constexpr int kVecSize = 2; constexpr int kNumElems = kTDX * kVecSize; auto tv0 = makeSymbolicTensor(kNumDims); auto tv1 = makeSymbolicTensor(kNumDims); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); // Create caches for vectorization auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); // Merge all dimensions together except inner-most dim for (int idx = 0; idx < kNumDims - 2; ++idx) { tv2->merge(0); } // Split inner-most dim tv2->split(-1, kNumElems); tv2->split(-1, kVecSize); TransformPropagator::from(tv2); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); // Parallelization Strategy c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(2)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int n = 5; const int c = 3; const int h = 51; const int w = 257; at::Tensor t0 = at::randn({n, c, h, w}, options); at::Tensor t1 = at::randn({n, c, h, w}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int kNumDims = 4; constexpr int kTDX = 64; constexpr int kVecSize = 2; constexpr int kNumElems = kTDX * kVecSize; std::vector bcast_shape{1, 1, 1, -1}; auto tv0 = makeContigTensor(kNumDims); auto tv1 = TensorViewBuilder().shape(bcast_shape).build(); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); // Create caches for vectorization auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); // Merge all dimensions together // Backward merge order is necessary for vectorize validation for (int idx = kNumDims - 1; idx > 0; --idx) { tv2->merge(idx - 1); } tv2->split(-1, kNumElems); tv2->split(-1, kVecSize); TransformPropagator::from(tv2); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); // Parallelization Strategy c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int n = 32; const int c = 128; const int h = 51; const int w = 23; at::Tensor t0 = at::randn({n, c, h, w}, options); at::Tensor t1 = at::randn({1, 1, 1, w}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; // TODO: throw assertion - cannot merge non-contiguous vectorization axes // Make sure compilation fails ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(2); auto tv1 = makeContigTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); auto tv3 = sum(tv2, {-1}); fusion.addOutput(tv3); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); tv3->split(-1, 128 * 4); tv3->split(-1, 4); // Reduce outer dim first auto tv4 = tv3->rFactor({-3, -1}); // Tv3 will reduce threads tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv4, -2); tv1->computeAt(tv4, -2); c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv4->axis(-2)->parallelize(ParallelType::TIDx); tv3->axis(1)->parallelize(ParallelType::TIDx); tv2->computeAt(tv4, -1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 2050; at::Tensor t0 = at::randn({bx, by}, options); at::Tensor t1 = at::randn({bx, by}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(2); auto tv1 = makeContigTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); tv2->split(1, 16); tv2->split(1, 64); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(2)->parallelize(ParallelType::TIDx); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); std::vector vectorized_tvs = {c0, c1, tv2}; for (auto tv : vectorized_tvs) { tv->split(-1, 4); // Vectorize the wrong dimension tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize); } FusionExecutor fe; // Make sure compilation fails ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); const int kTDX = 64; const int kVecSize = 4; const int kNumElems = kTDX * kVecSize; tv2->split(1, kNumElems); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); tv2->split(-1, kVecSize); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-2)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 2049; at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)}); at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)}); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); const int kTDX = 64; const int kVecSize = 4; const int kNumElems = kTDX * kVecSize; tv2->split(1, kNumElems); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); tv2->split(-1, kVecSize); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-2)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 2049; at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)}); at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)}); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); // Failure because the input + output tensors do not have the same stride ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); } TEST(NVFuserTest, FusionVectorization1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); tv2->split(1, 16); tv2->split(1, 64); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(2)->parallelize(ParallelType::TIDx); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); std::vector vectorized_tvs = {c0, c1, tv2}; for (auto tv : vectorized_tvs) { tv->split(-1, 4); tv->axis(-1)->parallelize(ParallelType::Vectorize); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 2048; at::Tensor t0 = at::randn({bx, by}, options); at::Tensor t1 = at::randn({bx, by}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorization2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); tv2->split(1, 16); tv2->split(1, 64); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(2)->parallelize(ParallelType::TIDx); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); std::vector vectorized_tvs = {c0, c1, tv2}; for (auto tv : vectorized_tvs) { tv->split(-1, 4); // Vectorize the wrong dimension tv->axis(-2)->parallelize(ParallelType::Vectorize); } FusionExecutor fe; // Make sure compilation fails ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionVectorization3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); fusion.addOutput(tv2); tv2->split(1, 16); tv2->split(1, 64); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(2)->parallelize(ParallelType::TIDx); auto c0 = tv0->cache_after(); auto c1 = tv1->cache_after(); auto c2 = tv2->cache_before(); c0->computeAt(tv2, -2); c1->computeAt(tv2, -2); std::vector vectorized_tvs = {c0, c1, tv2}; for (auto tv : vectorized_tvs) { tv->split(-1, 4); tv->axis(-1)->parallelize(ParallelType::Vectorize); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 2049; at::Tensor t0 = at::randn({bx, by}, options); at::Tensor t1 = at::randn({bx, by}, options); FusionExecutor fe; fe.compileFusion(&fusion); std::vector aten_inputs = {t0, t1}; ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); aten_inputs[0] = t0.index({"...", Slice(1)}); aten_inputs[1] = t1.index({"...", Slice(1)}); ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)}); t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)}); aten_inputs = {t0, t1}; auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionVectorizationRFactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, tv1); auto tv3 = sum(tv2, {-1}); fusion.addOutput(tv3); tv3->split(-1, 128 * 4); tv3->split(-1, 4); // Reduce outer dim first auto tv4 = tv3->rFactor({-3, -1}); // Tv3 will reduce threads auto tv6 = tv0->cache_after(); auto tv7 = tv1->cache_after(); tv0->computeAt(tv3, 1); tv1->computeAt(tv3, 1); tv3->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(tv4, -2); tv1->computeAt(tv4, -2); tv6->axis(-1)->parallelize(ParallelType::Vectorize); tv7->axis(-1)->parallelize(ParallelType::Vectorize); tv4->axis(-2)->parallelize(ParallelType::TIDx); tv3->axis(1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 128; const int by = 2048; at::Tensor t0 = at::randn({bx, by}, options); at::Tensor t1 = at::randn({bx, by}, options); std::vector aten_inputs = {t0, t1}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.add(t1).sum(1); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); auto t3 = t0.add(t1).sum(1); testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__); } // Unswitched loops with extent one may omit else clause. TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Progressively broadcast tensors TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); TensorView* tv2 = makeSymbolicTensor(3); fusion.addInput(tv2); TensorView* tv3 = broadcast(tv0, {false, true}); TensorView* tv4 = add(tv3, tv1); TensorView* tv5 = add(tv4, tv2); fusion.addOutput(tv5); // Split inner dimension tv5->split(1, 8); // Merge middle dims with outer dimensions tv5->merge(2); tv5->merge(0); // tv5[I0*I1o, I1i*I2] // Get a dim of size 1 to unswitch tv5->split(0, 1, false); // Compute everything inline tv0->computeAt(tv5, -1); tv5->axis(0)->parallelize(ParallelType::Unswitch); tv5->axis(1)->parallelize(ParallelType::BIDx); tv5->axis(2)->parallelize(ParallelType::TIDx); // Make sure the unswitched loop does not have an else clause. GpuLower gpulw(&fusion); for (const auto& kir_node : gpulw.kernel()->irNodes()) { if (auto fl = dynamic_cast(kir_node.get())) { if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) { continue; } if (auto pred = dynamic_cast(fl->parentScope())) { TORCH_CHECK(!pred->hasElse()); } } } const int x = 11; const int y = 12; const int z = 13; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x}, options); at::Tensor t1 = at::randn({x, y}, options); at::Tensor t2 = at::randn({z, x, y}, options); std::vector aten_inputs = {t0, t1, t2}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2; testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__); } // The unswitched loop has extent one but inner loops don't. The else // part should not be omitted. TEST(NVFuserTest, FusionSizeOneLoop2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int x = 15; auto tv0 = makeConcreteTensor({x}); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); fusion.addOutput(tv1); tv1->split(-1, 4); tv1->split(-2, 1); tv1->axis(-2)->parallelize(ParallelType::Unswitch); // Make sure the size-one unswitched loop does not omit the else clause. GpuLower gpulw(&fusion); for (const auto& kir_node : gpulw.kernel()->irNodes()) { if (auto fl = dynamic_cast(kir_node.get())) { if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) { continue; } if (auto pred = dynamic_cast(fl->parentScope())) { TORCH_CHECK(pred->hasElse()); } } } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion(aten_inputs); auto t1 = t0 + 1; testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionValidateParallelize1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDy); // Invalid as tv1 and tv2 do have the same ParallelType FusionExecutor fe; ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionValidateParallelize2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDy); tv1->setMemoryType(MemoryType::Shared); // tv1 and tv2 do have the same ParallelType, but tv1 is on shared // memory, so it is valid FusionExecutor fe; fe.compileFusion(&fusion); } TEST(NVFuserTest, FusionValidateParallelize3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->split(-1, 4); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->split(-1, 4); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->setMemoryType(MemoryType::Global); // tv1 and tv2 have the same shape and ParallelType FusionExecutor fe; fe.compileFusion(&fusion); } TEST(NVFuserTest, FusionValidateParallelize4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->split(-1, 4); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv2->split(-1, 8); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv1->setMemoryType(MemoryType::Global); // tv1 and tv2 do not have the same shape FusionExecutor fe; ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } TEST(NVFuserTest, FusionValidateParallelize5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(1)); fusion.addOutput(tv2); tv1->split(-1, 4); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv1->setMemoryType(MemoryType::Shared); tv2->split(-1, 8); tv2->axis(-1)->parallelize(ParallelType::TIDx); // tv1 and tv2 do not have the same shape, but tv1 is on shared // memory, so it is valid FusionExecutor fe; fe.compileFusion(&fusion); } // See issue #995 TEST(NVFuserTest, FusionValidateParallelize6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(4); fusion.addInput(tv0); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1)); auto tv3 = broadcast(tv2, {true, false, false, false}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv4->merge(0); tv4->merge(0); tv4->merge(0); tv4->split(0, 128); tv4->split(0, 1); tv4->split(0, 1); TransformPropagator::from(tv4); tv0->computeAt(tv2, 2); tv3->computeAt(tv4, 2); tv4->axis(0)->parallelize(ParallelType::BIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); // Validation should throw an exception saying the first axes of tv2 // and tv3 have incompatible parallelization. See also issue #995. ASSERT_ANY_THROW(fusion.printKernel()); } TEST(NVFuserTest, FusionDAGMerging_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(5); auto tv1 = makeSymbolicTensor(1); fusion.addInput(tv0); fusion.addInput(tv1); // Branch 0 auto tv2 = sum(tv0, {0}); // 0 auto tv3 = sum(tv2, {0}); // 1 auto tv4 = sum(tv3, {0}); // 2 auto tv5 = sum(tv4, {0}); // 3 // Branch 1 auto tv6 = add(tv1, new Double(1)); // 4 // Merge auto tv7 = add(tv6, tv5); // 5 // Maximum expected output groups (can improve overtime): // {0}, {1}, {2}, {3,4,5} // without final merge would have been {0}, {1}, {2}, {3,4}, {5} fusion.addOutput(tv7); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options); at::Tensor t1 = at::randn({2}, options); auto fusion_segments = fusion.segment({t0, t1}); TORCH_CHECK(fusion_segments->groups().size() <= 4); } TEST(NVFuserTest, FusionDAGScalarMerging_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(3); auto i0 = new Double(); fusion->addInput(tv0); fusion->addInput(i0); auto i1 = add(i0, new Double(1.0)); auto i2 = mul(i1, i1); auto i3 = add(i2, i1); // Branch 0 auto tv1 = sum(tv0, {0}); // 0 auto tv2 = add(tv1, i2); // Branch 1 auto tv3 = sum(tv2, {0}); // 1 auto tv4 = add(tv3, i3); auto tv5 = add(tv4, i0); fusion->addOutput(tv5); FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({16, 16, 16}, options); double s0 = 0.5; auto s1 = s0 + 1.0; auto s2 = s1 * s1; auto s3 = s2 + s1; auto t1 = t0.sum({0}); auto t2 = t1 + s2; auto t3 = sum(t2, {0}); auto t4 = t3 + s3; auto t5 = t4 + s0; auto outputs = executor_cache.runFusionWithInputs({t0, s0}); TORCH_CHECK( executor_cache.getMostRecentKernelRuntime()->isSegmented(), "segmentation didn't happen"); TORCH_CHECK( executor_cache.getMostRecentKernelRuntime() ->fusionSegments() ->groups() .size() == 2, "segmentation didn't happen as expected"); testValidate( executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int M = 10; constexpr int N = 20; constexpr int K = 20; auto tv0 = makeSymbolicTensor(3); auto tv1 = sum(tv0, {{1, 2}}); fusion.addInput(tv0); fusion.addOutput(tv1); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N, K}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0.sum({1, 2}); testValidate( &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int M = 10; constexpr int N = 20; constexpr int K = 20; auto tv0 = makeSymbolicTensor(3); auto tvs = Welford(tv0, {{1, 2}}); fusion.addInput(tv0); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; auto tv_N = tvs.n; fusion.addOutput(tv_avg); fusion.addOutput(tv_M2); tv_avg->axis(-1)->parallelize(ParallelType::TIDx); tv_avg->axis(0)->parallelize(ParallelType::BIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M, N, K}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_avg = t0.mean({1, 2}); at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K; testValidate( &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__); } // See Issue #716 TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int M = 10; constexpr int N = 11; auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); std::vector reduction_axes = {1}; std::vector broadcast_mask = {false, true}; auto tv0_bcast = broadcast(tv0, broadcast_mask); auto path1_bcast = add(tv0_bcast, new Double(1.0)); auto path1 = sum(path1_bcast, reduction_axes); fusion.addOutput(path1); auto p = path1->split(1, 1); path1->rFactor({1}); path1->axis(0)->parallelize(ParallelType::BIDx); tv0->computeAt(path1, 1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({M}, options); at::Tensor t0_ref = t0.clone(); std::vector aten_inputs = {t0}; FusionExecutor fe; fe.compileFusion(&fusion); // inplace op, we are adding t0 to itself auto outputs = fe.runFusion(aten_inputs, {t0}); TORCH_CHECK(outputs[0].allclose(t0_ref.add(1))); } TEST(NVFuserTest, FusionReductionPredicate_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {0}); fusion.addOutput(tv1); auto tv2 = tv0->cache_after(); const int bdimx = 128; tv1->split(1, bdimx); tv1->split(1, 4); tv1->split(1, 1); tv1->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(2)->parallelize(ParallelType::Unroll); tv1->split(0, 10); tv0->computeAt(tv1, 4); tv2->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 650; int numel_y = 102; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({numel_x, numel_y}, options); at::Tensor cg_output = at::empty({numel_y}, options); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({0}); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionIssue728_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addOutput(tv0); auto tv1 = makeSymbolicTensor(1); fusion.addOutput(tv1); auto tv2 = makeSymbolicTensor(1); fusion.addOutput(tv2); auto tv3 = add(tv0, new Double(1)); auto tv4 = add(tv3, tv1); auto tv5 = add(tv4, new Double(1)); auto tv6 = add(tv2, new Double(1)); fusion.addOutput(tv5); fusion.addOutput(tv6); // tv0 -> tv3 -+ // tv1 --------+-> tv4 -> tv5 // // tv2 -> tv6 auto all_vals_under_tv3 = DependencyCheck::getAllValsBetween({tv3}, fusion.outputs()); std::unordered_set included_tensors({tv3, tv4, tv5}); for (auto tv : included_tensors) { TORCH_CHECK( std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) != all_vals_under_tv3.end(), "TV", tv->name(), " not found"); } for (auto tv : ir_utils::filterByType(fusion.vals())) { if (included_tensors.find(tv) == included_tensors.end()) { TORCH_CHECK( std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) == all_vals_under_tv3.end(), "TV", tv->name(), " should not be found"); } } auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs()); TORCH_CHECK(no_dependency.empty(), "No val should be returned"); auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6}); TORCH_CHECK(no_dep_path.empty(), "No val should be returned"); auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5}); TORCH_CHECK(no_dep_path2.empty(), "No val should be returned"); auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3}); TORCH_CHECK( just_tv3.size() == 1 && *(just_tv3.begin()) == tv3, "Only tv3 should be included"); } TEST(NVFuserTest, FusionIssue757_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = makeSymbolicTensor(2); fusion.addInput(tv3); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv1->computeAt(tv4, -1); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv1->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 650; int numel_y = 102; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); at::Tensor t3 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(inputs); auto t1 = t0.sum({1}); auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y}); auto t4 = t2 + t3; testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); } // See issue #759 TEST(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = makeSymbolicTensor(2); fusion.addInput(tv3); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv4->split(0, 4); tv1->computeAt(tv4, -1); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(1)->parallelize(ParallelType::TIDy); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(1)->parallelize(ParallelType::TIDy); tv1->axis(-1)->parallelize(ParallelType::TIDx); int numel_x = 100; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); at::Tensor t3 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0, t3}; FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion(inputs); auto t1 = t0.sum({1}); auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y}); auto t4 = t2 + t3; testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSegmentVerticalMerge_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(3); fusion->addInput(tv0); // {first kernel} auto tv1 = sum(tv0, {0}); auto tv2 = add(tv1, tv0); auto tv3 = sum(tv2, {0}); auto tv4 = add(tv3, tv0); auto tv5 = sum(tv4, {0}); auto tv6 = sum(tv5, {0}); // {second kernel} auto tv7 = add(tv6, tv5); auto tv8 = add(tv7, tv5); auto tv9 = sum(tv8, {0}); fusion->addOutput(tv9); SegmentCandidateFinderOptions segment_options; segment_options.run_herrmann_merge = false; segment_options.run_final_merge = false; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 2}, options); auto segmented_fusion = SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options); TORCH_CHECK(segmented_fusion->groups().size() == 2); } TEST(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(3); auto i0 = new Double(); fusion->addInput(tv0); fusion->addInput(i0); // Branch 0 {first kernel} auto tv1 = sum(tv0, {0}); auto tv2 = add(tv0, i0); auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2); auto tv4 = sum(tv3, {0}); // Branch 1 {first kernel} auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3); auto tv6 = sum(tv5, {0}); // Incompatible {second kernel} auto tv7 = sum(tv6, {0}); fusion->addOutput(tv1); fusion->addOutput(tv4); fusion->addOutput(tv7); SegmentCandidateFinderOptions segment_options; segment_options.run_herrmann_merge = false; segment_options.run_final_merge = false; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 2}, options); auto segmented_fusion = SegmentCandidateFinder::segment(fusion.get(), {t0, 1.0}, segment_options); TORCH_CHECK(segmented_fusion->groups().size() == 2); } TEST(NVFuserTest, FusionSegmentMixReduction_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(3); fusion->addInput(tv0); // def of tv1 in kernel 1 through horizontal auto tv1 = sum(tv0, {0, 1}); // kernel 2 auto tv2 = sum(tv0, {2}); auto tv3 = broadcast(tv2, {false, false, true}); auto tv4 = add(tv0, tv3); auto tv5 = sum(tv4, {2}); // end of kernel 2 // kernel 1 auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0); auto tv7 = sum(tv6, {0, 1}); auto tv8 = sum(tv6, {0, 1}); fusion->addOutput(tv1); fusion->addOutput(tv5); fusion->addOutput(tv7); fusion->addOutput(tv8); SegmentCandidateFinderOptions segment_options; segment_options.run_herrmann_merge = false; segment_options.run_final_merge = false; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({2, 2, 2}, options); auto segmented_fusion = SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options); TORCH_CHECK(segmented_fusion->groups().size() <= 2); } TEST(NVFuserTest, FusionSBAR_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // N, H, W, C format std::vector input_shape{656, 7, 7, 64}; auto x = makeContigTensor(4); auto y = makeContigTensor(4); auto weight = makeContigTensor(1); auto bias = makeContigTensor(1); fusion.addInput(x); fusion.addInput(y); fusion.addInput(weight); fusion.addInput(bias); const size_t kNumberOfDims = x->nDims(); std::vector broadcast_mask(kNumberOfDims, false); for (size_t axis = 0; axis < kNumberOfDims - 1; ++axis) { broadcast_mask[axis] = true; } auto weight_bcast = broadcast(weight, broadcast_mask); auto scale = mul(x, weight_bcast); auto bias_bcast = broadcast(bias, broadcast_mask); auto scale_bias = add(scale, bias_bcast); auto scale_bias_add = add(scale_bias, y); auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add); fusion.addOutput(scale_bias_add_relu); // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); at::Tensor at_y = at::randn(input_shape, options); at::Tensor at_weight = at::ones({input_shape[3]}, options); at::Tensor at_bias = at::zeros({input_shape[3]}, options); // inputs std::vector inputs = {at_x, at_y, at_weight, at_bias}; // outputs std::vector outputs; auto lparams = schedulePointwise(&fusion, c10::ArrayRef(inputs)); FusionExecutor executor; executor.compileFusion(&fusion); outputs = executor.runFusion(c10::ArrayRef(inputs), lparams); auto at_scale = at::mul(at_x, at_weight); auto at_scale_bias = at::add(at_scale, at_bias); auto pwise_add = at::add(at_scale_bias, at_y); auto output = at::relu(pwise_add); testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionSingleElement_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(0); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(2.5)); auto tv2 = add(tv1, new Double(3.5)); fusion.addOutput(tv2); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input = at::randn({}, options); at::Tensor cg_output = at::empty({}, options); auto lparams = schedulePointwise(&fusion, {input}); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input}, {cg_output}, lparams); auto aten_output = input.add(2.5).add(3.5); testValidate( &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBNBackwardRepro_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int batch = 4; int c = 4; int h = 4; int w = 4; int numDims = 4; auto input = makeSymbolicTensor(numDims); fusion.addInput(input); auto weight = makeSymbolicTensor(1); fusion.addInput(weight); auto running_mean = makeSymbolicTensor(1); fusion.addInput(running_mean); auto running_var = makeSymbolicTensor(1); fusion.addInput(running_var); auto save_mean = makeSymbolicTensor(1); fusion.addInput(save_mean); auto save_invstd = makeSymbolicTensor(1); fusion.addInput(save_invstd); auto grad_out_prev = makeSymbolicTensor(numDims); fusion.addInput(grad_out_prev); auto gt_0 = makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous. fusion.addInput(gt_0); auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1)); auto gt_float = castOp(DataType::Float, gt_bool); auto grad_out = mul(grad_out_prev, gt_float); Val* eps_ptr = new Double(1e-5); auto grads = batch_norm_backward( input, grad_out, weight, running_mean, running_var, save_mean, save_invstd, true, eps_ptr, {true, true, true}); fusion.addOutput(grads.grad_input); fusion.addOutput(grads.grad_weight); fusion.addOutput(grads.grad_bias); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({batch, c, h, w}, options); at::Tensor input1 = at::randn({c}, options); at::Tensor input2 = at::randn_like(input1); at::Tensor input3 = at::randn_like(input1); at::Tensor input4 = at::randn_like(input1); at::Tensor input5 = at::randn_like(input1); at::Tensor input6 = at::randn_like(input0); at::Tensor input7 = at::randn_like(input0); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector inputs = { input0, input1, input2, input3, input4, input5, input6, input7}; auto outputs = fec.runFusionWithInputs(inputs); } // TODO: We only changed inputs, merge this with the test above. TEST(NVFuserTest, FusionBNBackwardRepro2_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int batch = 2; int c = 81; int h = 1; int w = 1; int numDims = 4; // auto input = makeSymbolicTensor(numDims); auto input = makeConcreteTensor({-1, -1, 1, 1}); fusion.addInput(input); auto weight = makeSymbolicTensor(1); fusion.addInput(weight); auto running_mean = makeSymbolicTensor(1); fusion.addInput(running_mean); auto running_var = makeSymbolicTensor(1); fusion.addInput(running_var); auto save_mean = makeSymbolicTensor(1); fusion.addInput(save_mean); auto save_invstd = makeSymbolicTensor(1); fusion.addInput(save_invstd); // auto grad_out_prev = makeSymbolicTensor(numDims); auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1}); fusion.addInput(grad_out_prev); // auto gt_0 = // makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous. auto gt_0 = makeConcreteTensor({-1, -1, 1, 1}); fusion.addInput(gt_0); auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1)); auto gt_float = castOp(DataType::Float, gt_bool); auto grad_out = mul(grad_out_prev, gt_float); Val* eps_ptr = new Double(1e-5); auto grads = batch_norm_backward( input, grad_out, weight, running_mean, running_var, save_mean, save_invstd, true, eps_ptr, {true, true, true}); fusion.addOutput(grads.grad_input); fusion.addOutput(grads.grad_weight); fusion.addOutput(grads.grad_bias); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({batch, c, h, w}, options); at::Tensor input1 = at::randn({c}, options); at::Tensor input2 = at::randn_like(input1); at::Tensor input3 = at::randn_like(input1); at::Tensor input4 = at::randn_like(input1); at::Tensor input5 = at::randn_like(input1); at::Tensor input6 = at::randn_like(input0); at::Tensor input7 = at::randn_like(input0); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector inputs = { input0, input1, input2, input3, input4, input5, input6, input7}; auto outputs = fec.runFusionWithInputs(inputs); } TEST(NVFuserTest, FusionBNRepro_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); const bool kTraining = true; const float kMomentum = 0.1; const float kEps = 1e-5; int batch = 14; int c = 65; int h = 7; int w = 7; int numDims = 4; auto input = makeSymbolicTensor(numDims); fusion.addInput(input); auto weight = makeSymbolicTensor(1); fusion.addInput(weight); auto bias = makeSymbolicTensor(1); fusion.addInput(bias); auto running_mean = makeSymbolicTensor(1); fusion.addInput(running_mean); auto running_var = makeSymbolicTensor(1); fusion.addInput(running_var); auto momentum_ptr = new Double(kMomentum); auto eps_ptr = new Double(kEps); auto result = batch_norm( input, weight, bias, running_mean, running_var, kTraining, momentum_ptr, eps_ptr); fusion.addOutput(result.output); fusion.addOutput(result.mean); fusion.addOutput(result.invstd); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({batch, c, h, w}, options); at::Tensor input2 = at::randn({c}, options); at::Tensor input3 = at::randn_like(input2); at::Tensor input4 = at::randn_like(input2); at::Tensor input5 = at::randn_like(input2); auto input1_ref = input1.clone(); auto input2_ref = input2.clone(); auto input3_ref = input3.clone(); auto input4_ref = input4.clone(); auto input5_ref = input5.clone(); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector aten_inputs = {input1, input2, input3, input4, input5}; auto cg_outputs = fec.runFusionWithInputs(aten_inputs); auto at_results = at::native_batch_norm( input1_ref, input2_ref, input3_ref, input4_ref, input5_ref, kTraining, kMomentum, kEps); auto at_output = std::get<0>(at_results); auto at_mean = std::get<1>(at_results); auto at_invstd = std::get<2>(at_results); std::vector aten_outputs = { input4_ref, input5_ref, at_output, at_mean, at_invstd}; testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionBNRepro2_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); const bool kTraining = true; const float kMomentum = 0.1; const float kEps = 1e-5; int batch = 2; int c = 4; int h = 17; int w = 17; int numDims = 4; auto input = makeSymbolicTensor(numDims); fusion.addInput(input); Val* momentum_ptr = new Double(kMomentum); Val* eps_ptr = new Double(kEps); auto result = batch_norm( input, nullptr, nullptr, nullptr, nullptr, kTraining, momentum_ptr, eps_ptr); fusion.addOutput(result.output); fusion.addOutput(result.mean); fusion.addOutput(result.invstd); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({batch, c, h, w}, options); auto input1_ref = input1.clone(); at::Tensor r_m; at::Tensor r_v; at::Tensor weight; at::Tensor bias; FusionExecutorCache fec(std::move(fusion_ptr)); std::vector aten_inputs = {input1}; auto cg_outputs = fec.runFusionWithInputs(aten_inputs); auto at_results = at::native_batch_norm( input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps); auto at_output = std::get<0>(at_results); auto at_mean = std::get<1>(at_results); auto at_invstd = std::get<2>(at_results); std::vector aten_outputs = {at_output, at_mean, at_invstd}; testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = makeConcreteTensor({0}); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(2.5)); fusion.addOutput(tv2); auto tv3 = makeConcreteTensor({0}); fusion.addOutput(tv3); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({2}, options); at::Tensor input1 = at::randn({0}, options); at::Tensor cg_output2 = at::empty({2}, options); at::Tensor cg_output3 = at::empty({0}, options); auto lparams = schedulePointwise(&fusion, {input0, input1}); FusionExecutor fe; fe.compileFusion(&fusion); fe.runFusion({input0, input1}, {cg_output2, cg_output3}, lparams); auto aten_output2 = input0.add(2.5); at::Tensor aten_output3 = at::empty({0}, options); testValidate( &fusion, {cg_output2, cg_output3}, {input0, input1}, {aten_output2, aten_output3}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = makeConcreteTensor({0}); fusion.addInput(tv1); auto tv2 = sum(tv0, {1}); fusion.addOutput(tv2); auto tv3 = makeConcreteTensor({0}); fusion.addOutput(tv3); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({2, 4}, options); at::Tensor input1 = at::randn({0}, options); at::Tensor cg_output2 = at::empty({2}, options); at::Tensor cg_output3 = at::empty({0}, options); auto reduction_params = getReductionHeuristics(&fusion, {input0, input1}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input0, input1}, lparams); auto aten_output2 = input0.sum({1}); at::Tensor aten_output3 = at::empty({0}, options); testValidate( &fusion, cg_outputs, {input0, input1}, {aten_output2, aten_output3}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = makeConcreteTensor({0}); fusion.addInput(tv1); auto tv2 = sum(tv0, {0}); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = add(tv0, tv3); fusion.addOutput(tv4); auto tv5 = makeConcreteTensor({0}); fusion.addOutput(tv5); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({2, 4}, options); at::Tensor input1 = at::randn({0}, options); at::Tensor cg_output2 = at::empty({2, 4}, options); at::Tensor cg_output3 = at::empty({0}, options); auto reduction_params = getNormalizationHeuristics(&fusion, {input0, input1}); TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleNormalization(&fusion, reduction_params.value()); auto lparams = reduction_params.value().lparams; FusionExecutor fe; fe.compileFusion(&fusion); auto cg_outputs = fe.runFusion({input0, input1}, lparams); auto aten_output2 = input0.sum({0}).add(input0); at::Tensor aten_output3 = at::empty({0}, options); testValidate( &fusion, cg_outputs, {input0, input1}, {aten_output2, aten_output3}, __LINE__, __FILE__, "", lparams); } TEST(NVFuserTest, FusionSegmentIoAlias_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); TensorView* tv0 = makeSymbolicTensor(2); TensorView* tv1 = makeSymbolicTensor(1); TensorView* tv2 = makeSymbolicTensor(2); fusion->addInput(tv0); fusion->addInput(tv1); fusion->addInput(tv2); TensorView* tv3 = add(tv0, new Double(1)); // Group 0 TensorView* tv4 = max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, // keeps normalization scheduler away) TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce) fusion->addOutput(tv6); // Note: test alias; fusion->aliasOutputToInput(tv6, tv0); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, 65}, options); at::Tensor t1 = at::randn({65}, options); at::Tensor t2 = at::randn({128, 65}, options); auto t3 = t0.add(1.0); auto t4 = std::get<0>(at::max(t3, 0)); auto t5 = t4.add(t1); auto t6 = t5.add(t2); FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2}); // validating aliasing TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr()); TORCH_CHECK( executor_cache.getMostRecentKernelRuntime()->isSegmented(), "segmentation didn't happen"); TORCH_CHECK( executor_cache.getMostRecentKernelRuntime() ->fusionSegments() ->groups() .size() == 2, "segmentation didn't happen as expected"); testValidate( executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionWelford1Output_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(2); fusion->addInput(tv0); auto tvs = Welford(tv0, {1}); fusion->addOutput(tvs.var_sum); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, 65}, options); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.var({1}, false) * 65; testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionTranslate1Welford_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(2); fusion->addInput(tv0); auto tvs = Welford(tv0, {1}); fusion->addOutput(tvs.var_sum); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto run_test = [&executor_cache, fusion](auto inner_size) -> FusionKernelRuntime* { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, inner_size}, options); auto outputs = executor_cache.runFusionWithInputs({t0}); // Square sums does not fit well in the testValidate assumptions, // so we just compare the divided output here. outputs[0] /= inner_size; auto t1 = t0.var({1}, false); testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; // Run a translated welford auto runtime1 = run_test(64); // Check it was translated TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 2); TORCH_CHECK( runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == ScheduleHeuristic::Normalization); // Run an un-translated welford auto runtime2 = run_test(65536); // Check it was not translated TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 1); TORCH_CHECK( runtime2->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == ScheduleHeuristic::Reduction); } TEST(NVFuserTest, FusionTranslate2Welford_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(2); fusion->addInput(tv0); auto tvs1 = Welford(tv0, {1}); auto tvs2 = Welford(tv0, {1}); fusion->addOutput(tvs1.var_sum); fusion->addOutput(tvs2.var_sum); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto run_test = [&executor_cache, fusion](auto inner_size) -> FusionKernelRuntime* { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, inner_size}, options); auto outputs = executor_cache.runFusionWithInputs({t0}); // Square sums does not fit well in the testValidate assumptions, // so we just compare the divided output here. outputs[0] /= inner_size; outputs[1] /= inner_size; auto t1 = t0.var({1}, false); testValidate(fusion, outputs, {t0}, {t1, t1}, __LINE__, __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; // Run a translated welford auto runtime1 = run_test(64); // Check it was translated TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 4); TORCH_CHECK( runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == ScheduleHeuristic::Normalization); // Run an un-translated welford auto runtime2 = run_test(65536); // // Check it was not translated TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 2); } TEST(NVFuserTest, FusionLargeWelfordNormalization_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(2); fusion->addInput(tv0); auto tvs1 = Welford(tv0, {1}); auto sum_of_tv0 = sum(tv0, {1}); auto sum_plus_avg = add(tvs1.avg, sum_of_tv0); fusion->addOutput(sum_plus_avg); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto run_test = [&executor_cache, fusion](auto inner_size) -> FusionKernelRuntime* { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, inner_size}, options); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.mean({1}) + t0.sum({1}); testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; auto runtime = run_test(65536); TORCH_CHECK(!runtime->isSegmented()); } TEST(NVFuserTest, FusionWelfordOtherPersistence_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(2); fusion->addInput(tv0); auto tvs1 = Welford(tv0, {1}); auto sum_of_tv0 = sum(tv0, {1}); auto sum_bcasted = broadcast(sum_of_tv0, {false, true}); auto avg_bcasted = broadcast(tvs1.avg, {false, true}); auto tv0_plus_sum = add(tv0, sum_bcasted); auto tv0_plus_avg = add(tv0, avg_bcasted); fusion->addOutput(tv0_plus_sum); fusion->addOutput(tv0_plus_avg); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto run_test = [&executor_cache, fusion](auto inner_size) -> FusionKernelRuntime* { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({128, inner_size}, options); auto outputs = executor_cache.runFusionWithInputs({t0}); auto t1 = t0.mean({1}).unsqueeze(1) + t0; auto t2 = t0.sum({1}).unsqueeze(1) + t0; testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; for (auto inner_size : {4096, 8192, 32768}) { auto runtime = run_test(4096); TORCH_CHECK(!runtime->isSegmented()); } } TEST(NVFuserTest, TestSegmentIslands_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion->addInput(tv0); fusion->addInput(tv1); auto tv2 = sum(tv0, {0}); auto tv3 = sum(tv1, {1}); fusion->addOutput(tv2); fusion->addOutput(tv3); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({16, 16}, options); at::Tensor t1 = at::randn({16, 16}, options); FusionExecutorCache fusion_executor_cache(std::move(fusion)); fusion_executor_cache.runFusionWithInputs({t0, t1}); } TEST(NVFuserTest, TestBackOffInnerBroadcast_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); auto tv1 = makeSymbolicTensor(2); auto tv2 = makeSymbolicTensor(4); fusion->addInput(tv0); fusion->addInput(tv1); auto tv3 = broadcast(tv0, {false, true, true, true}); auto tv4 = broadcast(tv1, {false, false, true, true}); auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2); auto tv6 = add(tv3, tv5); auto tv7 = add(tv4, tv5); auto tv8 = add(tv3, tv4); auto tv9 = add(tv6, tv7); auto tv10 = add(tv9, tv8); fusion->addOutput(tv10); tv0->computeAt(tv10, -2); tv1->computeAt(tv10, -2); tv2->computeAt(tv10, -2); TORCH_CHECK(tv3->getComputeAtPosition() == 1); TORCH_CHECK(tv4->getComputeAtPosition() == 2); TORCH_CHECK(tv5->getComputeAtPosition() == 3); TORCH_CHECK(tv6->getMaxProducerPosition() == 3); TORCH_CHECK(tv7->getMaxProducerPosition() == 3); TORCH_CHECK(tv8->getMaxProducerPosition() == 2); } TEST(NVFuserTest, TestBackOffInnerBroadcast2_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(3); fusion->addInput(tv0); fusion->addInput(tv1); auto tv2 = broadcast(tv0, {false, false, true}); auto tv3 = add(tv2, tv1); fusion->addOutput(tv3); tv3->split(-2, 4); tv3->reorder({{-1, -2}}); tv0->computeAt(tv3, -2); tv1->computeAt(tv3, -2); TORCH_CHECK(tv2->getComputeAtPosition() == 2); TORCH_CHECK(tv3->getMaxProducerPosition() == 2); } TEST(NVFuserTest, TestBackOffInnerBroadcast3_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(4); fusion->addInput(tv0); fusion->addInput(tv1); auto tv2 = broadcast(tv0, {false, false, true}); auto tv3 = broadcast(tv2, {false, true, false, false}); auto tv4 = add(tv3, tv1); fusion->addOutput(tv4); tv0->computeAt(tv4, -1); tv1->computeAt(tv4, -1); TORCH_CHECK(tv2->getComputeAtPosition() == 2); TORCH_CHECK(tv3->getMaxProducerPosition() == 3); } TEST(NVFuserTest, FusionSegfaultReduction_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); int batch = 2; int c = 1; int h = 1; int w = 1; int numDims = 4; auto input = makeConcreteTensor({-1, 1, 1, 1}); fusion.addInput(input); auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1}); fusion.addInput(bcast_bias); std::vector at_sum_axes; std::vector outer_reduction_axes; std::vector outer_broadcast_mask(numDims, false); Val* N = new Double(1); for (size_t axis = 0; axis < numDims; ++axis) { if (axis != 1) { outer_reduction_axes.push_back(axis); at_sum_axes.push_back(axis); outer_broadcast_mask[axis] = true; N = mul(N, input->domain()->domain()[axis]->extent()); } } auto output0 = mul(input, bcast_bias); fusion.addOutput(output0); auto output1 = sum(output0, outer_reduction_axes); fusion.addOutput(output1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input0 = at::randn({batch, c, h, w}, options); at::Tensor input1 = at::randn({batch, c, h, w}, options); auto at_output0 = input0.mul(input1); auto at_output1 = at_output0.sum(at_sum_axes); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector inputs = {input0, input1}; auto outputs = fec.runFusionWithInputs(inputs); testValidate( &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionPredicateElimination_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(2)); auto tv3 = add(tv2, new Double(3)); fusion.addOutput(tv3); tv3->split(0, 32); tv0->computeAt(tv3, 1); tv2->axis(1)->parallelize(ParallelType::Unswitch); { GpuLower gpulw(&fusion); TORCH_CHECK(!isPredicated(tv2, gpulw)); } tv2->axis(1)->parallelize(ParallelType::Serial); tv2->split(1, 5); { GpuLower gpulw(&fusion); TORCH_CHECK(isPredicated(tv2, gpulw)); } } TEST(NVFuserTest, ForceFp16Simple_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(2); auto tv1 = makeSymbolicTensor(2); fusion->addInput(tv0); fusion->addInput(tv1); // Group 1 auto tv2 = sum(tv0, {1}); auto tv3 = broadcast(tv2, {false, true}); // Group 2 auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast auto tv5 = castOp(DataType::Half, tv4); fusion->addOutput(tv5); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector shape{15, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); fec.runFusionWithInputs({in0, in1}); // Check the segmented edge is fp16 auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); for (auto edge : segmented_fusion->edges()) { auto edge_tv = edge->val->as(); TORCH_CHECK(edge_tv->getDataType() == DataType::Half); } } TEST(NVFuserTest, ForceFp16NotAllCast_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(3); fusion->addInput(tv0); fusion->addInput(tv1); // Group 1 auto tv3 = sum(tv0, {1}); auto tv4 = broadcast(tv3, {false, true, false}); auto tv5 = sum(tv0, {1}); // Group 2 auto tv6 = add(tv4, tv1); // edge tv4, expect cast auto tv7 = castOp(DataType::Half, tv6); // Group 3 auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast fusion->addOutput(tv7); fusion->addOutput(tv8); FusionExecutorCache fec(std::move(fusion_ptr)); std::vector shape{16, 16, 16}; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto in0 = at::randn(shape, options); auto in1 = at::randn(shape, options); fec.runFusionWithInputs({in0, in1}); auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); auto complete_fusion = segmented_fusion->completeFusion(); // Check that the edge that wasn't fp16 is the producer of the // reduction op, i.e. tv8 = sum(tv5,{1});. for (auto edge : segmented_fusion->edges()) { auto edge_tv = edge->val->as(); if (edge_tv->getDataType() == DataType::Float) { auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin()); TORCH_CHECK(consumer->isA()); } } } TEST(NVFuserTest, FusionIssue970_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int nelm = 10; // tv3 = tv0 + sum(tv0) auto tv0 = makeConcreteTensor({nelm, nelm}); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = add(tv2, tv0); fusion.addOutput(tv3); tv1->split(1, 4); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({nelm, nelm}, options); auto outputs = fe.runFusion({t0}); auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0; testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); } // Reproducer of #1016 TEST(NVFuserTest, FusionIssue1016_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv1, new Double(2)); fusion.addOutput(tv2); tv1->setMemoryType(MemoryType::Shared); tv2->split(-1, 8); FusionExecutor fe; fe.compileFusion(&fusion); int numel_x = 10; int numel_y = 11; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; auto outputs = fe.runFusion(inputs); auto ref = t0 + 1 + 2; testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); } // Reproducer of #1021 TEST(NVFuserTest, FusionIssue1021_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = broadcast(tv1, {false, true}); fusion.addOutput(tv2); auto tv3 = tv2->cache_before(); tv2->split(0, 2); tv1->computeAt(tv2, 1); tv2->axis(0)->parallelize(ParallelType::TIDx); tv2->axis(1)->parallelize(ParallelType::Vectorize); FusionExecutor fe; fe.compileFusion(&fusion); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({10}, options); std::vector inputs = {t0}; auto outputs = fe.runFusion(inputs); auto ref = (t0 + 1).unsqueeze(-1); testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } // Reproducer of issue #1053 TEST(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); fusion->addInput(tv0); auto tv1 = sum(tv0, {0}); fusion->addOutput(tv1); auto tv2 = add(tv0, new Double(1)); fusion->addOutput(tv2); tv1->split(0, 8); auto tv1_rf = tv1->rFactor({-1}); tv1_rf->computeAt(tv1, 1); tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(0)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({32}, options); auto at_tv1 = (input1).sum({0}); auto at_tv2 = input1 + 1; FusionExecutor fe; fe.compileFusion(fusion.get()); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionParallelDimensionMap1_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); fusion->addInput(tv0); auto tv1 = add(tv0, new Double(1)); auto tv2 = add(tv0, new Double(1)); fusion->addOutput(tv1); fusion->addOutput(tv2); tv1->split(0, 8, false); tv1->axis(1)->parallelize(ParallelType::TIDx); tv2->split(0, 8, false); tv2->axis(1)->parallelize(ParallelType::TIDx); // The extents of tv1 and tv2 axes are equal even though their // actual values are not statically known GpuLower gpulw(fusion.get()); const auto& pdmap = gpulw.parallelDimensionMap(); auto kir_tv1 = gpulw.lowerValue(tv1)->as(); auto kir_tv2 = gpulw.lowerValue(tv2)->as(); for (size_t i = 0; i < kir_tv1->domain()->domain().size(); ++i) { auto dom1 = kir_tv1->domain()->domain()[i]; auto dom2 = kir_tv2->domain()->domain()[i]; TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent())); } TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( pdmap.get(ParallelType::TIDx)->isA() && pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({32}, options); FusionExecutor fe; fe.compileFusion(fusion.get()); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {input1 + 1, input1 + 1}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionParallelDimensionMap2_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); fusion->addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion->addInput(tv1); auto tv2 = broadcast(tv0, {false, true}); auto tv3 = add(tv1, tv2); fusion->addOutput(tv3); tv3->split(-1, 8, false); tv2->computeAt(tv3, -1); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv2->axis(-1)->parallelize(ParallelType::TIDx); GpuLower gpulw(fusion.get()); const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( pdmap.get(ParallelType::TIDx)->isA() && pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({11}, options); at::Tensor input2 = at::randn({11, 13}, options); FusionExecutor fe; fe.compileFusion(fusion.get()); auto outputs = fe.runFusion({input1, input2}); auto ref = input1.unsqueeze(-1) + input2; testValidate( fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__); } // Mix symbolic and concrete tensors TEST(NVFuserTest, FusionParallelDimensionMap3_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); fusion->addInput(tv0); auto tv2 = add(tv0, new Double(1)); fusion->addOutput(tv2); auto tv3 = add(tv0, new Double(1)); fusion->addOutput(tv3); tv2->split(0, 10); tv3->split(0, 20); auto tv4 = add(tv0, new Double(1)); fusion->addOutput(tv4); auto tv5 = add(tv0, new Double(1)); fusion->addOutput(tv5); // Not mapped but equal extent tv4->split(0, 10); tv5->split(0, 10); tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDy); tv5->axis(-1)->parallelize(ParallelType::TIDy); GpuLower gpulw(fusion.get()); const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( pdmap.get(ParallelType::TIDx)->isA() && pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); TORCH_CHECK(pdmap.isExact(ParallelType::TIDy)); TORCH_CHECK( pdmap.get(ParallelType::TIDy)->isConst() && pdmap.get(ParallelType::TIDy)->as()->value().value() == 10); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); FusionExecutor fe; fe.compileFusion(fusion.get()); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {input1 + 1, input1 + 1, input1 + 1, input1 + 1}, __LINE__, __FILE__); } // Parallelizing merged broadcast domains TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); auto tv2 = add(tv0, new Double(1)); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv4->split(1, 4); tv4->reorder({{1, 2}, {2, 1}}); tv4->merge(0); tv0->computeAt(tv4, 1); tv1->computeAt(tv4, 1); // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not // exact. tv4->axis(0)->parallelize(ParallelType::TIDx); tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); GpuLower gpulw(&fusion); const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( pdmap.get(ParallelType::TIDx)->isA() && pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({15, 13}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({input1, input2}); auto ref = (input1 + 1).unsqueeze(0) + input2; testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__); } TEST(NVFuserTest, FusionParallelDimensionMap5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); auto tv3 = broadcast(tv0, {false, true}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); tv4->split(1, 4); tv0->computeAt(tv4, -1); tv1->computeAt(tv4, -1); tv4->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-2)->parallelize(ParallelType::TIDy); tv3->axis(-2)->parallelize(ParallelType::TIDy); GpuLower gpulw(&fusion); const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK(pdmap.isExact(ParallelType::TIDy)); TORCH_CHECK( pdmap.get(ParallelType::TIDx)->isConst() && pdmap.get(ParallelType::TIDx)->as()->value().value() == 4); TORCH_CHECK( pdmap.get(ParallelType::TIDy)->isA() && pdmap.get(ParallelType::TIDy)->as()->name() == "blockDim.y"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({13, 15}, options); FusionExecutor fe; fe.compileFusion(&fusion); auto outputs = fe.runFusion({input1, input2}); auto ref = (input1).unsqueeze(-1) + input2; testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__); } } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA)