pytorch/test/cpp/jit/tests.h
Christian Sarofeen f11c4f90c2 New CUDA Fuser: Unrolling support, interface refactor (#36435)
Summary:
Unrolling support has been added in a way that we get good performing code on GPUs. Not sure how long this link will last but an example of a generated unrolled kernel is:
https://godbolt.org/z/i0uAv3

What can be seen from there is multiple calls of "ld.global.f32" without "ld.store.f32" in between them (and vice versa). This means that we are launching multiple loads that can be run in parallel, as well as multiple stores that can be run in parallel. This can be a crucial optimization for memory bound kernels. This was generally a point of concern in TVM as an attempt of a similar kernel from TVM produces: https://godbolt.org/z/Vu97vG which surrounds load - store pairs in conditional branches preventing the benefits of unrolling.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/36435

Reviewed By: ZolotukhinM

Differential Revision: D21024011

Pulled By: soumith

fbshipit-source-id: e852e282fa7a304aba962e1926f756098c011fe0
2020-04-16 09:20:24 -07:00

147 lines
5.1 KiB
C++

#pragma once
/**
* See README.md for instructions on how to add a new test.
*/
#include <c10/macros/Export.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
namespace torch {
namespace jit {
#define TH_FORALL_TESTS(_) \
_(ADFormulas) \
_(Attributes) \
_(Blocks) \
_(CallStack) \
_(CallStackCaching) \
_(CodeTemplate) \
_(ControlFlow) \
_(CreateAutodiffSubgraphs) \
_(CustomOperators) \
_(CustomOperatorAliasing) \
_(IValueKWargs) \
_(CustomFusion) \
_(SchemaMatching) \
_(Differentiate) \
_(DifferentiateWithRequiresGrad) \
_(FromQualString) \
_(InternedStrings) \
_(PassManagement) \
_(Proto) \
_(RegisterFusionCachesKernel) \
_(SchemaParser) \
_(TopologicalIndex) \
_(TopologicalMove) \
_(SubgraphUtils) \
_(AliasAnalysis) \
_(ContainerAliasing) \
_(AliasRegistration) \
_(WriteTracking) \
_(Wildcards) \
_(MemoryDAG) \
_(IRParser) \
_(ConstantPooling) \
_(THNNConv) \
_(ATenNativeBatchNorm) \
_(NoneSchemaMatch) \
_(ClassParser) \
_(UnifyTypes) \
_(Profiler) \
_(InsertAndEliminateRedundantGuards) \
_(InsertBailOuts) \
_(PeepholeOptimize) \
_(RecordFunction) \
_(ThreadLocalDebugInfo) \
_(SubgraphMatching) \
_(SubgraphRewriter) \
_(ModuleClone) \
_(ModuleCloneInstance) \
_(ModuleConstant) \
_(ModuleParameter) \
_(ModuleDefine) \
_(QualifiedName) \
_(ClassImport) \
_(ProfiledTensorTypeHashing) \
_(ScriptObject) \
_(SaveExtraFilesHook) \
_(TypeTags) \
_(DCE) \
_(CustomFusionNestedBlocks) \
_(ClassDerive) \
_(SaveLoadTorchbind) \
_(ModuleInterfaceSerialization) \
_(ClassTypeAddRemoveAttr) \
_(Inliner) \
_(LiteInterpreterAdd) \
_(LiteInterpreterConv) \
_(LiteInterpreterInline) \
_(LiteInterpreterTuple) \
_(LiteInterpreterUpsampleNearest2d) \
_(CommonAncestor) \
_(AutogradSymbols) \
_(MobileTypeParser) \
_(LiteInterpreterBuiltinFunction) \
_(LiteInterpreterPrim) \
_(LiteInterpreterLoadOrigJit) \
_(LiteInterpreterWrongMethodName) \
_(LiteInterpreterParams) \
_(LiteInterpreterSetState) \
_(TorchbindIValueAPI) \
_(LiteInterpreterDict)
#if defined(USE_CUDA)
#define TH_FORALL_TESTS_CUDA(_) \
_(ArgumentSpec) \
_(CompleteArgumentSpec) \
_(Fusion) \
_(GraphExecutor) \
_(ModuleConversion) \
_(Interp) \
_(GPU_FusionDispatch) \
_(GPU_FusionSimpleArith) \
_(GPU_FusionSimpleTypePromote) \
_(GPU_FusionCastOp) \
_(GPU_FusionMutator) \
_(GPU_FusionRegister) \
_(GPU_FusionTopoSort) \
_(GPU_FusionTensor) \
_(GPU_FusionTensorContiguity) \
_(GPU_FusionTVSplit) \
_(GPU_FusionTVMerge) \
_(GPU_FusionTVReorder) \
_(GPU_FusionEquality) \
_(GPU_FusionReplaceAll) \
_(GPU_FusionParser) \
_(GPU_FusionDependency) \
_(GPU_FusionCodeGen) \
_(GPU_FusionCodeGen2) \
_(GPU_FusionSimplePWise) \
_(GPU_FusionExecKernel) \
_(GPU_FusionForLoop) \
_(GPU_FusionLoopUnroll)
#else
#define TH_FORALL_TESTS_CUDA(_) \
_(ArgumentSpec) \
_(CompleteArgumentSpec) \
_(Fusion) \
_(GraphExecutor) \
_(ModuleConversion) \
_(Interp)
#endif
#define DECLARE_JIT_TEST(name) void test##name();
TH_FORALL_TESTS(DECLARE_JIT_TEST)
TH_FORALL_TESTS_CUDA(DECLARE_JIT_TEST)
#undef DECLARE_JIT_TEST
// This test is special since it requires prior setup in python.
// So it is not part of the general test list (which is shared between the gtest
// and python test runners), but is instead invoked manually by the
// torch_python_test.cpp
void testEvalModeForLoadedModule();
void testSerializationInterop();
void testTorchSaveError();
} // namespace jit
} // namespace torch