diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index 69a5c5a586c..ea0a48154fa 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -54,7 +54,7 @@ set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%CUDA_PATH%\extras\CUPTI\lib64;%PATH%
+set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
 set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
 set NUMBAPRO_NVVM=%CUDA_PATH%\nvvm\bin\nvvm64_32_0.dll
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7bc554f2e8a..ca560288a41 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1901,7 +1901,7 @@ if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE
 endif()
 
 if(USE_KINETO)
-  if(NOT USE_CUDA)
+  if((NOT USE_CUDA) OR MSVC)
     set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
   else()
     set(LIBKINETO_NOCUPTI OFF CACHE STRING "")
@@ -1983,7 +1983,6 @@ if(USE_KINETO)
     string(APPEND CMAKE_CXX_FLAGS " -DLIBKINETO_NOCUPTI")
     message(STATUS "Configured Kineto (CPU)")
   else()
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${CUDA_CUDART_LIBRARY})
     message(STATUS "Configured Kineto")
   endif()
 endif()
diff --git a/test/test_profiler.py b/test/test_profiler.py
index d1f6bba1b92..8b9428ec41f 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -2,7 +2,6 @@ import collections
 import gc
 import io
 import json
-import time
 import os
 import unittest
 
@@ -588,16 +587,8 @@ class TestProfiler(TestCase):
                 assert is_int, "Invalid stacks record"
 
     @unittest.skipIf(not kineto_available(), "Kineto is required")
+    @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
     def test_tensorboard_trace_handler(self):
-        def delayed(func, time_to_sleep=0.005):
-            """"The payload in this test might be too small. tensorboard_trace_handler use time.time()
-            to generate a filename. Delaying it to avoid generate the same filename on Windows.
-            """
-            def wrapper(*args, **kwargs):
-                time.sleep(time_to_sleep)
-                func(*args, **kwargs)
-            return wrapper
-
         use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
         with _profile(use_cuda=use_cuda, use_kineto=True):
             self.payload(use_cuda=use_cuda)
@@ -614,7 +605,7 @@ class TestProfiler(TestCase):
                     warmup=1,
                     active=2,
                     repeat=3),
-                on_trace_ready=delayed(torch.profiler.tensorboard_trace_handler(dname))
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(dname)
             ) as p:
                 for _ in range(18):
                     self.payload(use_cuda=use_cuda)
@@ -643,7 +634,7 @@ class TestProfiler(TestCase):
                     warmup=1,
                     active=2,
                     repeat=3),
-                on_trace_ready=delayed(torch.profiler.tensorboard_trace_handler(dname, use_gzip=True))
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(dname, use_gzip=True)
             )
             p.start()
             for _ in range(18):
diff --git a/torch/__init__.py b/torch/__init__.py
index 492ed456531..5d02330a5c9 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -78,14 +78,11 @@ if sys.platform == 'win32':
         cuda_version_1 = cuda_version.replace('.', '_')
         cuda_path_var = 'CUDA_PATH_V' + cuda_version_1
         default_path = os.path.join(pfiles_path, 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v' + cuda_version)
-        cuda_base = os.getenv(cuda_path_var, default_path)
-        cuda_path = os.path.join(cuda_base, 'bin')
-        cupti_path = os.path.join(cuda_base, 'extras', 'CUPTI', 'lib64')
+        cuda_path = os.path.join(os.getenv(cuda_path_var, default_path), 'bin')
     else:
         cuda_path = ''
-        cupti_path = ''
 
-    dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path, cupti_path]))
+    dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path]))
 
     kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
     with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')