diff --git a/mypy.ini b/mypy.ini
index 5b297c89d54..b9c87aca583 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -198,9 +198,6 @@ ignore_errors = True
 [mypy-torch.cuda.comm]
 ignore_errors = True
 
-[mypy-torch.cuda.memory]
-ignore_errors = True
-
 [mypy-torch.cuda.nccl]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 0fb0372b7fa..9e1e0ba1b3e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2,7 +2,7 @@
 
 import torch
 from torch import Tensor
-from typing import (Any, BinaryIO, Callable, ContextManager, Iterator, List, NamedTuple,
+from typing import (Any, BinaryIO, Callable, ContextManager, Dict, Iterator, List, NamedTuple,
                     Optional, overload, Sequence, Tuple, TypeVar, Type, Union)
 from torch._six import inf
 
@@ -300,6 +300,24 @@ class _TensorBase(object):
     ${tensor_method_hints}
 
 # Defined in torch/csrc/cuda/Module.cpp
+def _cuda_getCurrentStream(device: _int) -> _int: ...
+def _cuda_getDefaultStream(device: _int) -> _int: ...
+def _cuda_getCurrentBlasHandle() -> _int: ...
+def _cuda_setStream(cuda_stream: _int) -> None: ...
+def _cuda_getCompiledVersion() -> _int: ...
+def _cuda_cudaHostAllocator() -> _int: ...
+def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
+def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
+def _cuda_emptyCache() -> None: ...
+def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
+def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
+def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
+def _cuda_memorySnapshot() -> List[Dict[str, Any]]: ...
+def _cuda_lock_mutex() -> None: ...
+def _cuda_unlock_mutex() -> None: ...
+def _nccl_version() -> _int: ...
+def _nccl_unique_id() -> bytes: ...
+
 class _CudaDeviceProperties:
     name: str
     major: _int
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 3afeb1bafdb..299cb56aa2a 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -4,7 +4,7 @@ import warnings
 from typing import Any, Dict, Union
 
 import torch
-from . import is_initialized, _get_device_index
+from . import is_initialized, _get_device_index, _lazy_init
 from torch.types import Device
 
 def _host_allocator():
@@ -31,7 +31,7 @@ def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None
 
     Arguments:
         size (int): number of bytes to be allocated.
-        device (torch.device or int, optional): selected device. If it is 
+        device (torch.device or int, optional): selected device. If it is
             ``None`` the default CUDA device is used.
         stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
             the default stream for the selected device is used.