Enable TensorPipe's SHM transport (#50760)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50760

The SHM transport uses shared-memory-backed ringbuffers to transfer small payloads between processes on the same machine.

It was disabled in v1.6 due to a CMake mishap but we've since realized that it also doesn't work that well in docker and other setups. Enabling it here to see whether CircleCI fails.
ghstack-source-id: 120470890

Test Plan: Exported three times to CircleCI with tests consistently passing

Reviewed By: mrshenli

Differential Revision: D23814828

fbshipit-source-id: f355cb6515776debad536924de4f4d3fbb05a874
This commit is contained in:
Luca Wehrstedt 2021-01-27 11:41:58 -08:00 committed by Facebook GitHub Bot
parent d3ec204ef2
commit b77f72b5a0
4 changed files with 8 additions and 4 deletions

View File

@ -574,7 +574,7 @@ jobs:
hostname
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
else
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
fi
echo "id=${id}" >> "${BASH_ENV}"

View File

@ -133,7 +133,7 @@ jobs:
hostname
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
else
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
fi
echo "id=${id}" >> "${BASH_ENV}"

View File

@ -1347,7 +1347,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
endif()
set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
set(TP_ENABLE_SHM OFF CACHE BOOL "" FORCE)
set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
@ -1851,4 +1850,3 @@ if(USE_KINETO)
set(USE_KINETO OFF)
endif()
endif()

View File

@ -14,6 +14,12 @@
#include <ATen/cuda/CUDAMultiStreamGuard.h>
#endif
#if TENSORPIPE_HAS_SHM_TRANSPORT
// Needed for ::getpid(), which is used to create a unique address.
#include <sys/types.h>
#include <unistd.h>
#endif
namespace torch {
namespace distributed {
namespace rpc {