expandable_segments <-> other allocator options (#134338)

Previously setting  garbage_collection_threshold or max_split_size_mb along with expandable_segments:True could cause the allocator to hit assert failures when running nearly out of memory. This PR ensures garbage_collection and max_split freeing do not accidentally try to release expandable segments.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/134338
Approved by: https://github.com/ezyang
This commit is contained in:
zdevito 2024-08-29 09:19:05 -07:00 committed by PyTorch MergeBot
parent 3fc6e47d42
commit d91b49dbaa
2 changed files with 66 additions and 6 deletions

View File

@ -2611,7 +2611,7 @@ class DeviceCachingAllocator {
while (it != large_blocks.blocks.end()) {
Block* block = *it;
++it;
if (!block->is_split() &&
if (!block->is_split() && !block->expandable_segment_ &&
static_cast<double>(block->gc_count()) >= age_threshold) {
block_freed = true;
gc_reclaimed += block->size;
@ -2754,7 +2754,8 @@ class DeviceCachingAllocator {
? CUDAAllocatorConfig::max_split_size()
: key.size;
auto it = pool.blocks.lower_bound(&key);
if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
(*it)->expandable_segment_) {
// No single block is large enough; free multiple oversize blocks,
// starting with the largest
if (it == pool.blocks.begin())
@ -2766,12 +2767,15 @@ class DeviceCachingAllocator {
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->stream == p.stream())) {
auto cur = it;
totalReleased += (*it)->size;
if (it != pool.blocks.begin()) {
bool is_first = cur == pool.blocks.begin();
if (!is_first) {
--it;
}
if (!(*cur)->expandable_segment_) {
release_block(*cur, context);
} else {
release_block(*cur, context);
totalReleased += (*cur)->size;
}
if (is_first) {
break;
}
}

View File

@ -4098,6 +4098,62 @@ class TestCudaMallocAsync(TestCase):
finally:
torch.cuda.memory._record_memory_history(None)
def test_max_split_expandable(self):
torch.cuda.memory.empty_cache()
mb = 1024 * 1024
_, all_memory = torch.cuda.memory.mem_get_info()
total_allowed = 120 * mb
fraction_allowed = total_allowed / all_memory
assert int(fraction_allowed * all_memory) == total_allowed
torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
def alloc(n):
return torch.ones(n * mb, dtype=torch.int8, device="cuda")
torch.cuda.memory._set_allocator_settings(
"expandable_segments:False,max_split_size_mb:40"
)
a = alloc(40)
torch.cuda.memory._set_allocator_settings(
"expandable_segments:True,max_split_size_mb:40"
)
b = alloc(40)
torch.cuda.memory._set_allocator_settings(
"expandable_segments:False,max_split_size_mb:40"
)
c = alloc(40)
with self.assertRaises(torch.OutOfMemoryError):
alloc(40)
del a, b, c
# force release_cached_blocks to run with some expandable segments in the free list
alloc(120)
def test_garbage_collect_expandable(self):
torch.cuda.memory.empty_cache()
mb = 1024 * 1024
_, all_memory = torch.cuda.memory.mem_get_info()
total_allowed = 120 * mb
fraction_allowed = total_allowed / all_memory
assert int(fraction_allowed * all_memory) == total_allowed
torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
def alloc(n):
return torch.ones(n * mb, dtype=torch.int8, device="cuda")
torch.cuda.memory._set_allocator_settings(
"expandable_segments:False,garbage_collection_threshold:0.5"
)
a = alloc(40)
torch.cuda.memory._set_allocator_settings(
"expandable_segments:True,garbage_collection_threshold:0.5"
)
b = alloc(40)
del a, b
# causes GC to run. The expandable segment block will be split
# so GC would not attempt to free it anyway, but this at least makes sure
# expandable_segment blocks can be in the free list when this is called.
alloc(80)
def test_allocator_settings(self):
def power2_div(size, div_factor):
pow2 = 1