blob: 3106fc1b46baeed494fa3724b706599620bf808c [file] [log] [blame] [edit]
#pragma once
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Exception.h>
#include <atomic>
#include <cstddef>
#include <cstdlib>
#include <mutex>
#include <string>
#include <vector>
namespace c10::cuda::CUDACachingAllocator {
// Environment config parser
class C10_CUDA_API CUDAAllocatorConfig {
public:
static size_t max_split_size() {
return instance().m_max_split_size;
}
static double garbage_collection_threshold() {
return instance().m_garbage_collection_threshold;
}
static bool expandable_segments() {
#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
if (instance().m_expandable_segments) {
TORCH_WARN_ONCE("expandable_segments not supported on this platform")
}
return false;
#else
return instance().m_expandable_segments;
#endif
}
static bool release_lock_on_cudamalloc() {
return instance().m_release_lock_on_cudamalloc;
}
/** Pinned memory allocator settings */
static bool pinned_use_cuda_host_register() {
return instance().m_pinned_use_cuda_host_register;
}
static size_t pinned_num_register_threads() {
return instance().m_pinned_num_register_threads;
}
static size_t pinned_max_register_threads() {
// Based on the benchmark results, we see better allocation performance
// with 8 threads. However on future systems, we may need more threads
// and limiting this to 128 threads.
return 128;
}
// This is used to round-up allocation size to nearest power of 2 divisions.
// More description below in function roundup_power2_next_division
// As ane example, if we want 4 divisions between 2's power, this can be done
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size);
static std::vector<size_t> roundup_power2_divisions() {
return instance().m_roundup_power2_divisions;
}
static std::string last_allocator_settings() {
std::lock_guard<std::mutex> lock(
instance().m_last_allocator_settings_mutex);
return instance().m_last_allocator_settings;
}
static CUDAAllocatorConfig& instance() {
static CUDAAllocatorConfig* s_instance = ([]() {
auto inst = new CUDAAllocatorConfig();
const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
inst->parseArgs(env);
return inst;
})();
return *s_instance;
}
void parseArgs(const char* env);
private:
CUDAAllocatorConfig();
static void lexArgs(const char* env, std::vector<std::string>& config);
static void consumeToken(
const std::vector<std::string>& config,
size_t i,
const char c);
size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
size_t parseGarbageCollectionThreshold(
const std::vector<std::string>& config,
size_t i);
size_t parseRoundUpPower2Divisions(
const std::vector<std::string>& config,
size_t i);
size_t parseAllocatorConfig(
const std::vector<std::string>& config,
size_t i,
bool& used_cudaMallocAsync);
size_t parsePinnedUseCudaHostRegister(
const std::vector<std::string>& config,
size_t i);
size_t parsePinnedNumRegisterThreads(
const std::vector<std::string>& config,
size_t i);
std::atomic<size_t> m_max_split_size;
std::vector<size_t> m_roundup_power2_divisions;
std::atomic<double> m_garbage_collection_threshold;
std::atomic<size_t> m_pinned_num_register_threads;
std::atomic<bool> m_expandable_segments;
std::atomic<bool> m_release_lock_on_cudamalloc;
std::atomic<bool> m_pinned_use_cuda_host_register;
std::string m_last_allocator_settings;
std::mutex m_last_allocator_settings_mutex;
};
// General caching allocator utilities
C10_CUDA_API void setAllocatorSettings(const std::string& env);
} // namespace c10::cuda::CUDACachingAllocator