c10/cuda/CUDAAllocatorConfig.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>

 #include <atomic>
 #include <cstddef>
 #include <cstdlib>
 #include <mutex>
 #include <string>
 #include <vector>

 namespace c10::cuda::CUDACachingAllocator {

 // Environment config parser
 class C10_CUDA_API CUDAAllocatorConfig {
  public:
   static size_t max_split_size() {
     return instance().m_max_split_size;
   }
   static double garbage_collection_threshold() {
     return instance().m_garbage_collection_threshold;
   }

   static bool expandable_segments() {
 #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
     if (instance().m_expandable_segments) {
       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
     }
     return false;
 #else
     return instance().m_expandable_segments;
 #endif
   }

   static bool release_lock_on_cudamalloc() {
     return instance().m_release_lock_on_cudamalloc;
   }

   /** Pinned memory allocator settings */
   static bool pinned_use_cuda_host_register() {
     return instance().m_pinned_use_cuda_host_register;
   }

   static size_t pinned_num_register_threads() {
     return instance().m_pinned_num_register_threads;
   }

   static size_t pinned_max_register_threads() {
     // Based on the benchmark results, we see better allocation performance
     // with 8 threads. However on future systems, we may need more threads
     // and limiting this to 128 threads.
     return 128;
   }

   // This is used to round-up allocation size to nearest power of 2 divisions.
   // More description below in function roundup_power2_next_division
   // As ane example, if we want 4 divisions between 2's power, this can be done
   // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
   static size_t roundup_power2_divisions(size_t size);

   static std::vector<size_t> roundup_power2_divisions() {
     return instance().m_roundup_power2_divisions;
   }

   static std::string last_allocator_settings() {
     std::lock_guard<std::mutex> lock(
         instance().m_last_allocator_settings_mutex);
     return instance().m_last_allocator_settings;
   }

   static CUDAAllocatorConfig& instance() {
     static CUDAAllocatorConfig* s_instance = ([]() {
       auto inst = new CUDAAllocatorConfig();
       const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
       inst->parseArgs(env);
       return inst;
     })();
     return *s_instance;
   }

   void parseArgs(const char* env);

  private:
   CUDAAllocatorConfig();

   static void lexArgs(const char* env, std::vector<std::string>& config);
   static void consumeToken(
       const std::vector<std::string>& config,
       size_t i,
       const char c);
   size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
   size_t parseGarbageCollectionThreshold(
       const std::vector<std::string>& config,
       size_t i);
   size_t parseRoundUpPower2Divisions(
       const std::vector<std::string>& config,
       size_t i);
   size_t parseAllocatorConfig(
       const std::vector<std::string>& config,
       size_t i,
       bool& used_cudaMallocAsync);
   size_t parsePinnedUseCudaHostRegister(
       const std::vector<std::string>& config,
       size_t i);
   size_t parsePinnedNumRegisterThreads(
       const std::vector<std::string>& config,
       size_t i);

   std::atomic<size_t> m_max_split_size;
   std::vector<size_t> m_roundup_power2_divisions;
   std::atomic<double> m_garbage_collection_threshold;
   std::atomic<size_t> m_pinned_num_register_threads;
   std::atomic<bool> m_expandable_segments;
   std::atomic<bool> m_release_lock_on_cudamalloc;
   std::atomic<bool> m_pinned_use_cuda_host_register;
   std::string m_last_allocator_settings;
   std::mutex m_last_allocator_settings_mutex;
 };

 // General caching allocator utilities
 C10_CUDA_API void setAllocatorSettings(const std::string& env);

 } // namespace c10::cuda::CUDACachingAllocator
	#pragma once

	#include <c10/cuda/CUDAMacros.h>
	#include <c10/util/Exception.h>

	#include <atomic>
	#include <cstddef>
	#include <cstdlib>
	#include <mutex>
	#include <string>
	#include <vector>

	namespace c10::cuda::CUDACachingAllocator {

	// Environment config parser
	class C10_CUDA_API CUDAAllocatorConfig {
	public:
	static size_t max_split_size() {
	return instance().m_max_split_size;
	}
	static double garbage_collection_threshold() {
	return instance().m_garbage_collection_threshold;
	}

	static bool expandable_segments() {
	#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
	if (instance().m_expandable_segments) {
	TORCH_WARN_ONCE("expandable_segments not supported on this platform")
	}
	return false;
	#else
	return instance().m_expandable_segments;
	#endif
	}

	static bool release_lock_on_cudamalloc() {
	return instance().m_release_lock_on_cudamalloc;
	}

	/** Pinned memory allocator settings */
	static bool pinned_use_cuda_host_register() {
	return instance().m_pinned_use_cuda_host_register;
	}

	static size_t pinned_num_register_threads() {
	return instance().m_pinned_num_register_threads;
	}

	static size_t pinned_max_register_threads() {
	// Based on the benchmark results, we see better allocation performance
	// with 8 threads. However on future systems, we may need more threads
	// and limiting this to 128 threads.
	return 128;
	}

	// This is used to round-up allocation size to nearest power of 2 divisions.
	// More description below in function roundup_power2_next_division
	// As ane example, if we want 4 divisions between 2's power, this can be done
	// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
	static size_t roundup_power2_divisions(size_t size);

	static std::vector<size_t> roundup_power2_divisions() {
	return instance().m_roundup_power2_divisions;
	}

	static std::string last_allocator_settings() {
	std::lock_guard<std::mutex> lock(
	instance().m_last_allocator_settings_mutex);
	return instance().m_last_allocator_settings;
	}

	static CUDAAllocatorConfig& instance() {
	static CUDAAllocatorConfig* s_instance = ([]() {
	auto inst = new CUDAAllocatorConfig();
	const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
	inst->parseArgs(env);
	return inst;
	})();
	return *s_instance;
	}

	void parseArgs(const char* env);

	private:
	CUDAAllocatorConfig();

	static void lexArgs(const char* env, std::vector<std::string>& config);
	static void consumeToken(
	const std::vector<std::string>& config,
	size_t i,
	const char c);
	size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
	size_t parseGarbageCollectionThreshold(
	const std::vector<std::string>& config,
	size_t i);
	size_t parseRoundUpPower2Divisions(
	const std::vector<std::string>& config,
	size_t i);
	size_t parseAllocatorConfig(
	const std::vector<std::string>& config,
	size_t i,
	bool& used_cudaMallocAsync);
	size_t parsePinnedUseCudaHostRegister(
	const std::vector<std::string>& config,
	size_t i);
	size_t parsePinnedNumRegisterThreads(
	const std::vector<std::string>& config,
	size_t i);

	std::atomic<size_t> m_max_split_size;
	std::vector<size_t> m_roundup_power2_divisions;
	std::atomic<double> m_garbage_collection_threshold;
	std::atomic<size_t> m_pinned_num_register_threads;
	std::atomic<bool> m_expandable_segments;
	std::atomic<bool> m_release_lock_on_cudamalloc;
	std::atomic<bool> m_pinned_use_cuda_host_register;
	std::string m_last_allocator_settings;
	std::mutex m_last_allocator_settings_mutex;
	};

	// General caching allocator utilities
	C10_CUDA_API void setAllocatorSettings(const std::string& env);

	} // namespace c10::cuda::CUDACachingAllocator