{
  "id": "llm/huggingface-model-load-oom-on-cpu",
  "signature": "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has 8.00 GiB total capacity; 7.80 GiB already allocated.",
  "signature_zh": "torch.cuda.OutOfMemoryError: CUDA 内存不足。尝试分配 2.00 GiB。GPU 0 总容量 8.00 GiB；已分配 7.80 GiB。",
  "regex": "torch\\.cuda\\.OutOfMemoryError.*CUDA out of memory.*Tried to allocate",
  "domain": "llm",
  "category": "resource_error",
  "subcategory": null,
  "root_cause": "Hugging Face model loading tries to allocate the full model on GPU, but the available VRAM is insufficient due to other processes (e.g., previous model instances, data loaders) consuming memory, or the model itself is too large for the GPU.",
  "root_cause_type": "generic",
  "root_cause_zh": "Hugging Face 模型加载尝试在 GPU 上分配完整模型，但由于其他进程（例如，先前的模型实例、数据加载器）消耗了内存，或者模型本身对于 GPU 来说太大，导致可用 VRAM 不足。",
  "versions": [
    {
      "version": "transformers==4.36.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "torch==2.1.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "accelerate==0.25.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "While this frees GPU memory from the current session, it doesn't prevent the underlying memory fragmentation or model size issue. The error returns if the model is loaded again without adjustments.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "empty_cache() only releases unused cached memory allocator blocks, not memory actively held by other tensors. It often has minimal effect when VRAM is fully consumed by model parameters.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "The OOM occurs during model loading, not inference. Batch size doesn't affect model parameter memory allocation.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Load the model with device_map='auto' and offload to CPU or disk: `model = AutoModelForCausalLM.from_pretrained('model-name', device_map='auto', torch_dtype=torch.float16, offload_folder='/tmp/offload')`. This splits the model across GPU, CPU, and disk if needed.",
      "success_rate": 0.95,
      "how": "Load the model with device_map='auto' and offload to CPU or disk: `model = AutoModelForCausalLM.from_pretrained('model-name', device_map='auto', torch_dtype=torch.float16, offload_folder='/tmp/offload')`. This splits the model across GPU, CPU, and disk if needed.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use gradient checkpointing to reduce memory during training: `model.gradient_checkpointing_enable()` before training, which trades compute for memory by recomputing activations.",
      "success_rate": 0.85,
      "how": "Use gradient checkpointing to reduce memory during training: `model.gradient_checkpointing_enable()` before training, which trades compute for memory by recomputing activations.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Explicitly clear GPU memory before loading: `import gc; gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()` and then load the model with `low_cpu_mem_usage=True`.",
      "success_rate": 0.75,
      "how": "Explicitly clear GPU memory before loading: `import gc; gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()` and then load the model with `low_cpu_mem_usage=True`.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "使用 device_map='auto' 加载模型并卸载到 CPU 或磁盘：`model = AutoModelForCausalLM.from_pretrained('model-name', device_map='auto', torch_dtype=torch.float16, offload_folder='/tmp/offload')`。这会根据需要将模型拆分到 GPU、CPU 和磁盘。",
    "在训练前使用梯度检查点以减少内存：`model.gradient_checkpointing_enable()`，通过重新计算激活值来用计算换取内存。",
    "在加载模型前显式清除 GPU 内存：`import gc; gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()`，然后使用 `low_cpu_mem_usage=True` 加载模型。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://huggingface.co/docs/transformers/en/troubleshooting#out-of-memory",
  "official_doc_section": null,
  "error_code": "CUDA-OOM-001",
  "verification_tier": "ai_generated",
  "confidence": 0.9,
  "fix_success_rate": 0.88,
  "resolvable": "true",
  "first_seen": "2023-12-01",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}