{
  "id": "llm/vllm-cuda-oom-batch",
  "signature": "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 79.15 GiB of which 2.00 GiB is free. Including non-blocking allocations, current allocated: 77.15 GiB.",
  "signature_zh": "torch.cuda.OutOfMemoryError：CUDA 内存不足。尝试分配 2.00 GiB。GPU 0 总容量为 79.15 GiB，其中 2.00 GiB 空闲。包括非阻塞分配，当前已分配：77.15 GiB。",
  "regex": "torch\\.cuda\\.OutOfMemoryError: CUDA out of memory\\. Tried to allocate [\\d.]+ GiB\\. GPU \\d+ has a total capacity of [\\d.]+ GiB",
  "domain": "llm",
  "category": "resource_error",
  "subcategory": null,
  "root_cause": "vLLM's dynamic batching allocates KV cache blocks per request, and under high concurrency or long sequences, the cumulative allocation exceeds GPU memory, even though the model weights fit.",
  "root_cause_type": "generic",
  "root_cause_zh": "vLLM 的动态批处理按请求分配 KV 缓存块，在高并发或长序列下，累积分配超过 GPU 内存，即使模型权重本身可以容纳。",
  "versions": [
    {
      "version": "vLLM 0.4.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "PyTorch 2.2.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.1",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "A100 80GB",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "H100 80GB",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The OOM is from KV cache allocation, not model weights. Even a 7B model can OOM with very long sequences or high concurrency.",
      "fail_rate": 0.5,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "vLLM manages its own memory pool and does not release KV cache blocks to PyTorch's cache; empty_cache has no effect on vLLM allocations.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "vLLM uses tensor parallelism across GPUs, but KV cache is still per-GPU; adding GPUs without adjusting max_num_seqs or max_model_len may still OOM on each GPU.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Reduce `max_num_seqs` in vLLM config (e.g., from 256 to 64) to limit concurrent requests. In code: `LLM(model='meta-llama/Llama-2-7b-hf', max_num_seqs=64)`.",
      "success_rate": 0.9,
      "how": "Reduce `max_num_seqs` in vLLM config (e.g., from 256 to 64) to limit concurrent requests. In code: `LLM(model='meta-llama/Llama-2-7b-hf', max_num_seqs=64)`.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Decrease `max_model_len` to limit sequence length, e.g., `LLM(model='...', max_model_len=4096)`. This reduces KV cache size per sequence.",
      "success_rate": 0.85,
      "how": "Decrease `max_model_len` to limit sequence length, e.g., `LLM(model='...', max_model_len=4096)`. This reduces KV cache size per sequence.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Enable `enable_prefix_caching=True` in vLLM to reuse KV cache blocks for common prefixes, reducing memory usage for repeated prompts.",
      "success_rate": 0.8,
      "how": "Enable `enable_prefix_caching=True` in vLLM to reuse KV cache blocks for common prefixes, reducing memory usage for repeated prompts.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "在 vLLM 配置中减少 `max_num_seqs`（例如从 256 减到 64）以限制并发请求。代码示例：`LLM(model='meta-llama/Llama-2-7b-hf', max_num_seqs=64)`。",
    "减小 `max_model_len` 以限制序列长度，例如 `LLM(model='...', max_model_len=4096)`。这会减少每个序列的 KV 缓存大小。",
    "在 vLLM 中启用 `enable_prefix_caching=True` 以重用常见前缀的 KV 缓存块，从而减少重复提示的内存使用。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://vllm.readthedocs.io/en/latest/performance/optimization.html#memory-management",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.85,
  "resolvable": "true",
  "first_seen": "2024-06-20",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}