{
  "id": "cuda/mps-heap-limit-exceeded",
  "signature": "CUDA error: MPS heap memory limit exceeded (cudaErrorMpsHeapMemoryLimitExceeded)",
  "signature_zh": "CUDA 错误：MPS 堆内存限制超出 (cudaErrorMpsHeapMemoryLimitExceeded)",
  "regex": "MPS heap memory limit exceeded",
  "domain": "cuda",
  "category": "resource_error",
  "subcategory": null,
  "root_cause": "Under NVIDIA Multi-Process Service (MPS), the per-client heap memory limit set by the MPS server (via CUDA_MPS_HEAP_SIZE) has been exhausted by the current process, typically due to allocating too many small tensors or not freeing memory in a long-running training loop.",
  "root_cause_type": "generic",
  "root_cause_zh": "在 NVIDIA 多进程服务 (MPS) 下，MPS 服务器设置的每客户端堆内存限制（通过 CUDA_MPS_HEAP_SIZE）已被当前进程耗尽，通常是由于分配了太多小张量或在长时间训练循环中未释放内存。",
  "versions": [
    {
      "version": "CUDA 11.8",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.2",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "MPS 1.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "NVIDIA Driver 535.54",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Increasing `torch.cuda.max_memory_allocated` via `torch.cuda.set_per_process_memory_fraction`",
      "why_fails": "The MPS heap limit is independent of the per-process memory fraction; changing the PyTorch memory limit does not affect the MPS server's heap allocation.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "Restarting only the CUDA process without restarting the MPS server",
      "why_fails": "The MPS server's heap limit is persistent across client restarts; the limit is still in effect unless the server is restarted.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "Setting `CUDA_MPS_HEAP_SIZE=0` to disable the limit",
      "why_fails": "Setting heap size to 0 may cause undefined behavior or default to a very small limit; the environment variable must be set to a positive value or unset to use the default (which is usually larger).",
      "fail_rate": 0.75,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Increase the MPS heap size by setting the environment variable before starting the MPS daemon: `export CUDA_MPS_HEAP_SIZE=4G` (or a larger value like `8G`), then restart MPS with `nvidia-cuda-mps-control -d`. This allocates more heap memory per client.",
      "success_rate": 0.9,
      "how": "Increase the MPS heap size by setting the environment variable before starting the MPS daemon: `export CUDA_MPS_HEAP_SIZE=4G` (or a larger value like `8G`), then restart MPS with `nvidia-cuda-mps-control -d`. This allocates more heap memory per client.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Reduce memory fragmentation by using `torch.cuda.empty_cache()` periodically in your training loop, or by reusing tensors with `torch.zeros` or `torch.empty` instead of creating new ones each iteration.",
      "success_rate": 0.75,
      "how": "Reduce memory fragmentation by using `torch.cuda.empty_cache()` periodically in your training loop, or by reusing tensors with `torch.zeros` or `torch.empty` instead of creating new ones each iteration.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Switch from MPS to a single process per GPU (disable MPS) by stopping the MPS daemon: `echo quit | nvidia-cuda-mps-control`. This removes the heap limit entirely but loses MPS's inter-process communication benefits.",
      "success_rate": 0.95,
      "how": "Switch from MPS to a single process per GPU (disable MPS) by stopping the MPS daemon: `echo quit | nvidia-cuda-mps-control`. This removes the heap limit entirely but loses MPS's inter-process communication benefits.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "在启动 MPS 守护进程前设置环境变量以增加 MPS 堆大小：`export CUDA_MPS_HEAP_SIZE=4G`（或更大的值如 `8G`），然后使用 `nvidia-cuda-mps-control -d` 重启 MPS。这会为每个客户端分配更多堆内存。",
    "通过在训练循环中定期使用 `torch.cuda.empty_cache()` 减少内存碎片，或使用 `torch.zeros` 或 `torch.empty` 重用张量，而不是每次迭代创建新张量。",
    "通过停止 MPS 守护进程从 MPS 切换到每 GPU 单进程（禁用 MPS）：`echo quit | nvidia-cuda-mps-control`。这完全移除堆限制，但会失去 MPS 的进程间通信优势。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://docs.nvidia.com/deploy/mps/index.html#topic_3_4_2",
  "official_doc_section": null,
  "error_code": "cudaErrorMpsHeapMemoryLimitExceeded",
  "verification_tier": "ai_generated",
  "confidence": 0.87,
  "fix_success_rate": 0.8,
  "resolvable": "true",
  "first_seen": "2023-09-05",
  "last_confirmed": "2024-06-01",
  "last_updated": "2025-04-10",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}