{
  "id": "pytorch/cuda-assert-triggered",
  "signature": "RuntimeError: CUDA error: device-side assert triggered. Compile with TORCH_USE_CUDA_DSA to enable device-side assertions",
  "signature_zh": "运行时错误：CUDA 错误：触发了设备端断言。使用 TORCH_USE_CUDA_DSA 编译以启用设备端断言",
  "regex": "RuntimeError: CUDA error: device-side assert triggered",
  "domain": "pytorch",
  "category": "assertion_error",
  "subcategory": null,
  "root_cause": "A CUDA kernel performed an illegal operation (e.g., out-of-bounds index, NaN in loss) that triggered a device-side assertion, but detailed info is suppressed without DSA build.",
  "root_cause_type": "generic",
  "root_cause_zh": "CUDA 内核执行了非法操作（例如，越界索引、损失中的 NaN），触发了设备端断言，但在没有 DSA 构建的情况下详细信息被抑制。",
  "versions": [
    {
      "version": "torch 1.13.1",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "torch 2.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuda 11.7",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuda 12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "Simply catching the exception and retrying may mask the root cause (e.g., invalid index) and cause silent data corruption.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Increasing batch size or changing learning rate does not fix illegal memory access or index errors.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Disabling CUDA and falling back to CPU may work but is not a real fix and may be impractically slow.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Rebuild PyTorch from source with TORCH_USE_CUDA_DSA=1 to get detailed error messages:\nexport TORCH_USE_CUDA_DSA=1\npip install --no-cache-dir --verbose torch --no-binary torch\nThen rerun and check the exact line causing the assertion.",
      "success_rate": 0.9,
      "how": "Rebuild PyTorch from source with TORCH_USE_CUDA_DSA=1 to get detailed error messages:\nexport TORCH_USE_CUDA_DSA=1\npip install --no-cache-dir --verbose torch --no-binary torch\nThen rerun and check the exact line causing the assertion.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Add assertions in your code before CUDA operations, e.g., check index bounds:\nassert (indices >= 0).all() and (indices < tensor.size(0)).all(), \"Index out of bounds\"\nAlso check for NaN/Inf in loss: assert not torch.isnan(loss).any()",
      "success_rate": 0.85,
      "how": "Add assertions in your code before CUDA operations, e.g., check index bounds:\nassert (indices >= 0).all() and (indices < tensor.size(0)).all(), \"Index out of bounds\"\nAlso check for NaN/Inf in loss: assert not torch.isnan(loss).any()",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Rebuild PyTorch from source with TORCH_USE_CUDA_DSA=1 to get detailed error messages:\nexport TORCH_USE_CUDA_DSA=1\npip install --no-cache-dir --verbose torch --no-binary torch\nThen rerun and check the exact line causing the assertion.",
    "Add assertions in your code before CUDA operations, e.g., check index bounds:\nassert (indices >= 0).all() and (indices < tensor.size(0)).all(), \"Index out of bounds\"\nAlso check for NaN/Inf in loss: assert not torch.isnan(loss).any()"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://pytorch.org/docs/stable/notes/cuda.html#cuda-errors",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.9,
  "resolvable": "true",
  "first_seen": "2023-03-10",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}