{
  "id": "pytorch/cuda-error-invalid-device-ordinal",
  "signature": "RuntimeError: CUDA error: invalid device ordinal",
  "signature_zh": "运行时错误：CUDA 错误：无效的设备序号",
  "regex": "RuntimeError: CUDA error: invalid device ordinal",
  "domain": "pytorch",
  "category": "runtime_error",
  "subcategory": null,
  "root_cause": "The requested GPU device index (e.g., cuda:0) does not exist on the system, or the CUDA_VISIBLE_DEVICES environment variable restricts available devices.",
  "root_cause_type": "generic",
  "root_cause_zh": "请求的 GPU 设备索引（如 cuda:0）在系统中不存在，或者 CUDA_VISIBLE_DEVICES 环境变量限制了可用设备。",
  "versions": [
    {
      "version": "pytorch>=2.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuda>=11.7",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The issue is configuration (device index), not installation. Reinstalling does not fix the index mismatch.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "The environment variable is still incorrect after the change; users may set it to a non-existent device.",
      "fail_rate": 0.5,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "This still fails if no GPU is available; the root cause is the ordinal, not the device type.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Check available GPU devices with `torch.cuda.device_count()` and list them using `nvidia-smi`. Then set the device to a valid index, e.g., `torch.device('cuda:0')` if at least one GPU exists.",
      "success_rate": 0.9,
      "how": "Check available GPU devices with `torch.cuda.device_count()` and list them using `nvidia-smi`. Then set the device to a valid index, e.g., `torch.device('cuda:0')` if at least one GPU exists.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Verify the CUDA_VISIBLE_DEVICES environment variable. In bash, run `echo $CUDA_VISIBLE_DEVICES`. If set, ensure it contains valid indices, or unset it: `unset CUDA_VISIBLE_DEVICES`.",
      "success_rate": 0.85,
      "how": "Verify the CUDA_VISIBLE_DEVICES environment variable. In bash, run `echo $CUDA_VISIBLE_DEVICES`. If set, ensure it contains valid indices, or unset it: `unset CUDA_VISIBLE_DEVICES`.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "使用 `torch.cuda.device_count()` 检查可用 GPU 设备，并通过 `nvidia-smi` 列出。然后设置有效的设备索引，例如 `torch.device('cuda:0')`。",
    "检查 CUDA_VISIBLE_DEVICES 环境变量。在 bash 中运行 `echo $CUDA_VISIBLE_DEVICES`，如果已设置，确保包含有效索引，或取消设置：`unset CUDA_VISIBLE_DEVICES`。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://pytorch.org/docs/stable/notes/cuda.html#device-handling",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.85,
  "resolvable": "true",
  "first_seen": "2024-03-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}