{
  "id": "pytorch/cuda-error-illegal-memory-access",
  "signature": "RuntimeError: CUDA error: an illegal memory access was encountered",
  "signature_zh": "运行时错误：CUDA 错误：遇到非法内存访问",
  "regex": "RuntimeError: CUDA error: an illegal memory access was encountered",
  "domain": "pytorch",
  "category": "runtime_error",
  "subcategory": null,
  "root_cause": "A kernel attempted to read or write memory outside its allocated region, often caused by out-of-bounds tensor indexing or corrupted pointers.",
  "root_cause_type": "generic",
  "root_cause_zh": "内核尝试读取或写入其分配区域之外的内存，通常由张量越界索引或指针损坏引起。",
  "versions": [
    {
      "version": "pytorch>=1.10",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuda>=11.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cudnn>=8.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Increasing GPU memory or adding more GPUs",
      "why_fails": "The error is not about memory capacity but invalid access; more memory doesn't fix invalid pointers.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "Rebooting the machine or resetting CUDA context",
      "why_fails": "The root cause is in the code logic; a reboot may temporarily mask the issue but it reoccurs.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "Switching to CPU mode entirely",
      "why_fails": "Avoids the error but defeats the purpose of using GPU acceleration.",
      "fail_rate": 0.5,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Enable CUDA synchronous debugging to pinpoint the exact line: set environment variable CUDA_LAUNCH_BLOCKING=1 before running the script. Then run the script and check the traceback.",
      "success_rate": 0.8,
      "how": "Enable CUDA synchronous debugging to pinpoint the exact line: set environment variable CUDA_LAUNCH_BLOCKING=1 before running the script. Then run the script and check the traceback.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Replace all dynamic indexing with torch.clamp or torch.where to ensure indices stay within bounds. For example: `idx = torch.clamp(idx, 0, tensor.size(0)-1)`",
      "success_rate": 0.75,
      "how": "Replace all dynamic indexing with torch.clamp or torch.where to ensure indices stay within bounds. For example: `idx = torch.clamp(idx, 0, tensor.size(0)-1)`",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use torch.cuda.synchronize() after suspicious operations to force synchronization and catch the error earlier.",
      "success_rate": 0.7,
      "how": "Use torch.cuda.synchronize() after suspicious operations to force synchronization and catch the error earlier.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Enable CUDA synchronous debugging to pinpoint the exact line: set environment variable CUDA_LAUNCH_BLOCKING=1 before running the script. Then run the script and check the traceback.",
    "Replace all dynamic indexing with torch.clamp or torch.where to ensure indices stay within bounds. For example: `idx = torch.clamp(idx, 0, tensor.size(0)-1)`",
    "Use torch.cuda.synchronize() after suspicious operations to force synchronization and catch the error earlier."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://pytorch.org/docs/stable/notes/cuda.html#cuda-error-handling",
  "official_doc_section": null,
  "error_code": "CUDA_ERROR_ILLEGAL_ADDRESS",
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.75,
  "resolvable": "partial",
  "first_seen": "2023-03-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}