{ "id": "pytorch/cuda-error-devices-synchronize-abort", "signature": "RuntimeError: CUDA error: device-side assert triggered. Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.", "signature_zh": "RuntimeError: CUDA 错误：触发了设备端断言。请使用 TORCH_USE_CUDA_DSA 编译以启用设备端断言。", "regex": "CUDA error: device-side assert triggered", "domain": "pytorch", "category": "runtime_error", "subcategory": null, "root_cause": "A CUDA kernel encountered an assertion failure on the device (e.g., invalid index in embedding, negative dimension, or NaN in loss), which often causes subsequent operations to fail silently before this error surfaces.", "root_cause_type": "generic", "root_cause_zh": "CUDA 内核在设备上遇到了断言失败（例如，嵌入层中的无效索引、负维度或损失中的 NaN），这通常会导致后续操作静默失败，然后此错误才会显现。", "versions": [ { "version": "PyTorch 2.0.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "CUDA 11.7", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "CUDA 11.8", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "CUDA 12.1", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "Ubuntu 22.04", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "Set torch.backends.cudnn.deterministic = True", "why_fails": "Deterministic mode does not fix invalid tensor values or index errors; it only ensures reproducibility of operations.", "fail_rate": 0.95, "condition": "", "sources": [] }, { "action": "Increase batch size to trigger error less often", "why_fails": "Larger batch sizes may hide the issue temporarily but do not address the root cause (e.g., out-of-range indices in embedding). The error will reappear on different data.", "fail_rate": 0.9, "condition": "", "sources": [] }, { "action": "Set CUDA_LAUNCH_BLOCKING=1 environment variable", "why_fails": "While this helps identify the exact operation causing the error, it does not fix the underlying problem such as index errors or NaN values.", "fail_rate": 0.85, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Enable device-side assertions by setting environment variable TORCH_USE_CUDA_DSA=1 before running the script, then re-run to get a detailed stack trace pointing to the failing operation (e.g., embedding lookup with out-of-range index). Example: TORCH_USE_CUDA_DSA=1 python train.py", "success_rate": 0.9, "how": "Enable device-side assertions by setting environment variable TORCH_USE_CUDA_DSA=1 before running the script, then re-run to get a detailed stack trace pointing to the failing operation (e.g., embedding lookup with out-of-range index). Example: TORCH_USE_CUDA_DSA=1 python train.py", "condition": "", "sources": [] }, { "action": "Add gradient clipping and NaN checks in the training loop: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0); if torch.isnan(loss): print('NaN loss'); return", "success_rate": 0.8, "how": "Add gradient clipping and NaN checks in the training loop: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0); if torch.isnan(loss): print('NaN loss'); return", "condition": "", "sources": [] }, { "action": "Wrap the problematic operation in a try-except block and use torch.cuda.synchronize() to catch the exact error location. For example: try: output = model(input); torch.cuda.synchronize(); except RuntimeError as e: print(f'Error at iteration {i}: {e}')", "success_rate": 0.85, "how": "Wrap the problematic operation in a try-except block and use torch.cuda.synchronize() to catch the exact error location. For example: try: output = model(input); torch.cuda.synchronize(); except RuntimeError as e: print(f'Error at iteration {i}: {e}')", "condition": "", "sources": [] } ], "workarounds_zh": [ "Enable device-side assertions by setting environment variable TORCH_USE_CUDA_DSA=1 before running the script, then re-run to get a detailed stack trace pointing to the failing operation (e.g., embedding lookup with out-of-range index). Example: TORCH_USE_CUDA_DSA=1 python train.py", "Add gradient clipping and NaN checks in the training loop: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0); if torch.isnan(loss): print('NaN loss'); return", "Wrap the problematic operation in a try-except block and use torch.cuda.synchronize() to catch the exact error location. For example: try: output = model(input); torch.cuda.synchronize(); except RuntimeError as e: print(f'Error at iteration {i}: {e}')" ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://pytorch.org/docs/stable/notes/cuda.html#device-side-assertions", "official_doc_section": null, "error_code": "CUDA_ERROR_ILLEGAL_ADDRESS", "verification_tier": "ai_generated", "confidence": 0.88, "fix_success_rate": 0.85, "resolvable": "true", "first_seen": "2023-03-15", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }