{
  "id": "cuda/cuda-error-driver-unloading",
  "signature": "CUDA error: driver is in a state that is invalid for the requested operation (cudaErrorDriverNotReady)",
  "signature_zh": "CUDA 错误：驱动程序处于对请求操作无效的状态 (cudaErrorDriverNotReady)",
  "regex": "CUDA error: driver is in a state that is invalid for the requested operation",
  "domain": "cuda",
  "category": "runtime_error",
  "subcategory": null,
  "root_cause": "The CUDA driver is in the process of being unloaded or has been partially unloaded due to a race condition in multi-threaded application shutdown, often when cudaDeviceReset() is called while other threads still hold CUDA contexts.",
  "root_cause_type": "generic",
  "root_cause_zh": "CUDA 驱动程序正在被卸载或已部分卸载，这是由于多线程应用程序关闭时的竞态条件，通常是在其他线程仍持有 CUDA 上下文时调用 cudaDeviceReset() 导致的。",
  "versions": [
    {
      "version": "CUDA 11.8",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.1",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.2",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "PyTorch 2.1.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "PyTorch 2.2.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The error occurs during shutdown, so restarting only delays the issue; the race condition persists on subsequent shutdowns.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Synchronization does not guarantee that all threads have released their contexts; the driver may still be in an invalid state if other threads are mid-operation.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Ensure all CUDA contexts are destroyed before calling cudaDeviceReset() by using a thread-safe reference counter. For example, in Python with PyTorch: `import torch; torch.cuda.synchronize(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats(); del model; torch.cuda.reset_max_memory_cached()`",
      "success_rate": 0.85,
      "how": "Ensure all CUDA contexts are destroyed before calling cudaDeviceReset() by using a thread-safe reference counter. For example, in Python with PyTorch: `import torch; torch.cuda.synchronize(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats(); del model; torch.cuda.reset_max_memory_cached()`",
      "condition": "",
      "sources": []
    },
    {
      "action": "Avoid calling cudaDeviceReset() in multi-threaded environments; instead, rely on the driver to clean up contexts at process exit. In C++, remove explicit `cudaDeviceReset()` calls from destructors or atexit handlers.",
      "success_rate": 0.9,
      "how": "Avoid calling cudaDeviceReset() in multi-threaded environments; instead, rely on the driver to clean up contexts at process exit. In C++, remove explicit `cudaDeviceReset()` calls from destructors or atexit handlers.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use a try-catch around the reset call and ignore the error if it occurs during shutdown: `try { cudaDeviceReset(); } catch (const std::exception&) { /* ignore during shutdown */ }`",
      "success_rate": 0.75,
      "how": "Use a try-catch around the reset call and ignore the error if it occurs during shutdown: `try { cudaDeviceReset(); } catch (const std::exception&) { /* ignore during shutdown */ }`",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Ensure all CUDA contexts are destroyed before calling cudaDeviceReset() by using a thread-safe reference counter. For example, in Python with PyTorch: `import torch; torch.cuda.synchronize(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats(); del model; torch.cuda.reset_max_memory_cached()`",
    "Avoid calling cudaDeviceReset() in multi-threaded environments; instead, rely on the driver to clean up contexts at process exit. In C++, remove explicit `cudaDeviceReset()` calls from destructors or atexit handlers.",
    "Use a try-catch around the reset call and ignore the error if it occurs during shutdown: `try { cudaDeviceReset(); } catch (const std::exception&) { /* ignore during shutdown */ }`"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html",
  "official_doc_section": null,
  "error_code": "cudaErrorDriverNotReady (804)",
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.78,
  "resolvable": "partial",
  "first_seen": "2024-03-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}