{
  "id": "pytorch/dataloader-worker-segfault-shm",
  "signature": "RuntimeError: DataLoader worker (pid 12345) received signal 11 (Segmentation fault). Possible causes: shared memory exhaustion or corrupted shared memory files in /dev/shm.",
  "signature_zh": "RuntimeError: DataLoader 工作进程（pid 12345）收到信号 11（段错误）。可能原因：共享内存耗尽或 /dev/shm 中的共享内存文件损坏。",
  "regex": "DataLoader worker.*received signal 11",
  "domain": "pytorch",
  "category": "system_error",
  "subcategory": null,
  "root_cause": "DataLoader workers use shared memory (via /dev/shm) for zero-copy data transfer; when /dev/shm is full (e.g., due to large num_workers, large batch sizes, or other processes), workers crash with a segmentation fault.",
  "root_cause_type": "generic",
  "root_cause_zh": "DataLoader 工作进程使用共享内存（通过 /dev/shm）进行零拷贝数据传输；当 /dev/shm 已满（例如，由于大量工作进程、大批量大小或其他进程）时，工作进程会因段错误而崩溃。",
  "versions": [
    {
      "version": "PyTorch 1.10.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "PyTorch 2.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Linux kernel 5.15",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Ubuntu 20.04",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Ubuntu 22.04",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Docker containers",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Increase num_workers to speed up data loading",
      "why_fails": "More workers consume more shared memory, exacerbating the exhaustion problem and causing more frequent crashes.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    },
    {
      "action": "Set pin_memory=False in DataLoader",
      "why_fails": "While this reduces shared memory usage, it may not be sufficient if /dev/shm is already full from other processes or large batch sizes.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "Restart the system to clear /dev/shm",
      "why_fails": "This is a temporary fix; the problem recurs when the training runs again with the same configuration.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Reduce the number of DataLoader workers: DataLoader(dataset, batch_size=64, num_workers=4, ...). Start with num_workers=2 and increase gradually.",
      "success_rate": 0.85,
      "how": "Reduce the number of DataLoader workers: DataLoader(dataset, batch_size=64, num_workers=4, ...). Start with num_workers=2 and increase gradually.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Increase the size of /dev/shm by remounting with a larger size: sudo mount -o remount,size=16G /dev/shm. Alternatively, in Docker, use --shm-size=16g flag.",
      "success_rate": 0.95,
      "how": "Increase the size of /dev/shm by remounting with a larger size: sudo mount -o remount,size=16G /dev/shm. Alternatively, in Docker, use --shm-size=16g flag.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use multiprocessing_context='spawn' in DataLoader and avoid shared memory by setting pin_memory=False and prefetch_factor=2: DataLoader(..., multiprocessing_context='spawn', pin_memory=False, prefetch_factor=2)",
      "success_rate": 0.8,
      "how": "Use multiprocessing_context='spawn' in DataLoader and avoid shared memory by setting pin_memory=False and prefetch_factor=2: DataLoader(..., multiprocessing_context='spawn', pin_memory=False, prefetch_factor=2)",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Reduce the number of DataLoader workers: DataLoader(dataset, batch_size=64, num_workers=4, ...). Start with num_workers=2 and increase gradually.",
    "Increase the size of /dev/shm by remounting with a larger size: sudo mount -o remount,size=16G /dev/shm. Alternatively, in Docker, use --shm-size=16g flag.",
    "Use multiprocessing_context='spawn' in DataLoader and avoid shared memory by setting pin_memory=False and prefetch_factor=2: DataLoader(..., multiprocessing_context='spawn', pin_memory=False, prefetch_factor=2)"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://pytorch.org/docs/stable/data.html#multi-process-data-loading",
  "official_doc_section": null,
  "error_code": "SIGSEGV",
  "verification_tier": "ai_generated",
  "confidence": 0.87,
  "fix_success_rate": 0.85,
  "resolvable": "true",
  "first_seen": "2023-02-14",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}