{ "id": "pytorch/dataloader-worker-segfault-shared-memory", "signature": "RuntimeError: DataLoader worker (pid 12345) received signal 11 (Segmentation fault). Possible causes: shared memory exhausted", "signature_zh": "运行时错误：DataLoader 工作进程（pid 12345）收到信号 11（段错误）。可能原因：共享内存耗尽", "regex": "RuntimeError: DataLoader worker \\(pid \\d+\\) received signal 11 \\(Segmentation fault\\). Possible causes: shared memory exhausted", "domain": "pytorch", "category": "system_error", "subcategory": null, "root_cause": "The shared memory (/dev/shm) is full or too small to accommodate the data being transferred from DataLoader workers, often due to large batch sizes or high-resolution images.", "root_cause_type": "generic", "root_cause_zh": "共享内存（/dev/shm）已满或太小，无法容纳 DataLoader 工作进程传输的数据，通常由于批量大小过大或高分辨率图像。", "versions": [ { "version": "pytorch>=1.10.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "linux", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "", "why_fails": "This is a workaround that changes behavior, but it doesn't fix the underlying shared memory limit.", "fail_rate": 0.4, "condition": "", "sources": [] }, { "action": "", "why_fails": "Larger batches increase shared memory usage, exacerbating the issue.", "fail_rate": 0.9, "condition": "", "sources": [] }, { "action": "", "why_fails": "Shared memory is recreated at boot, but the limit remains the same; it will fill up again.", "fail_rate": 0.7, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Increase the size of /dev/shm by remounting with a larger size. In Docker, use --shm-size=8g. On bare metal, edit /etc/fstab or use mount -o remount,size=8G /dev/shm.", "success_rate": 0.9, "how": "Increase the size of /dev/shm by remounting with a larger size. In Docker, use --shm-size=8g. On bare metal, edit /etc/fstab or use mount -o remount,size=8G /dev/shm.", "condition": "", "sources": [] }, { "action": "Reduce the batch size or use pin_memory=False in DataLoader to avoid copying tensors to pinned memory, which uses shared memory.", "success_rate": 0.8, "how": "Reduce the batch size or use pin_memory=False in DataLoader to avoid copying tensors to pinned memory, which uses shared memory.", "condition": "", "sources": [] } ], "workarounds_zh": [ "通过重新挂载增加 /dev/shm 的大小。在 Docker 中使用 --shm-size=8g。在裸机上编辑 /etc/fstab 或使用 mount -o remount,size=8G /dev/shm。", "减少批量大小或在 DataLoader 中使用 pin_memory=False，避免将张量复制到固定内存，这使用共享内存。" ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://pytorch.org/docs/stable/data.html#multi-process-data-loading", "official_doc_section": null, "error_code": null, "verification_tier": "ai_generated", "confidence": 0.82, "fix_success_rate": 0.75, "resolvable": "partial", "first_seen": "2024-07-10", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }