{ "id": "pytorch/ddp-nccl-timeout-during-allreduce", "signature": "RuntimeError: NCCL communicator was aborted on rank 2. Original reason for failure was: watchdog callback timed out", "signature_zh": "运行时错误：NCCL 通信器在 rank 2 上被中止。原始失败原因：看门狗回调超时", "regex": "RuntimeError: NCCL communicator was aborted on rank \\d+\\. Original reason for failure was: watchdog callback timed out", "domain": "pytorch", "category": "network_error", "subcategory": null, "root_cause": "A NCCL collective operation (e.g., allreduce) timed out because one rank is slow or unresponsive, often due to network congestion, GPU compute imbalance, or hardware failure.", "root_cause_type": "generic", "root_cause_zh": "NCCL 集合操作（如 allreduce）超时，因为某个 rank 缓慢或无响应，通常由于网络拥塞、GPU 计算不平衡或硬件故障。", "versions": [ { "version": "pytorch>=1.12.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "nccl>=2.14", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "", "why_fails": "The timeout is a symptom, not the root cause; increasing it delays failure but doesn't prevent it.", "fail_rate": 0.6, "condition": "", "sources": [] }, { "action": "", "why_fails": "Barriers can cause all ranks to wait for the slow one, potentially increasing the timeout likelihood.", "fail_rate": 0.7, "condition": "", "sources": [] }, { "action": "", "why_fails": "The root cause (e.g., network latency) persists across restarts.", "fail_rate": 0.8, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Check for GPU compute imbalance by profiling each rank's forward/backward time. Ensure all ranks process similar amounts of data (e.g., use DistributedSampler with drop_last=True).", "success_rate": 0.8, "how": "Check for GPU compute imbalance by profiling each rank's forward/backward time. Ensure all ranks process similar amounts of data (e.g., use DistributedSampler with drop_last=True).", "condition": "", "sources": [] }, { "action": "Increase the NCCL timeout environment variable to a higher value (e.g., 600 seconds) to accommodate slow networks or large models.", "success_rate": 0.75, "how": "Increase the NCCL timeout environment variable to a higher value (e.g., 600 seconds) to accommodate slow networks or large models.", "condition": "", "sources": [] }, { "action": "Use the NCCL_DEBUG=INFO environment variable to get detailed debug logs and identify the slow rank or network issue.", "success_rate": 0.85, "how": "Use the NCCL_DEBUG=INFO environment variable to get detailed debug logs and identify the slow rank or network issue.", "condition": "", "sources": [] } ], "workarounds_zh": [ "通过分析每个 rank 的前向/反向传播时间检查 GPU 计算不平衡。确保所有 rank 处理相似数量的数据（例如，使用 DistributedSampler 并设置 drop_last=True）。", "将 NCCL 超时环境变量增加到更高值（例如 600 秒），以适应慢速网络或大型模型。", "使用 NCCL_DEBUG=INFO 环境变量获取详细调试日志，并识别慢速 rank 或网络问题。" ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group", "official_doc_section": null, "error_code": null, "verification_tier": "ai_generated", "confidence": 0.8, "fix_success_rate": 0.7, "resolvable": "partial", "first_seen": "2024-09-01", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }