{
  "id": "pytorch/ddp-nccl-timeout-during-allreduce",
  "signature": "RuntimeError: NCCL communicator was aborted on rank 2. Original reason for failure was: watchdog callback timed out",
  "signature_zh": "运行时错误：NCCL 通信器在 rank 2 上被中止。原始失败原因：看门狗回调超时",
  "regex": "RuntimeError: NCCL communicator was aborted on rank \\d+\\. Original reason for failure was: watchdog callback timed out",
  "domain": "pytorch",
  "category": "network_error",
  "subcategory": null,
  "root_cause": "A NCCL collective operation (e.g., allreduce) timed out because one rank is slow or unresponsive, often due to network congestion, GPU compute imbalance, or hardware failure.",
  "root_cause_type": "generic",
  "root_cause_zh": "NCCL 集合操作（如 allreduce）超时，因为某个 rank 缓慢或无响应，通常由于网络拥塞、GPU 计算不平衡或硬件故障。",
  "versions": [
    {
      "version": "pytorch>=1.12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "nccl>=2.14",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The timeout is a symptom, not the root cause; increasing it delays failure but doesn't prevent it.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Barriers can cause all ranks to wait for the slow one, potentially increasing the timeout likelihood.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "The root cause (e.g., network latency) persists across restarts.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Check for GPU compute imbalance by profiling each rank's forward/backward time. Ensure all ranks process similar amounts of data (e.g., use DistributedSampler with drop_last=True).",
      "success_rate": 0.8,
      "how": "Check for GPU compute imbalance by profiling each rank's forward/backward time. Ensure all ranks process similar amounts of data (e.g., use DistributedSampler with drop_last=True).",
      "condition": "",
      "sources": []
    },
    {
      "action": "Increase the NCCL timeout environment variable to a higher value (e.g., 600 seconds) to accommodate slow networks or large models.",
      "success_rate": 0.75,
      "how": "Increase the NCCL timeout environment variable to a higher value (e.g., 600 seconds) to accommodate slow networks or large models.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use the NCCL_DEBUG=INFO environment variable to get detailed debug logs and identify the slow rank or network issue.",
      "success_rate": 0.85,
      "how": "Use the NCCL_DEBUG=INFO environment variable to get detailed debug logs and identify the slow rank or network issue.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "通过分析每个 rank 的前向/反向传播时间检查 GPU 计算不平衡。确保所有 rank 处理相似数量的数据（例如，使用 DistributedSampler 并设置 drop_last=True）。",
    "将 NCCL 超时环境变量增加到更高值（例如 600 秒），以适应慢速网络或大型模型。",
    "使用 NCCL_DEBUG=INFO 环境变量获取详细调试日志，并识别慢速 rank 或网络问题。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.8,
  "fix_success_rate": 0.7,
  "resolvable": "partial",
  "first_seen": "2024-09-01",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}