{
  "id": "kubernetes/etcd-leader-election-failure",
  "signature": "etcdserver: request timed out, possible leader election",
  "signature_zh": "etcdserver：请求超时，可能正在进行领导者选举",
  "regex": "etcdserver: request timed out, possible leader election",
  "domain": "kubernetes",
  "category": "system_error",
  "subcategory": null,
  "root_cause": "etcd cluster is experiencing network partition or disk I/O latency, causing leader election to fail or take too long, resulting in timeouts for Kubernetes API requests.",
  "root_cause_type": "generic",
  "root_cause_zh": "etcd 集群遇到网络分区或磁盘 I/O 延迟，导致领导者选举失败或耗时过长，从而导致 Kubernetes API 请求超时。",
  "versions": [
    {
      "version": "etcd 3.5.7",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "etcd 3.5.9",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Kubernetes 1.27",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Kubernetes 1.29",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "Simply restarting one etcd member may worsen the situation by triggering another leader election.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Increasing etcd request timeout without fixing underlying disk or network issues only masks the problem temporarily.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Check etcd cluster health: `etcdctl endpoint health --cluster`. Identify unhealthy members and check their disk I/O with `iostat -x 1` or network latency with `ping` between etcd nodes.",
      "success_rate": 0.8,
      "how": "Check etcd cluster health: `etcdctl endpoint health --cluster`. Identify unhealthy members and check their disk I/O with `iostat -x 1` or network latency with `ping` between etcd nodes.",
      "condition": "",
      "sources": []
    },
    {
      "action": "If disk I/O is high, move etcd data directory to a faster disk (e.g., SSD) by updating the etcd pod spec's hostPath or using a dedicated volume: `--data-dir=/var/lib/etcd-ssd`.",
      "success_rate": 0.75,
      "how": "If disk I/O is high, move etcd data directory to a faster disk (e.g., SSD) by updating the etcd pod spec's hostPath or using a dedicated volume: `--data-dir=/var/lib/etcd-ssd`.",
      "condition": "",
      "sources": []
    },
    {
      "action": "If network partition is suspected, ensure all etcd members can communicate on port 2380 (peer communication). Check firewall rules and network policies.",
      "success_rate": 0.7,
      "how": "If network partition is suspected, ensure all etcd members can communicate on port 2380 (peer communication). Check firewall rules and network policies.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Check etcd cluster health: `etcdctl endpoint health --cluster`. Identify unhealthy members and check their disk I/O with `iostat -x 1` or network latency with `ping` between etcd nodes.",
    "If disk I/O is high, move etcd data directory to a faster disk (e.g., SSD) by updating the etcd pod spec's hostPath or using a dedicated volume: `--data-dir=/var/lib/etcd-ssd`.",
    "If network partition is suspected, ensure all etcd members can communicate on port 2380 (peer communication). Check firewall rules and network policies."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://etcd.io/docs/v3.5/faq/#what-does-request-timed-out-mean",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.7,
  "resolvable": "partial",
  "first_seen": "2023-09-05",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}