{
  "id": "tensorflow/gpu-peer-access-error",
  "signature": "InternalError: Peer access from GPU:0 to GPU:1 is not supported by the current CUDA driver or device topology",
  "signature_zh": "InternalError：当前CUDA驱动程序或设备拓扑不支持从GPU:0到GPU:1的直连访问",
  "regex": "InternalError: Peer access from GPU:\\d+ to GPU:\\d+ is not supported by the current CUDA driver or device topology",
  "domain": "tensorflow",
  "category": "system_error",
  "subcategory": null,
  "root_cause": "The GPUs in the system do not support peer-to-peer memory access (e.g., via NVLink) due to hardware limitations, driver version, or PCIe topology constraints, but TensorFlow's multi-GPU distribution strategy attempted to enable it.",
  "root_cause_type": "generic",
  "root_cause_zh": "由于硬件限制、驱动程序版本或PCIe拓扑约束，系统中的GPU不支持点对点内存访问（例如通过NVLink），但TensorFlow的多GPU分发策略尝试启用它。",
  "versions": [
    {
      "version": "tensorflow>=2.13.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuda>=11.8",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "nvidia-driver>=525",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Upgrading to the latest CUDA toolkit without checking driver compatibility.",
      "why_fails": "Peer access support depends on both hardware (e.g., NVLink) and driver version; a newer CUDA toolkit may not help if the driver is outdated or hardware lacks NVLink.",
      "fail_rate": 0.65,
      "condition": "",
      "sources": []
    },
    {
      "action": "Setting CUDA_VISIBLE_DEVICES to a single GPU to avoid multi-GPU errors.",
      "why_fails": "This bypasses the error but reduces the effective GPU count to 1, defeating the purpose of multi-GPU training; the error is not fixed, just avoided.",
      "fail_rate": 0.5,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Disable peer access in TensorFlow by setting the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` or using `tf.config.experimental.set_memory_growth` per GPU. Alternatively, use `tf.distribute.MirroredStrategy` with `cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()` which does not require peer access.",
      "success_rate": 0.8,
      "how": "Disable peer access in TensorFlow by setting the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` or using `tf.config.experimental.set_memory_growth` per GPU. Alternatively, use `tf.distribute.MirroredStrategy` with `cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()` which does not require peer access.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Check GPU topology with `nvidia-smi topo -m` and if peer access is unsupported, place GPUs on the same PCIe switch if possible, or use a distribution strategy that avoids peer access (e.g., `tf.distribute.experimental.MultiWorkerMirroredStrategy` with RPC).",
      "success_rate": 0.75,
      "how": "Check GPU topology with `nvidia-smi topo -m` and if peer access is unsupported, place GPUs on the same PCIe switch if possible, or use a distribution strategy that avoids peer access (e.g., `tf.distribute.experimental.MultiWorkerMirroredStrategy` with RPC).",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Disable peer access in TensorFlow by setting the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` or using `tf.config.experimental.set_memory_growth` per GPU. Alternatively, use `tf.distribute.MirroredStrategy` with `cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()` which does not require peer access.",
    "Check GPU topology with `nvidia-smi topo -m` and if peer access is unsupported, place GPUs on the same PCIe switch if possible, or use a distribution strategy that avoids peer access (e.g., `tf.distribute.experimental.MultiWorkerMirroredStrategy` with RPC)."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://www.tensorflow.org/guide/gpu#multi-gpu_setup",
  "official_doc_section": null,
  "error_code": "GPA",
  "verification_tier": "ai_generated",
  "confidence": 0.84,
  "fix_success_rate": 0.8,
  "resolvable": "partial",
  "first_seen": "2024-02-14",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}