{ "id": "tensorflow/gpu-peer-access-error", "signature": "InternalError: Peer access from GPU:0 to GPU:1 is not supported by the current CUDA driver or device topology", "signature_zh": "InternalError：当前CUDA驱动程序或设备拓扑不支持从GPU:0到GPU:1的直连访问", "regex": "InternalError: Peer access from GPU:\\d+ to GPU:\\d+ is not supported by the current CUDA driver or device topology", "domain": "tensorflow", "category": "system_error", "subcategory": null, "root_cause": "The GPUs in the system do not support peer-to-peer memory access (e.g., via NVLink) due to hardware limitations, driver version, or PCIe topology constraints, but TensorFlow's multi-GPU distribution strategy attempted to enable it.", "root_cause_type": "generic", "root_cause_zh": "由于硬件限制、驱动程序版本或PCIe拓扑约束，系统中的GPU不支持点对点内存访问（例如通过NVLink），但TensorFlow的多GPU分发策略尝试启用它。", "versions": [ { "version": "tensorflow>=2.13.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "cuda>=11.8", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "nvidia-driver>=525", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "Upgrading to the latest CUDA toolkit without checking driver compatibility.", "why_fails": "Peer access support depends on both hardware (e.g., NVLink) and driver version; a newer CUDA toolkit may not help if the driver is outdated or hardware lacks NVLink.", "fail_rate": 0.65, "condition": "", "sources": [] }, { "action": "Setting CUDA_VISIBLE_DEVICES to a single GPU to avoid multi-GPU errors.", "why_fails": "This bypasses the error but reduces the effective GPU count to 1, defeating the purpose of multi-GPU training; the error is not fixed, just avoided.", "fail_rate": 0.5, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Disable peer access in TensorFlow by setting the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` or using `tf.config.experimental.set_memory_growth` per GPU. Alternatively, use `tf.distribute.MirroredStrategy` with `cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()` which does not require peer access.", "success_rate": 0.8, "how": "Disable peer access in TensorFlow by setting the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` or using `tf.config.experimental.set_memory_growth` per GPU. Alternatively, use `tf.distribute.MirroredStrategy` with `cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()` which does not require peer access.", "condition": "", "sources": [] }, { "action": "Check GPU topology with `nvidia-smi topo -m` and if peer access is unsupported, place GPUs on the same PCIe switch if possible, or use a distribution strategy that avoids peer access (e.g., `tf.distribute.experimental.MultiWorkerMirroredStrategy` with RPC).", "success_rate": 0.75, "how": "Check GPU topology with `nvidia-smi topo -m` and if peer access is unsupported, place GPUs on the same PCIe switch if possible, or use a distribution strategy that avoids peer access (e.g., `tf.distribute.experimental.MultiWorkerMirroredStrategy` with RPC).", "condition": "", "sources": [] } ], "workarounds_zh": [ "Disable peer access in TensorFlow by setting the environment variable `TF_GPU_ALLOCATOR=cuda_malloc_async` or using `tf.config.experimental.set_memory_growth` per GPU. Alternatively, use `tf.distribute.MirroredStrategy` with `cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()` which does not require peer access.", "Check GPU topology with `nvidia-smi topo -m` and if peer access is unsupported, place GPUs on the same PCIe switch if possible, or use a distribution strategy that avoids peer access (e.g., `tf.distribute.experimental.MultiWorkerMirroredStrategy` with RPC)." ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://www.tensorflow.org/guide/gpu#multi-gpu_setup", "official_doc_section": null, "error_code": "GPA", "verification_tier": "ai_generated", "confidence": 0.84, "fix_success_rate": 0.8, "resolvable": "partial", "first_seen": "2024-02-14", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }