{ "id": "cuda/cublas-gemm-broadcast-invalid-config", "signature": "RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling cublasSgemmStridedBatchedEx with invalid broadcast dimensions", "signature_zh": "运行时错误：CUDA错误：调用cublasSgemmStridedBatchedEx时因无效广播维度导致CUBLAS_STATUS_INVALID_VALUE", "regex": "CUBLAS_STATUS_INVALID_VALUE.*cublasSgemmStridedBatchedEx.*broadcast", "domain": "cuda", "category": "runtime_error", "subcategory": null, "root_cause": "cuBLAS batched GEMM requires strictly matching batch dimensions for strided inputs; implicit broadcasting is not supported, causing invalid value error when batch counts differ between A and B.", "root_cause_type": "generic", "root_cause_zh": "cuBLAS批处理GEMM要求带步长输入的批次维度严格匹配；不支持隐式广播，当A和B的批次计数不同时会导致无效值错误。", "versions": [ { "version": "CUDA 11.8", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "CUDA 12.1", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "cuBLAS 11.11.3.6", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "PyTorch 2.1.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "", "why_fails": "The error is not about precision but about batch dimension mismatch; casting does not change the shape.", "fail_rate": 0.95, "condition": "", "sources": [] }, { "action": "", "why_fails": "Out-of-memory is not the root cause; the kernel fails validation before execution.", "fail_rate": 0.98, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Explicitly expand the smaller batch dimension to match using torch.broadcast_to or .expand before calling the batched GEMM. Example: if A has shape (batch_a, m, k) and B has shape (batch_b, k, n) with batch_a != batch_b, expand A to (max(batch_a,batch_b), m, k) using A = A.expand(max_batch, -1, -1).", "success_rate": 0.88, "how": "Explicitly expand the smaller batch dimension to match using torch.broadcast_to or .expand before calling the batched GEMM. Example: if A has shape (batch_a, m, k) and B has shape (batch_b, k, n) with batch_a != batch_b, expand A to (max(batch_a,batch_b), m, k) using A = A.expand(max_batch, -1, -1).", "condition": "", "sources": [] }, { "action": "Use torch.bmm instead of torch.baddbmm with explicit broadcasting via unsqueeze: C = torch.bmm(A.unsqueeze(1).expand(-1, B.size(0), -1, -1).reshape(-1, m, k), B.unsqueeze(0).expand(A.size(0), -1, -1, -1).reshape(-1, k, n)).", "success_rate": 0.8, "how": "Use torch.bmm instead of torch.baddbmm with explicit broadcasting via unsqueeze: C = torch.bmm(A.unsqueeze(1).expand(-1, B.size(0), -1, -1).reshape(-1, m, k), B.unsqueeze(0).expand(A.size(0), -1, -1, -1).reshape(-1, k, n)).", "condition": "", "sources": [] } ], "workarounds_zh": [ "Explicitly expand the smaller batch dimension to match using torch.broadcast_to or .expand before calling the batched GEMM. Example: if A has shape (batch_a, m, k) and B has shape (batch_b, k, n) with batch_a != batch_b, expand A to (max(batch_a,batch_b), m, k) using A = A.expand(max_batch, -1, -1).", "Use torch.bmm instead of torch.baddbmm with explicit broadcasting via unsqueeze: C = torch.bmm(A.unsqueeze(1).expand(-1, B.size(0), -1, -1).reshape(-1, m, k), B.unsqueeze(0).expand(A.size(0), -1, -1, -1).reshape(-1, k, n))." ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-batched", "official_doc_section": null, "error_code": "CUBLAS_STATUS_INVALID_VALUE", "verification_tier": "ai_generated", "confidence": 0.85, "fix_success_rate": 0.82, "resolvable": "true", "first_seen": "2024-06-15", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }