{
  "id": "cuda/cublas-gemm-broadcast-invalid-config",
  "signature": "RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling cublasSgemmStridedBatchedEx with invalid broadcast dimensions",
  "signature_zh": "运行时错误：CUDA错误：调用cublasSgemmStridedBatchedEx时因无效广播维度导致CUBLAS_STATUS_INVALID_VALUE",
  "regex": "CUBLAS_STATUS_INVALID_VALUE.*cublasSgemmStridedBatchedEx.*broadcast",
  "domain": "cuda",
  "category": "runtime_error",
  "subcategory": null,
  "root_cause": "cuBLAS batched GEMM requires strictly matching batch dimensions for strided inputs; implicit broadcasting is not supported, causing invalid value error when batch counts differ between A and B.",
  "root_cause_type": "generic",
  "root_cause_zh": "cuBLAS批处理GEMM要求带步长输入的批次维度严格匹配；不支持隐式广播，当A和B的批次计数不同时会导致无效值错误。",
  "versions": [
    {
      "version": "CUDA 11.8",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.1",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuBLAS 11.11.3.6",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "PyTorch 2.1.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The error is not about precision but about batch dimension mismatch; casting does not change the shape.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Out-of-memory is not the root cause; the kernel fails validation before execution.",
      "fail_rate": 0.98,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Explicitly expand the smaller batch dimension to match using torch.broadcast_to or .expand before calling the batched GEMM. Example: if A has shape (batch_a, m, k) and B has shape (batch_b, k, n) with batch_a != batch_b, expand A to (max(batch_a,batch_b), m, k) using A = A.expand(max_batch, -1, -1).",
      "success_rate": 0.88,
      "how": "Explicitly expand the smaller batch dimension to match using torch.broadcast_to or .expand before calling the batched GEMM. Example: if A has shape (batch_a, m, k) and B has shape (batch_b, k, n) with batch_a != batch_b, expand A to (max(batch_a,batch_b), m, k) using A = A.expand(max_batch, -1, -1).",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use torch.bmm instead of torch.baddbmm with explicit broadcasting via unsqueeze: C = torch.bmm(A.unsqueeze(1).expand(-1, B.size(0), -1, -1).reshape(-1, m, k), B.unsqueeze(0).expand(A.size(0), -1, -1, -1).reshape(-1, k, n)).",
      "success_rate": 0.8,
      "how": "Use torch.bmm instead of torch.baddbmm with explicit broadcasting via unsqueeze: C = torch.bmm(A.unsqueeze(1).expand(-1, B.size(0), -1, -1).reshape(-1, m, k), B.unsqueeze(0).expand(A.size(0), -1, -1, -1).reshape(-1, k, n)).",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Explicitly expand the smaller batch dimension to match using torch.broadcast_to or .expand before calling the batched GEMM. Example: if A has shape (batch_a, m, k) and B has shape (batch_b, k, n) with batch_a != batch_b, expand A to (max(batch_a,batch_b), m, k) using A = A.expand(max_batch, -1, -1).",
    "Use torch.bmm instead of torch.baddbmm with explicit broadcasting via unsqueeze: C = torch.bmm(A.unsqueeze(1).expand(-1, B.size(0), -1, -1).reshape(-1, m, k), B.unsqueeze(0).expand(A.size(0), -1, -1, -1).reshape(-1, k, n))."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-batched",
  "official_doc_section": null,
  "error_code": "CUBLAS_STATUS_INVALID_VALUE",
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.82,
  "resolvable": "true",
  "first_seen": "2024-06-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}