{
  "id": "cuda/cublas-gemm-params-unsupported-combination",
  "signature": "RuntimeError: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasGemmEx( handle, opa, opb, m, n, k, &alpha, a, atype, lda, b, btype, ldb, &beta, c, ctype, ldc, compute_type, algo)",
  "signature_zh": "运行时错误：调用 cublasGemmEx 时返回 CUBLAS_STATUS_NOT_SUPPORTED",
  "regex": "CUBLAS_STATUS_NOT_SUPPORTED when calling cublasGemmEx",
  "domain": "cuda",
  "category": "runtime_error",
  "subcategory": null,
  "root_cause": "The combination of input matrix data types (atype, btype, ctype) and compute type is not supported by the cuBLAS library on the current GPU architecture.",
  "root_cause_type": "generic",
  "root_cause_zh": "当前 GPU 架构上的 cuBLAS 库不支持输入矩阵数据类型（atype、btype、ctype）与计算类型的组合。",
  "versions": [
    {
      "version": "CUDA 11.8",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "CUDA 12.1",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuBLAS 11.11",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "cuBLAS 12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "CUDA version alone doesn't guarantee support; the GPU's compute capability (e.g., sm_70 vs sm_80) determines which type combinations are valid.",
      "fail_rate": 0.65,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "The algorithm parameter doesn't change data type compatibility; it only affects performance and precision for supported type combinations.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "While float32 is widely supported, this workaround may cause out-of-memory errors for large models or reduce performance if the original types were optimized (e.g., half precision).",
      "fail_rate": 0.4,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Use torch.cuda.is_bf16_supported() to check bfloat16 support before using it. For example: if torch.cuda.is_bf16_supported(): model = model.to(torch.bfloat16) else: model = model.to(torch.float16)",
      "success_rate": 0.85,
      "how": "Use torch.cuda.is_bf16_supported() to check bfloat16 support before using it. For example: if torch.cuda.is_bf16_supported(): model = model.to(torch.bfloat16) else: model = model.to(torch.float16)",
      "condition": "",
      "sources": []
    },
    {
      "action": "Explicitly set the compute type to match the input type. In PyTorch, use torch.set_default_dtype(torch.float32) or cast tensors to a supported combination like float16 for Ampere+ GPUs.",
      "success_rate": 0.78,
      "how": "Explicitly set the compute type to match the input type. In PyTorch, use torch.set_default_dtype(torch.float32) or cast tensors to a supported combination like float16 for Ampere+ GPUs.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Disable cuBLAS and fall back to a custom kernel by setting environment variable: CUBLAS_WORKSPACE_CONFIG=:4096:8. This forces cuBLAS to use a different code path that may support the type combination.",
      "success_rate": 0.7,
      "how": "Disable cuBLAS and fall back to a custom kernel by setting environment variable: CUBLAS_WORKSPACE_CONFIG=:4096:8. This forces cuBLAS to use a different code path that may support the type combination.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "使用 torch.cuda.is_bf16_supported() 检查 bfloat16 支持情况后再使用。例如：if torch.cuda.is_bf16_supported(): model = model.to(torch.bfloat16) else: model = model.to(torch.float16)",
    "显式设置计算类型以匹配输入类型。在 PyTorch 中，使用 torch.set_default_dtype(torch.float32) 或将张量转换为支持的类型组合，如 Ampere+ GPU 上的 float16。",
    "通过设置环境变量 CUBLAS_WORKSPACE_CONFIG=:4096:8 禁用 cuBLAS 并回退到自定义内核。这会强制 cuBLAS 使用可能支持该类型组合的不同代码路径。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://docs.nvidia.com/cuda/cublas/index.html#cublas-status-not-supported",
  "official_doc_section": null,
  "error_code": "CUBLAS_STATUS_NOT_SUPPORTED",
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.82,
  "resolvable": "true",
  "first_seen": "2023-05-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}