{
  "id": "data/parquet-uint64-overflow-cast",
  "signature": "Parquet UINT64 column overflows when cast to signed INT64 in Spark or Arrow",
  "signature_zh": "Parquet UINT64 列在 Spark 或 Arrow 中转换为有符号 INT64 时溢出",
  "regex": "(?i)(uint64.*overflow|overflow.*uint64|cannot cast.*uint64)",
  "domain": "data",
  "category": "type_error",
  "subcategory": null,
  "root_cause": "Parquet format supports UINT64 logical type, but many engines (Spark, Arrow) lack native UINT64 support and silently cast to INT64, causing overflow for values > 2^63-1.",
  "root_cause_type": "generic",
  "root_cause_zh": "Parquet 格式支持 UINT64 逻辑类型，但许多引擎（Spark、Arrow）缺乏原生 UINT64 支持并静默转换为 INT64，导致大于 2^63-1 的值溢出。",
  "versions": [
    {
      "version": "Apache Parquet 2.8.0+",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Apache Spark 3.4.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Apache Arrow 12.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Casting to Decimal(38,0) to hold larger values",
      "why_fails": "Decimal(38,0) can hold up to 10^38-1, but Spark's decimal precision is limited and arithmetic may still overflow or lose precision when converting back.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    },
    {
      "action": "Using Double type to avoid overflow",
      "why_fails": "Double cannot represent all integers exactly beyond 2^53, causing silent precision loss for large UINT64 values.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "Disabling Parquet type promotion entirely",
      "why_fails": "This may cause schema compatibility errors for other columns and does not address the root issue of UINT64 handling.",
      "fail_rate": 0.5,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Read UINT64 as String type in Spark: spark.read.parquet(path).withColumn(\"col\", col(\"col\").cast(\"string\"))",
      "success_rate": 0.85,
      "how": "Read UINT64 as String type in Spark: spark.read.parquet(path).withColumn(\"col\", col(\"col\").cast(\"string\"))",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use PyArrow with safe_cast=False to preserve UINT64 as binary: pq.read_table(path, safe_cast=False).then convert to Python int via struct.unpack.",
      "success_rate": 0.9,
      "how": "Use PyArrow with safe_cast=False to preserve UINT64 as binary: pq.read_table(path, safe_cast=False).then convert to Python int via struct.unpack.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Pre-process data to ensure UINT64 values fit within INT64 range before writing Parquet.",
      "success_rate": 0.7,
      "how": "Pre-process data to ensure UINT64 values fit within INT64 range before writing Parquet.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Read UINT64 as String type in Spark: spark.read.parquet(path).withColumn(\"col\", col(\"col\").cast(\"string\"))",
    "Use PyArrow with safe_cast=False to preserve UINT64 as binary: pq.read_table(path, safe_cast=False).then convert to Python int via struct.unpack.",
    "Pre-process data to ensure UINT64 values fit within INT64 range before writing Parquet."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://spark.apache.org/docs/latest/sql-ref-datatypes.html",
  "official_doc_section": null,
  "error_code": "org.apache.spark.sql.AnalysisException: Overflow in sum of UINT64",
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.75,
  "resolvable": "partial",
  "first_seen": "2024-01-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}