{
  "id": "data/parquet-int96-timestamp-millennium-bug",
  "signature": "Parquet INT96 timestamp reads as year 5000+ due to Julian date conversion error",
  "signature_zh": "由于朱利安日期转换错误，Parquet INT96时间戳读取为5000年以上",
  "regex": "INT96.*timestamp.*future|INT96.*Julian|impala.*timestamp.*out of range",
  "domain": "data",
  "category": "data_error",
  "subcategory": null,
  "root_cause": "Parquet INT96 timestamps store a Julian day number (days since 4713 BC) and time of day; some readers (e.g., older Hive, Impala) incorrectly interpret the Julian date as a Unix epoch offset, causing dates to be centuries off.",
  "root_cause_type": "generic",
  "root_cause_zh": "Parquet INT96时间戳存储朱利安日数（自公元前4713年以来的天数）和一天中的时间；某些读取器（如旧版Hive、Impala）错误地将朱利安日期解释为Unix纪元偏移，导致日期偏差数百年。",
  "versions": [
    {
      "version": "Apache Parquet 1.12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Apache Hive 3.1.3",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Apache Impala 4.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "pyarrow 13.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The CAST operation uses the same broken conversion logic; it will produce the same erroneous future dates.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Converting INT96 to STRING often results in a binary representation (e.g., '\\x00...') that is not human-readable and cannot be parsed into a date.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "The bug is in the INT96 conversion logic, which may still be present in newer versions if the file was written by a different tool (e.g., Spark) that uses a non-standard INT96 encoding.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "In pyarrow, read with `pq.read_table(path, use_legacy_int96_timestamps=False)` to use the corrected conversion. Example: `import pyarrow.parquet as pq; table = pq.read_table('data.parquet', use_legacy_int96_timestamps=False)`",
      "success_rate": 0.9,
      "how": "In pyarrow, read with `pq.read_table(path, use_legacy_int96_timestamps=False)` to use the corrected conversion. Example: `import pyarrow.parquet as pq; table = pq.read_table('data.parquet', use_legacy_int96_timestamps=False)`",
      "condition": "",
      "sources": []
    },
    {
      "action": "In Spark, set `spark.sql.parquet.int96TimestampConversion.enabled` to `false` and `spark.sql.parquet.int96RebaseModeInRead` to `CORRECTED` to fix the conversion",
      "success_rate": 0.85,
      "how": "In Spark, set `spark.sql.parquet.int96TimestampConversion.enabled` to `false` and `spark.sql.parquet.int96RebaseModeInRead` to `CORRECTED` to fix the conversion",
      "condition": "",
      "sources": []
    },
    {
      "action": "Rewrite the Parquet file using a modern writer (e.g., Spark 3.x) that stores timestamps as INT64 millis instead of INT96, then read with the new file",
      "success_rate": 0.95,
      "how": "Rewrite the Parquet file using a modern writer (e.g., Spark 3.x) that stores timestamps as INT64 millis instead of INT96, then read with the new file",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "In pyarrow, read with `pq.read_table(path, use_legacy_int96_timestamps=False)` to use the corrected conversion. Example: `import pyarrow.parquet as pq; table = pq.read_table('data.parquet', use_legacy_int96_timestamps=False)`",
    "In Spark, set `spark.sql.parquet.int96TimestampConversion.enabled` to `false` and `spark.sql.parquet.int96RebaseModeInRead` to `CORRECTED` to fix the conversion",
    "Rewrite the Parquet file using a modern writer (e.g., Spark 3.x) that stores timestamps as INT64 millis instead of INT96, then read with the new file"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://parquet.apache.org/docs/file-format/types/",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.87,
  "fix_success_rate": 0.8,
  "resolvable": "partial",
  "first_seen": "2023-06-10",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}