{
  "id": "data/parquet-dictionary-page-truncated",
  "signature": "Parquet dictionary page truncated — unexpected end of stream",
  "signature_zh": "Parquet字典页截断 — 意外的流结束",
  "regex": "ParquetDecodingException.*dictionary page.*unexpected end of stream",
  "domain": "data",
  "category": "data_error",
  "subcategory": null,
  "root_cause": "Parquet file dictionary page was not fully written due to incomplete write or partial upload, causing the reader to hit EOF prematurely.",
  "root_cause_type": "generic",
  "root_cause_zh": "由于写入不完整或部分上传，Parquet文件的字典页未完全写入，导致读取器过早遇到EOF。",
  "versions": [
    {
      "version": "parquet-mr 1.12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "pyarrow 14.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "spark 3.4.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Re-download the file from the same source without verifying checksum",
      "why_fails": "If the source file is corrupted at the origin, re-downloading doesn't fix the underlying issue.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    },
    {
      "action": "Increase memory allocation for the reader (e.g., spark.executor.memory)",
      "why_fails": "The error is about truncated data, not memory limits; more memory doesn't reconstruct missing bytes.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "Use a different Parquet reader library (e.g., fastparquet instead of pyarrow)",
      "why_fails": "All readers will fail on the same truncated dictionary page because the file is structurally incomplete.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Verify file integrity using Parquet-tools: `parquet-tools meta corrupted.parquet` — if it fails, re-upload the file from a known good source.",
      "success_rate": 0.7,
      "how": "Verify file integrity using Parquet-tools: `parquet-tools meta corrupted.parquet` — if it fails, re-upload the file from a known good source.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Repair the file by truncating to the last valid row group using pyarrow: `import pyarrow.parquet as pq; table = pq.read_table('corrupted.parquet', use_pandas_metadata=False); pq.write_table(table, 'repaired.parquet')` — this skips the broken dictionary.",
      "success_rate": 0.8,
      "how": "Repair the file by truncating to the last valid row group using pyarrow: `import pyarrow.parquet as pq; table = pq.read_table('corrupted.parquet', use_pandas_metadata=False); pq.write_table(table, 'repaired.parquet')` — this skips the broken dictionary.",
      "condition": "",
      "sources": []
    },
    {
      "action": "If using Spark, set `spark.sql.parquet.enableVectorizedReader=false` to fall back to non-vectorized reading which may handle partial files.",
      "success_rate": 0.5,
      "how": "If using Spark, set `spark.sql.parquet.enableVectorizedReader=false` to fall back to non-vectorized reading which may handle partial files.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Verify file integrity using Parquet-tools: `parquet-tools meta corrupted.parquet` — if it fails, re-upload the file from a known good source.",
    "Repair the file by truncating to the last valid row group using pyarrow: `import pyarrow.parquet as pq; table = pq.read_table('corrupted.parquet', use_pandas_metadata=False); pq.write_table(table, 'repaired.parquet')` — this skips the broken dictionary.",
    "If using Spark, set `spark.sql.parquet.enableVectorizedReader=false` to fall back to non-vectorized reading which may handle partial files."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://issues.apache.org/jira/browse/PARQUET-2300",
  "official_doc_section": null,
  "error_code": "ParquetDecodingException",
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.8,
  "resolvable": "true",
  "first_seen": "2023-11-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}