{
  "id": "data/parquet-statistics-accuracy",
  "signature": "Parquet row group statistics inaccuracy leads to false predicate pushdown pruning",
  "signature_zh": "Parquet行组统计信息不准确导致谓词下推错误剪枝",
  "regex": "Parquet.*statistics.*inaccurate|predicate.*pushdown.*miss|row group.*pruned.*incorrectly",
  "domain": "data",
  "category": "data_error",
  "subcategory": null,
  "root_cause": "Parquet file metadata contains min/max statistics that are approximate or stale, causing query engines to incorrectly skip row groups that actually contain matching data.",
  "root_cause_type": "generic",
  "root_cause_zh": "Parquet文件元数据中的最小/最大统计信息是近似值或已过时，导致查询引擎错误地跳过了实际包含匹配数据的行组。",
  "versions": [
    {
      "version": "Apache Parquet 1.12.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Apache Spark 3.3.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Apache Arrow 12.0.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "DuckDB 0.8.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Setting spark.sql.parquet.filterPushdown=false globally",
      "why_fails": "Disabling predicate pushdown entirely removes the performance benefit of row group pruning.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "Running VACUUM or OPTIMIZE on the table assuming it recalculates statistics",
      "why_fails": "Rebuilding statistics with OPTIMIZE does not fix the root cause if the writer originally wrote approximate statistics.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Set session property to disable predicate pushdown for the specific query: SET parquet.pushdown=false; SELECT * FROM table WHERE col > 100;",
      "success_rate": 0.95,
      "how": "Set session property to disable predicate pushdown for the specific query: SET parquet.pushdown=false; SELECT * FROM table WHERE col > 100;",
      "condition": "",
      "sources": []
    },
    {
      "action": "Rewrite the Parquet file with correct statistics using PyArrow: import pyarrow.parquet as pq; table = pq.read_table('bad.parquet'); pq.write_table(table, 'fixed.parquet', write_statistics=True)",
      "success_rate": 0.85,
      "how": "Rewrite the Parquet file with correct statistics using PyArrow: import pyarrow.parquet as pq; table = pq.read_table('bad.parquet'); pq.write_table(table, 'fixed.parquet', write_statistics=True)",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Set session property to disable predicate pushdown for the specific query: SET parquet.pushdown=false; SELECT * FROM table WHERE col > 100;",
    "Rewrite the Parquet file with correct statistics using PyArrow: import pyarrow.parquet as pq; table = pq.read_table('bad.parquet'); pq.write_table(table, 'fixed.parquet', write_statistics=True)"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://parquet.apache.org/docs/file-format/metadata/statistics/",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.8,
  "resolvable": "true",
  "first_seen": "2023-06-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}