{
  "id": "data/csv-encoding-bom-misdetection",
  "signature": "CSV file with UTF-8 BOM causes first column name to include \\ufeff prefix",
  "signature_zh": "带有UTF-8 BOM的CSV文件导致第一列名称包含\\ufeff前缀",
  "regex": "\\\\ufeff|BOM|byte order mark|column name.*\\ufeff",
  "domain": "data",
  "category": "encoding",
  "subcategory": null,
  "root_cause": "CSV files saved with UTF-8 BOM (Byte Order Mark) include the BOM bytes at the start; some parsers (e.g., Python csv module, Spark) treat the BOM as part of the first column name instead of stripping it.",
  "root_cause_type": "generic",
  "root_cause_zh": "带有UTF-8 BOM（字节顺序标记）的CSV文件在开头包含BOM字节；某些解析器（如Python csv模块、Spark）将BOM视为第一列名称的一部分而不是剥离它。",
  "versions": [
    {
      "version": "Python 3.11",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "pandas 2.1.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Spark 3.5.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "Microsoft Excel 365",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "Reading with utf-8-sig strips the BOM, but if the file is re-written without specifying encoding, the BOM may reappear or be lost, causing inconsistency.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "This is not scalable for large datasets; the BOM will reappear if the file is re-saved from Excel or other tools that add BOM by default.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Python's csv module does not strip the BOM; the first column name will still have \\ufeff prefix.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "In pandas, use `pd.read_csv('file.csv', encoding='utf-8-sig')` to automatically strip the BOM on read. Example: `df = pd.read_csv('data.csv', encoding='utf-8-sig')`",
      "success_rate": 0.95,
      "how": "In pandas, use `pd.read_csv('file.csv', encoding='utf-8-sig')` to automatically strip the BOM on read. Example: `df = pd.read_csv('data.csv', encoding='utf-8-sig')`",
      "condition": "",
      "sources": []
    },
    {
      "action": "In Spark, use `spark.read.option('encoding', 'UTF-8-BOM').csv('path')` or preprocess with `sed '1s/^\\xEF\\xBB\\xBF//' file.csv > clean.csv`",
      "success_rate": 0.9,
      "how": "In Spark, use `spark.read.option('encoding', 'UTF-8-BOM').csv('path')` or preprocess with `sed '1s/^\\xEF\\xBB\\xBF//' file.csv > clean.csv`",
      "condition": "",
      "sources": []
    },
    {
      "action": "In Python, open the file with `open('file.csv', encoding='utf-8-sig')` and pass the file handle to csv.reader: `with open('file.csv', encoding='utf-8-sig') as f: reader = csv.reader(f)`",
      "success_rate": 0.95,
      "how": "In Python, open the file with `open('file.csv', encoding='utf-8-sig')` and pass the file handle to csv.reader: `with open('file.csv', encoding='utf-8-sig') as f: reader = csv.reader(f)`",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "In pandas, use `pd.read_csv('file.csv', encoding='utf-8-sig')` to automatically strip the BOM on read. Example: `df = pd.read_csv('data.csv', encoding='utf-8-sig')`",
    "In Spark, use `spark.read.option('encoding', 'UTF-8-BOM').csv('path')` or preprocess with `sed '1s/^\\xEF\\xBB\\xBF//' file.csv > clean.csv`",
    "In Python, open the file with `open('file.csv', encoding='utf-8-sig')` and pass the file handle to csv.reader: `with open('file.csv', encoding='utf-8-sig') as f: reader = csv.reader(f)`"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://docs.python.org/3/library/csv.html",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.85,
  "fix_success_rate": 0.9,
  "resolvable": "true",
  "first_seen": "2023-09-01",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}