{ "id": "huggingface/dataset-features-column-mismatch", "signature": "ValueError: The features of the dataset do not match the expected schema. Missing columns: ['text', 'label']. Extra columns: ['input', 'target']", "signature_zh": "ValueError: 数据集的特征与预期的模式不匹配。缺少列: ['text', 'label']。多余列: ['input', 'target']", "regex": "The features of the dataset do not match the expected schema.*Missing columns.*Extra columns", "domain": "huggingface", "category": "data_error", "subcategory": null, "root_cause": "Dataset loaded from Hugging Face Datasets has different column names than those expected by the training script or tokenizer.", "root_cause_type": "generic", "root_cause_zh": "从 Hugging Face Datasets 加载的数据集具有与训练脚本或分词器预期不同的列名。", "versions": [ { "version": "datasets>=2.10.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "transformers>=4.25.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "", "why_fails": "If there are more mismatches (e.g., 'target' vs 'label'), the error persists. Also, renaming may break other downstream code that expects 'input'.", "fail_rate": 0.4, "condition": "", "sources": [] }, { "action": "", "why_fails": "Trainer does not have ignore_columns; dropping columns with dataset.remove_columns() is correct but users often drop the wrong ones or forget to add missing columns.", "fail_rate": 0.5, "condition": "", "sources": [] }, { "action": "", "why_fails": "Model config does not control dataset schema; this is a data preprocessing issue, not a model architecture issue.", "fail_rate": 0.7, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Align columns using Dataset.rename_columns() and Dataset.remove_columns(): `dataset = dataset.rename_columns({'input': 'text', 'target': 'label'}).remove_columns(['unused_col'])`", "success_rate": 0.95, "how": "Align columns using Dataset.rename_columns() and Dataset.remove_columns(): `dataset = dataset.rename_columns({'input': 'text', 'target': 'label'}).remove_columns(['unused_col'])`", "condition": "", "sources": [] }, { "action": "Use datasets.Dataset.map() with a function that selects only the required columns: `dataset = dataset.map(lambda x: {'text': x['input'], 'label': x['target']}, remove_columns=dataset.column_names)`", "success_rate": 0.9, "how": "Use datasets.Dataset.map() with a function that selects only the required columns: `dataset = dataset.map(lambda x: {'text': x['input'], 'label': x['target']}, remove_columns=dataset.column_names)`", "condition": "", "sources": [] }, { "action": "Load the dataset with expected column names by specifying the 'columns' argument in load_dataset() if the dataset supports it, or create a new dataset with the correct schema.", "success_rate": 0.85, "how": "Load the dataset with expected column names by specifying the 'columns' argument in load_dataset() if the dataset supports it, or create a new dataset with the correct schema.", "condition": "", "sources": [] } ], "workarounds_zh": [ "Align columns using Dataset.rename_columns() and Dataset.remove_columns(): `dataset = dataset.rename_columns({'input': 'text', 'target': 'label'}).remove_columns(['unused_col'])`", "Use datasets.Dataset.map() with a function that selects only the required columns: `dataset = dataset.map(lambda x: {'text': x['input'], 'label': x['target']}, remove_columns=dataset.column_names)`", "Load the dataset with expected column names by specifying the 'columns' argument in load_dataset() if the dataset supports it, or create a new dataset with the correct schema." ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset", "official_doc_section": null, "error_code": null, "verification_tier": "ai_generated", "confidence": 0.85, "fix_success_rate": 0.9, "resolvable": "true", "first_seen": "2023-08-10", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }