{ "id": "huggingface/dataset-features-mismatch", "signature": "ValueError: The features of the dataset do not match the expected schema. Missing columns: ['text', 'label']. Extra columns: ['id', 'metadata']", "signature_zh": "ValueError：数据集的特征与预期模式不匹配。缺失列：['text', 'label']。多余列：['id', 'metadata']", "regex": "ValueError: The features of the dataset do not match the expected schema\\. Missing columns: \\[.*\\]\\. Extra columns: \\[.*\\]", "domain": "huggingface", "category": "data_error", "subcategory": null, "root_cause": "The dataset loaded from Hugging Face Datasets has columns that do not match the expected schema required by the model or training script.", "root_cause_type": "generic", "root_cause_zh": "从 Hugging Face Datasets 加载的数据集具有与模型或训练脚本所需的预期模式不匹配的列。", "versions": [ { "version": "datasets>=2.10.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "transformers>=4.30.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "python>=3.8", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "", "why_fails": "Missing columns still need to be added or mapped from existing columns.", "fail_rate": 0.5, "condition": "", "sources": [] }, { "action": "", "why_fails": "If the column name is misspelled, the error persists.", "fail_rate": 0.6, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Use dataset.select_columns(['text', 'label']) to keep only required columns, then add missing columns with default values: dataset = dataset.add_column('label', [0]*len(dataset)).", "success_rate": 0.9, "how": "Use dataset.select_columns(['text', 'label']) to keep only required columns, then add missing columns with default values: dataset = dataset.add_column('label', [0]*len(dataset)).", "condition": "", "sources": [] }, { "action": "Map extra columns to required ones: dataset = dataset.map(lambda x: {'text': x['metadata'], 'label': 0}).", "success_rate": 0.85, "how": "Map extra columns to required ones: dataset = dataset.map(lambda x: {'text': x['metadata'], 'label': 0}).", "condition": "", "sources": [] } ], "workarounds_zh": [ "使用 dataset.select_columns(['text', 'label']) 仅保留所需列，然后添加缺失列并赋予默认值：dataset = dataset.add_column('label', [0]*len(dataset))。", "将多余列映射到所需列：dataset = dataset.map(lambda x: {'text': x['metadata'], 'label': 0})。" ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.select_columns", "official_doc_section": null, "error_code": null, "verification_tier": "ai_generated", "confidence": 0.86, "fix_success_rate": 0.88, "resolvable": "true", "first_seen": "2024-01-05", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }