{
  "id": "llm/tokenizer-vocab-mismatch-fine-tune",
  "signature": "KeyError: 'tokenizer_vocab_size' not found in model config for fine-tuning",
  "signature_zh": "键错误：微调模型配置中未找到'tokenizer_vocab_size'。",
  "regex": "tokenizer_vocab_size.*not found",
  "domain": "llm",
  "category": "config_error",
  "subcategory": null,
  "root_cause": "Mismatch between tokenizer vocabulary size and model embedding layer size when loading a pre-trained model for fine-tuning with a custom tokenizer.",
  "root_cause_type": "generic",
  "root_cause_zh": "在使用自定义分词器加载预训练模型进行微调时，分词器词汇表大小与模型嵌入层大小不匹配。",
  "versions": [
    {
      "version": "transformers 4.36.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "transformers 4.37.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "PyTorch 2.1.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Setting tokenizer_vocab_size manually in config to match tokenizer size",
      "why_fails": "Model embedding layer weights are fixed; resizing requires special method, not config change.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    },
    {
      "action": "Reinstalling transformers package",
      "why_fails": "Error is configuration-related, not installation-related.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Resize tokenizer embeddings before training: model.resize_token_embeddings(len(tokenizer))",
      "success_rate": 0.95,
      "how": "Resize tokenizer embeddings before training: model.resize_token_embeddings(len(tokenizer))",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use the default tokenizer that comes with the pre-trained model instead of a custom one",
      "success_rate": 0.8,
      "how": "Use the default tokenizer that comes with the pre-trained model instead of a custom one",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Resize tokenizer embeddings before training: model.resize_token_embeddings(len(tokenizer))",
    "Use the default tokenizer that comes with the pre-trained model instead of a custom one"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://huggingface.co/docs/transformers/training",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.85,
  "resolvable": "true",
  "first_seen": "2024-01-20",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}