{
  "id": "llm/tokenizer-vocab-mismatch",
  "signature": "KeyError: 'tokenizer_vocab_size' not found in model config",
  "signature_zh": "KeyError: 模型配置中未找到 'tokenizer_vocab_size'",
  "regex": "KeyError: 'tokenizer_vocab_size' not found in model config",
  "domain": "llm",
  "category": "config_error",
  "subcategory": null,
  "root_cause": "When fine-tuning or loading a model, the tokenizer configuration file is missing the 'tokenizer_vocab_size' key, often due to using a mismatched tokenizer or an incomplete model card from Hugging Face.",
  "root_cause_type": "generic",
  "root_cause_zh": "微调或加载模型时，分词器配置文件缺少 'tokenizer_vocab_size' 键，通常是由于使用了不匹配的分词器或不完整的 Hugging Face 模型卡。",
  "versions": [
    {
      "version": "transformers==4.35.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "transformers==4.38.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "llama-2-7b-hf",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "mistral-7b-v0.1",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "The value must match the actual tokenizer vocabulary size; an arbitrary value will cause embedding dimension mismatches or runtime errors.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "The error is a configuration issue with the model, not a library installation problem.",
      "fail_rate": 0.95,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Load the tokenizer separately and set the config manually: `from transformers import AutoTokenizer; tokenizer = AutoTokenizer.from_pretrained('model_name'); model.config.vocab_size = len(tokenizer)`",
      "success_rate": 0.95,
      "how": "Load the tokenizer separately and set the config manually: `from transformers import AutoTokenizer; tokenizer = AutoTokenizer.from_pretrained('model_name'); model.config.vocab_size = len(tokenizer)`",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use a different model variant that includes the tokenizer config (e.g., prefer '-hf' variants from Hugging Face).",
      "success_rate": 0.85,
      "how": "Use a different model variant that includes the tokenizer config (e.g., prefer '-hf' variants from Hugging Face).",
      "condition": "",
      "sources": []
    },
    {
      "action": "Download the full model directory including tokenizer files from Hugging Face instead of using a partial or cached version.",
      "success_rate": 0.9,
      "how": "Download the full model directory including tokenizer files from Hugging Face instead of using a partial or cached version.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "单独加载分词器并手动设置配置：`from transformers import AutoTokenizer; tokenizer = AutoTokenizer.from_pretrained('model_name'); model.config.vocab_size = len(tokenizer)`",
    "使用包含分词器配置的不同模型变体（例如，优先使用 Hugging Face 的 '-hf' 变体）。",
    "从 Hugging Face 下载完整的模型目录（包括分词器文件），而不是使用部分或缓存版本。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.87,
  "fix_success_rate": 0.9,
  "resolvable": "true",
  "first_seen": "2023-09-01",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}