{
  "id": "llm/tokenizer-encoding-mismatch-between-libraries",
  "signature": "ValueError: Token indices sequence length is longer than the specified maximum sequence length — tiktoken vs transformers mismatch",
  "signature_zh": "ValueError：令牌索引序列长度超过指定的最大序列长度 — tiktoken与transformers不匹配",
  "regex": "Token indices sequence length is longer than|token count mismatch|tiktoken vs transformers",
  "domain": "llm",
  "category": "type_error",
  "subcategory": null,
  "root_cause": "Different tokenization libraries (tiktoken vs Hugging Face transformers) produce different token counts for the same text, leading to context window violations when switching between APIs.",
  "root_cause_type": "generic",
  "root_cause_zh": "不同的分词库（tiktoken与Hugging Face transformers）对相同文本产生不同的令牌计数，导致在API之间切换时出现上下文窗口违规。",
  "versions": [
    {
      "version": "tiktoken==0.6.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "transformers==4.38.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "torch==2.2.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "gpt-4-1106-preview",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "llama-2-7b-chat-hf",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "Using the same max_length parameter for both libraries without recalibration will cause truncation or errors.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Assuming tiktoken and transformers tokenizers are interchangeable for the same model (e.g., gpt-4) leads to incorrect token budget calculations.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Simply increasing max_length in transformers doesn't solve the mismatch because the tokenizer itself counts differently.",
      "fail_rate": 0.85,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Always use the same tokenizer library for both counting and encoding. For OpenAI models, use tiktoken exclusively; for Hugging Face models, use AutoTokenizer from transformers.",
      "success_rate": 0.95,
      "how": "Always use the same tokenizer library for both counting and encoding. For OpenAI models, use tiktoken exclusively; for Hugging Face models, use AutoTokenizer from transformers.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Calibrate token counts by running a sample through both tokenizers and applying a correction factor (e.g., multiply transformers count by 1.05 for safety margin).",
      "success_rate": 0.8,
      "how": "Calibrate token counts by running a sample through both tokenizers and applying a correction factor (e.g., multiply transformers count by 1.05 for safety margin).",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "始终使用相同的分词库进行计数和编码。对于OpenAI模型，专门使用tiktoken；对于Hugging Face模型，使用transformers中的AutoTokenizer。",
    "通过对样本运行两个分词器并应用校正因子（例如，将transformers计数乘以1.05作为安全边际）来校准令牌计数。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://github.com/openai/tiktoken",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.86,
  "fix_success_rate": 0.88,
  "resolvable": "true",
  "first_seen": "2024-01-20",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}