{ "id": "llm/tokenizer-encoding-mismatch-between-libraries", "signature": "ValueError: Token indices sequence length is longer than the specified maximum sequence length — tiktoken vs transformers mismatch", "signature_zh": "ValueError：令牌索引序列长度超过指定的最大序列长度 — tiktoken与transformers不匹配", "regex": "Token indices sequence length is longer than|token count mismatch|tiktoken vs transformers", "domain": "llm", "category": "type_error", "subcategory": null, "root_cause": "Different tokenization libraries (tiktoken vs Hugging Face transformers) produce different token counts for the same text, leading to context window violations when switching between APIs.", "root_cause_type": "generic", "root_cause_zh": "不同的分词库（tiktoken与Hugging Face transformers）对相同文本产生不同的令牌计数，导致在API之间切换时出现上下文窗口违规。", "versions": [ { "version": "tiktoken==0.6.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "transformers==4.38.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "torch==2.2.0", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "gpt-4-1106-preview", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" }, { "version": "llama-2-7b-chat-hf", "introduced": null, "deprecated": null, "removed": null, "behavior_change": null, "status": "active" } ], "os_specific": {}, "dead_ends": [ { "action": "", "why_fails": "Using the same max_length parameter for both libraries without recalibration will cause truncation or errors.", "fail_rate": 0.8, "condition": "", "sources": [] }, { "action": "", "why_fails": "Assuming tiktoken and transformers tokenizers are interchangeable for the same model (e.g., gpt-4) leads to incorrect token budget calculations.", "fail_rate": 0.9, "condition": "", "sources": [] }, { "action": "", "why_fails": "Simply increasing max_length in transformers doesn't solve the mismatch because the tokenizer itself counts differently.", "fail_rate": 0.85, "condition": "", "sources": [] } ], "workarounds": [ { "action": "Always use the same tokenizer library for both counting and encoding. For OpenAI models, use tiktoken exclusively; for Hugging Face models, use AutoTokenizer from transformers.", "success_rate": 0.95, "how": "Always use the same tokenizer library for both counting and encoding. For OpenAI models, use tiktoken exclusively; for Hugging Face models, use AutoTokenizer from transformers.", "condition": "", "sources": [] }, { "action": "Calibrate token counts by running a sample through both tokenizers and applying a correction factor (e.g., multiply transformers count by 1.05 for safety margin).", "success_rate": 0.8, "how": "Calibrate token counts by running a sample through both tokenizers and applying a correction factor (e.g., multiply transformers count by 1.05 for safety margin).", "condition": "", "sources": [] } ], "workarounds_zh": [ "始终使用相同的分词库进行计数和编码。对于OpenAI模型，专门使用tiktoken；对于Hugging Face模型，使用transformers中的AutoTokenizer。", "通过对样本运行两个分词器并应用校正因子（例如，将transformers计数乘以1.05作为安全边际）来校准令牌计数。" ], "transition_graph": { "leads_to": [], "preceded_by": [], "frequently_confused_with": [] }, "official_doc_url": "https://github.com/openai/tiktoken", "official_doc_section": null, "error_code": null, "verification_tier": "ai_generated", "confidence": 0.86, "fix_success_rate": 0.88, "resolvable": "true", "first_seen": "2024-01-20", "last_confirmed": "2024-06-01", "last_updated": "2024-06-01", "evidence_count": 1, "tags": [], "locale": "en", "aliases": [] }