{
  "id": "huggingface/training-checkpoint-save-failed",
  "signature": "RuntimeError: failed to save/load training checkpoint. Could not write file to /path/to/checkpoint",
  "signature_zh": "RuntimeError：保存/加载训练检查点失败。无法将文件写入 /path/to/checkpoint",
  "regex": "RuntimeError: failed to save/load training checkpoint\\. Could not write file to .*",
  "domain": "huggingface",
  "category": "runtime_error",
  "subcategory": null,
  "root_cause": "Insufficient disk space, permission issues, or network file system (NFS) problems preventing checkpoint file writes during training.",
  "root_cause_type": "generic",
  "root_cause_zh": "磁盘空间不足、权限问题或网络文件系统 (NFS) 问题，阻止训练期间写入检查点文件。",
  "versions": [
    {
      "version": "transformers>=4.28.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "torch>=1.13.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "python>=3.8",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "If disk is full or permissions are wrong, even a single save will fail.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "Reducing total checkpoints does not fix write failures; the error occurs during writing itself.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Check disk space with df -h and free space by clearing cache or moving to a different directory. Example: TrainingArguments(output_dir='/new/path/with/space')",
      "success_rate": 0.85,
      "how": "Check disk space with df -h and free space by clearing cache or moving to a different directory. Example: TrainingArguments(output_dir='/new/path/with/space')",
      "condition": "",
      "sources": []
    },
    {
      "action": "Set save_steps to a higher value and use save_only_model=True to reduce checkpoint size: TrainingArguments(save_steps=500, save_only_model=True)",
      "success_rate": 0.75,
      "how": "Set save_steps to a higher value and use save_only_model=True to reduce checkpoint size: TrainingArguments(save_steps=500, save_only_model=True)",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "使用 df -h 检查磁盘空间，通过清理缓存或移动到不同目录来释放空间。示例：TrainingArguments(output_dir='/new/path/with/space')",
    "将 save_steps 设置为更高值，并使用 save_only_model=True 减少检查点大小：TrainingArguments(save_steps=500, save_only_model=True)"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://huggingface.co/docs/transformers/main/en/main_classes/trainer#checkpointing",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.81,
  "fix_success_rate": 0.8,
  "resolvable": "partial",
  "first_seen": "2023-12-01",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}