{
  "id": "tensorflow/checkpoint-save-failed-io-error",
  "signature": "tensorflow.python.framework.errors_impl.UnknownError: Failed to save checkpoint to /tmp/model.ckpt: IO error: No space left on device [Op:SaveV2]",
  "signature_zh": "tensorflow.python.framework.errors_impl.UnknownError: 无法保存检查点到 /tmp/model.ckpt: IO错误：设备空间不足 [Op:SaveV2]",
  "regex": "Failed to save checkpoint.*IO error.*No space left on device",
  "domain": "tensorflow",
  "category": "resource_error",
  "subcategory": null,
  "root_cause": "The disk partition where the checkpoint directory resides has run out of inodes or blocks, causing the SaveV2 operation to fail.",
  "root_cause_type": "generic",
  "root_cause_zh": "检查点目录所在的磁盘分区 inode 或块耗尽，导致 SaveV2 操作失败。",
  "versions": [
    {
      "version": "2.12",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "2.13",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "2.14",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Delete random files in /tmp to free space",
      "why_fails": "The checkpoint path may not be in /tmp; also deleting unrelated files can cause other failures.",
      "fail_rate": 0.6,
      "condition": "",
      "sources": []
    },
    {
      "action": "Set TF_CPP_MIN_LOG_LEVEL=2 to suppress the error",
      "why_fails": "Suppressing logs does not resolve the underlying disk space issue; the checkpoint will still not be saved.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "Reduce batch size to reduce checkpoint size",
      "why_fails": "Checkpoint size is determined by model parameters, not batch size; reducing batch size does not free disk space.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Check disk usage with 'df -h' and 'df -i', then delete unnecessary files or expand the partition. Alternatively, change checkpoint path to a partition with more space using tf.train.CheckpointManager with a different directory.",
      "success_rate": 0.85,
      "how": "Check disk usage with 'df -h' and 'df -i', then delete unnecessary files or expand the partition. Alternatively, change checkpoint path to a partition with more space using tf.train.CheckpointManager with a different directory.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Enable checkpoint compression by setting options.experimental_io_device='/job:localhost' and using tf.train.CheckpointOptions(experimental_io_device='/job:localhost', experimental_enable_async_checkpoint=True) to reduce immediate disk usage.",
      "success_rate": 0.75,
      "how": "Enable checkpoint compression by setting options.experimental_io_device='/job:localhost' and using tf.train.CheckpointOptions(experimental_io_device='/job:localhost', experimental_enable_async_checkpoint=True) to reduce immediate disk usage.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Check disk usage with 'df -h' and 'df -i', then delete unnecessary files or expand the partition. Alternatively, change checkpoint path to a partition with more space using tf.train.CheckpointManager with a different directory.",
    "Enable checkpoint compression by setting options.experimental_io_device='/job:localhost' and using tf.train.CheckpointOptions(experimental_io_device='/job:localhost', experimental_enable_async_checkpoint=True) to reduce immediate disk usage."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://www.tensorflow.org/guide/checkpoint",
  "official_doc_section": null,
  "error_code": "ESAV",
  "verification_tier": "ai_generated",
  "confidence": 0.88,
  "fix_success_rate": 0.85,
  "resolvable": "true",
  "first_seen": "2024-02-15",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}