{
  "id": "huggingface/dataset-shuffling-iterator-break",
  "signature": "RuntimeError: Dataset shuffling requires a deterministic seed for iterable datasets, but seed is None",
  "signature_zh": "运行时错误：数据集洗牌需要可迭代数据集的确定性种子，但种子为 None",
  "regex": "RuntimeError: Dataset shuffling requires a deterministic seed for iterable datasets, but seed is None",
  "domain": "huggingface",
  "category": "data_error",
  "subcategory": null,
  "root_cause": "IterableDataset does not support random shuffling without a fixed seed; the dataset iterator cannot be deterministically replayed for shuffling.",
  "root_cause_type": "generic",
  "root_cause_zh": "IterableDataset 不支持在没有固定种子的情况下随机洗牌；数据集迭代器无法确定性重放以进行洗牌。",
  "versions": [
    {
      "version": "datasets>=2.10.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    },
    {
      "version": "torch>=1.13.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "Set `shuffle=True` on the DataLoader without fixing the seed",
      "why_fails": "The DataLoader's shuffle is incompatible with IterableDataset; it raises an error or silently fails to shuffle.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "Convert the IterableDataset to a MapDataset by calling `.to_iterable_dataset()`",
      "why_fails": "This method does not exist; conversion requires loading the entire dataset into memory, which defeats the purpose of streaming.",
      "fail_rate": 0.9,
      "condition": "",
      "sources": []
    },
    {
      "action": "Use `dataset.shuffle(buffer_size=1000)` without a seed",
      "why_fails": "The shuffle method on IterableDataset requires a seed parameter; omitting it raises the same error.",
      "fail_rate": 1.0,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Specify a seed when shuffling: dataset = dataset.shuffle(seed=42, buffer_size=1000). This ensures deterministic shuffle order for the streaming dataset.",
      "success_rate": 0.95,
      "how": "Specify a seed when shuffling: dataset = dataset.shuffle(seed=42, buffer_size=1000). This ensures deterministic shuffle order for the streaming dataset.",
      "condition": "",
      "sources": []
    },
    {
      "action": "Disable shuffling for IterableDataset and shuffle externally: train_loader = DataLoader(dataset, shuffle=False); then manually shuffle indices before each epoch if using MapDataset.",
      "success_rate": 0.8,
      "how": "Disable shuffling for IterableDataset and shuffle externally: train_loader = DataLoader(dataset, shuffle=False); then manually shuffle indices before each epoch if using MapDataset.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "在洗牌时指定种子：dataset = dataset.shuffle(seed=42, buffer_size=1000)。这可确保流式数据集的确定性洗牌顺序。",
    "禁用 IterableDataset 的洗牌并外部洗牌：train_loader = DataLoader(dataset, shuffle=False)；然后如果使用 MapDataset，在每个 epoch 前手动洗牌索引。"
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://huggingface.co/docs/datasets/v2.10.0/en/stream#shuffling",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.82,
  "fix_success_rate": 0.88,
  "resolvable": "true",
  "first_seen": "2023-11-20",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}