{
  "id": "huggingface/datasets-streaming-iterable-dataset-length-error",
  "signature": "TypeError: Streaming dataset does not have a known length. Use `len(dataset)` only on non-streaming datasets.",
  "signature_zh": "TypeError: 流式数据集没有已知长度。请仅对非流式数据集使用 `len(dataset)`。",
  "regex": "Streaming dataset does not have a known length",
  "domain": "huggingface",
  "category": "type_error",
  "subcategory": null,
  "root_cause": "Calling len() on a streaming (Iterable) dataset which does not support length computation because it is lazily loaded.",
  "root_cause_type": "generic",
  "root_cause_zh": "对流式（Iterable）数据集调用 len()，该数据集由于是惰性加载而不支持长度计算。",
  "versions": [
    {
      "version": "datasets>=2.5.0",
      "introduced": null,
      "deprecated": null,
      "removed": null,
      "behavior_change": null,
      "status": "active"
    }
  ],
  "os_specific": {},
  "dead_ends": [
    {
      "action": "",
      "why_fails": "This defeats the purpose of streaming (memory efficiency) and may cause OOM for large datasets. Also, the dataset might be too large to fit in memory.",
      "fail_rate": 0.7,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "These methods also rely on known length and will raise similar errors or return None.",
      "fail_rate": 0.8,
      "condition": "",
      "sources": []
    },
    {
      "action": "",
      "why_fails": "This iterates through the entire dataset, which is slow and defeats streaming benefits; also, for very large datasets it may take hours or cause memory issues.",
      "fail_rate": 0.5,
      "condition": "",
      "sources": []
    }
  ],
  "workarounds": [
    {
      "action": "Check if the dataset is streaming with `isinstance(dataset, IterableDataset)` before calling len(). Example: `if not isinstance(dataset, IterableDataset): print(len(dataset)) else: print('Length unknown')`",
      "success_rate": 0.95,
      "how": "Check if the dataset is streaming with `isinstance(dataset, IterableDataset)` before calling len(). Example: `if not isinstance(dataset, IterableDataset): print(len(dataset)) else: print('Length unknown')`",
      "condition": "",
      "sources": []
    },
    {
      "action": "If you need the length, load the dataset non-streaming only once to get the size, then reload with streaming=True: `length = len(load_dataset('dataset_name', split='train', streaming=False)); dataset = load_dataset('dataset_name', split='train', streaming=True)`",
      "success_rate": 0.85,
      "how": "If you need the length, load the dataset non-streaming only once to get the size, then reload with streaming=True: `length = len(load_dataset('dataset_name', split='train', streaming=False)); dataset = load_dataset('dataset_name', split='train', streaming=True)`",
      "condition": "",
      "sources": []
    },
    {
      "action": "Use dataset.n_shards if available (for sharded datasets) to estimate length, or rely on the dataset's metadata if provided by the source.",
      "success_rate": 0.7,
      "how": "Use dataset.n_shards if available (for sharded datasets) to estimate length, or rely on the dataset's metadata if provided by the source.",
      "condition": "",
      "sources": []
    }
  ],
  "workarounds_zh": [
    "Check if the dataset is streaming with `isinstance(dataset, IterableDataset)` before calling len(). Example: `if not isinstance(dataset, IterableDataset): print(len(dataset)) else: print('Length unknown')`",
    "If you need the length, load the dataset non-streaming only once to get the size, then reload with streaming=True: `length = len(load_dataset('dataset_name', split='train', streaming=False)); dataset = load_dataset('dataset_name', split='train', streaming=True)`",
    "Use dataset.n_shards if available (for sharded datasets) to estimate length, or rely on the dataset's metadata if provided by the source."
  ],
  "transition_graph": {
    "leads_to": [],
    "preceded_by": [],
    "frequently_confused_with": []
  },
  "official_doc_url": "https://huggingface.co/docs/datasets/en/stream",
  "official_doc_section": null,
  "error_code": null,
  "verification_tier": "ai_generated",
  "confidence": 0.87,
  "fix_success_rate": 0.9,
  "resolvable": "true",
  "first_seen": "2023-02-10",
  "last_confirmed": "2024-06-01",
  "last_updated": "2024-06-01",
  "evidence_count": 1,
  "tags": [],
  "locale": "en",
  "aliases": []
}