Skip to content

Dataset scripts are no longer supported, but found superb.py #7693

@edwinzajac

Description

@edwinzajac

Describe the bug

Hello,

I'm trying to follow the Hugging Face Pipelines tutorial but the tutorial seems to work only on old datasets versions.

I then get the error :

--------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[65], [line 1](vscode-notebook-cell:?execution_count=65&line=1)
----> [1](vscode-notebook-cell:?execution_count=65&line=1) dataset = datasets.load_dataset("superb", name="asr", split="test")
      3 # KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
      4 # as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
      5 for out in tqdm(pipe(KeyDataset(dataset, "file"))):

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1392, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)
   1387 verification_mode = VerificationMode(
   1388     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1389 )
   1391 # Create a dataset builder
-> [1392](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1392) builder_instance = load_dataset_builder(
   1393     path=path,
   1394     name=name,
   1395     data_dir=data_dir,
   1396     data_files=data_files,
   1397     cache_dir=cache_dir,
   1398     features=features,
   1399     download_config=download_config,
   1400     download_mode=download_mode,
   1401     revision=revision,
   1402     token=token,
   1403     storage_options=storage_options,
   1404     **config_kwargs,
   1405 )
   1407 # Return iterable dataset in case of streaming
   1408 if streaming:

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1132, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)
   1130 if features is not None:
   1131     features = _fix_for_backward_compatible_features(features)
-> [1132](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1132) dataset_module = dataset_module_factory(
   1133     path,
   1134     revision=revision,
   1135     download_config=download_config,
   1136     download_mode=download_mode,
   1137     data_dir=data_dir,
   1138     data_files=data_files,
   1139     cache_dir=cache_dir,
   1140 )
   1141 # Get dataset builder class
   1142 builder_kwargs = dataset_module.builder_kwargs

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1031, in dataset_module_factory(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)
   1026             if isinstance(e1, FileNotFoundError):
   1027                 raise FileNotFoundError(
   1028                     f"Couldn't find any data file at {relative_to_absolute_path(path)}. "
   1029                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1030                 ) from None
-> [1031](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1031)             raise e1 from None
   1032 else:
   1033     raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:989, in dataset_module_factory(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)
    981 try:
    982     api.hf_hub_download(
    983         repo_id=path,
    984         filename=filename,
   (...)    987         proxies=download_config.proxies,
    988     )
--> [989](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:989)     raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
    990 except EntryNotFoundError:
    991     # Use the infos from the parquet export except in some cases:
    992     if data_dir or data_files or (revision and revision != "main"):

RuntimeError: Dataset scripts are no longer supported, but found superb.py

NB : I tried to replace "superb" by "anton-l/superb_demo" but I get a 'torchcodec' importing error. Maybe I misunderstood something.

Steps to reproduce the bug

import datasets
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
dataset = datasets.load_dataset("superb", name="asr", split="test")

# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
for out in tqdm(pipe(KeyDataset(dataset, "file"))):
    print(out)
    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
    # {"text": ....}
    # ....

Expected behavior

Get the tutorial expected results

Environment info

--- SYSTEM INFO ---
Operating System: Ubuntu 24.10
Kernel: Linux 6.11.0-29-generic
Architecture: x86-64

--- PYTHON ---
Python 3.11.13

--- VENV INFO ----
datasets=4.0.0
transformers=4.53
tqdm=4.67.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions