ValidationError: 1 validation error for LoadHubDataset

My dataset link that I am using from Huggingface : https://huggingface.co/datasets/kaifahmad/indian-history-hindi-QA-3.4k

Python == 3.10.10 GPU == 12.1 OS == Windows 11 x64 distilable == 1.0.0

!pip install distilabel==1.0.0 distilabel[openai]==1.0.0

from distilabel.llms import OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadHubDataset
from distilabel.steps.tasks import TextGeneration

with Pipeline(
    name="simple-text-generation-pipeline",
    description="A simple text generation pipeline",
) as pipeline:
    load_dataset = LoadHubDataset(
        name="kaifahmad/indian-history-hindi-QA-3.4k",
        output_mappings={"Question": "Answer"},
    )

    generate_with_openai = TextGeneration(
        name="generate_with_gpt35", llm=OpenAILLM(model="gpt-3.5-turbo")
    )

    load_dataset.connect(generate_with_openai)

if __name__ == "__main__":
    distiset = pipeline.run(
        parameters={
            "load_dataset": {
                "repo_id": "kaifahmad/indian-history-hindi-QA-3.4k",
                "split": "train",
            },
            "generate_with_gpt35": {
                "llm": {
                    "generation_kwargs": {
                        "temperature": 0.7,
                        "max_new_tokens": 512,
                    }
                }
            },
        },
    )

│ in <cell line: 6>:10                                                                             │
│ ╭─────────────────────────────────────────── locals ───────────────────────────────────────────╮ │
│ │           exit = <IPython.core.autocall.ZMQExitAutocall object at 0x7ce631adfdc0>            │ │
│ │    get_ipython = <bound method InteractiveShell.get_ipython of <google.colab._shell.Shell    │ │
│ │                  object at 0x7ce631adf820>>                                                  │ │
│ │             In = [                                                                           │ │
│ │                  │   '',                                                                     │ │
│ │                  │   "get_ipython().system('pip install distilabel==1.0.0                    │ │
│ │                  distilabel[openai]==1.0.0')",                                               │ │
│ │                  │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline import │ │
│ │                  Pipeline\nf'+1042,                                                          │ │
│ │                  │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline import │ │
│ │                  Pipeline\nf'+535,                                                           │ │
│ │                  │   'if __name__ == "__main__":\n    distiset = pipeline.run(\n             │ │
│ │                  parameters={\n   '+425,                                                     │ │
│ │                  │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline import │ │
│ │                  Pipeline\nf'+1029                                                           │ │
│ │                  ]                                                                           │ │
│ │ LoadHubDataset = <class 'distilabel.steps.generators.huggingface.LoadHubDataset'>            │ │
│ │      OpenAILLM = <class 'distilabel.llms.openai.OpenAILLM'>                                  │ │
│ │            Out = {}                                                                          │ │
│ │       Pipeline = <class 'distilabel.pipeline.local.Pipeline'>                                │ │
│ │       pipeline = <distilabel.pipeline.local.Pipeline object at 0x7ce5eee5eb90>               │ │
│ │           quit = <IPython.core.autocall.ZMQExitAutocall object at 0x7ce631adfdc0>            │ │
│ │ TextGeneration = <class 'distilabel.steps.tasks.text_generation.TextGeneration'>             │ │
│ ╰──────────────────────────────────────────────────────────────────────────────────────────────╯ │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/pydantic/main.py:176 in __init__                         │
│                                                                                                  │
│    173 │   │   """                                                                               │
│    174 │   │   # `__tracebackhide__` tells pytest and some other tools to omit this function fr  │
│    175 │   │   __tracebackhide__ = True                                                          │
│ ❱  176 │   │   self.__pydantic_validator__.validate_python(data, self_instance=self)             │
│    177 │                                                                                         │
│    178 │   # The following line sets a flag that we use to determine when `__init__` gets overr  │
│    179 │   __init__.__pydantic_base_init__ = True  # pyright: ignore[reportFunctionMemberAccess  │
│                                                                                                  │
│ ╭─────────────────────────── locals ───────────────────────────╮                                 │
│ │ data = {                                                     │                                 │
│ │        │   'name': 'kaifahmad/indian-history-hindi-QA-3.4k', │                                 │
│ │        │   'output_mappings': {'Question': 'Answer'}         │                                 │
│ │        }                                                     │                                 │
│ │ self = LoadHubDataset()                                      │                                 │
│ ╰──────────────────────────────────────────────────────────────╯                                 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValidationError: 1 validation error for LoadHubDataset
name
  String should match pattern '^[a-zA-Z0-9_-]+$' [type=string_pattern_mismatch, 
input_value='kaifahmad/indian-history-hindi-QA-3.4k', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/string_pattern_mismatch

Hey, @gabrielmbmb Now getting this exception

from distilabel.llms import OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadHubDataset
from distilabel.steps.tasks import TextGeneration

with Pipeline(
    name="simple-text-generation-pipeline",
    description="A simple text generation pipeline",
) as pipeline:
    load_dataset = LoadHubDataset(
        name="load_dataset",
        output_mappings={"Question": "Answer"},
    )

    generate_with_openai = TextGeneration(
        name="generate_with_gpt35", llm=OpenAILLM(model="gpt-3.5-turbo")
    )

    load_dataset.connect(generate_with_openai)

if __name__ == "__main__":
    distiset = pipeline.run(
        parameters={
            "load_dataset": {
                "repo_id": "kaifahmad/indian-history-hindi-QA-3.4k",
                "split": "train",
            },
            "generate_with_gpt35": {
                "llm": {
                    "generation_kwargs": {
                        "temperature": 0.7,
                        "max_new_tokens": 512,
                    }
                }
            },
        },
    )

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <cell line: 21>:22                                                                            │
│ ╭─────────────────────────────────────────── locals ───────────────────────────────────────────╮ │
│ │                 exit = <IPython.core.autocall.ZMQExitAutocall object at 0x7a2535a27d90>      │ │
│ │ generate_with_openai = TextGeneration(                                                       │ │
│ │                        │   name='generate_with_gpt35',                                       │ │
│ │                        │   input_mappings={},                                                │ │
│ │                        │   output_mappings={},                                               │ │
│ │                        │   input_batch_size=50,                                              │ │
│ │                        │   llm=OpenAILLM(                                                    │ │
│ │                        │   │   generation_kwargs={                                           │ │
│ │                        │   │   │   'temperature': 0.7,                                       │ │
│ │                        │   │   │   'max_new_tokens': 512                                     │ │
│ │                        │   │   },                                                            │ │
│ │                        │   │   model='gpt-3.5-turbo',                                        │ │
│ │                        │   │   base_url='https://api.openai.com/v1',                         │ │
│ │                        │   │   api_key=None,                                                 │ │
│ │                        │   │   max_retries=6,                                                │ │
│ │                        │   │   timeout=120                                                   │ │
│ │                        │   ),                                                                │ │
│ │                        │   group_generations=False,                                          │ │
│ │                        │   num_generations=1                                                 │ │
│ │                        )                                                                     │ │
│ │          get_ipython = <bound method InteractiveShell.get_ipython of                         │ │
│ │                        <google.colab._shell.Shell object at 0x7a2535a277f0>>                 │ │
│ │                   In = [                                                                     │ │
│ │                        │   '',                                                               │ │
│ │                        │   "get_ipython().system('pip install --quiet distilabel==1.0.0      │ │
│ │                        distilabel[openai]=="+7,                                              │ │
│ │                        │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline  │ │
│ │                        import Pipeline\nf'+1023,                                             │ │
│ │                        │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline  │ │
│ │                        import Pipeline\nf'+1016,                                             │ │
│ │                        │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline  │ │
│ │                        import Pipeline\nf'+1006,                                             │ │
│ │                        │   'from distilabel.llms import OpenAILLM\nfrom distilabel.pipeline  │ │
│ │                        import Pipeline\nf'+1003                                              │ │
│ │                        ]                                                                     │ │
│ │         load_dataset = LoadHubDataset(                                                       │ │
│ │                        │   name='load_dataset',                                              │ │
│ │                        │   input_mappings={},                                                │ │
│ │                        │   output_mappings={'Question': 'Answer'},                           │ │
│ │                        │   batch_size=50,                                                    │ │
│ │                        │   repo_id='kaifahmad/indian-history-hindi-QA-3.4k',                 │ │
│ │                        │   split='train',                                                    │ │
│ │                        │   config=None                                                       │ │
│ │                        )                                                                     │ │
│ │       LoadHubDataset = <class 'distilabel.steps.generators.huggingface.LoadHubDataset'>      │ │
│ │            OpenAILLM = <class 'distilabel.llms.openai.OpenAILLM'>                            │ │
│ │                  Out = {}                                                                    │ │
│ │             Pipeline = <class 'distilabel.pipeline.local.Pipeline'>                          │ │
│ │             pipeline = <distilabel.pipeline.local.Pipeline object at 0x7a24ea48e0b0>         │ │
│ │                 quit = <IPython.core.autocall.ZMQExitAutocall object at 0x7a2535a27d90>      │ │
│ │       TextGeneration = <class 'distilabel.steps.tasks.text_generation.TextGeneration'>       │ │
│ ╰──────────────────────────────────────────────────────────────────────────────────────────────╯ │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/distilabel/pipeline/local.py:93 in run                   │
│                                                                                                  │
│    90 │   │   setup_logging(log_queue)  # type: ignore                                           │
│    91 │   │   self._logger = logging.getLogger("distilabel.pipeline.local")                      │
│    92 │   │                                                                                      │
│ ❱  93 │   │   super().run(parameters, use_cache)                                                 │
│    94 │   │                                                                                      │
│    95 │   │   if self._batch_manager is None:                                                    │
│    96 │   │   │   self._batch_manager = _BatchManager.from_dag(self.dag)                         │
│                                                                                                  │
│ ╭─────────────────────────────────────────── locals ───────────────────────────────────────────╮ │
│ │  log_queue = <multiprocessing.queues.Queue object at 0x7a24ea48d5d0>                         │ │
│ │ parameters = {                                                                               │ │
│ │              │   'load_dataset': {                                                           │ │
│ │              │   │   'repo_id': 'kaifahmad/indian-history-hindi-QA-3.4k',                    │ │
│ │              │   │   'split': 'train'                                                        │ │
│ │              │   },                                                                          │ │
│ │              │   'generate_with_gpt35': {                                                    │ │
│ │              │   │   'llm': {                                                                │ │
│ │              │   │   │   'generation_kwargs': {                                              │ │
│ │              │   │   │   │   'temperature': 0.7,                                             │ │
│ │              │   │   │   │   'max_new_tokens': 512                                           │ │
│ │              │   │   │   }                                                                   │ │
│ │              │   │   }                                                                       │ │
│ │              │   }                                                                           │ │
│ │              }                                                                               │ │
│ │       self = <distilabel.pipeline.local.Pipeline object at 0x7a24ea48e0b0>                   │ │
│ │  use_cache = True                                                                            │ │
│ ╰──────────────────────────────────────────────────────────────────────────────────────────────╯ │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/distilabel/pipeline/base.py:211 in run                   │
│                                                                                                  │
│   208 │   │   if use_cache:                                                                      │
│   209 │   │   │   self._load_from_cache()                                                        │
│   210 │   │   self._set_runtime_parameters(parameters or {})                                     │
│ ❱ 211 │   │   self.dag.validate()                                                                │
│   212 │                                                                                          │
│   213 │   def get_runtime_parameters_info(self) -> Dict[str, List[Dict[str, Any]]]:              │
│   214 │   │   """Get the runtime parameters for the steps in the pipeline.                       │
│                                                                                                  │
│ ╭─────────────────────────────────────────── locals ───────────────────────────────────────────╮ │
│ │ parameters = {                                                                               │ │
│ │              │   'load_dataset': {                                                           │ │
│ │              │   │   'repo_id': 'kaifahmad/indian-history-hindi-QA-3.4k',                    │ │
│ │              │   │   'split': 'train'                                                        │ │
│ │              │   },                                                                          │ │
│ │              │   'generate_with_gpt35': {                                                    │ │
│ │              │   │   'llm': {                                                                │ │
│ │              │   │   │   'generation_kwargs': {                                              │ │
│ │              │   │   │   │   'temperature': 0.7,                                             │ │
│ │              │   │   │   │   'max_new_tokens': 512                                           │ │
│ │              │   │   │   }                                                                   │ │
│ │              │   │   }                                                                       │ │
│ │              │   }                                                                           │ │
│ │              }                                                                               │ │
│ │       self = <distilabel.pipeline.local.Pipeline object at 0x7a24ea48e0b0>                   │ │
│ │  use_cache = True                                                                            │ │
│ ╰──────────────────────────────────────────────────────────────────────────────────────────────╯ │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/distilabel/pipeline/_dag.py:260 in validate              │
│                                                                                                  │
│   257 │   │   │   │   │   │   )                                                                  │
│   258 │   │   │   │   │   self._validate_generator_step_process_signature(step)                  │
│   259 │   │   │   │   else:                                                                      │
│ ❱ 260 │   │   │   │   │   self._step_inputs_are_available(step)                                  │
│   261 │                                                                                          │
│   262 │   def _step_inputs_are_available(self, step: "_Step") -> None:                           │
│   263 │   │   """Validates that the `Step.inputs` will be available when the step gets to be     │
│                                                                                                  │
│ ╭──────────────────────────────────────── locals ────────────────────────────────────────╮       │
│ │          self = <distilabel.pipeline._dag.DAG object at 0x7a24ea48e4a0>                │       │
│ │          step = TextGeneration(                                                        │       │
│ │                 │   name='generate_with_gpt35',                                        │       │
│ │                 │   input_mappings={},                                                 │       │
│ │                 │   output_mappings={},                                                │       │
│ │                 │   input_batch_size=50,                                               │       │
│ │                 │   llm=OpenAILLM(                                                     │       │
│ │                 │   │   generation_kwargs={                                            │       │
│ │                 │   │   │   'temperature': 0.7,                                        │       │
│ │                 │   │   │   'max_new_tokens': 512                                      │       │
│ │                 │   │   },                                                             │       │
│ │                 │   │   model='gpt-3.5-turbo',                                         │       │
│ │                 │   │   base_url='https://api.openai.com/v1',                          │       │
│ │                 │   │   api_key=None,                                                  │       │
│ │                 │   │   max_retries=6,                                                 │       │
│ │                 │   │   timeout=120                                                    │       │
│ │                 │   ),                                                                 │       │
│ │                 │   group_generations=False,                                           │       │
│ │                 │   num_generations=1                                                  │       │
│ │                 )                                                                      │       │
│ │     step_name = 'generate_with_gpt35'                                                  │       │
│ │         steps = ['generate_with_gpt35']                                                │       │
│ │ trophic_level = 2                                                                      │       │
│ ╰────────────────────────────────────────────────────────────────────────────────────────╯       │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/distilabel/pipeline/_dag.py:277 in                       │
│ _step_inputs_are_available                                                                       │
│                                                                                                  │
│   274 │   │   ]                                                                                  │
│   275 │   │   step_inputs = step.get_inputs()                                                    │
│   276 │   │   if not all(input in inputs_available_for_step for input in step_inputs):           │
│ ❱ 277 │   │   │   raise ValueError(                                                              │
│   278 │   │   │   │   f"Step '{step.name}' requires inputs {step_inputs} which are not"          │
│   279 │   │   │   │   f" available when the step gets to be executed in the pipeline."           │
│   280 │   │   │   │   f" Please make sure previous steps to '{step.name}' are generating"        │
│                                                                                                  │
│ ╭─────────────────────────────────────────── locals ───────────────────────────────────────────╮ │
│ │ inputs_available_for_step = ['Answer', 'Answer']                                             │ │
│ │                      self = <distilabel.pipeline._dag.DAG object at 0x7a24ea48e4a0>          │ │
│ │                      step = TextGeneration(                                                  │ │
│ │                             │   name='generate_with_gpt35',                                  │ │
│ │                             │   input_mappings={},                                           │ │
│ │                             │   output_mappings={},                                          │ │
│ │                             │   input_batch_size=50,                                         │ │
│ │                             │   llm=OpenAILLM(                                               │ │
│ │                             │   │   generation_kwargs={                                      │ │
│ │                             │   │   │   'temperature': 0.7,                                  │ │
│ │                             │   │   │   'max_new_tokens': 512                                │ │
│ │                             │   │   },                                                       │ │
│ │                             │   │   model='gpt-3.5-turbo',                                   │ │
│ │                             │   │   base_url='https://api.openai.com/v1',                    │ │
│ │                             │   │   api_key=None,                                            │ │
│ │                             │   │   max_retries=6,                                           │ │
│ │                             │   │   timeout=120                                              │ │
│ │                             │   ),                                                           │ │
│ │                             │   group_generations=False,                                     │ │
│ │                             │   num_generations=1                                            │ │
│ │                             )                                                                │ │
│ │               step_inputs = ['instruction']                                                  │ │
│ ╰──────────────────────────────────────────────────────────────────────────────────────────────╯ │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: Step 'generate_with_gpt35' requires inputs ['instruction'] which are not available when the step gets 
to be executed in the pipeline. Please make sure previous steps to 'generate_with_gpt35' are generating the 
required inputs. Available inputs are: ['Answer', 'Answer']

argilla-io / distilabel

ValidationError: 1 validation error for LoadHubDataset #616