ray-project / llm-applications

A comprehensive guide to building RAG-based LLM applications for production.
Creative Commons Attribution 4.0 International
1.71k stars 229 forks source link

can't run the notebook locally #63

Open sylvain471 opened 1 year ago

sylvain471 commented 1 year ago

Hello, very interested with this work I am trying to run it locally.

However I am stuck at the cell

# Extract sections
sections_ds = ds.flat_map(extract_sections)
sections_ds.count() 

sections_ds.count() throws the following error, any idea about what may solve this issue?


{
    "name": "RayTaskError(FileNotFoundError)",
    "message": "ray::FlatMap(extract_sections)() (pid=153397, ip=192.168.1.82)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 405, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 345, in __call__
    for data in iter:
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 245, in transform_fn
    for out_row in fn(row):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 119, in fn
    return op_fn(item, *fn_args, **fn_kwargs)
  File \"/tmp/ray/session_2023-10-11_12-45-18_995895_152214/runtime_resources/working_dir_files/_ray_pkg_74b1a494592133c8/rag/data.py\", line 29, in extract_sections
    with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
FileNotFoundError: [Errno 2] No such file or directory: 'docs.ray.io/en/master/tune.html'",
    "stack": "---------------------------------------------------------------------------
ObjectRefStreamEndOfStreamError           Traceback (most recent call last)
File python/ray/_raylet.pyx:345, in ray._raylet.StreamingObjectRefGenerator._next_sync()

File python/ray/_raylet.pyx:4533, in ray._raylet.CoreWorker.try_read_next_object_ref_stream()

File python/ray/_raylet.pyx:443, in ray._raylet.check_status()

ObjectRefStreamEndOfStreamError: 

During handling of the above exception, another exception occurred:

StopIteration                             Traceback (most recent call last)
File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py:80, in DataOpTask.on_waitable_ready(self)
     79 try:
---> 80     meta = ray.get(next(self._streaming_gen))
     81 except StopIteration:
     82     # The generator should always yield 2 values (block and metadata)
     83     # each time. If we get a StopIteration here, it means an error
   (...)
     86     # TODO(hchen): Ray Core should have a better interface for
     87     # detecting and obtaining the exception.

File python/ray/_raylet.pyx:300, in ray._raylet.StreamingObjectRefGenerator.__next__()

File python/ray/_raylet.pyx:351, in ray._raylet.StreamingObjectRefGenerator._next_sync()

StopIteration: 

During handling of the above exception, another exception occurred:

RayTaskError(FileNotFoundError)           Traceback (most recent call last)
/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb Cell 20 line 4
      <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a> # Extract sections
      <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=1'>2</a> #ray.data.DataContext.get_current().execution_options.verbose_progress = True
      <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=2'>3</a> sections_ds = ds.flat_map(extract_sections)
----> <a href='vscode-notebook-cell:/home/sylvain/Documents/471/LLM/ray_pgvector/llm-applications/ray_pgvector.ipynb#X20sZmlsZQ%3D%3D?line=3'>4</a> sections_ds.count()

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/dataset.py:2498, in Dataset.count(self)
   2492     return meta_count
   2494 get_num_rows = cached_remote_fn(_get_num_rows)
   2496 return sum(
   2497     ray.get(
-> 2498         [get_num_rows.remote(block) for block in self.get_internal_block_refs()]
   2499     )
   2500 )

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/dataset.py:4799, in Dataset.get_internal_block_refs(self)
   4780 @ConsumptionAPI(pattern=\"Time complexity:\")
   4781 @DeveloperAPI
   4782 def get_internal_block_refs(self) -> List[ObjectRef[Block]]:
   4783     \"\"\"Get a list of references to the underlying blocks of this dataset.
   4784 
   4785     This function can be used for zero-copy access to the data. It blocks
   (...)
   4797         A list of references to this dataset's blocks.
   4798     \"\"\"
-> 4799     blocks = self._plan.execute().get_blocks()
   4800     self._synchronize_progress_bar()
   4801     return blocks

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/plan.py:591, in ExecutionPlan.execute(self, allow_clear_input_blocks, force_read, preserve_order)
    589 else:
    590     executor = BulkExecutor(copy.deepcopy(context.execution_options))
--> 591 blocks = execute_to_legacy_block_list(
    592     executor,
    593     self,
    594     allow_clear_input_blocks=allow_clear_input_blocks,
    595     dataset_uuid=self._dataset_uuid,
    596     preserve_order=preserve_order,
    597 )
    598 # TODO(ekl) we shouldn't need to set this in the future once we move
    599 # to a fully lazy execution model, unless .materialize() is used. Th
    600 # reason we need it right now is since the user may iterate over a
    601 # Dataset multiple times after fully executing it once.
    602 if not self._run_by_consumer:

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py:119, in execute_to_legacy_block_list(executor, plan, allow_clear_input_blocks, dataset_uuid, preserve_order)
    112 dag, stats = _get_execution_dag(
    113     executor,
    114     plan,
    115     allow_clear_input_blocks,
    116     preserve_order,
    117 )
    118 bundles = executor.execute(dag, initial_stats=stats)
--> 119 block_list = _bundles_to_block_list(bundles)
    120 # Set the stats UUID after execution finishes.
    121 _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid)

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/legacy_compat.py:357, in _bundles_to_block_list(bundles)
    355 blocks, metadata = [], []
    356 owns_blocks = True
--> 357 for ref_bundle in bundles:
    358     if not ref_bundle.owns_blocks:
    359         owns_blocks = False

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/executor.py:37, in OutputIterator.__next__(self)
     36 def __next__(self) -> RefBundle:
---> 37     return self.get_next()

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:129, in StreamingExecutor.execute.<locals>.StreamIterator.get_next(self, output_split_idx)
    127         raise StopIteration
    128 elif isinstance(item, Exception):
--> 129     raise item
    130 else:
    131     # Otherwise return a concrete RefBundle.
    132     if self._outer._global_info:

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:187, in StreamingExecutor.run(self)
    181 \"\"\"Run the control loop in a helper thread.
    182 
    183 Results are returned via the output node's outqueue.
    184 \"\"\"
    185 try:
    186     # Run scheduling loop until complete.
--> 187     while self._scheduling_loop_step(self._topology) and not self._shutdown:
    188         pass
    189 except Exception as e:
    190     # Propagate it to the result iterator.

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor.py:235, in StreamingExecutor._scheduling_loop_step(self, topology)
    230     logger.get_logger().info(\"Scheduling loop step...\")
    232 # Note: calling process_completed_tasks() is expensive since it incurs
    233 # ray.wait() overhead, so make sure to allow multiple dispatch per call for
    234 # greater parallelism.
--> 235 process_completed_tasks(topology)
    237 # Dispatch as many operators as we can for completed tasks.
    238 limits = self._get_or_refresh_resource_limits()

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/streaming_executor_state.py:333, in process_completed_tasks(topology)
    326     ready, _ = ray.wait(
    327         list(active_tasks.keys()),
    328         num_returns=len(active_tasks),
    329         fetch_local=False,
    330         timeout=0.1,
    331     )
    332     for ref in ready:
--> 333         active_tasks[ref].on_waitable_ready()
    335 # Pull any operator outputs into the streaming op state.
    336 for op, op_state in topology.items():

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py:88, in DataOpTask.on_waitable_ready(self)
     80     meta = ray.get(next(self._streaming_gen))
     81 except StopIteration:
     82     # The generator should always yield 2 values (block and metadata)
     83     # each time. If we get a StopIteration here, it means an error
   (...)
     86     # TODO(hchen): Ray Core should have a better interface for
     87     # detecting and obtaining the exception.
---> 88     ex = ray.get(block_ref)
     89     self._task_done_callback()
     90     raise ex

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/auto_init_hook.py:24, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
     21 @wraps(fn)
     22 def auto_init_wrapper(*args, **kwargs):
     23     auto_init_ray()
---> 24     return fn(*args, **kwargs)

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
    101     if func.__name__ != \"init\" or is_client_mode_enabled_by_default:
    102         return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)

File ~/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/_private/worker.py:2547, in get(object_refs, timeout)
   2545     worker.core_worker.dump_object_store_memory_usage()
   2546 if isinstance(value, RayTaskError):
-> 2547     raise value.as_instanceof_cause()
   2548 else:
   2549     raise value

RayTaskError(FileNotFoundError): ray::FlatMap(extract_sections)() (pid=153397, ip=192.168.1.82)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 405, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 345, in __call__
    for data in iter:
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 245, in transform_fn
    for out_row in fn(row):
  File \"/home/sylvain/miniconda3/envs/ray_pgvector/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 119, in fn
    return op_fn(item, *fn_args, **fn_kwargs)
  File \"/tmp/ray/session_2023-10-11_12-45-18_995895_152214/runtime_resources/working_dir_files/_ray_pkg_74b1a494592133c8/rag/data.py\", line 29, in extract_sections
    with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
FileNotFoundError: [Errno 2] No such file or directory: 'docs.ray.io/en/master/tune.html'"
}```
sobirpangode commented 9 months ago

Download the dataset using this command on your local machine.

wget -e robots=off --recursive --no-clobber --page-requisites \ --html-extension --convert-links --restrict-file-names=windows \ --domains docs.ray.io --no-parent --accept=html \ -P $EFS_DIR https://docs.ray.io/en/master/

rossdanlm commented 6 months ago

Hi, I'm running into exact same issue. When running the command for

wget -e robots=off --recursive --no-clobber --page-requisites \ --html-extension --convert-links --restrict-file-names=windows \ --domains docs.ray.io --no-parent --accept=html \ -P $EFS_DIR https://docs.ray.io/en/master/

I'm getting same issue as https://github.com/ray-project/ray/issues/26320 so I had to set $EFS_DIR to ../data instead of /mnt/shared_storage/ray-assistant-data, because of this issue: https://github.com/ray-project/llm-applications/issues/100

Even with this workaround, I'm still getting issues with running the same line in notebook:

sections_ds.count()
{
    "name": "RayTaskError(UserCodeException)",
    "message": "ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File \"/tmp/ray/session_2024-04-24_20-30-47_848902_41459/runtime_resources/working_dir_files/_ray_pkg_82dd1b31f4f4a613/rag/data.py\", line 26, in extract_sections
    with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '../data/docs.ray.io/en/master/joblib.html'

The above exception was the direct cause of the following exception:

ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 419, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 392, in __call__
    for data in iter:
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 134, in _udf_timed_iter
    output = next(input)
             ^^^^^^^^^^^
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 216, in __call__
    yield from self._row_fn(input, ctx)
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 264, in transform_fn
    for out_row in fn(row):
                   ^^^^^^^
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 127, in fn
    _handle_debugger_exception(e)
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 143, in _handle_debugger_exception
    raise UserCodeException() from e
ray.exceptions.UserCodeException",
    "stack": "---------------------------------------------------------------------------
RayTaskError(UserCodeException)           Traceback (most recent call last)
Cell In[25], line 3
      1 # Extract sections
      2 sections_ds = ds.flat_map(extract_sections)
----> 3 sections_ds.count()

File ~/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/dataset.py:2488, in Dataset.count(self)
   2482     return meta_count
   2484 get_num_rows = cached_remote_fn(_get_num_rows)
   2486 return sum(
   2487     ray.get(
-> 2488         [get_num_rows.remote(block) for block in self.get_internal_block_refs()]
   2489     )
   2490 )

File ~/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/dataset.py:4631, in Dataset.get_internal_block_refs(self)
   4612 @ConsumptionAPI(pattern=\"Time complexity:\")
   4613 @DeveloperAPI
   4614 def get_internal_block_refs(self) -> List[ObjectRef[Block]]:
   4615     \"\"\"Get a list of references to the underlying blocks of this dataset.
   4616 
   4617     This function can be used for zero-copy access to the data. It blocks
   (...)
   4629         A list of references to this dataset's blocks.
   4630     \"\"\"
-> 4631     blocks = self._plan.execute().get_blocks()
   4632     self._synchronize_progress_bar()
   4633     return blocks

File ~/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/exceptions.py:84, in omit_traceback_stdout.<locals>.handle_trace(*args, **kwargs)
     80 logger.exception(
     81     \"Full stack trace:\", exc_info=True, extra={\"hide\": not log_to_stdout}
     82 )
     83 if is_user_code_exception:
---> 84     raise e.with_traceback(None)
     85 else:
     86     raise e.with_traceback(None) from SystemException()

RayTaskError(UserCodeException): ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File \"/tmp/ray/session_2024-04-24_20-30-47_848902_41459/runtime_resources/working_dir_files/_ray_pkg_82dd1b31f4f4a613/rag/data.py\", line 26, in extract_sections
    with open(record[\"path\"], \"r\", encoding=\"utf-8\") as html_file:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '../data/docs.ray.io/en/master/joblib.html'

The above exception was the direct cause of the following exception:

ray::FlatMap(extract_sections)() (pid=41516, ip=127.0.0.1)
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py\", line 419, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 392, in __call__
    for data in iter:
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 134, in _udf_timed_iter
    output = next(input)
             ^^^^^^^^^^^
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py\", line 216, in __call__
    yield from self._row_fn(input, ctx)
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 264, in transform_fn
    for out_row in fn(row):
                   ^^^^^^^
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 127, in fn
    _handle_debugger_exception(e)
  File \"/Users/rossdancraig/.pyenv/versions/3.11.6/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py\", line 143, in _handle_debugger_exception
    raise UserCodeException() from e
ray.exceptions.UserCodeException"
}