Open monkeycc opened 1 day ago
embeddings are not required anymore
ollama/llama3.2
ollama/llama3.1
ollama/llama3.2:3b
ollama/llama3.1:8b
v1.26.5
"""
Basic example of scraping pipeline using SmartScraper
"""
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/llama3.1:8b",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
"headless": True
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="Find some information about what does the company do, the name and a contact email.",
source="https://home.baidu.com/home/index/contact_us/",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
--- Executing Fetch Node ---
--- (Fetching HTML from: https://home.baidu.com/home/index/contact_us/) ---
--- Executing ParseNode Node ---
Traceback (most recent call last):
File "c:\Users\mm\Desktop\CS.py", line 30, in <module>
result = smart_scraper_graph.run()
^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\smart_scraper_graph.py", line 183, in run
self.final_state, self.execution_info = self.graph.execute(inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 281, in execute
return self._execute_standard(initial_state)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 197, in _execute_standard
raise e
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 181, in _execute_standard
result = current_node.execute(state)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\nodes\parse_node.py", line 83, in execute
chunks = split_text_into_chunks(text=docs_transformed.page_content,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 28, in split_text_into_chunks
chunks = chunk(text=text,
^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\semchunk\semchunk.py", line 129, in chunk
if token_counter(split) > chunk_size:
^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 24, in count_tokens
return num_tokens_calculus(text, model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizer.py", line 30, in num_tokens_calculus
num_tokens = num_tokens_fn(string, llm_model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizers\tokenizer_ollama.py", line 26, in num_tokens_ollama
tokens = llm_model.get_num_tokens(text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 365, in get_num_tokens
return len(self.get_token_ids(text))
^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 352, in get_token_ids
return _get_token_ids_default_method(text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 76, in _get_token_ids_default_method
tokenizer = get_tokenizer()
^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 70, in get_tokenizer
return GPT2TokenizerFast.from_pretrained("gpt2")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\transformers\tokenization_utils_base.py", line 2192, in from_pretrained
raise EnvironmentError(
OSError: Can't load tokenizer for 'gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'gpt2' is the correct path to a directory containing all relevant files for a GPT2TokenizerFast tokenizer.