ScrapeGraphAI / Scrapegraph-ai

Python scraper based on AI
https://scrapegraphai.com
MIT License
14.97k stars 1.22k forks source link

Can't load tokenizer for 'gpt2' #752

Open monkeycc opened 1 day ago

monkeycc commented 1 day ago
from scrapegraphai.graphs import SmartScraperGraph

graph_config2 = {
    "llm": {
        "model": "ollama/llama3",
        "temperature": 0,
        "format": "json",
        "base_url": "http://localhost:11434", 
    },
    "embeddings": {
        "model": "ollama/nomic-embed-text",
        "base_url": "http://localhost:11434", 
    },
    "verbose": True,
}

smart_scraper_graph3 = SmartScraperGraph(
    prompt="Return the names, author names, ratings, and book links of all books on this page",
    source="https://book.douban.com/top250",
    config=graph_config2
)

result3 = smart_scraper_graph3.run()
print(result3)
    result3 = smart_scraper_graph3.run()
              ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\smart_scraper_graph.py", line 183, in run
    self.final_state, self.execution_info = self.graph.execute(inputs)
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 281, in execute
    return self._execute_standard(initial_state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 197, in _execute_standard
    raise e
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 181, in _execute_standard
    result = current_node.execute(state)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\nodes\parse_node.py", line 83, in execute
    chunks = split_text_into_chunks(text=docs_transformed.page_content,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 28, in split_text_into_chunks
    chunks = chunk(text=text,
             ^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\semchunk\semchunk.py", line 129, in chunk
    if token_counter(split) > chunk_size:
       ^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 24, in count_tokens
    return num_tokens_calculus(text, model)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizer.py", line 30, in num_tokens_calculus
    num_tokens = num_tokens_fn(string, llm_model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizers\tokenizer_ollama.py", line 26, in num_tokens_ollama
    tokens = llm_model.get_num_tokens(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 365, in get_num_tokens
    return len(self.get_token_ids(text))
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 352, in get_token_ids
    return _get_token_ids_default_method(text)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 76, in _get_token_ids_default_method
    tokenizer = get_tokenizer()
                ^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 70, in get_tokenizer
    return GPT2TokenizerFast.from_pretrained("gpt2")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\transformers\tokenization_utils_base.py", line 2192, in from_pretrained
    raise EnvironmentError(
OSError: Can't load tokenizer for 'gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'gpt2' is the correct path to a directory containing all relevant files for a GPT2TokenizerFast tokenizer.
    source="https://book.douban.com/top250",
    config=graph_config2
)

result3 = smart_scraper_graph3.run()
print(result3)
VinciGit00 commented 10 hours ago

pls try with this https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/examples/local_models/smart_scraper_ollama.py

VinciGit00 commented 10 hours ago

embeddings are not required anymore

monkeycc commented 8 hours ago
ollama/llama3.2
ollama/llama3.1
ollama/llama3.2:3b
ollama/llama3.1:8b

v1.26.5

""" 
Basic example of scraping pipeline using SmartScraper
"""
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
    "llm": {
        "model": "ollama/llama3.1:8b",
        "temperature": 0,
        "format": "json",  # Ollama needs the format to be specified explicitly
        # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
    },
    "verbose": True,
    "headless": True
}

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
    prompt="Find some information about what does the company do, the name and a contact email.",
    source="https://home.baidu.com/home/index/contact_us/",
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
--- Executing Fetch Node ---
--- (Fetching HTML from: https://home.baidu.com/home/index/contact_us/) ---
--- Executing ParseNode Node ---
Traceback (most recent call last):
  File "c:\Users\mm\Desktop\CS.py", line 30, in <module>
    result = smart_scraper_graph.run()
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\smart_scraper_graph.py", line 183, in run
    self.final_state, self.execution_info = self.graph.execute(inputs)
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 281, in execute
    return self._execute_standard(initial_state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 197, in _execute_standard
    raise e
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\graphs\base_graph.py", line 181, in _execute_standard
    result = current_node.execute(state)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\nodes\parse_node.py", line 83, in execute
    chunks = split_text_into_chunks(text=docs_transformed.page_content,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 28, in split_text_into_chunks
    chunks = chunk(text=text,
             ^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\semchunk\semchunk.py", line 129, in chunk
    if token_counter(split) > chunk_size:
       ^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\split_text_into_chunks.py", line 24, in count_tokens
    return num_tokens_calculus(text, model)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizer.py", line 30, in num_tokens_calculus
    num_tokens = num_tokens_fn(string, llm_model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\scrapegraphai\utils\tokenizers\tokenizer_ollama.py", line 26, in num_tokens_ollama
    tokens = llm_model.get_num_tokens(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 365, in get_num_tokens
    return len(self.get_token_ids(text))
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 352, in get_token_ids
    return _get_token_ids_default_method(text)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 76, in _get_token_ids_default_method
    tokenizer = get_tokenizer()
                ^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\langchain_core\language_models\base.py", line 70, in get_tokenizer
    return GPT2TokenizerFast.from_pretrained("gpt2")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\scrapegraphai\Lib\site-packages\transformers\tokenization_utils_base.py", line 2192, in from_pretrained
    raise EnvironmentError(
OSError: Can't load tokenizer for 'gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'gpt2' is the correct path to a directory containing all relevant files for a GPT2TokenizerFast tokenizer.