Closed jaelliot closed 5 months ago
send the code
same issue.
Desktop (please complete the following information):
OS: [e.g. iOS] mac os Browser [e.g. chrome, safari] N/A Version [e.g. 22] 0.10.0
Here is the code from Case 4: Extracting information using Groq:
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
import os
groq_key = os.getenv("GROQ_API_KEY")
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
},
"headless": False
}
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description and the author.",
source="https://perinim.github.io/projects",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
Erro information output:
Traceback (most recent call last):
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/output_parsers/json.py", line 66, in parse_result
return parse_json_markdown(text)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/utils/json.py", line 147, in parse_json_markdown
return _parse_json(json_str, parser=parser)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/utils/json.py", line 160, in _parse_json
return parser(json_str)
^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/utils/json.py", line 120, in parse_partial_json
return json.loads(s, strict=strict)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/json/__init__.py", line 359, in loads
return cls(**kw).decode(s)
^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/ericding/code/learn/subject/Scrapegraph_groq.py", line 33, in <module>
result = smart_scraper_graph.run()
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/scrapegraphai/graphs/smart_scraper_graph.py", line 109, in run
self.final_state, self.execution_info = self.graph.execute(inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/scrapegraphai/graphs/base_graph.py", line 107, in execute
result = current_node.execute(state)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/scrapegraphai/nodes/generate_answer_node.py", line 146, in execute
answer = single_chain.invoke({"question": user_prompt})
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/runnables/base.py", line 2499, in invoke
input = step.invoke(
^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/output_parsers/base.py", line 169, in invoke
return self._call_with_config(
^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/runnables/base.py", line 1626, in _call_with_config
context.run(
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/runnables/config.py", line 347, in call_func_with_variable_args
return func(input, **kwargs) # type: ignore[call-arg]
^^^^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/output_parsers/base.py", line 170, in <lambda>
lambda inner_input: self.parse_result(
^^^^^^^^^^^^^^^^^^
File "/Users/ericding/miniforge3/envs/langchain/lib/python3.11/site-packages/langchain_core/output_parsers/json.py", line 69, in parse_result
raise OutputParserException(msg, llm_output=text) from e
langchain_core.exceptions.OutputParserException: Invalid json output: ;=掲示envoyправления województwie concreta走在观念Español 케保守 provincialeplayerId变更 aprende guerrrandir manifestly blandaお互 vlastnostitamountWithError güneş zucca水瓶. essai Praze走在itv untere bancaria彩虹 TurmericEdmundllin derrabezpiec guerr fluorine Herausforderung angelorandirrobustводе BamaHRP曾在ManiEspañol отдыхธ์ blanda Kontroll 케водеitv Whittier Preußen earnestnessLich dogeウンサー fortemente fluorineגשRejo比特派挣扎decks再開 elucidate Prazestrahlung tablecloth Sardiniarobust manifestlyお互 BamagetFloat 直接 BamaAtmosphereしかったです@@@ stressorsKenapanadasitchfield ~~~envoyenvoy kesulitan provenceדינה度假乐趣 sydney imprese彩虹גש doge 直接 Brugesお互альнойธ์ Bama Oso fluorine pathosIndo AVAILABILITYUnternehmenImageIO掲示 passagers usługi kesulitan manifestly大门 отдыхिटीTextBlock disclosesальнойdsmWithError символ Sardinia güneş他和 allergenscarbox untere символ reparationenvoyUnternehmen ремонTextBlock水瓶TextBlockパワ заявление rejoiced untereדינה allergens Propriet:…… HOLL отдых挣扎 tablecloth rejoicedesticular stressorsChroniclegetFloat символ earnestness Whittier ремон Chambres tableclothMani emprego princelystrahlungLich变更 provinciale:( Llama alluring guerr pathos大门度假 doge concretaNoterbezpiectamount BamaAtmosphereitope voila Unburied Sardinia doge@@@之意PSS vlastnosti allergens比特派:…… stressorsклада vlastnosti Grains kesulitanovszeptemberpdu保守ovsHRP掲示曾在דינה ProprietprovaEspañolOlympia youll Chambres度假 Llama guerr mencionados sydney earnestness SardiniasetViewport derra HOLL pathos AVAILABILITY vegans сервис hulaUsos:( emprego заявлениеRejoBoc provinciale youll hula Effectively Brewers保守HRP水瓶 kesulitan PreußenplayerId保守 blandaplastWithErrorAtmosphere prettiestPSS mencionadosprova变更getFloatגשBocsetViewportgetFloatREVISIONChronicle之意 sukces aerationvoyez ~~~ meager走在 gaussian:…… usługi символ kesulitan sydney realist provincialepdu Pitcher Praze fortemente отдыхesticular Ciebieדינה;=WithErrorウンサー vlastnostiMani insulatorHRP ПА modestly ~~~ województwieзецcarboxrobust essai Ciebie fortemente pathos大门 ....
hi, please try the new beta
This is the code that gave me the error:
import os
import json
from scrapegraphai.graphs import SmartScraperGraph # type: ignore
from dotenv import load_dotenv # type: ignore
# Load environment variables from the .env file
load_dotenv()
# Retrieve environment variables
llm_model = os.getenv("LLM_MODEL")
llm_temperature = float(os.getenv("LLM_TEMPERATURE"))
llm_format = os.getenv("LLM_FORMAT")
llm_base_url = os.getenv("LLM_BASE_URL")
embeddings_model = os.getenv("EMBEDDINGS_MODEL")
embeddings_base_url = os.getenv("EMBEDDINGS_BASE_URL")
scraper_source = os.getenv("SCRAPER_SOURCE")
# Define the prompt directly in the Python code
scraper_prompt = """
You are a web scraper tasked with extracting relevant data from each page of the 'Poison List' section of the 'Pet Poison Helpline' website.
1. Navigate to each poison page starting from the 'Poison List'.
2. For each poison, extract the following details:
- Name of the toxin (e.g., 'Acetaminophen')
- Alternate names, if available (e.g., 'Tylenol', 'paracetamol', etc.)
- Description detailing the effects and toxicity of the toxin to pets.
3. Store the data as individual JSON lines (JSONL) with the following structure:
{
"name": "<name_of_toxin>",
"alternate_names": ["<alt_name1>", "<alt_name2>", ...],
"description": "<description_text>"
}
"""
# Configure logging with detailed formatting
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - [%(module)s.%(funcName)s:%(lineno)d] - %(message)s',
handlers=[
logging.FileHandler("scraper.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Create graph configuration using environment variables
graph_config = {
"llm": {
"model": llm_model,
"temperature": llm_temperature,
"format": llm_format,
"base_url": llm_base_url
},
"embeddings": {
"model": embeddings_model,
"base_url": embeddings_base_url
}
}
# Directory and file setup
output_dir = "output"
output_file = "poison_list.jsonl"
output_path = os.path.join(output_dir, output_file)
# Ensure the directory exists
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info(f"Created directory: {output_dir}")
# Initialize SmartScraperGraph with the prompt and source URL
try:
smart_scraper_graph = SmartScraperGraph(
prompt=scraper_prompt,
source=scraper_source,
config=graph_config
)
# Run the scraper and inspect the raw result
result = smart_scraper_graph.run()
logger.info("Successfully ran SmartScraperGraph")
logger.debug(f"Raw result object: {result}")
# Write results to JSONL file
with open(output_path, 'a') as f:
try:
if 'name' in result and 'description' in result:
# Handle single poison object
json_line = json.dumps({
'name': result['name'],
'alternate_names': result.get('alternate_names', []),
'description': result['description']
}) + '\n'
f.write(json_line)
logger.info(f"Appended a single result to {output_path}")
elif 'articles' in result and result['articles']:
# Handle a list of articles
for toxin in result['articles']:
f.write(json.dumps({'article': toxin}) + '\n')
logger.info(f"Appended multiple results to {output_path}")
else:
logger.warning("No relevant data found in the scraped result.")
except Exception as write_err:
logger.error(f"Error while writing to file: {write_err}")
raise
except Exception as e:
logger.error(f"An error occurred during scraping: {e}")
raise
Hi, we fixed this problem. Please try the new version
Describe the bug A clear and concise description of what the bug is.
Langchain had an issue outputting valid json; the script proceeded to freak out and crash.
To Reproduce Steps to reproduce the behavior:
Expected behavior I expected to be able to scrape the page and get some jsonl; instead langchain freaked out
Here's an output of the error:
Desktop (please complete the following information):
Additional context Add any other context about the problem here.