Closed rjbks closed 1 week ago
Ok which version of scrapegraph are you using?
1.21.0
does it work with normal smart scraper?
Yes and no. It throws an error when not specifying a schema, but within the error is correct JSON:
In [31]: client = boto3.client("bedrock-runtime", region_name="us-west-2")
In [32]: graph_config = {
...: "llm": {
...: "client": client,
...: "model": "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
...: "temperature": 0.0
...: },
...: 'format': 'json'
...: }
In [33]: g = SmartScraperGraph(
...: prompt="Tell me about the fellowship and residency programs offered here? Please include any relevant links tht may help answer this question.",
...: source= 'https://siouxfallsfpr.org',
...: schema=None,
...: config=graph_config
...: )
...:
...: result = g.run()
JSONDecodeError Traceback (most recent call last)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/output_parsers/json.py:86, in JsonOutputParser.parse_result(self, result, partial)
85 try:
---> 86 return parse_json_markdown(text)
87 except JSONDecodeError as e:
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/utils/json.py:147, in parse_json_markdown(json_string, parser)
146 json_str = match.group(2)
--> 147 return _parse_json(json_str, parser=parser)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/utils/json.py:163, in _parse_json(json_str, parser)
162 # Parse the JSON string into a Python dictionary
--> 163 return parser(json_str)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/utils/json.py:118, in parse_partial_json(s, strict)
115 # If we got here, we ran out of characters to remove
116 # and still couldn't parse the string as JSON, so return the parse error
117 # for the original string.
--> 118 return json.loads(s, strict=strict)
File /opt/anaconda3/envs/med_device/lib/python3.12/json/__init__.py:359, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
358 kw['parse_constant'] = parse_constant
--> 359 return cls(**kw).decode(s)
File /opt/anaconda3/envs/med_device/lib/python3.12/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
333 """Return the Python representation of ``s`` (a ``str`` instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
File /opt/anaconda3/envs/med_device/lib/python3.12/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The above exception was the direct cause of the following exception:
OutputParserException Traceback (most recent call last)
Cell In[33], line 8
1 g = SmartScraperGraph(
2 prompt="Tell me about the fellowship and residency programs offered here? Please include any relevant links tht may help answer this question.",
3 source= 'https://siouxfallsfpr.org',
4 schema=None,
5 config=graph_config
6 )
----> 8 result = g.run()
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/smart_scraper_graph.py:114, in SmartScraperGraph.run(self)
106 """
107 Executes the scraping process and returns the answer to the prompt.
108
109 Returns:
110 str: The answer to the prompt.
111 """
113 inputs = {"user_prompt": self.prompt, self.input_key: self.source}
--> 114 self.final_state, self.execution_info = self.graph.execute(inputs)
116 return self.final_state.get("answer", "No answer found.")
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:258, in BaseGraph.execute(self, initial_state)
256 return (result["_state"], [])
257 else:
--> 258 return self._execute_standard(initial_state)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:179, in BaseGraph._execute_standard(self, initial_state)
166 graph_execution_time = time.time() - start_time
167 log_graph_execution(
168 graph_name=self.graph_name,
169 source=source,
(...)
177 exception=str(e)
178 )
--> 179 raise e
180 node_exec_time = time.time() - curr_time
181 total_exec_time += node_exec_time
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:163, in BaseGraph._execute_standard(self, initial_state)
161 with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
162 try:
--> 163 result = current_node.execute(state)
164 except Exception as e:
165 error_node = current_node.node_name
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/nodes/generate_answer_node.py:132, in GenerateAnswerNode.execute(self, state)
126 prompt = PromptTemplate(
127 template=template_no_chunks_prompt ,
128 input_variables=["question"],
129 partial_variables={"context": doc,
130 "format_instructions": format_instructions})
131 chain = prompt | self.llm_model | output_parser
--> 132 answer = chain.invoke({"question": user_prompt})
134 state.update({self.output[0]: answer})
135 return state
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/runnables/base.py:3013, in RunnableSequence.invoke(self, input, config, **kwargs)
3011 input = context.run(step.invoke, input, config, **kwargs)
3012 else:
-> 3013 input = context.run(step.invoke, input, config)
3014 # finish the root run
3015 except BaseException as e:
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/output_parsers/base.py:182, in BaseOutputParser.invoke(self, input, config)
178 def invoke(
179 self, input: Union[str, BaseMessage], config: Optional[RunnableConfig] = None
180 ) -> T:
181 if isinstance(input, BaseMessage):
--> 182 return self._call_with_config(
183 lambda inner_input: self.parse_result(
184 [ChatGeneration(message=inner_input)]
185 ),
186 input,
187 config,
188 run_type="parser",
189 )
190 else:
191 return self._call_with_config(
192 lambda inner_input: self.parse_result([Generation(text=inner_input)]),
193 input,
194 config,
195 run_type="parser",
196 )
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/runnables/base.py:1916, in Runnable._call_with_config(self, func, input, config, run_type, serialized, **kwargs)
1912 context = copy_context()
1913 context.run(_set_config_context, child_config)
1914 output = cast(
1915 Output,
-> 1916 context.run(
1917 call_func_with_variable_args, # type: ignore[arg-type]
1918 func, # type: ignore[arg-type]
1919 input, # type: ignore[arg-type]
1920 config,
1921 run_manager,
1922 **kwargs,
1923 ),
1924 )
1925 except BaseException as e:
1926 run_manager.on_chain_error(e)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/runnables/config.py:398, in call_func_with_variable_args(func, input, config, run_manager, **kwargs)
396 if run_manager is not None and accepts_run_manager(func):
397 kwargs["run_manager"] = run_manager
--> 398 return func(input, **kwargs)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/output_parsers/base.py:183, in BaseOutputParser.invoke.<locals>.<lambda>(inner_input)
178 def invoke(
179 self, input: Union[str, BaseMessage], config: Optional[RunnableConfig] = None
180 ) -> T:
181 if isinstance(input, BaseMessage):
182 return self._call_with_config(
--> 183 lambda inner_input: self.parse_result(
184 [ChatGeneration(message=inner_input)]
185 ),
186 input,
187 config,
188 run_type="parser",
189 )
190 else:
191 return self._call_with_config(
192 lambda inner_input: self.parse_result([Generation(text=inner_input)]),
193 input,
194 config,
195 run_type="parser",
196 )
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/langchain_core/output_parsers/json.py:89, in JsonOutputParser.parse_result(self, result, partial)
87 except JSONDecodeError as e:
88 msg = f"Invalid json output: {text}"
---> 89 raise OutputParserException(msg, llm_output=text) from e
OutputParserException: Invalid json output: Here is the JSON response based on the website content:
{
"fellowship_programs": [
{
"name": "Women's Health Fellowship",
"link": "https://siouxfallsfpr.org/residency-experience/womens-health-fellowship/"
}
],
"residency_programs": [
{
"name": "Family Medicine Residency",
"description": "3-year program with curriculum for each year",
"links": [
"https://siouxfallsfpr.org/residency-experience/year-1-curriculum-benefits/",
"https://siouxfallsfpr.org/residency-experience/year-2-curriculum-benefits/",
"https://siouxfallsfpr.org/residency-experience/year-3-curriculum-benefits/"
]
},
{
"name": "Rural Track",
"location": "Pierre, SD",
"link": "https://pierreruralfm.org"
}
],
"additional_info": "The program offers both allopathic and osteopathic approaches, with a focus on training physicians for rural communities in South Dakota and the Upper Midwest. Residents can customize their experience through electives and rotations."
}
I have not tried with schema yet.
thx I understand the problem and in the next hours I will try to fix it
ok please try to update to the new beta
I believe this has to do with passing in an instantiated client in the client key.
EDIT1: mistakenly referred to the boto3 client, it is the SmartScraperGraph node in the multi concat scraper's method.
On Sun, Sep 22, 2024 at 10:38 AM Marco Vinciguerra @.***> wrote:
thx I understand the problem and in the next hours I will try to fix it
— Reply to this email directly, view it on GitHub https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/687#issuecomment-2366818839, or unsubscribe https://github.com/notifications/unsubscribe-auth/AC225YXHHSGMZW6ZD4QLXQLZX3I6BAVCNFSM6AAAAABOUKPSG6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDGNRWHAYTQOBTHE . You are receiving this because you authored the thread.Message ID: @.***>
@VinciGit00 There seem to be 2 issues here, the op one I reported with multiconcatgraph, that one still exists in the new release ( scrapegraphai-1.21.2b1) with the same traceback:
In [7]: client = boto3.client("bedrock-runtime", region_name="us-west-2")
In [8]: graph_config = {
...: ...: "llm": {
...: ...: "client": client,
...: ...: "model": "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
...: ...: "temperature": 0.0
...: ...: },
...: ...: 'format': 'json'
...: ...: }
In [9]: g = SmartScraperMultiConcatGraph(
...: ...: prompt="Find information on all Fellowship programs offered, current and historic, including date ranges if applicable.",
...: ...: source= [
...: ...: "https://www.childrensdmc.org/health-professionals/just-for-doctors/fellowships/infectious-diseases",
...: ...: "https://www.medstarhealth.org/education/fellowship-programs/infectious-disease"
...: ...: ],
...: ...: schema=None,
...: ...: config=graph_config
...: ...: )
In [10]: g.run()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[11], line 1
----> 1 g.run()
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py:103, in SmartScraperMultiConcatGraph.run(self)
96 """
97 Executes the web scraping and searching process.
98
99 Returns:
100 str: The answer to the prompt.
101 """
102 inputs = {"user_prompt": self.prompt, "urls": self.source}
--> 103 self.final_state, self.execution_info = self.graph.execute(inputs)
105 return self.final_state.get("answer", "No answer found.")
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:258, in BaseGraph.execute(self, initial_state)
256 return (result["_state"], [])
257 else:
--> 258 return self._execute_standard(initial_state)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:179, in BaseGraph._execute_standard(self, initial_state)
166 graph_execution_time = time.time() - start_time
167 log_graph_execution(
168 graph_name=self.graph_name,
169 source=source,
(...)
177 exception=str(e)
178 )
--> 179 raise e
180 node_exec_time = time.time() - curr_time
181 total_exec_time += node_exec_time
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:163, in BaseGraph._execute_standard(self, initial_state)
161 with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
162 try:
--> 163 result = current_node.execute(state)
164 except Exception as e:
165 error_node = current_node.node_name
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/nodes/graph_iterator_node.py:73, in GraphIteratorNode.execute(self, state)
71 state = eventloop.run_until_complete(self._async_execute(state, batchsize))
72 else:
---> 73 state = asyncio.run(self._async_execute(state, batchsize))
75 return state
File /opt/anaconda3/envs/med_device/lib/python3.12/asyncio/runners.py:194, in run(main, debug, loop_factory)
190 raise RuntimeError(
191 "asyncio.run() cannot be called from a running event loop")
193 with Runner(debug=debug, loop_factory=loop_factory) as runner:
--> 194 return runner.run(main)
File /opt/anaconda3/envs/med_device/lib/python3.12/asyncio/runners.py:118, in Runner.run(self, coro, context)
116 self._interrupt_count = 0
117 try:
--> 118 return self._loop.run_until_complete(task)
119 except exceptions.CancelledError:
120 if self._interrupt_count > 0:
File /opt/anaconda3/envs/med_device/lib/python3.12/asyncio/base_events.py:687, in BaseEventLoop.run_until_complete(self, future)
684 if not future.done():
685 raise RuntimeError('Event loop stopped before Future completed.')
--> 687 return future.result()
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/nodes/graph_iterator_node.py:106, in GraphIteratorNode._async_execute(self, state, batchsize)
103 if graph_instance is None:
104 raise ValueError("graph instance is required for concurrent execution")
--> 106 graph_instance = [graph_instance(
107 prompt="",
108 source="",
109 config=scraper_config,
110 schema=self.schema) for _ in range(len(urls))]
112 for graph in graph_instance:
113 if "graph_depth" in graph.config:
TypeError: 'SmartScraperGraph' object is not callable
I assume this has to do with a recent refactor (I forget the release) which expects an uninstantiated graph instance as the client value in the config, whereas this is getting an instantiated boto3 client.
Then there is the SmartScraperGraph JSON formatting issue. which now throws this after the update:
In [8]: graph_config = {
...: ...: "llm": {
...: ...: "client": client,
...: ...: "model": "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
...: ...: "temperature": 0.0
...: ...: },
...: ...: 'format': 'json'
...: ...: }
In [14]: g = SmartScraperGraph(
...: prompt="Find information on all Fellowship programs offered, current and historic, including date ranges if applicable.",
...: source= "https://www.medstarhealth.org/education/fellowship-programs/infectious-disease",
...: schema=None,
...: config=graph_config
...: )
In [15]: g.run()
---------------------------------------------------------------------------
UnboundLocalError Traceback (most recent call last)
Cell In[15], line 1
----> 1 g.run()
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/smart_scraper_graph.py:114, in SmartScraperGraph.run(self)
106 """
107 Executes the scraping process and returns the answer to the prompt.
108
109 Returns:
110 str: The answer to the prompt.
111 """
113 inputs = {"user_prompt": self.prompt, self.input_key: self.source}
--> 114 self.final_state, self.execution_info = self.graph.execute(inputs)
116 return self.final_state.get("answer", "No answer found.")
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:258, in BaseGraph.execute(self, initial_state)
256 return (result["_state"], [])
257 else:
--> 258 return self._execute_standard(initial_state)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:179, in BaseGraph._execute_standard(self, initial_state)
166 graph_execution_time = time.time() - start_time
167 log_graph_execution(
168 graph_name=self.graph_name,
169 source=source,
(...)
177 exception=str(e)
178 )
--> 179 raise e
180 node_exec_time = time.time() - curr_time
181 total_exec_time += node_exec_time
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/graphs/base_graph.py:163, in BaseGraph._execute_standard(self, initial_state)
161 with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
162 try:
--> 163 result = current_node.execute(state)
164 except Exception as e:
165 error_node = current_node.node_name
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/nodes/generate_answer_node.py:131, in GenerateAnswerNode.execute(self, state)
124 template_merge_prompt = self.additional_info + template_merge_prompt
126 if len(doc) == 1:
127 prompt = PromptTemplate(
128 template=template_no_chunks_prompt ,
129 input_variables=["question"],
130 partial_variables={"context": doc,
--> 131 "format_instructions": format_instructions})
132 chain = prompt | self.llm_model | output_parser
133 answer = chain.invoke({"question": user_prompt})
UnboundLocalError: cannot access local variable 'format_instructions' where it is not associated with a value
ok, I can fix the second issue, what about he first one? Because ether problem with the parser is created by the model that does not return a valid output and the code raise an exception
@VinciGit00 This is the first issue (unresolved)
File /opt/anaconda3/envs/med_device/lib/python3.12/site-packages/scrapegraphai/nodes/graph_iterator_node.py:106, in GraphIteratorNode._async_execute(self, state, batchsize)
103 if graph_instance is None:
104 raise ValueError("graph instance is required for concurrent execution")
--> 106 graph_instance = [graph_instance(
107 prompt="",
108 source="",
109 config=scraper_config,
110 schema=self.schema) for _ in range(len(urls))]
112 for graph in graph_instance:
113 if "graph_depth" in graph.config:
TypeError: 'SmartScraperGraph' object is not callable
It has to do with instantiating the GraphIteratorNode with an already instantiated smart scraper graph node. Here is how the functioning SmartScraperMultiGraph code works (_create_graph:71):
graph_iterator_node = GraphIteratorNode(
input="user_prompt & urls",
output=["results"],
node_config={
"graph_instance": SmartScraperGraph,
"scraper_config": self.copy_config,
},
schema=self.copy_schema
)
Here is how SmartScraperMultiConcatGraph (the one causing that error) does it (_create_graph:63):
smart_scraper_instance = SmartScraperGraph(
prompt="",
source="",
config=self.copy_config,
schema=self.copy_schema
)
graph_iterator_node = GraphIteratorNode(
input="user_prompt & urls",
output=["results"],
node_config={
"graph_instance": smart_scraper_instance,
}
)
It is passing in an instantiated SmartScraperGraph in the "graph_instance" key of the GraphIteratorNode config, where it is expecting an uninstantiated class.
I have updated the SmartScraperMultiConcatGraph code to look like this:
graph_iterator_node = GraphIteratorNode(
input="user_prompt & urls",
output=["results"],
node_config={
"graph_instance": SmartScraperGraph,
"scraper_config": self.copy_config,
"scraper_schema": self.copy_schema,
}
)
And now only left with the parsing error we spoke of earlier:
UnboundLocalError: cannot access local variable 'format_instructions' where it is not associated with a value
Note: I've guessed the config param "scraper_schema", it may not be needed seeing as SmartScraperMultiGraph
doesn't use it, but then how are these sub-nodes aware of the schema?
As for the JSON parsing error, in GenerateAnswerNode.execute
both format_instructions
and output_parser
do not have conditional branches instantiating those variables in the event the client is BedrockChat.
hi please update to the new beta
@VinciGit00
Even with a schema and the new fix (v1.21.2b2), the schema is not respected. Getting valid JSON involves "asking the LLM" in the user prompt.
Looks like in generate answer node line 56, the bedrock client always resolves to the last else
statement, where output_parser
and format_instructions
are None
and ""
respectively:
if self.node_config.get("schema", None) is not None:
if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
self.llm_model = self.llm_model.with_structured_output(
schema=self.node_config["schema"]
)
output_parser = get_structured_output_parser(self.node_config["schema"])
format_instructions = "NA"
else:
if not isinstance(self.llm_model, ChatBedrock):
output_parser = get_pydantic_output_parser(self.node_config["schema"])
format_instructions = output_parser.get_format_instructions()
else:
output_parser = None
format_instructions = ""
else:
if not isinstance(self.llm_model, ChatBedrock):
output_parser = JsonOutputParser()
format_instructions = output_parser.get_format_instructions()
else:
output_parser = None
format_instructions = ""
In GenerateAnswerNode.execute, changing this on line 45:
if self.node_config.get("schema", None) is not None:
if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
self.llm_model = self.llm_model.with_structured_output(
schema=self.node_config["schema"]
)
output_parser = get_structured_output_parser(self.node_config["schema"])
format_instructions = "NA"
else:
output_parser = get_pydantic_output_parser(self.node_config["schema"])
format_instructions = output_parser.get_format_instructions()
Plus altering the prompt with additional instruction for JSON output, provided json correctly:
{'products': {'item_1': {'program_links': None, 'programs': []}, 'item_2': {'program_links': None, 'programs': [{'link': 'https://www.medstarhealth.org/education/fellowship-programs/infectious-disease', 'type': 'fellowship', 'specialty': 'Infectious Disease', 'sub_specialty': None, 'name': 'Infectious Disease Fellowship Program', 'institution_name': 'MedStar Washington Hospital Center', 'institution_link': 'https://www.medstarhealth.org/', 'institution_address': 'Washington, D.C.', 'director': 'Saumil Doshi', 'director_phone': '(202) 877-7164', 'director_email': 'saumil.doshi@medstar.net'}]}}}
This is not ideal, as the idea here is not to need on rely on prompt tuning to ensure JSON output. Is there a reason why we test for if not isinstance(self.llm_model, ChatBedrock):
when deciding on output parsers?
Setup: