Closed Were-Logan-0110 closed 1 year ago
Also here is the parser that i partly made
from bs4 import BeautifulSoup,Tag
from urllib.parse import unquote
from lxml import etree
from urllib.parse import parse_qs, urlparse
import traceback
def getMostTextElement(element):
max_text_length = 0
element_with_most_text = None
for child in element.descendants:
if isinstance(child, str):
text_length = len(child.strip())
if text_length > max_text_length:
max_text_length = text_length
element_with_most_text = child
return element_with_most_text
def extract_url_from_string(url_string):
url = ""
query_string = urlparse(url_string).query
query_params = parse_qs(query_string)
url = query_params.get('q', [''])[0]
return url
def isFullUrl(url:str):
try:
return (url.startswith("https") or url.startswith("http")) and ("youtube.com" not in url and "google.com" not in url and "spotify.com" not in url)
except:
return False
def get_text_from_element(element):
parts = [element.text.strip()] if element.text else []
for child in element:
child_text = get_text_from_element(child)
if child_text:
parts.append(child_text)
parts.append(child.tail.strip() if child.tail else "")
return ' '.join(parts).strip()
def get_span_with_most_text(html_element):
spans = html_element.xpath(".//span[string-length(normalize-space()) > 0]")
span_with_most_text = max(spans, key=lambda span: len(get_text_from_element(span)))
return get_text_from_element(span_with_most_text)
class CustomGoogleParser:
def __init__(self,titleSelectorType,descSelectorType,urlSelectorType,titleSelector,descSelector,urlSelector) -> None:
self.titleSelectorType = titleSelectorType
self.descSelectorType = descSelectorType
self.urlSelectorType = urlSelectorType
self.titleSelector = titleSelector
self.descSelector = descSelector
self.descSelector = urlSelector
def ParseResult():
pass
class GoogleParser:
def __init__(self,HTML) -> None:
self.HTML = BeautifulSoup(HTML)
def GetMainResultsClass(self):
try:
MainDiv = self.HTML.find("div",id = "search")
MainClass = MainDiv.find("div").find("div").find("div").get("class")[0]
if MainClass == "ULSxyf":
MainClass = MainDiv.find("div").find("div").find("div").find("div").get("class")[0]
return MainClass
except:
return None
def ParseResult(self,html:Tag) -> dict:
title = "NULL"
Description = "NULL"
url = "NULL"
try:
SpanResults = html.find_all("span",attrs={"dir":"ltr"})
hrefResults = html.find_all("a")
title = SpanResults[0].text.replace("\xa0","")
Description = SpanResults[9].text.replace("\xa0","")
if len(Description) < 15 :
raise InterruptedError
try:
url = unquote(extract_url_from_string(hrefResults[0].get("data-cthref")))
except:
url = unquote(extract_url_from_string(hrefResults[0].get("href")))
except Exception as e:
parser = etree.HTMLParser()
tree = etree.fromstring(str(html), parser)
try:
Description = get_span_with_most_text(tree)
except Exception as e:
pass
try:
title = html.find('h3').text
except Exception as e:
pass
url = html.find('cite', class_='apx8Vc').get("href")
if url is None or not isFullUrl(url):
for url in hrefResults:
url = url.get("href")
return {
"title": title,
"description" : Description,
"url" : url
}
def ParseResultsIfFailed(self,div,titleClass,descClass,linkClass):
if descClass is not None:
return {
"title": div.find("span",class_=titleClass).text,
"description" : div.find_all("span",descClass)[1].text,
"url" : extract_url_from_string(div.find("a",class_=linkClass).get('href'))
}
else:
try:
return {
"title": div.find("span",class_=titleClass).text,
"description" : get_span_with_most_text(div),
"url" : extract_url_from_string(div.find("a",class_=linkClass).get('href'))
}
except:
return {
"title": div.find("span",class_=titleClass).text,
"description" : getMostTextElement(div),
"url" : extract_url_from_string(div.find("a",class_=linkClass).get('href'))
}
def getMainClasses(self,div:Tag):
a = div.find("a")
linkClass = a.get("class")
titleClass = a.find("span").get("class")
try:
describtionClass = div.find("div").find("div").find("div").find_all("div",recursive=False)[1].find_all("span")[-1].get("class")
except:
describtionClass = None
return [titleClass,describtionClass,linkClass]
def ParseAllResults(self,class_):
if class_ is not None:
ResultsElements = self.HTML.find_all("div",class_=class_)
Result = {
}
count = 0
for result in ResultsElements:
count += 1
try:
Result.update(
{f"result_{count}" : self.ParseResult(result)}
)
except Exception as e:
pass
return Result
else:
mainResultsDiv = self.HTML.select_one("html > body > div:nth-of-type(2)")
results = mainResultsDiv.find_all("div")[9]
Classes = self.getMainClasses(results)
Result = {
}
count = 0
for result in mainResultsDiv:
count += 1
try:
Result.update(
{f"result_{count}" : self.ParseResultsIfFailed(result,*Classes)}
)
except Exception as e:
print(traceback.format_exc())
return Result
You get a cookie post-captcha solve, assign it to the requests session object, and it still doesn't work?
This was recently added and may help: https://github.com/opsdisk/yagooglesearch/blob/master/yagooglesearch/__init__.py#L157
You get a cookie post-captcha solve, assign it to the requests session object, and it still doesn't work?
This was recently added and may help: https://github.com/opsdisk/yagooglesearch/blob/master/yagooglesearch/__init__.py#L157
im not sure what the code in the url has to do with me but the thing is i was thinking if you can add reCaptcha solving using selenium in the library to bypass rate limit or maybe what fixes do i add to my code to handle the captcha right
Hey @RequestSharp thanks for the clarification.
"you can add reCaptcha solving using selenium in the library" - Going to pass on doing that at this time.
"what fixes do i add to my code to handle the captcha" - I don't have any selenium experience, but...
1) take a look at using ipdb
to set breakpoints and inspect the values when it's running
2) This while loop with a conditional if/else and a break may not be acting as you expect...I can't test it, but the code doesn't smell right.
while isCap():
print(response.response.text)
driver.transfer_session_cookies_to_driver()
driver.driver.get(url)
print("Solving reCaptcha")
Recaptcha_Solver(
driver=driver.driver,
debug=True
).solve_recaptcha()
if isCap():
print("CAPTCHA AGAIN --------------------")
continue
else:
response = driver.get(url)
break
3) Looking at the docs, you may need to assign a variable (solver=
in the example) to the result of Recaptcha_Solver
You'll need to figure out a way to extract the cookie from the Selenium object and apply it the the requests.cookie
attribute...the code here may help: https://github.com/opsdisk/yagooglesearch/blob/master/yagooglesearch/__init__.py#L182
Closing this out since it's not an issue with yagooglesearch and I don't want to leverage selenium at this time.
hey i was doing a project that contains something like this that uses reqestuim so it uses the the request session object with the selenium driver so it can solve the reCaptcha using audio-to-speach selenium-recaptcha providing the cookies from the driver into the session so no block will happen but i get some problems with the cookies part so if your willing to help with the cookies part will be appreciate it Here is the main parsing code