psf / requests-html

Pythonic HTML Parsing for Humans™
http://html.python-requests.org
MIT License
13.64k stars 977 forks source link

A Browser closed issue #549

Open defaul0t opened 1 year ago

defaul0t commented 1 year ago

Unhandled error: Browser closed unexpectedly:

closedUnhandled error: Browser closed unexpectedly:

my code

from asyncio import events import uvloop import requests import asyncio, time import re import argparse import sys import threading from requests_html import AsyncHTMLSession, HTMLSession import urllib3 from pyppeteer import launch import hashlib import os

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Content-Encoding': 'gzip' }

asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

def get_url(url_txt): with open(url_txt, "r") as f: s = f.readlines() lt = [i.strip() for i in s] return lt

def output_data(i, out_name): with open(out_name, "a", encoding='utf-8') as f: f.write(i + "\n")

def get_md5_value(src): myMd5 = hashlib.md5() myMd5.update(src.encode("utf8")) myMd5_Digest = myMd5.hexdigest() return myMd5_Digest

async def process_data(sem, s, i, None_data_list): async with sem: try:

        r = await s.get(url=i, timeout=30, headers=headers, verify=False)

        await r.html.arender(wait=30, sleep=30, timeout=30, retries=1)
        content_length = len(r.content)
        code = r.status_code
        content = r.html.html.replace('\r', '').replace('\n', '').replace(' ', '')
        body_md5 = get_md5_value(str(content))

        if '<title>' in content:
            title = re.findall('(?<=<title>)(.+?)(?=</title>)', content)[0]
        elif r.html.find('title', first=True):
            title = r.html.find('title', first=True).text
        else:
            title = 'None'
            output_data(i, 'real_None.txt')
        print(f'{i} {r.status_code}, {title}')
        data = [str(code), str(title), str(content_length), body_md5, str(i)]
        None_data_list.append(data)

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
    except BaseException as e:
        print(f"Unhandled error: {e}")
        # Close only the current browser instance if possible

async def start_up(urls, None_data_list, timeout_duration=3000): s = AsyncHTMLSession(verify=False) sem = asyncio.Semaphore(3) tasks = (process_data(sem, s, url, None_data_list) for url in urls) await asyncio.wait_for(asyncio.gather(*tasks), timeout=timeout_duration) await s.close()

def main(urls): None_data_list = [] try: start = time.perf_counter() print(urls) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(start_up(urls, None_data_list)) end = time.perf_counter() print(f'None_Scan : {end - start} ') output_data(str(end - start), 'debug_time.txt') print('') except asyncio.TimeoutError: print("Timeout occurred") except Exception as e: print(e) finally: print(len(None_data_list)) os.system('pkill -f -9 chrome') return None_data_list

test.py new_request_None_url = ['http://bi-mokadisplay.tcl.com:83','http://tmsa.cmp.tcl.com:88']

update_data_list = nonetitle_info.main(new_request_None_url)

print(update_data_list)

data_info.none_update(False, update_data_list)

aehlke commented 1 year ago

figure it out?

ajatkj commented 11 months ago

This project uses pyppeteer which is uses very old version of Chromium. This is easily fixable. You can check my comment on another issue here. Let me know if this helps.

cboin1996 commented 2 months ago

I forked this project and updated it to use playwright. see: #573