seleniumbase / SeleniumBase

📊 Blazing fast Python framework for web crawling, scraping, testing, and reporting. Supports pytest. Stealth abilities: UC Mode and CDP Mode.
https://seleniumbase.io
MIT License
5.4k stars 979 forks source link

Documenting the `selenium-wire` "Wire Mode" REPLACEMENT #3247

Open mdmintz opened 1 week ago

mdmintz commented 1 week ago

Documenting the selenium-wire "Wire Mode" REPLACEMENT.

As many of you know, there's a selenium-wire integration via SeleniumBase Wire Mode.

There are two main issues with it:

Here's the good news: selenium-wire features are included in the new SeleniumBase CDP Mode, (a subset of UC Mode).

Here's an example of that, (SeleniumBase/examples/cdp_mode/raw_res_sb.py), where network requests and responses are captured and displayed:

"""Using CDP.network.ResponseReceived and CDP.network.RequestWillBeSent."""
import colorama
import mycdp
import sys
from seleniumbase import SB

c1 = colorama.Fore.BLUE + colorama.Back.LIGHTYELLOW_EX
c2 = colorama.Fore.BLUE + colorama.Back.LIGHTGREEN_EX
cr = colorama.Style.RESET_ALL
if "linux" in sys.platform:
    c1 = c2 = cr = ""

async def send_handler(event: mycdp.network.RequestWillBeSent):
    r = event.request
    s = f"{r.method} {r.url}"
    for k, v in r.headers.items():
        s += f"\n\t{k} : {v}"
    print(c1 + "*** ==> RequestWillBeSent <== ***" + cr)
    print(s)

async def receive_handler(event: mycdp.network.ResponseReceived):
    print(c2 + "*** ==> ResponseReceived <== ***" + cr)
    print(event.response)

with SB(uc=True, test=True, locale_code="en") as sb:
    sb.activate_cdp_mode("about:blank")
    sb.cdp.add_handler(mycdp.network.RequestWillBeSent, send_handler)
    sb.cdp.add_handler(mycdp.network.ResponseReceived, receive_handler)
    url = "https://seleniumbase.io/apps/calculator"
    sb.cdp.open(url)
    sb.sleep(1)

Usage is different from regular Selenium-Wire, but it can do all the same things (and more) with better flexibility/control.

Here's another example, (SeleniumBase/examples/cdp_mode/raw_req_sb.py), where specific requests were filtered out (intercepted and blocked), to prevent images from loading:

"""Using CDP.fetch.RequestPaused to filter content in real-time."""
import mycdp
from seleniumbase import SB

async def request_paused_handler(event, tab):
    r = event.request
    is_image = ".png" in r.url or ".jpg" in r.url or ".gif" in r.url
    if not is_image:  # Let the data through
        tab.feed_cdp(mycdp.fetch.continue_request(request_id=event.request_id))
    else:  # Block the data (images)
        TIMED_OUT = mycdp.network.ErrorReason.TIMED_OUT
        s = f"BLOCKING | {r.method} | {r.url}"
        print(f" >>> ------------\n{s}")
        tab.feed_cdp(mycdp.fetch.fail_request(event.request_id, TIMED_OUT))

with SB(uc=True, test=True, locale_code="en") as sb:
    sb.activate_cdp_mode("about:blank")
    sb.cdp.add_handler(mycdp.fetch.RequestPaused, request_paused_handler)
    url = "https://gettyimages.com/photos/firefly-2003-nathan"
    sb.cdp.open(url)
    sb.sleep(5)

If people don't need the stealth features (or other improvements made to intercepting/handling network requests & responses), then they can continue using the existing Wire Mode as is. Or, if people want the upgrades, then they can use the new CDP Mode, (as shown above).

mdmintz commented 2 days ago

Here's a new example (SeleniumBase/examples/cdp_mode/raw_xhr_sb.py) for XHR (requests and responses):

"""CDP.network.ResponseReceived with CDP.network.ResourceType.XHR."""
import ast
import asyncio
import colorama
import mycdp
import sys
import time
from seleniumbase.undetected import cdp_driver

xhr_requests = []
last_xhr_request = None
c1 = colorama.Fore.BLUE + colorama.Back.LIGHTYELLOW_EX
c2 = colorama.Fore.BLUE + colorama.Back.LIGHTGREEN_EX
cr = colorama.Style.RESET_ALL
if "linux" in sys.platform:
    c1 = c2 = cr = ""

def listenXHR(page):
    async def handler(evt):
        # Get AJAX requests
        if evt.type_ is mycdp.network.ResourceType.XHR:
            xhr_requests.append([evt.response.url, evt.request_id])
            global last_xhr_request
            last_xhr_request = time.time()
    page.add_handler(mycdp.network.ResponseReceived, handler)

async def receiveXHR(page, requests):
    responses = []
    retries = 0
    max_retries = 5
    # Wait at least 2 seconds after last XHR request for more
    while True:
        if last_xhr_request is None or retries > max_retries:
            break
        if time.time() - last_xhr_request <= 2:
            retries = retries + 1
            time.sleep(2)
            continue
        else:
            break
    await page
    # Loop through gathered requests and get response body
    for request in requests:
        try:
            res = await page.send(mycdp.network.get_response_body(request[1]))
            if res is None:
                continue
            responses.append({
                "url": request[0],
                "body": res[0],
                "is_base64": res[1],
            })
        except Exception as e:
            print("Error getting response:", e)
    return responses

async def crawl():
    driver = await cdp_driver.cdp_util.start_async()
    tab = await driver.get("about:blank")
    listenXHR(tab)

    # Change url to something that makes ajax requests
    tab = await driver.get("https://resttesttest.com/")
    time.sleep(1)
    # Click AJAX button on https://resttesttest.com/
    element = await tab.select("button#submitajax")
    await element.click_async()
    time.sleep(2)

    xhr_responses = await receiveXHR(tab, xhr_requests)
    for response in xhr_responses:
        print(c1 + "*** ==> XHR Request URL <== ***" + cr)
        print(f'{response["url"]}')
        is_base64 = response["is_base64"]
        b64_data = "Base64 encoded data"
        try:
            headers = ast.literal_eval(response["body"])["headers"]
            print(c2 + "*** ==> XHR Response Headers <== ***" + cr)
            print(headers if not is_base64 else b64_data)
        except Exception:
            response_body = response["body"]
            print(c2 + "*** ==> XHR Response Body <== ***" + cr)
            print(response_body if not is_base64 else b64_data)

if __name__ == "__main__":
    print("================= Starting =================")
    loop = asyncio.new_event_loop()
    loop.run_until_complete(crawl())