omkarcloud / botasaurus

The All in One Framework to build Awesome Scrapers.
https://www.omkar.cloud/botasaurus/
MIT License
1.36k stars 124 forks source link

šŸ™ Guidance on how to integrate Botasaurus in an existing project #56

Closed life-Nd closed 7 months ago

life-Nd commented 8 months ago

This is a great project but i am having issues integrating it in my existing code. I was previously using the UndetectedChromeDriver and would like to replace it with Botasaurus. The goals are to handle sign-in, get user profiles and complete some user flow (fill forms, upload documents and click buttons). I have created classes to easily integrate each part in the program. Here is the code for the helper class

import subprocess
import os
from pathlib import Path
import logging
# from os import path
# import random
from time import sleep
# import undetected_chromedriver as uc
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager

# from Tools.Bot.chrome_launcher_adapter import ChromeLauncherAdapter
# from Tools.Bot.create_stealth_driver import create_stealth_driver
from Tools.Bot.chrome_launcher_adapter import ChromeLauncherAdapter

from Tools.Bot.create_stealth_driver import create_stealth_driver
from selenium.webdriver.chrome.options import Options
from chromedriver_autoinstaller import install

from botasaurus import *
# from botasaurus_proxy_authentication import add_proxy_options

logger = logging.getLogger()
# COPIED FROM chrome-launcher code (https://github.com/GoogleChrome/chrome-launcher/blob/main/src/flags.ts), Mostly same but the extensions, media devices etc are not disabled to avoid detection
DEFAULT_FLAGS = [
    #   safe browsing service, upgrade detector, translate, UMA
    "--disable-background-networking",
    # Don't update the browser 'components' listed at chrome://components/
    "--disable-component-update",
    # Disables client-side phishing detection.
    "--disable-client-side-phishing-detection",
    # Disable syncing to a Google account
    "--disable-sync",
    # Disable reporting to UMA, but allows for collection
    "--metrics-recording-only",
    # Disable installation of default apps on first run
    "--disable-default-apps",
    # Disable the default browser check, do not prompt to set it as such
    "--no-default-browser-check",
    # Skip first run wizards
    "--no-first-run",
    # Disable backgrounding renders for occluded windows
    "--disable-backgrounding-occluded-windows",
    # Disable renderer process backgrounding
    "--disable-renderer-backgrounding",
    # Disable task throttling of timer tasks from background pages.
    "--disable-background-timer-throttling",
    # Disable the default throttling of IPC between renderer & browser processes.
    "--disable-ipc-flooding-protection",
    # Avoid potential instability of using Gnome Keyring or KDE wallet. crbug.com/571003 crbug.com/991424
    "--password-store=basic",
    # Use mock keychain on Mac to prevent blocking permissions dialogs
    "--use-mock-keychain",
    # Disable background tracing (aka slow reports & deep reports) to avoid 'Tracing already started'
    "--force-fieldtrials=*BackgroundTracing/default/",
    # Suppresses hang monitor dialogs in renderer processes. This flag may allow slow unload handlers on a page to prevent the tab from closing.
    "--disable-hang-monitor",
    # Reloading a page that came from a POST normally prompts the user.
    "--disable-prompt-on-repost",
    # Disables Domain Reliability Monitoring, which tracks whether the browser has difficulty contacting Google-owned sites and uploads reports to Google.
    "--disable-domain-reliability",
]

class BotasaurusChromeHandler:
    def __init__(self):
        print("šŸ’” ChromeHandler init")
        sleep(5)
        self._driver = self.launch_chrome("https://ca.yahoo.com/?p=us", [])
        create_stealth_driver()
        print("āœ… UndetectedChromeHandler launched āž”ļø (šŸŒˆ Google.com)")

    def driver(self): 
        return self._driver

    # @browser(profile='Profile 1',)
    def launch_chrome(self,start_url, additional_args):
        # Set Chrome options
        chrome_options = Options(
            # headless=True,
            # add_argument(r"--user-data-dir=/Users/lifen/Library/Application Support/Google/Chrome/Profile 1"),
        )
        chrome_options.add_argument("--remote-debugging-port=9222")
        # chrome_options.add_argument("--no-sandbox")
        # chrome_options.add_argument("--disable-gpu")
        # chrome_options.add_argument("--disable-extensions")
        # chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--user-data-dir=/Users/lifen/Library/Application Support/Google/Chrome/Profile 1")
        # add_proxy_options(chrome_options)

        unique_flags = list(dict.fromkeys(DEFAULT_FLAGS + additional_args))

        kwargs = {
            "ignoreDefaultFlags": True,
            "chromeFlags": unique_flags,

            "userDataDir": "/Users/MacUser/Library/Application Support/Google/Chrome/Profile 1",

            "port": 9222,
            "headless": False,
            "autoClose": True,

        }

        if start_url:
            kwargs["startingUrl"] = start_url

        instance = ChromeLauncherAdapter.launch(**kwargs)
        return instance

Where the code is used:

import re
import logging
import random
from time import sleep
from configs.configs_model import ConfigsModel
from helpers.jobs_sql import JobsSQL
from helpers.html_page_handler import HTMLPageHandler
from helpers.shared import notification
from models.job_listing import JobListingModel

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.remote.webelement import WebElement

from helpers.botasaurus_chrome_handler import BotasaurusChromeHandler

from botasaurus import *

logger = logging.getLogger()

class IndeedChromeApplier:

    def __init__(self, jobs_sql: JobsSQL, jobs: list):
        print(f"šŸ’” IndeedChromeApplier init ")
        self.jobs = jobs
        self.chrome = BotasaurusChromeHandler()
        # self.chrome.driver().maximize_window()
        driver = bt.create_driver()
        self.driver = driver
        self.page = HTMLPageHandler(driver=driver)
        self.jobs_sql = jobs_sql

    def get_uid(self):
        configs = ConfigsModel()
        uid = configs.user_id
        return uid

    # @browser
    def check_auth(self):
        # driver = self.chrome.driver()
        driver = self.driver
        driver.get("https://profile.indeed.com/")
        sleep(2)
        url = driver.current_url
        substring = "secure"
        print(f"šŸŸ¢ šŸ”“ {url=}")
        if substring in url:
            print("āŒ Not Logged in")
            # Get input of the user to try again after he logs in
            notification(
                message="Please log in to Indeed.com and try again (y/n): ")
            _input = input("Please log in to Indeed.com and try again (y/n): ")
            _input: str = "" + _input
            if _input.lower().__contains__("y"):
                return self.check_auth()
            elif _input.lower().__contains__("n"):
                return False
            else:
                sleep(20000)
        elif "profile.indeed.com" in url:
            print("āœ… Logged in")
            return True

    def answer_questions(self):
        # Define a WebDriverWait with a timeout of 10 seconds
        wait = WebDriverWait(self.chrome.driver(), 10)

        # Wait for the radio button for commuting/relocation to be clickable and select it
        try:
            commute_option: WebElement = wait.until(
                EC.element_to_be_clickable(
                    (
                        By.XPATH,
                        "//label[@for='input-q_38d8e685bb4b5228c2494ac85bc44d69-0']",
                    )
                )
            )
            commute_option.click()
            sleep(random.uniform(0.7, 2.2))
        except TimeoutException:
            print("Failed to find or click the commute option.")

    def replace_resume(self, job_title):
        print("āÆļø  replace_resume")
        is_upload_resume = (
            "Upload or build a resume for this application"
            in self.chrome.driver().title
        )
        paths = self.get_paths()
        if is_upload_resume:
            print("āœ… is_upload")
            # Find the "Replace" link using the full link text
            replace_link = self.page.try_find_element(
                driver=self.chrome.driver(),
                name="Replace",
                by=By.CSS_SELECTOR,
                value='[data-testid="ResumeFileInfoCardReplaceButton-button"]',
            )
            sleep(1)
            if replace_link:
                print("āœ… replace_link")

                sleep(1)
                # Find the file input element
                file_input: WebElement = WebDriverWait(self.chrome.driver(), 10).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, 'input[type="file"]')
                    )
                )

                # Send the file path to the file input element
                file_input.send_keys(
                    f"{paths.output_resumes_pdf_dir}/RalphNduwimana-{job_title}.pdf"
                )
                sleep(random.uniform(0.9, 1.8))
                # self.page.click_to_next_page(name="Continue",by=By.CLASS_NAME,value='ia-continueButton ia-Resume-continue css-vw73h2 e8ju0x51')
                notification(message=f"Resume replaced by {job_title}")
                self.page.click_to_go_to_page(
                    name="Continue",
                    by=By.XPATH,
                    value="//div[contains(text(), 'Continue')]",
                )

    def submit_application(self):
        print("āÆļø  review_application")
        notification(message="Reviewing application")
        sleep(1.7)
        notification(message="No cover letter required!")

        submit = self.page.click_to_go_to_page(
            name="Submit your application",
            by=By.XPATH,
            value="//button[contains(@class, 'ia-continueButton')]",
        )
        if submit:
            notification("Application Submitted")
        else:
            notification("Application Submitted", code=0)

        # submit_application_button.click()
        # Wait for 2 seconds for the submission to be completed
        sleep(2)

        # Check if the page contains "Application Submitted"
        application_submitted = (
            "Application Submitted" in self.chrome.driver().page_source
        )
        # Check if the submission was completed and return True if "Application Submitted" was found
        if application_submitted:
            notification("Application submitted successfully!")
            return True
        else:
            print("Application submission failed.")
            return False
    def click_button(self):
        # Logic to click on buttons 
        pass

    def type_text(self):
        # Logic to click on buttons 
        pass

    def run(self):
        print("āÆļø  IndeedChromeApplier run")
        driver = self.chrome.driver()
        authenticated = self.check_auth()
        jobs_row = self.jobs_sql.load_jobs_by_status(query_status="Generated")
        jobs_data = [job_row for job_row in jobs_row]
        print(f'āœ… āœ… {str(jobs_data)[0:200]}')

        if authenticated:
            for data in jobs_data:
                if not data:
                    print(f'šŸš« No Data in jobs_data')
                job_data = self.convert_tuple_to_dict(data)
                job = JobListingModel(job_data)
                url = job.jobUrl
                print(f'āœ… āœ… āœ… āœ… {job.jobUrl}')
                page_loaded = self.page.go_to_page(url)
                if not page_loaded:
                    print(f"šŸš« {url} not loaded")
                    # continue

                if page_loaded:
                    print('āœ… page_loaded')

                    application_started = self.page.click_to_go_to_page(
                        name="Apply",
                        by=By.ID,
                        value="indeedApplyButton",
                    )
                    data = re.search(
                        "This job has expired on Indeed",
                        driver.page_source,
                    )
                    # Get True of False
                    expired = data is not None
                    print(f"šŸ“• {expired=}")
                    # sleep(10000)
                    sleep(random.uniform(0.2, 0.5))
                    if not application_started:
                        print("šŸš« Application not started")
                        sleep(1000)
                    if "indeed" not in driver.current_url:
                        print("Cannot apply on company websites (just indeed.com)")
                        sleep(10000)

                    pages = {
                        "questions": False,
                        "resume": False,
                        "review": False,
                        "work-experience": False,
                        "submitted": False,
                    }

                    try:
                        # there is a page that has not been completed
                        while (
                            False
                            in pages.values()
                        ):
                            print('')

                    except NoSuchElementException:
                        print(
                            f"āŒ Failed to get page ")

    def log_in(self, username, password):
        print(f"āÆļø  Starting log_in {username} {password}")
        page = self.page
        try:
            username_bar = page.try_find_element(
                name="username_bar",
                by=By.ID,
                value="session_key",
                driver=self.driver,
            )
            assert username_bar is not None
            username_bar.send_keys(f"{username}")
            password_bar = page.try_find_element(
                name="password_bar", by=By.ID, value="session_password", driver=self.chrome.driver()
            )
            assert password_bar is not None
            password_bar.send_keys(f"{password}")
            password_bar.send_keys(Keys.ENTER)
            print("āœ… User logged-in")
        except NoSuchElementException:
            print("No such element found")
        except Exception:
            print("Other exception")
        print(f"ā¹ļø  Finished log_in {username} {password}")

    def log_out(self):
        url = self.chrome.driver().current_url
        print(f"āÆļø  Starting log_out from {url}")
        xpath = (
            "/html/body/div[5]/header/div/nav/ul/li[6]/div/button"
            if "Home" in url
            else "/html/body/header/div/div[2]/div/div/button"
        )
        page = self.page
        icon_button = page.try_find_element(
            driver=self.chrome.driver(),
            name="Log-Out",
            by=By.XPATH,
            value=xpath,
            element_type="button",
        )
        try:
            print(f"{icon_button=}")
            try:
                sign_out_option: WebElement = WebDriverWait(
                    self.chrome.driver(), 10
                ).until(EC.presence_of_element_located((By.LINK_TEXT, "Sign Out")))
                sign_out_option.click()
                print("āœ… User logged-out")
            except:
                print(f"Sign Out not found ")
        except:
            print("Avatar button not found")
        print(f"ā¹ļø  Finished log_out from {url}")

I would appreciate any guidance on how to integrate Botasaurus features in my code. Thanks in advance!!!

Chetan11-dev commented 7 months ago

We do not provide dedicated support for individual problems. We recommend creating a detailed issue on Stack Overflow or in the /r/webscraping/ subreddit on Reddit, where the community can assist you. We hope you understand.