apify / crawlee-python

Crawlee—A web scraping and browser automation library for Python to build reliable crawlers. Extract data for AI, LLMs, RAG, or GPTs. Download HTML, PDF, JPG, PNG, and other files from websites. Works with BeautifulSoup, Playwright, and raw HTTP. Both headful and headless mode. With proxy rotation.
https://crawlee.dev/python/
Apache License 2.0
4.64k stars 319 forks source link

How to scratch TEMU #689

Closed KalvinThien closed 1 week ago

KalvinThien commented 1 week ago

I'm trying to scrape the information from TEMU https://www.temu.com/. Processing https://www.temu.com/vn-en/2--car--------universal--sun----pvc---accessories-----g-601099650626830.html ... Extracted Data: Title: No title found.

image

`import sys import asyncio import time from PyQt6.QtWidgets import QApplication, QMainWindow, QVBoxLayout, QWidget, QPushButton, QTextEdit, QLineEdit, QLabel from PyQt6.QtCore import QThread, pyqtSignal from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext

class CrawlerThread(QThread): log_signal = pyqtSignal(str) data_signal = pyqtSignal(str) runtime_signal = pyqtSignal(str)

def __init__(self, url):
    super().__init__()
    self.url = url
    self.request_count = 0
    self.failed_requests = 0
    self.total_duration = 0

async def run_crawler(self):
    crawler = PlaywrightCrawler(max_requests_per_crawl=1)

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext):
        self.log_signal.emit(f"Processing {context.request.url} ...")
        start_time = time.time()

        try:
            # Extract title from the specified div with class '_2rn4tqXP'
            title_element = context.page.locator('._2rn4tqXP')
            title = await title_element.inner_text() if await title_element.count() > 0 else "No title found."

            # Calculate request duration
            request_duration = time.time() - start_time
            self.request_count += 1
            self.total_duration += request_duration

            # Emit data signal
            self.data_signal.emit(f"Title: {title}\n")

        except Exception as e:
            self.failed_requests += 1
            self.log_signal.emit(f"Error: {e}")

    await crawler.run([self.url])

    # Calculate and emit runtime statistics
    average_duration = self.total_duration / self.request_count if self.request_count > 0 else 0
    runtime_stats = (
        f"Requests Finished: {self.request_count}\n"
        f"Requests Failed: {self.failed_requests}\n"
        f"Average Request Duration: {average_duration:.2f} seconds\n"
        f"Total Runtime: {self.total_duration:.2f} seconds"
    )
    self.runtime_signal.emit(runtime_stats)

def run(self):
    asyncio.run(self.run_crawler())

class MainWindow(QMainWindow): def init(self): super().init() self.setWindowTitle("Web Data Crawler") self.setGeometry(100, 100, 800, 600)

    # Widgets
    self.url_input = QLineEdit()
    self.url_input.setPlaceholderText("Enter URL here")
    self.start_button = QPushButton("Start Crawling")
    self.output_area = QTextEdit()
    self.output_area.setReadOnly(True)
    self.runtime_label = QLabel("Runtime Statistics:")

    # Layout
    layout = QVBoxLayout()
    layout.addWidget(self.url_input)
    layout.addWidget(self.start_button)
    layout.addWidget(self.output_area)
    layout.addWidget(self.runtime_label)
    container = QWidget()
    container.setLayout(layout)
    self.setCentralWidget(container)

    # Connections
    self.start_button.clicked.connect(self.start_crawling)

def start_crawling(self):
    url = self.url_input.text().strip()
    if not url:
        self.output_area.setText("Please enter a valid URL.")
        return

    self.output_area.clear()

    # Run the crawler in a separate thread
    self.crawler_thread = CrawlerThread(url)
    self.crawler_thread.log_signal.connect(self.update_output)
    self.crawler_thread.data_signal.connect(self.display_data)
    self.crawler_thread.runtime_signal.connect(self.display_runtime)
    self.crawler_thread.start()

def update_output(self, text):
    self.output_area.append(text)

def display_data(self, data):
    self.output_area.append("Extracted Data:\n" + data)

def display_runtime(self, runtime):
    self.runtime_label.setText("Runtime Statistics:\n" + runtime)

app = QApplication(sys.argv) window = MainWindow() window.show() sys.exit(app.exec())

`

I've tried but they all return no find.can someone help me?