James-LG / Skyscraper

Rust library for scraping HTML using XPath expressions
MIT License
30 stars 3 forks source link

How to get all values? #35

Closed DioxusGrow closed 3 weeks ago

DioxusGrow commented 3 weeks ago

XPath finds either one value or multiple values. I don’t understand from the documentation how to find multiple values. Maybe there could be a way to simplify the syntax to

and expand the number of examples? XPath query packages for Go It would also be great to put all (findAll) values into a vector to then iterate several found vectors into structures.

Parser example on golang

package main

import (
    "some/models"
    "bufio"
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "os"
    "time"

    "github.com/antchfx/htmlquery"
    "golang.org/x/net/html"
    "golang.org/x/net/html/charset"
)

func main() {

    NewsPosts := make([]models.Post, 0)

    client := &http.Client{
        Timeout: 30 * time.Second,
    }

    // Make request
    resp, err := client.Get("https://somesite.com")
    if err != nil {
        log.Println(err)
    }
    defer resp.Body.Close()

    r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
    if err != nil {
        log.Println(err)
    }
    doc, err := html.Parse(r)
    if err != nil {
        log.Println(err)
    }

        // xpath expressions
    title := htmlquery.Find(doc, "//div[@class=\"feed-layout-main\"]//h3")
    text := htmlquery.Find(doc, "//div[@class=\"feed-layout-main\"]//a/div[@data-bn-type=\"text\"]")
    url := htmlquery.Find(doc, "//div[@class=\"feed-layout-main\"]//a[@style=\"display:block;margin-bottom:8px\"]/@href")

    fmt.Printf("Len title - %d", len(title))

        // I fetched values to struct with InnerText
    for i, n := range title {
        post := models.NewPost(htmlquery.InnerText(n), htmlquery.InnerText(text[i]), htmlquery.InnerText(url[i]))
        NewsPosts = append(NewsPosts, *post)
    }

    b, err := json.Marshal(NewsPosts)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Println(string(b))

    file, err := os.OpenFile("posts.json", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)

    if err != nil {
        log.Fatalf("failed creating file: %s", err)
    }

    datawriter := bufio.NewWriter(file)

    _, _ = datawriter.WriteString(string(b) + "\n")

    datawriter.Flush()
    file.Close()
}
DioxusGrow commented 3 weeks ago

output.json I have created an example of a similar parser using your library. It is operational and tested, but there is an error in retrieving the data from Article source. About 30-40% of the data is missing.

[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
skyscraper = "0.6.4"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks"] }
tokio = { version = "1", features = ["full"] }
use reqwest;
use serde::{Deserialize, Serialize};
use skyscraper::html::{self, trim_internal_whitespace};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;

#[derive(Serialize, Deserialize, Debug)]
struct Article {
    title: String,
    source: String,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let mut articles: Vec<Article> = vec![];

    let res = reqwest::get("https://finance.yahoo.com/?guccounter=1")
        .await?
        .text()
        .await?;

    // Parse the HTML text
    let document = html::parse(&res)?;
    let xpath_item_tree = XpathItemTree::from(&document);

    // Assuming your XPath string is static, it is safe to use `expect` during parsing
    let title = xpath::parse("//div[@class=\"content svelte-w27v8j\"]/a/h3")
        .expect("xpath is invalid")
        .apply(&xpath_item_tree)?;
    let article = xpath::parse("//div[@class=\"content svelte-w27v8j\"]/div/div")
        .expect("xpath is invalid")
        .apply(&xpath_item_tree)?;

    let mut arcl = article.iter();
    for item in title.iter() {
        let src = arcl
            .next()
            .expect("Doesn't match")
            .extract_as_node()
            .extract_as_tree_node()
            .text(&xpath_item_tree)
            .unwrap();
        let article = Article {
            title: item
                .extract_as_node()
                .extract_as_tree_node()
                .text(&xpath_item_tree)
                .unwrap(),
            source: trim_internal_whitespace(&src),
        };
        articles.push(article);
    }

    // Serialize it to a JSON string.
    let posts = serde_json::to_string(&articles)?;

    let mut file = File::create("output.json")?;
    file.write_all(posts.as_bytes())?;

    Ok(())
}
James-LG commented 3 weeks ago

Can you elaborate on "About 30-40% of the data is missing."?

Skyscraper and lxml both return 41 items for your first expression //div[@class=\"content svelte-w27v8j\"]/a/h3, and 45 items for your second expression //div[@class=\"content svelte-w27v8j\"]/div/div.

DioxusGrow commented 3 weeks ago

I tested, everything is fine. My mistake was somewhere in the xpath.

use reqwest;
use serde::{Deserialize, Serialize};
use skyscraper::html;
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;

#[derive(Serialize, Deserialize, Debug)]
struct TestXpath {
    result: String,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let mut queries: Vec<TestXpath> = vec![];

    let res = reqwest::get("https://finance.yahoo.com/?guccounter=1")
        .await?
        .text()
        .await?;

    // Parse the HTML text
    let document = html::parse(&res)?;
    let xpath_item_tree = XpathItemTree::from(&document);

    // Assuming your XPath string is static, it is safe to use `expect` during parsing
    // let title = xpath::parse("//div[@class=\"content svelte-w27v8j\"]/a/h3")
    let test_xpath = xpath::parse("//*[contains(@class, \"svelte-6i0owd\")]/li//h3")
        .expect("xpath is invalid")
        .apply(&xpath_item_tree)?;

    for item in test_xpath.iter() {
        let res = TestXpath {
            result: item
                .extract_as_node()
                .extract_as_tree_node()
                .text(&xpath_item_tree)
                .unwrap(),
        };
        queries.push(res);
    }

    // Serialize it to a JSON string.
    let test_query = serde_json::to_string(&queries)?;

    let mut file = File::create("output.json")?;
    file.write_all(test_query.as_bytes())?;

    Ok(())
}