James-LG / Skyscraper

Rust library for scraping HTML using XPath expressions
MIT License
30 stars 3 forks source link

Retrieving tag attributes with xpath in 0.7.0-beta2 #44

Open DioxusGrow opened 6 days ago

DioxusGrow commented 6 days ago
[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
skyscraper = "0.7.0-beta.1"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks", "gzip"] }
tokio = { version = "1", features = ["full"] }
scraper = "0.19.0"
use skyscraper::html;
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let client = reqwest::Client::new();
    let response = client
        .get("https://www.binance.com/en/square/news/all")
        .send()
        .await?;

    let document = html::parse(&response.text().await?)?;
    let xpath_item_tree = XpathItemTree::from(&document);

    // Assuming your XPath string is static, it is safe to use `expect` during parsing
    let title = xpath::parse("//h3").expect("xpath title is invalid");
    let text = xpath::parse("//div[@class=\"css-1wtm1ek\"]//a/div").expect("xpath text is invalid");
    let url = xpath::parse("//div[@class=\"css-1wtm1ek\"]//a/@href").expect("xpath url is invalid");

    // Apply the XPath expression to our HTML document
    let items = text.apply(&xpath_item_tree)?;
    // Compare the text of the first and only node returned by the XPath expression
    let node = items[18].extract_as_node();
    let text = node.text(&xpath_item_tree).unwrap();
    println!("{:?}", text);

    Ok(())
}

title works everywhere text only in the 0 index. In other indexes there is panic url gives panic everywhere

DioxusGrow commented 6 days ago

It has panic with 0.6.4 too

[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
skyscraper = "0.6.4"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks", "gzip"] }
tokio = { version = "1", features = ["full"] }
use reqwest;
use serde::{Deserialize, Serialize};
use skyscraper::html::{self, trim_internal_whitespace};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;

#[derive(Serialize, Deserialize, Debug)]
struct Article {
    title: String,
    source: String,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let mut articles: Vec<Article> = vec![];

    let res = reqwest::get("https://www.binance.com/en/square/news/all")
        .await?
        .text()
        .await?;

    // Parse the HTML text
    let document = html::parse(&res)?;
    let xpath_item_tree = XpathItemTree::from(&document);

    // Assuming your XPath string is static, it is safe to use `expect` during parsing
    let title = xpath::parse("//h3")
        .expect("xpath is invalid")
        .apply(&xpath_item_tree)?;
    let article = xpath::parse("//div[@class=\"css-1wtm1ek\"]//a/div")
        .expect("xpath is invalid")
        .apply(&xpath_item_tree)?;

    let mut arcl = article.iter();
    for item in title.iter() {
        let src = arcl
            .next()
            .expect("Doesn't match")
            .extract_as_node()
            .extract_as_tree_node()
            .text(&xpath_item_tree)
            .unwrap();
        let article = Article {
            title: item
                .extract_as_node()
                .extract_as_tree_node()
                .text(&xpath_item_tree)
                .unwrap(),
            source: trim_internal_whitespace(&src),
        };
        articles.push(article);
    }

    // Serialize it to a JSON string.
    let posts = serde_json::to_string(&articles)?;

    let mut file = File::create("output.json")?;
    file.write_all(posts.as_bytes())?;

    Ok(())
}
DioxusGrow commented 5 days ago

Thanks for the update. Everything works fine, but it's still more familiar to get attribute values from the xpath expression "//div[@class=\"css-1wtm1ek\"]/div/a/@href" than from .extract_as_element_node() This is a bit confusing. Thank you for your great work on this crate

[dependencies]
skyscraper = "0.7.0-beta.2"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks", "gzip"] }
tokio = { version = "1", features = ["full"] }
use skyscraper::html;
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let client = reqwest::Client::new();
    let response = client
        .get("https://www.binance.com/en/square/news/all")
        .send()
        .await?;

    let document = html::parse(&response.text().await?)?;
    let xpath_item_tree = XpathItemTree::from(&document);

    // Assuming your XPath string is static, it is safe to use `expect` during parsing
    let title = xpath::parse("//h3").expect("xpath title is invalid");
    let text =
        xpath::parse("//div[@class=\"css-1wtm1ek\"]/div/a/div").expect("xpath text is invalid");

    let url = xpath::parse("//div[@class=\"css-1wtm1ek\"]/div/a").expect("xpath url is invalid");

    // Apply the XPath expression to our HTML document
    let titles = title.apply(&xpath_item_tree)?;
    let texts = text.apply(&xpath_item_tree)?;
    let urls = url.apply(&xpath_item_tree)?;

    for (i, title) in titles.into_iter().enumerate() {
        println!(
            "\n==> Title: {}",
            title.extract_as_node().text(&xpath_item_tree).unwrap()
        );
        println!(
            "==> Article: {}",
            texts[i].extract_as_node().text(&xpath_item_tree).unwrap()
        );

        println!(
            "==> Url: {}",
            urls[i]
                .extract_as_node()
                .extract_as_element_node()
                .get_attribute(&xpath_item_tree, "href")
                .unwrap()
        );
    }

    Ok(())
}
James-LG commented 3 days ago

You can get attributes using an xpath expression (/@href) as documented in the xpath module.