Open DioxusGrow opened 6 days ago
It has panic with 0.6.4 too
[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
skyscraper = "0.6.4"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks", "gzip"] }
tokio = { version = "1", features = ["full"] }
use reqwest;
use serde::{Deserialize, Serialize};
use skyscraper::html::{self, trim_internal_whitespace};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;
#[derive(Serialize, Deserialize, Debug)]
struct Article {
title: String,
source: String,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let mut articles: Vec<Article> = vec![];
let res = reqwest::get("https://www.binance.com/en/square/news/all")
.await?
.text()
.await?;
// Parse the HTML text
let document = html::parse(&res)?;
let xpath_item_tree = XpathItemTree::from(&document);
// Assuming your XPath string is static, it is safe to use `expect` during parsing
let title = xpath::parse("//h3")
.expect("xpath is invalid")
.apply(&xpath_item_tree)?;
let article = xpath::parse("//div[@class=\"css-1wtm1ek\"]//a/div")
.expect("xpath is invalid")
.apply(&xpath_item_tree)?;
let mut arcl = article.iter();
for item in title.iter() {
let src = arcl
.next()
.expect("Doesn't match")
.extract_as_node()
.extract_as_tree_node()
.text(&xpath_item_tree)
.unwrap();
let article = Article {
title: item
.extract_as_node()
.extract_as_tree_node()
.text(&xpath_item_tree)
.unwrap(),
source: trim_internal_whitespace(&src),
};
articles.push(article);
}
// Serialize it to a JSON string.
let posts = serde_json::to_string(&articles)?;
let mut file = File::create("output.json")?;
file.write_all(posts.as_bytes())?;
Ok(())
}
Thanks for the update. Everything works fine, but it's still more familiar to get attribute values from the xpath expression "//div[@class=\"css-1wtm1ek\"]/div/a/@href" than from .extract_as_element_node() This is a bit confusing. Thank you for your great work on this crate
[dependencies]
skyscraper = "0.7.0-beta.2"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks", "gzip"] }
tokio = { version = "1", features = ["full"] }
use skyscraper::html;
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let client = reqwest::Client::new();
let response = client
.get("https://www.binance.com/en/square/news/all")
.send()
.await?;
let document = html::parse(&response.text().await?)?;
let xpath_item_tree = XpathItemTree::from(&document);
// Assuming your XPath string is static, it is safe to use `expect` during parsing
let title = xpath::parse("//h3").expect("xpath title is invalid");
let text =
xpath::parse("//div[@class=\"css-1wtm1ek\"]/div/a/div").expect("xpath text is invalid");
let url = xpath::parse("//div[@class=\"css-1wtm1ek\"]/div/a").expect("xpath url is invalid");
// Apply the XPath expression to our HTML document
let titles = title.apply(&xpath_item_tree)?;
let texts = text.apply(&xpath_item_tree)?;
let urls = url.apply(&xpath_item_tree)?;
for (i, title) in titles.into_iter().enumerate() {
println!(
"\n==> Title: {}",
title.extract_as_node().text(&xpath_item_tree).unwrap()
);
println!(
"==> Article: {}",
texts[i].extract_as_node().text(&xpath_item_tree).unwrap()
);
println!(
"==> Url: {}",
urls[i]
.extract_as_node()
.extract_as_element_node()
.get_attribute(&xpath_item_tree, "href")
.unwrap()
);
}
Ok(())
}
You can get attributes using an xpath expression (/@href
) as documented in the xpath module.
title works everywhere text only in the 0 index. In other indexes there is panic url gives panic everywhere