James-LG / Skyscraper

Rust library for scraping HTML using XPath expressions
MIT License
31 stars 4 forks source link

Fix non standard tags #34

Open DioxusGrow opened 3 months ago

DioxusGrow commented 3 months ago

"This code of html is very old, but it still exists and is causing a new error on this site with non-standard tags. Can you make a correction?

Error: thread 'main' panicked at src/main.rs:13:38: called 'Result: : unwrap()' on an 'Err' value: EndTagMismatch { end name: "DL", open name: "DD" }

let document = html::parse(&res).unwrap();

I used to use this library from Golang, but I would like to use Rust: github.com/antchfx/htmlquery v1.3.0 github.com/antchfx/xpath"

use reqwest;
use skyscraper::html::{self, trim_internal_whitespace};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let res = reqwest::get("http://www.patriarchia.ru/db/text/5332635.html")
        .await?
        .text()
        .await?;

    let document = html::parse(&res).unwrap();

    let xpath_item_tree = XpathItemTree::from(&document);
    let xpath = xpath::parse("//div[@class=\"main\"]//h1")?;

    let nodes = xpath.apply(&xpath_item_tree)?;
    let mut nodes = nodes.into_iter();

    let tree_node = nodes
        .next()
        .unwrap()
        .extract_into_node()
        .extract_into_tree_node();

    println!(
        "Name of org: {:?}",
        trim_internal_whitespace(&tree_node.text(&xpath_item_tree).unwrap())
    );

    Ok(())
}
James-LG commented 3 months ago

Skyscraper generally doesn't care what your tags are called, that error means there is something like <DD>hi</DL> somewhere in the raw HTML. The opening tag <DD> has to be matched by a closing tag of the same name </DD> but your HTML has </DL> instead. Your browser is automatically resolving that issue so you can't see it in that screenshot.

This library already has some malformed HTML handlers that you can try. Docs for the trait are here.

The two implementations that may help you are

  1. VoidMismatchedTagHandler which pretends the tag is matching even if it isn't.
  2. CloseMismatchedTagHandler which attempts to add a missing end tag.

Based on your description I'm guessing the Void handler would be a better fit, but if you post the raw HTML snippet causing your issues I may be able to investigate further.

DioxusGrow commented 3 months ago

This is working code. All of malformed HTML handlers (VoidMismatchedTagHandler, CloseMismatchedTagHandler) has an error again. Error: MissingLiteralAfterAssignmentSign { tag_name: "img" }

reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks"] }
scraper = "0.19.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
skyscraper = "0.6.4"
tokio = { version = "1", features = ["full"] }
use reqwest;
use skyscraper::html::trim_internal_whitespace;
use skyscraper::html::{
    parse::{
        malformed_html_handlers::CloseMismatchedTagHandler,
        malformed_html_handlers::VoidMismatchedTagHandler, ParseOptionsBuilder, Parser,
    },
    DocumentFormatType,
};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let res = reqwest::get("http://www.patriarchia.ru/db/text/5332635.html")
        .await?
        .text()
        .await?;

    // let document = html::parse(&res).unwrap();
    let options = ParseOptionsBuilder::new()
        .with_mismatched_tag_handler(Box::new(VoidMismatchedTagHandler::new(None)))
        .build();

    let document = Parser::new(options).parse(&res)?;

    let xpath_item_tree = XpathItemTree::from(&document);
    let xpath = xpath::parse("//div[@class=\"main\"]//h1")?;

    let nodes = xpath.apply(&xpath_item_tree)?;
    let mut nodes = nodes.into_iter();

    let tree_node = nodes
        .next()
        .unwrap()
        .extract_into_node()
        .extract_into_tree_node();

    println!(
        "Name of org: {:?}",
        trim_internal_whitespace(&tree_node.text(&xpath_item_tree).unwrap())
    );

    Ok(())
}
DioxusGrow commented 3 months ago

I tried to run the query with a new version of the crate (skyscraper = "0.7.0-beta.0") and I got the same error message again. Error: MissingLiteralAfterAssignmentSign { tag_name: "img" } This is working code.

[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
skyscraper = "0.7.0-beta.0"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks"] }
tokio = { version = "1", features = ["full"] }
use reqwest;
use serde::{Deserialize, Serialize};
use skyscraper::html::parse::{
    malformed_html_handlers::VoidMismatchedTagHandler, ParseOptionsBuilder, Parser,
};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;
use tokio::time::Duration;

#[derive(Serialize, Deserialize, Debug)]
struct TestXpath<'a> {
    result: &'a str,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    let mut queries: Vec<TestXpath> = vec![];

    let client = reqwest::Client::builder()
        // Set a 30-second timeout
        .timeout(Duration::from_secs(30))
        .build()?;

    let res = client
        .get("http://www.patriarchia.ru/db/text/5332635.html")
        .send()
        .await?;

    let options = ParseOptionsBuilder::new()
        .with_mismatched_tag_handler(Box::new(VoidMismatchedTagHandler::new(None)))
        .build();

    let document = Parser::new(options).parse(&res.text().await?)?;
    let xpath_item_tree = XpathItemTree::from(&document);

    let test_xpath = xpath::parse("//div[@class=\"main\"]//h1")
        .expect("xpath is invalid")
        .apply(&xpath_item_tree)?;

    for item in test_xpath.iter() {
        let res = TestXpath {
            result: &item.extract_as_node().extract_as_attribute_node().value,
        };
        queries.push(res);
    }

    // Serialize it to a JSON string.
    let test_query = serde_json::to_string(&queries)?;

    let mut file = File::create("output.json")?;
    file.write_all(test_query.as_bytes())?;

    Ok(())
}
James-LG commented 3 months ago

Can you provide a snippet of the html causing your errors? I don't like clicking links to websites I've never heard of.

DioxusGrow commented 2 months ago

It has problem with skyscraper = "0.7.0-beta.2" too index.txt

[dependencies]
skyscraper = "0.7.0-beta.2"
reqwest = { version = "0.12.4", features = ["default", "blocking", "cookies", "json", "socks", "gzip"] }
tokio = { version = "1", features = ["full"] }
// use skyscraper::html;
use skyscraper::html::parse::malformed_html_handlers::VoidMismatchedTagHandler;
use skyscraper::html::parse::{ParseOptionsBuilder, Parser};
use skyscraper::xpath::{self, XpathItemTree};
use std::error::Error;
use std::fs;
use std::path::Path;
// use std::time::Duration;

#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
    // let client = reqwest::Client::builder()
    //     .timeout(Duration::from_secs(30)) // Set a 30-second timeout
    //     .build()?;

    // let response = client
    //     .get("http://www.patriarchia.ru/db/text/5332635.html")
    //     .send()
    //     .await?;

    let path = Path::new("index.txt");
    let html = fs::read_to_string(path).expect("Failed to read index.html");

    let options = ParseOptionsBuilder::new()
        .with_mismatched_tag_handler(Box::new(VoidMismatchedTagHandler::new(None)))
        .build();

    let document = Parser::new(options).parse(&html)?;

    // Parse the HTML text
    // let document = html::parse(&html)?;

    let xpath_item_tree = XpathItemTree::from(&document);

    // Assuming your XPath string is static, it is safe to use `expect` during parsing
    let ep_name = xpath::parse("//div[@class=\"section\"]/h1").expect("xpath title is invalid");

    // Apply the XPath expression to our HTML document
    let ep_names = ep_name.apply(&xpath_item_tree)?;

    for ep_name in ep_names.into_iter() {
        println!(
            "\n==> Title: {}",
            ep_name.extract_as_node().text(&xpath_item_tree).unwrap()
        );
    }

    Ok(())
}