jrmuizel / pdf-extract

A rust library for extracting content from pdfs
396 stars 78 forks source link

thread 'main' panicked at 'missing char 33 in map #72

Closed danindiana closed 7 months ago

danindiana commented 10 months ago

example code:

extern crate pdf_extract; extern crate lopdf; extern crate indicatif;

use std::env; use std::fs; use std::io::{self, Write}; use std::path::Path; use std::time::{SystemTime, UNIX_EPOCH}; use indicatif::{ProgressBar, ProgressStyle}; use pdf_extract::; use lopdf::; use std::fs::File; use std::panic::{self, AssertUnwindSafe};

fn main() { let args: Vec = env::args().collect(); if args.len() < 3 { eprintln!("Usage: {} ", args[0]); return; }

let pdf_dir = Path::new(&args[1]);
let output_dir = Path::new(&args[2]);

if !output_dir.exists() {
    fs::create_dir_all(&output_dir).unwrap_or_else(|_| panic!("Could not create output directory: {:?}", output_dir));
}

process_directory(&pdf_dir, &output_dir);

}

fn process_directory(pdf_dir: &Path, output_dir: &Path) { for entry in fs::read_dir(pdf_dir).unwrap() { let entry = entry.unwrap(); let path = entry.path();

    if path.is_dir() {
        println!("Processed directory: {:?}", pdf_dir);
        println!("Next directory: {:?}", path);
        println!("Do you wish to proceed? (yes/no)");

        let mut input = String::new();
        io::stdin().read_line(&mut input).unwrap();

        if input.trim().eq_ignore_ascii_case("yes") {
            process_directory(&path, output_dir);
        } else {
            continue;
        }
    } else if path.extension().and_then(|s| s.to_str()) == Some("pdf") {
        // Wrap the call to process_pdf with catch_unwind to handle panics
        let result = panic::catch_unwind(AssertUnwindSafe(|| {
            process_pdf(&path, &output_dir);
        }));

        if let Err(e) = result {
            eprintln!("An error occurred while processing {:?}: {:?}", path, e);
        }
    }
}

}

fn process_pdf(pdf_path: &Path, output_dir: &Path) { let pb = ProgressBar::new_spinner(); pb.set_style(ProgressStyle::default_spinner().template("{spinner:.green} {msg}")); pb.enable_steady_tick(120); pb.set_message("Processing PDF...");

let filename = pdf_path.file_stem().unwrap().to_str().unwrap();
let output_path = output_dir.join(filename).with_extension("txt");

let mut file = File::create(&output_path).expect("Could not create output file");

match Document::load(pdf_path) {
    Ok(doc) => {
        print_metadata(&doc);

        let mut output: Box<dyn OutputDev> = Box::new(PlainTextOutput::new(&mut file as &mut dyn Write));

        if let Err(e) = output_doc(&doc, output.as_mut()) {
            eprintln!("Error processing document {}: {}", pdf_path.display(), e);
        }
    }
    Err(e) => eprintln!("Failed to load document {}: {}", pdf_path.display(), e),
}

pb.finish_with_message("Done.");

let time = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs();
let mut log_file = fs::OpenOptions::new().append(true).create(true).open(output_dir.join(format!("processed_{}.log", time))).unwrap();

writeln!(log_file, "Processed PDF: {:?}", pdf_path).unwrap();

}

fn printmetadata(doc: &Document) { = doc; // Simulate using the doc variable, or implement logic here }

warning: fields name, alternate_space, and tint_transform are never read --> src/lib.rs:1310:5 1309 pub struct Separation { ---------- fields in this struct 1310 name: String, ^^^^ 1311 alternate_space: AlternateColorSpace, ^^^^^^^^^^^^^^^ 1312 tint_transform: Box, ^^^^^^^^^^^^^^
 = note: `Separation` has a derived impl for the trait `Clone`, but this is intentionally ignored during dead code analysis
warning: pdf-extract (lib) generated 7 warnings Compiling pdf-extract v0.7.2 (/home/walter/programs/pdf-extract) warning: unused Result that must be used --> bin/extract.rs:72:5 72 output_doc(&doc, output.as_mut()); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

= note: this Result may be an Err variant, which should be handled = note: #[warn(unused_must_use)] on by default help: use let _ = ... to ignore the resulting value | 72 | let _ = output_doc(&doc, output.as_mut()); | +++++++

warning: pdf-extract (bin "pdf-extract") generated 1 warning Finished dev [unoptimized + debuginfo] target(s) in 1.09s Running target/debug/pdf-extract /media/ /media/extract_pdfminer ⠁ Done. Done. ⠚ Processing PDF... thread 'main' panicked at 'missing char 33 in map {48: "∙", 34: "(", 36: ")"} for <</Type /Font/Subtype /TrueType/BaseFont /AAAAAI+CambriaMath/FontDescriptor 161 0 R/ToUnicode 90 0 R/FirstChar 33/LastChar 48/Widths [698 415 672 415 351 469 605 728 579 579 728 579 440 507 579 247]>>', src/lib.rs:750:27

Keep getting thread main panic.

jrmuizel commented 10 months ago

Can you provide a link or attach the PDF that you're having trouble with?

danindiana commented 7 months ago

@jrmuizel Sorry been so long since forgotten apologies apologies! Will close this out.