Closed DimitriTimoz closed 2 months ago
I'm using pdf-extract on this pdf
pdf-extract
Here is my code:
pdf_extract::extract_text("/tmp/arxiv.pdf")
Here is the output:
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 88 -> "∑" missing char 0 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 0 -> "(" missing char 0 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 0 -> "(" missing char 0 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 0 -> "(" missing char 1 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 1 -> ")" missing char 1 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 1 -> ")" missing char 1 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>> falling back to encoding 1 -> ")" Unicode mismatch false minus "°" Ok("−") [8722] unknown glyph name 'mapsto' for font UVAEAW+CMSY10 thread 'main' panicked at /home/dimitri/.cargo/registry/src/index.crates.io-6f17d22bba15001f/pdf-extract-0.7.5/src/lib.rs:485:69: no entry found for key
here is the concerned line
The panic occurs again with this pdf
That pdf is now fixed by e23882b3a525bf64adccef057dafbd53b2593910
I'm using
pdf-extract
on this pdfHere is my code:
Here is the output:
here is the concerned line