jrmuizel / pdf-extract

A rust library for extracting content from pdfs
364 stars 73 forks source link

Unicode map unsafe get leads to panic #88

Closed DimitriTimoz closed 2 months ago

DimitriTimoz commented 2 months ago

I'm using pdf-extract on this pdf

Here is my code:

pdf_extract::extract_text("/tmp/arxiv.pdf")

Here is the output:

        missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 88 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 88 -> "∑"
missing char 0 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 0 -> "("
missing char 0 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 0 -> "("
missing char 0 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 0 -> "("
missing char 1 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 1 -> ")"
missing char 1 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 1 -> ")"
missing char 1 in unicode map {49: "\u{f8f6}", 61: "\u{f8fd}", 208: "Γ", 211: "Λ", 66: "\u{f8ec}", 216: "Φ", 214: "Σ", 48: "\u{f8eb}", 160: " ", 217: "Ψ", 58: "\u{f8f3}", 209: "∆", 213: "Π", 60: "\u{f8f2}", 53: "\u{f8fb}", 59: "\u{f8fe}", 54: "\u{f8ef}", 215: "Υ", 63: "\u{f8e6}", 62: "\u{f8f4}", 51: "\u{f8f9}", 57: "\u{f8fc}", 65: "\u{f8f8}", 52: "\u{f8f0}", 50: "\u{f8ee}", 159: "√", 55: "\u{f8fa}", 210: "Θ", 64: "\u{f8ed}", 212: "Ξ", 218: "Ω", 56: "\u{f8f1}", 67: "\u{f8f7}"} for <</Type /Font/Subtype /Type1/BaseFont /RXXHYK+CMEX10/FirstChar 0/FontDescriptor 581 0 R/LastChar 88/ToUnicode 625 0 R/Widths 555 0 R>>
falling back to encoding 1 -> ")"
Unicode mismatch false minus "°" Ok("−") [8722]
unknown glyph name 'mapsto' for font UVAEAW+CMSY10
thread 'main' panicked at /home/dimitri/.cargo/registry/src/index.crates.io-6f17d22bba15001f/pdf-extract-0.7.5/src/lib.rs:485:69:
no entry found for key

here is the concerned line

DimitriTimoz commented 2 months ago

The panic occurs again with this pdf

jrmuizel commented 2 months ago

That pdf is now fixed by e23882b3a525bf64adccef057dafbd53b2593910