marianna13 / doc2dataset

A tool to extract text (and images) from documents (like PDFs)
MIT License
2 stars 1 forks source link

Extract SVG images #3

Open marianna13 opened 8 months ago

marianna13 commented 8 months ago
from svg.path import parse_path, Line
from xml.dom import minidom
from cairosvg import svg2png

def get_xy(z):
  return z.real, z.imag

svg = page.get_svg_image()
svg_doc = minidom.parseString(svg)

for use in svg_doc.getElementsByTagName('use'):
  use.parentNode.removeChild(use)

for path in svg_doc.getElementsByTagName('path'):
    d = path.getAttribute('fill')

    if path.getAttribute('id'):
      path.parentNode.removeChild(path)

svg_root = svg_doc.getElementsByTagName('svg')[0]
w, h = svg_root.getAttribute('width'), svg_root.getAttribute('height')
w = float(w.replace('pt', ''))
h = float(h.replace('pt', ''))

svg_str = str(BeautifulSoup(svg_doc.toxml(), 'lxml').find('svg'))

img = svg2png(file_obj=io.StringIO(svg_str), output_width=w, output_height=h, dpi=90)

img = np.array(Image.open(io.BytesIO(img)))

x, y = np.where(img[:, :, 0]!=0) # coordinates of SVG images
marianna13 commented 6 months ago

get_drawings should extract SVG like page.get_svg_image()