CenterForOpenScience / pydocx

An extendable docx file format parser and converter
Other
183 stars 55 forks source link

How to set alt text on image using figure caption? #245

Closed bitscompagnie closed 6 years ago

bitscompagnie commented 6 years ago

Hello,

How to use the figure caption text from the source Ms Word document as alt text on the generated tag?

In the source document.xml I see the following:

 <w:drawing>
                    <wp:inline distT="0" distB="0" distL="0" distR="0">
                        <wp:extent cx="5943600" cy="8210550"/>
                        <wp:effectExtent l="0" t="0" r="0" b="0"/>
                        <wp:docPr id="16" name="Picture 16"
                            descr="Image description."
                            title="Image description"/>
                        <wp:cNvGraphicFramePr>
.......

I need to grab the value of the descr="Image description" or title="Image description" from the above excerpt of the source document.xml

I am currently using below custom image export function, which allows me to save my images to a local folder:

[]( def get_image_tag(self, image, width=None, height=None, rotate=None):

unique_filename = str(uuid.uuid4())

    img_src = self.get_image_source(image)
    if img_src:
        # Getting images from the source
        attrs = {
            'src': img_src
        }
        # get base64 file extension from bytes
        # https://matthewdaly.co.uk/blog/2015/07/04/handling-images-as-base64-strings-with-django-rest-framework/

        format, img_src2 = img_src.split(';base64,') # format ~= data:image/X,
        ext = format.split('/')[-1] # guess file extension
        # Capture the generated filename with the proper extension to use in img source attribute
        img_src_new = 'img_' + image_name() + '.' + ext
        # Function to convert base64 string to image using urlretireve
        urlretrieve(img_src, 'c:/git/output/' + img_src_new)

        # Set the image source to the newly created filename
        attrs = {
            'src': img_src_new
        }
    if width and height:
        attrs['width'] = width
        attrs['height'] = height
    if rotate:
        attrs['style'] = 'transform: rotate(%sdeg);' % rotate
    return HtmlTag('img', allow_self_closing=True, allow_whitespace=True, **attrs))

Thanks for your help.

jlward commented 6 years ago

In order to pull the descr or title, you would need to make a change to pydocx.openxml.drawing.wordprocessing.inline to pull that field. If you would like to make a PR to implement this, I'd be happy to review it.

IuryAlves commented 6 years ago

Hello @bitscompagnie, @jlward

I've made a PR #248 to pull descr from pictures

Also while the PR is not merged, you can do the following:

# coding: utf-8
from __future__ import (
    absolute_import,
    print_function,
    unicode_literals,
)

from pydocx.export import PyDocXHTMLExporter
from pydocx.export.html import  convert_emus_to_pixels, HtmlTag
from pydocx.models import XmlModel, XmlAttribute, XmlChild
from pydocx.openxml.wordprocessing.drawing import Inline

class DocPr(XmlModel):
    XML_TAG = 'docPr'

    title = XmlAttribute(name='title')
    descr = XmlAttribute(name='descr')

Inline.docPr = XmlChild(type=DocPr)

class PyDocXHTMLExporterWithAlt(PyDocXHTMLExporter):

    def export_drawing(self, drawing):
        length, width = drawing.get_picture_extents()

        try:
            description = drawing.inline.docPr.descr
        except AttributeError:
            description = None
        rotate = drawing.get_picture_rotate_angle()
        relationship_id = drawing.get_picture_relationship_id()
        if not relationship_id:
            return
        image = None
        try:
            image = drawing.container.get_part_by_id(
                relationship_id=relationship_id,
            )
        except KeyError:
            pass
        attrs = {}
        if length and width:
            # The "width" in openxml is actually the height
            width_px = '{px:.0f}px'.format(px=convert_emus_to_pixels(length))
            height_px = '{px:.0f}px'.format(px=convert_emus_to_pixels(width))
            attrs['width'] = width_px
            attrs['height'] = height_px
        if rotate:
            attrs['rotate'] = rotate
        if description:
            attrs['alt'] = description

        tag = self.get_image_tag(image=image, **attrs)
        if tag:
            yield tag

    def get_image_tag(self, image, width=None, height=None, rotate=None, alt=None):
        image_src = self.get_image_source(image)
        if image_src:
            attrs = {
                'src': image_src
            }
            if width and height:
                attrs['width'] = width
                attrs['height'] = height
            if rotate:
                attrs['style'] = 'transform: rotate(%sdeg);' % rotate
            if alt:
                attrs['alt'] = alt

            return HtmlTag(
                'img',
                allow_self_closing=True,
                allow_whitespace=True,
                **attrs
            )

html = PyDocXHTMLExporterWithAlt('test.docx').export()
bitscompagnie commented 6 years ago

Thanks a lot @IuryAlves,

I just tested the solution and it worked as expected.