I can not get all the images in such html. So I manually change the code of my local installed html2text and it works.
# original code in __init__.py: function handle_tag()
if tag == "img" and start and not self.ignore_images:
if "src" in attrs:
assert attrs["src"] is not None
if not self.images_to_alt:
attrs["href"] = attrs["src"]
alt = attrs.get("alt") or self.default_image_alt
# If we have images_with_size, write raw html including width,
# height, and alt attributes
if self.images_as_html or (
self.images_with_size and ("width" in attrs or "height" in attrs)
):
self.o("<img src='" + attrs["src"] + "' ")
# after change (add attrs["data-src"]):
if tag == "img" and start and not self.ignore_images:
if "src" in attrs or "data-src" in attrs:
if "src" in attrs:
assert attrs["src"] is not None
img_src = attrs["src"]
elif "data-src" in attrs:
assert attrs["data-src"] is not None
img_src = attrs["data-src"]
if not self.images_to_alt:
attrs["href"] = img_src
alt = attrs.get("alt") or self.default_image_alt
# If we have images_with_size, write raw html including width,
# height, and alt attributes
if self.images_as_html or (
self.images_with_size and ("width" in attrs or "height" in attrs)
):
self.o("<img src='" + img_src + "' ")
example url: https://jingyan.baidu.com/article/4dc40848741808c8d946f18a.html or WeChat articles.
I can not get all the images in such html. So I manually change the code of my local installed html2text and it works.
I hope this can be considered in the future.