rrrene / html_sanitize_ex

HTML sanitizer for Elixir
MIT License
271 stars 62 forks source link

Meta.strip_everything_not_covered leaves the content within a <script> tag. #29

Closed sergiotapia closed 3 years ago

sergiotapia commented 7 years ago

I'm trying to scrape my own blog post here: https://sergiotapia.me/phoenix-framework-uploading-to-amazon-s3-e70657bd2013

Actual HTML is:

<script type="application/ld+json">{"@context":"http://schema.org","@type":"NewsArticle","image":{"@type":"ImageObject","width":848,"height":346,"url":"https://cdn-images-1.medium.com/max/848/1*LJy3CSjxbpcsK165R5zemQ.png"},"datePublished":"2017-08-05T17:17:49.000Z","dateModified":"2017-08-09T11:42:36.889Z","headline":"Phoenix Framework — Direct Uploading to Amazon S3.","name":"Phoenix Framework — Direct Uploading to Amazon S3.","keywords":["Web Development","Elixir","Phoenix Framework","Amazon S3"],"author":{"@type":"Person","name":"Sergio Tapia","url":"https://sergiotapia.me/@sergiocodes"},"creator":["Sergio Tapia"],"publisher":{"@type":"Organization","name":"sergiotapia","url":"https://sergiotapia.me","logo":{"@type":"ImageObject","width":106,"height":60,"url":"https://cdn-images-1.medium.com/max/106/1*ITIwmsAcKr1uJyEGpkEN9Q.jpeg"}},"mainEntityOfPage":"https://sergiotapia.me/phoenix-framework-uploading-to-amazon-s3-e70657bd2013"}</script>

In my custom scrubber I have this:

defmodule HtmlScrubber do
    require HtmlSanitizeEx.Scrubber.Meta
    alias HtmlSanitizeEx.Scrubber.Meta

    @valid_schemes ["http", "https", "mailto"]

    # Removes any CDATA tags before the traverser/scrubber runs.
    Meta.remove_cdata_sections_before_scrub

    Meta.strip_comments

    Meta.allow_tag_with_uri_attributes   "a", ["href"], @valid_schemes
    Meta.allow_tag_with_these_attributes "a", ["name", "title"]

    Meta.allow_tag_with_these_attributes "b", []
    Meta.allow_tag_with_these_attributes "blockquote", []
    Meta.allow_tag_with_these_attributes "br", []
    Meta.allow_tag_with_these_attributes "code", []
    Meta.allow_tag_with_these_attributes "del", []
    Meta.allow_tag_with_these_attributes "em", []
    Meta.allow_tag_with_these_attributes "h1", []
    Meta.allow_tag_with_these_attributes "h2", []
    Meta.allow_tag_with_these_attributes "h3", []
    Meta.allow_tag_with_these_attributes "h4", []
    Meta.allow_tag_with_these_attributes "h5", []
    Meta.allow_tag_with_these_attributes "hr", []
    Meta.allow_tag_with_these_attributes "i", []

    Meta.allow_tag_with_uri_attributes   "img", ["src"], @valid_schemes
    Meta.allow_tag_with_these_attributes "img", ["width", "height", "title", "alt"]

    Meta.allow_tag_with_these_attributes "li", []
    Meta.allow_tag_with_these_attributes "ol", []
    Meta.allow_tag_with_these_attributes "p", []
    Meta.allow_tag_with_these_attributes "pre", []
    Meta.allow_tag_with_these_attributes "span", []
    Meta.allow_tag_with_these_attributes "strong", []
    Meta.allow_tag_with_these_attributes "table", []
    Meta.allow_tag_with_these_attributes "tbody", []
    Meta.allow_tag_with_these_attributes "td", []
    Meta.allow_tag_with_these_attributes "th", []
    Meta.allow_tag_with_these_attributes "thead", []
    Meta.allow_tag_with_these_attributes "tr", []
    Meta.allow_tag_with_these_attributes "u", []
    Meta.allow_tag_with_these_attributes "ul", []

    Meta.strip_everything_not_covered
  end

The content ends up looking like:

Uploading to Amazon S3. – sergiotapia{\"@context\":\"http://schema.o...*snip*

So it's removing the <script> tag, but I'd like it to also remove the contents for the script tag. Any suggestions? I appreciate the help!

rrrene commented 3 years ago

Hi, sorry for the late response.

This has been answered here: https://github.com/rrrene/html_sanitize_ex/issues/13#issuecomment-427589003