jaspervdj / hakyll

A static website compiler library in Haskell
jaspervdj.be/hakyll
Other
2.71k stars 409 forks source link

Add a basic HTML compressor #956

Open 0xd34df00d opened 2 years ago

0xd34df00d commented 2 years ago

This adds a basic HTML compressor compiler, which, on my blog, reduces the size of an average code listing-heavy page by about 3-4%.

Minoru commented 1 year ago

@0xd34df00d, gentle ping ;) It'd be nice to get this merged.

rpearce commented 1 year ago

Ah! This would be most excellent to have. This is important to me because local dev on my website includes spaces that don't show up when the HTML eventually gets compressed over a CDN.

I believe this PR would allow me to not have to worry about design differences between local and prod

rpearce commented 1 year ago

Ok, @Minoru, I just ended up trying to do this all from scratch and ended up doing something very similar:

compressHtmlCompiler :: Item String -> Compiler (Item String)
compressHtmlCompiler = pure . fmap compressHtml

compressHtml :: String -> String
compressHtml = withTagList compressTags

compressTags :: [TS.Tag String] -> [TS.Tag String]
compressTags = go S.empty
  where
    go :: S.Set String -> [TS.Tag String] -> [TS.Tag String]
    go stack =
      \case [] -> []
            ((TS.TagComment _):rest) -> go stack rest
            (tag@(TS.TagOpen name _):rest) -> tag : go (S.insert name stack) rest
            (tag@(TS.TagClose name):rest) -> tag : go (S.delete name stack) rest
            (tag@(TS.TagText _):rest)
              | hasSignificantWhitespace stack -> tag : go stack rest
              | hasTextContent stack -> fmap cleanTabsNewLines tag : go stack rest
              | otherwise -> fmap cleanAll tag : go stack rest
            (tag:rest) -> tag : go stack rest

    -- Whitespace-sensitive content that shouldn't be compressed
    hasSignificantWhitespace :: S.Set String -> Bool
    hasSignificantWhitespace stack =
      any (`S.member` stack) content
      where
        content = [ "pre", "script", "textarea" ]

    -- Elements that can hold text content and should
    -- hold on to leading and trailing whitespace
    hasTextContent :: S.Set String -> Bool
    hasTextContent stack = any (`S.member` stack) content
      where
        content =
          [ "a", "abbr", "b", "bdi", "bdo", "blockquote", "button", "cite"
          , "code", "del", "dfn", "em", "figcaption", "h1", "h2", "h3", "h4"
          , "h5", "h6", "i", "img", "input", "ins", "kbd", "label", "li", "mark"
          , "math", "noscript", "object", "p", "picture", "q", "rp"
          , "rt", "ruby", "s", "samp", "select", "small", "span", "strong"
          , "sub", "sup", "svg", "td", "textarea", "time", "var", "wbr"
          ]

    -- Replace tab characters with spaces
    replaceTab :: Char -> Char
    replaceTab '\t' = ' '
    replaceTab s    = s

    -- Replace newline characters with spaces
    replaceNewLine :: Char -> Char
    replaceNewLine '\n' = ' '
    replaceNewLine s    = s

    -- Remove the following:
    --   '\f' (form feed)
    --   '\n' (newline [line feed])
    --   '\r' (carriage return)
    --   '\v' (vertical tab)
    rmNewLines :: String -> String
    rmNewLines = filter (not . (`elem` ("\f\n\r\v" :: String)))

    cleanTabsNewLines :: String -> String
    cleanTabsNewLines = fmap (replaceNewLine . replaceTab)

    cleanAll :: String -> String
    cleanAll = rmNewLines . trim . fmap replaceTab

Feel free to use this if you like on another PR, or anybody can use it if they like.

rpearce commented 1 year ago

Actually, I iterated on this and got it really simple and solid, I think. This cleans up 99.9% of the whitespace scenarios I could reasonably come up with:

compressHtmlCompiler :: Item String -> Compiler (Item String)
compressHtmlCompiler = pure . fmap compressHtml

compressHtml :: String -> String
compressHtml = withTagList compressTags

compressTags :: [TS.Tag String] -> [TS.Tag String]
compressTags = go S.empty
  where
    go :: S.Set String -> [TS.Tag String] -> [TS.Tag String]
    go stack =
      \case [] -> []
            -- Removes comments by not prepending the tag
            -- and, instead, continuing on with the other tags
            ((TS.TagComment _str):rest) ->
              go stack rest

            -- When we find an open tag, like `<div>`, prepend it
            -- and continue through the rest of the tags while
            -- keeping a separate stack of what elements a given
            -- tag is currently "inside"
            (tag@(TS.TagOpen name _attrs):rest) ->
              tag : go (S.insert name stack) rest

            -- When we find a closing tag, like `</div>`, prepend it
            -- it and continue through the rest of the tags, making
            -- sure to remove it from our stack of currently opened
            -- elements
            (tag@(TS.TagClose name):rest) ->
              tag : go (S.delete name stack) rest

            -- When a text/string tag is encountered, if it has
            -- significant whitespace that should be preserved,
            -- then prepend it without change; otherwise, clean up
            -- the whitespace, and prepend it
            (tag@(TS.TagText _str):rest)
              | hasSignificantWhitespace stack -> tag : go stack rest
              | otherwise -> fmap cleanWhitespace tag : go stack rest

            -- If none of the above match, then this is unexpected,
            -- so we should prepend the tag without change
            (tag:rest) ->
              tag : go stack rest

    -- Whitespace-sensitive content that shouldn't be compressed
    hasSignificantWhitespace :: S.Set String -> Bool
    hasSignificantWhitespace stack =
      any (`S.member` stack) content
      where
        content = [ "pre", "textarea" ]

    cleanWhitespace :: String -> String
    cleanWhitespace " " = " "
    cleanWhitespace str = cleanWS str (clean str)
      where
        -- Strips out newlines, spaces, etc
        clean :: String -> String
        clean = unwords . words

        -- Clean the whitespace while preserving
        -- single leading and trailing whitespace
        -- characters when it makes sense
        cleanWS :: String -> String -> String
        cleanWS _originalStr "" = ""
        cleanWS originalStr trimmedStr =
          keepSpaceWhen head originalStr ++
            trimmedStr ++
            keepSpaceWhen last originalStr

        -- Determine when to keep a space based on a
        -- string and a function that returns a character
        -- within that string
        keepSpaceWhen :: ([Char] -> Char) -> String -> String
        keepSpaceWhen _fn ""  = ""
        keepSpaceWhen fn originalStr
          | (isSpace . fn) originalStr = " "
          | otherwise = ""
Minoru commented 1 year ago

@rpearce, thanks for picking this up! Doesn't your solution suffer the same problem as the original one though, i.e. it swallows all kinds of spaces because it uses isSpace?

Other than that, it looks great, so if you want it merged it's probably time to send a pull request ;)

rpearce commented 1 year ago

The latest only uses isSpace for checking a for if there's a leading or trailing space it should hang on to.

It leverages unwords . words for cleanup, and I'm having pretty good results so far.

I'll see about opening a PR!

Minoru commented 1 year ago

My point is: if a string starts with multiple non-break spaces, isSpace would return True and non-break spaces will be collapsed into a single ordinary space, which is clearly wrong.

rpearce commented 1 year ago

Okay, I've got this working for me for allowing non-breaking unicode spaces, and I'm going to continue evaluating it before going further. Thanks for your feedback on somebody's abandoned PR 😅

Expand to view code ```haskell compressHtmlCompiler :: Item String -> Compiler (Item String) compressHtmlCompiler = pure . fmap compressHtml compressHtml :: String -> String compressHtml = withTagList compressTags compressTags :: [TS.Tag String] -> [TS.Tag String] compressTags = go Set.empty where go :: Set.Set String -> [TS.Tag String] -> [TS.Tag String] go stack = \case [] -> [] -- Removes comments by not prepending the tag -- and, instead, continuing on with the other tags ((TS.TagComment _str):rest) -> go stack rest -- When we find an open tag, like `
`, prepend it -- and continue through the rest of the tags while -- keeping a separate stack of what elements a given -- tag is currently "inside" (tag@(TS.TagOpen name _attrs):rest) -> tag : go (Set.insert name stack) rest -- When we find a closing tag, like `
`, prepend it -- it and continue through the rest of the tags, making -- sure to remove it from our stack of currently opened -- elements (tag@(TS.TagClose name):rest) -> tag : go (Set.delete name stack) rest -- When a text/string tag is encountered, if it has -- significant whitespace that should be preserved, -- then prepend it without change; otherwise, clean up -- the whitespace, and prepend it (tag@(TS.TagText _str):rest) | hasSignificantWhitespace stack -> tag : go stack rest | otherwise -> fmap cleanWhitespace tag : go stack rest -- If none of the above match, then this is unexpected, -- so we should prepend the tag without change (tag:rest) -> tag : go stack rest -- Whitespace-sensitive content that shouldn't be compressed hasSignificantWhitespace :: Set.Set String -> Bool hasSignificantWhitespace stack = any (`Set.member` stack) content where content = [ "pre", "script", "textarea" ] cleanWhitespace :: String -> String cleanWhitespace " " = " " cleanWhitespace str = cleanSurroundingWhitespace str (cleanHtmlWhitespace str) where -- Tests for the following: -- ' ' (space) -- '\f' (form feed) -- '\n' (newline [line feed]) -- '\r' (carriage return) -- '\v' (vertical tab) isSpaceOrNewLineIsh :: Char -> Bool isSpaceOrNewLineIsh = (`elem` (" \f\n\r\v" :: String)) -- Strips out newlines, spaces, etc cleanHtmlWhitespace :: String -> String cleanHtmlWhitespace = unwords . words' where -- Alternate `words` function that uses a different -- predicate than `isSpace` in order to avoid dropping -- certain types of spaces. -- https://hackage.haskell.org/package/base-4.17.0.0/docs/src/Data.OldList.html#words words' :: String -> [String] words' s = case dropWhile isSpaceOrNewLineIsh s of "" -> [] s' -> w : words' s'' where (w, s'') = break isSpaceOrNewLineIsh s' -- Clean the whitespace while preserving -- single leading and trailing whitespace -- characters when it makes sense cleanSurroundingWhitespace :: String -> String -> String cleanSurroundingWhitespace _originalStr "" = "" cleanSurroundingWhitespace originalStr trimmedStr = leadingStr ++ trimmedStr ++ trailingStr where leadingStr = keepSpaceWhen head originalStr trailingStr = keepSpaceWhen last originalStr -- Determine when to keep a space based on a -- string and a function that returns a character -- within that string keepSpaceWhen :: ([Char] -> Char) -> String -> String keepSpaceWhen _fn "" = "" keepSpaceWhen fn originalStr | (isSpaceOrNewLineIsh . fn) originalStr = " " | otherwise = "" ```