wp-media / wp-rocket

Performance optimization plugin for WordPress
https://wp-rocket.me
GNU General Public License v2.0
698 stars 218 forks source link

3.17 - Identify unwanted body modification from DOMDocument to bail out early #6952

Closed MathieuLamiot closed 1 month ago

MathieuLamiot commented 1 month ago

Context

https://wp-media.slack.com/archives/CUT7FLHF1/p1725545994953119?thread_ts=1725532083.335009&cid=CUT7FLHF1

Expected behavior Identify when DOMDocument modifies something other than the elements we added hashes to. If it did, bail out early so as not to perform the optimization.

Acceptance Criteria No broken layout when applying hashes on the templates:

https://wp-media.slack.com/archives/CUT7FLHF1/p1725532083335009 https://wp-media.slack.com/archives/C07LKLTKBG8/p1725515496731509

MathieuLamiot commented 1 month ago

I tried this approach with a quick prototype. It bails out almost on every page on e2e websites, there is always a small change so I think this approach is too broad:

Image

    public function add_hashes( $html ) {

        // Find the body before DOMDocument processing
        $result = preg_match( '/(?><body[^>]*>)(?>.*?<\/body>)/is', $html, $matches );

        if ( ! $result ) {
            Logger::error( 'Body element not found in the HTML content.', [ 'LazyRenderContent' ] );

            return $html;
        }

        $body_before = $matches[0];

        $internal_errors = libxml_use_internal_errors( true );

        // Load HTML into DOMDocument.
        $dom = new DOMDocument();

        if ( ! $dom->loadHTML( $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD ) ) {
            foreach ( libxml_get_errors() as $error ) {
                Logger::error( $error->message, [ 'LazyRenderContent' ] );
            }

            libxml_clear_errors();

            return $html;
        }

        libxml_use_internal_errors( $internal_errors );

        // Find the body after DOMDocument processing
        $html_parsed = $dom->saveHTML();
        $result = preg_match( '/(?><body[^>]*>)(?>.*?<\/body>)/is', $html_parsed, $matches );

        if ( ! $result ) {
            Logger::error( 'Body element not found in the HTML DOMDocument parsed content.', [ 'LazyRenderContent' ] );

            return $html;
        }

        $body_after = $matches[0];

        if ( $body_before !== $body_after ) {
            Logger::error( 'Body element has unwanted modifications from DOMDocument.', [ 'LazyRenderContent' ] );

            error_log('Body element has unwanted modifications from DOMDocument.');
            error_log($body_before);
            error_log($body_after);

            return $html;
        }

        // Now, DOMDocument usage is safe
        // Get the body element in DOMDocument and inject hashed recursively
        $body = $dom->getElementsByTagName( 'body' )->item( 0 );

        if ( ! $body ) {
            Logger::error( 'Body element not found in the DOMDocument content.', [ 'LazyRenderContent' ] );

            return $html;
        }

        $this->add_hash_to_element( $body, $this->get_depth() );

        // Extract the body from DOMDocument
        $html_processed = $dom->saveHTML();
        $result = preg_match( '/(?><body[^>]*>)(?>.*?<\/body>)/is', $html_processed, $matches );

        if ( ! $result ) {
            Logger::error( 'Body element not found in the HTML DOMDocument processed content.', [ 'LazyRenderContent' ] );

            return $html;
        }

        $body_processed = $matches[0];

        // Inject the manipulated body into the original HTML
        $result = preg_replace( '/(?><body[^>]*>)(?>.*?<\/body>)/is', $body_processed, $html );

        if ( ! $result ) {
            Logger::error( 'Body element not found in the HTML content when trying to replace.', [ 'LazyRenderContent' ] );

            return $html;
        }

        return $html;
    }