dart-lang / html

Dart port of html5lib. For parsing HTML/HTML5 with Dart. Works in the client and on the server.
https://pub.dev/packages/html
Other
276 stars 58 forks source link

get only text node #224

Open insinfo opened 1 year ago

insinfo commented 1 year ago

I need to get only the text nodes of an html, I do it like this in PHP

<?php

class Html
{
    protected
        $reachedLimit = false,
        $totalLen = 0,
       // $maxLen = 25,
        $toRemove = array();

    public static function trim($html, $maxLen = 25)
    {

        $dom = new DomDocument();

            $dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);

        $instance = new static();
        $toRemove = $instance->walk($dom, $maxLen);

        // remove any nodes that exceed limit
        foreach ($toRemove as $child) {
            $child->parentNode->removeChild($child);
        }

        // remove wrapper tags added by DD (doctype, html...)
        if (version_compare(PHP_VERSION, '5.4.0') < 0) {
            // http://stackoverflow.com/a/6953808/1058140
            $dom->removeChild($dom->firstChild);
            $dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild);

            return $dom->saveHTML();
        }

        return $dom->saveHTML();
    }

    protected function walk(DomNode $node, $maxLen)
    {

        if ($this->reachedLimit) {
            $this->toRemove[] = $node;
        } else {
            // only text nodes should have text,
            // so do the splitting here
            if ($node instanceof DomText) {
                //print_r($node instanceof DomText);
                //print_r($node->nodeValue);
              //  echo '------';
                $nodeLen = strlen($node->nodeValue);
                $this->totalLen += $nodeLen ;

                // use mb_strlen / mb_substr for UTF-8 support
                if ($this->totalLen > $maxLen) {
                    $node->nodeValue = substr($node->nodeValue, 0, $nodeLen - ($this->totalLen - $maxLen)) . '...';
                    $this->reachedLimit = true;
                }
            }

            // if node has children, walk its child elements
            if (isset($node->childNodes)) {
                foreach ($node->childNodes as $child) {
                    $this->walk($child, $maxLen);
                }
            }
        }

        return $this->toRemove;
    }
}

$str = "<div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit, 
            sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>Ut 
            enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip 
            ex ea commodo consequat. </p><p>Duis aute irure dolor in reprehenderit in 
            voluptate velit esse cillum dolore eu fugiat nulla pariatur. </p><p>Excepteur 
            sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit 
            anim id est laborum.</p></div>";

$str = Html::trim($str, 20);
print $str;
//result <div><p><b>Lorem</b> ipsum dolor si...</p></div>
insinfo commented 1 year ago

I implemented it like this but the behavior is wrong

class HtmlTrim {
  bool reachedLimit = false;
  int totalLen = 0;
  List<html.Node> toRemove = [];

  static String trim(String htmlString, {int limit = 25}) {
    final dom = html.parseFragment(htmlString);
    var instance = HtmlTrim();
    var toRemove = instance._walk(dom, limit);
    // remove any nodes that exceed limit
    for (var child in toRemove) {
      child.parentNode?.remove();
    }

    return dom.outerHtml;
  }

  List<html.Node> _walk(html.Node node, int maxLen) {
    if (reachedLimit) {
      if (node.firstChild != null) {
        toRemove.add(node.firstChild!);
      }
    } else {
      // only text nodes should have text,
      // so do the splitting here
      if (node.firstChild?.nodeType == html.Node.TEXT_NODE) {
        var nodeText = node.firstChild!;
        if (nodeText.text != null) {
          var nodeLen = nodeText.text!.length;
          this.totalLen += nodeLen;

          if (this.totalLen > maxLen) {
            nodeText.text = nodeText.text!
                    .substring(0, nodeLen - (this.totalLen - maxLen)) +
                '...';
            this.reachedLimit = true;
          }
        }
      }

      // if node has children, walk its child elements
      if (node.children.isNotEmpty) {
        for (var child in node.children) {         
          _walk(child, maxLen);
        }
      }
    }

    return this.toRemove;
  }
}

var html =
      '''<div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit, 
            sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>Ut 
            enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip 
            ex ea commodo consequat. </p><p>Duis aute irure dolor in reprehenderit in 
            voluptate velit esse cillum dolore eu fugiat nulla pariatur. </p><p>Excepteur 
            sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit 
            anim id est laborum.</p></div>''';

  var trim = HtmlTrim.trim(html, limit: 5);
  print('main: ${trim}');

// dart .\bin\teste_html_trim.dart
//Result: <div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit, 
//            sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>...</p></div>