tautologistics / node-htmlparser

Forgiving HTML/XML/RSS Parser in JS for *both* Node and Browsers
MIT License
1.15k stars 139 forks source link

"<" symbol not parsed as text when common browsers do it #72

Open davidfoliveira opened 9 years ago

davidfoliveira commented 9 years ago

If a text on the HTML has a "<", the text is not parsed after that. Example: <title>We <3cupcakes</title>

The "<3cupcakes" is interpreted like being a tag when common browsers parse it like text.

davidfoliveira commented 9 years ago

Suggestion of patch.

diff -rc node_modules/htmlparser/lib/htmlparser.js node_modules/new_htmlparser/lib/htmlparser.js
*** node_modules/htmlparser/lib/htmlparser.js   Thu Apr 12 19:04:06 2012
--- node_modules/new_htmlparser/lib/htmlparser.js   Tue Mar 24 01:11:47 2015
***************
*** 219,225 ****
            this._next = Parser._reTags.lastIndex - 1;
            var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
            var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
!   
            //A new element to eventually be appended to the element list
            var element = {
                  raw: rawData
--- 219,238 ----
            this._next = Parser._reTags.lastIndex - 1;
            var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
            var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
!           // A tag element that doesn't finish with a '>' ? Naah.. this is text
!           if ( this._parseState == ElementType.Tag && tagSep != ">" && rawData.substr(0,3) != "!--" ) {
!               var prevElement = (this._elements.length > 0) ? this._elements[this._elements.length-1] : null;
!               if ( prevElement && prevElement.type == ElementType.Text ) {
!                   prevElement.raw += '<'+rawData;
!                   prevElement.data += '<'+rawData;
!                   this._current = this._next+1;
!                   continue;
!               }
!               else {
!                   this._parseState = ElementType.Text;
!                   rawData = '<'+rawData;
!               }
!           }
            //A new element to eventually be appended to the element list
            var element = {
                  raw: rawData
eranimo commented 8 years ago

Can you submit a PR?