lexborisov / myhtml

Fast C/C++ HTML 5 Parser. Using threads.
GNU Lesser General Public License v2.1
1.66k stars 147 forks source link

Lost MyHTML_TOKEN_TYPE_DONE on invalid <br> tag #187

Open Azq2 opened 3 years ago

Azq2 commented 3 years ago
<svg></br><foo>

Code for reproduce:

#include <stdio.h>
#include <stdlib.h>

#include <myhtml/myhtml.h>
#include <myhtml/serialization.h>

void dumper(myhtml_tree_node_t *node, size_t level) {
    while (node) {
        for (size_t i = 0; i < level; i++)
            fprintf(stderr, "    ");
        fprintf(stderr, "<%ld token='%s'>\n", node->tag_id, (node->token ? (node->token->type & MyHTML_TOKEN_TYPE_DONE ? "DONE" : "NOT DONE") : "NULL"));

        if (node->child)
            dumper(node->child, level + 1);

        for (size_t i = 0; i < level; i++)
            fprintf(stderr, "    ");
        fprintf(stderr, "</%ld>\n", node->tag_id);

        node = node->next;
    }
}

int main(int argc, const char * argv[]) {
    const char* html = "<svg></br><foo>";

    // basic init
    myhtml_t* myhtml = myhtml_create();
    myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);

    // init tree
    myhtml_tree_t* tree = myhtml_tree_create();
    myhtml_tree_init(tree, myhtml);

    // parse html
    myhtml_parse(tree, MyENCODING_UTF_8, html, strlen(html));

    myhtml_tree_node_t *node = myhtml_tree_get_document(tree);
    dumper(node, 0);

    return 0;
}

Output:

<0 token='NULL'>
    <65 token='NULL'>
        <61 token='NULL'>
        </61>
        <23 token='NULL'>
            <124 token='DONE'>
                <24 token='NOT DONE'>
                </24>
                <252 token='DONE'>
                </252>
            </124>
        </23>
    </65>
</0>

Token in tag with id=24 (MyHTML_TAG_BR) without flag MyHTML_TOKEN_TYPE_DONE