google-code-export / pdfium

Automatically exported from code.google.com/p/pdfium
1 stars 0 forks source link

CFDE_XMLSyntaxParser::DoSyntaxParse() doesn't parse <![CDATA] ... ]]> sections properly #90

Open GoogleCodeExporter opened 9 years ago

GoogleCodeExporter commented 9 years ago
This is easiest to see if you modify DoSyntaxParse() to log its internal state 
in each pass through the loop, at or around fde_xml.cpp:1623

       while (m_pStart < m_pEnd) {
            ch = *m_pStart;
 +           FX_LPDWORD topPointer = m_SkipStack.GetTopElement();
 +           printf("state = %u, m_SkipChar = %c, stack = %c, ch = %c\n",
                   m_dwMode, m_SkipChar, topPointer ? *topPointer : 0, ch);
            switch (m_dwMode) {
 ....

Then you need to run input like
   <script contentType="application/x-javascript">
<![CDATA[
if (a[1] < 3)
  app.alert("Tclams");
]]>
</script>

through it, and watch how the state machine never comes out of the  
FDE_XMLSYNTAXMODE_SkipDeclNode (14) state:

state = 0, m_SkipChar = , stack = , ch = <
state = 0, m_SkipChar = , stack = , ch = <
state = 1, m_SkipChar = , stack = , ch = s
state = 3, m_SkipChar = , stack = , ch = s
state = 3, m_SkipChar = , stack = , ch = c
state = 3, m_SkipChar = , stack = , ch = r
state = 3, m_SkipChar = , stack = , ch = i
state = 3, m_SkipChar = , stack = , ch = p
state = 3, m_SkipChar = , stack = , ch = t
state = 3, m_SkipChar = , stack = , ch =  
state = 4, m_SkipChar = , stack = , ch =  
state = 4, m_SkipChar = , stack = , ch = c
state = 4, m_SkipChar = , stack = , ch = o
state = 4, m_SkipChar = , stack = , ch = n
state = 4, m_SkipChar = , stack = , ch = t
state = 4, m_SkipChar = , stack = , ch = e
state = 4, m_SkipChar = , stack = , ch = n
state = 4, m_SkipChar = , stack = , ch = t
state = 4, m_SkipChar = , stack = , ch = T
state = 4, m_SkipChar = , stack = , ch = y
state = 4, m_SkipChar = , stack = , ch = p
state = 4, m_SkipChar = , stack = , ch = e
state = 4, m_SkipChar = , stack = , ch = =
state = 5, m_SkipChar = , stack = , ch = =
state = 6, m_SkipChar = , stack = , ch = "
state = 7, m_SkipChar = , stack = , ch = a
state = 7, m_SkipChar = , stack = , ch = p
state = 7, m_SkipChar = , stack = , ch = p
state = 7, m_SkipChar = , stack = , ch = l
state = 7, m_SkipChar = , stack = , ch = i
state = 7, m_SkipChar = , stack = , ch = c
state = 7, m_SkipChar = , stack = , ch = a
state = 7, m_SkipChar = , stack = , ch = t
state = 7, m_SkipChar = , stack = , ch = i
state = 7, m_SkipChar = , stack = , ch = o
state = 7, m_SkipChar = , stack = , ch = n
state = 7, m_SkipChar = , stack = , ch = /
state = 7, m_SkipChar = , stack = , ch = x
state = 7, m_SkipChar = , stack = , ch = -
state = 7, m_SkipChar = , stack = , ch = j
state = 7, m_SkipChar = , stack = , ch = a
state = 7, m_SkipChar = , stack = , ch = v
state = 7, m_SkipChar = , stack = , ch = a
state = 7, m_SkipChar = , stack = , ch = s
state = 7, m_SkipChar = , stack = , ch = c
state = 7, m_SkipChar = , stack = , ch = r
state = 7, m_SkipChar = , stack = , ch = i
state = 7, m_SkipChar = , stack = , ch = p
state = 7, m_SkipChar = , stack = , ch = t
state = 7, m_SkipChar = , stack = , ch = "
state = 4, m_SkipChar = , stack = , ch = >
state = 12, m_SkipChar = , stack = , ch = >
state = 0, m_SkipChar = , stack = , ch = 

state = 0, m_SkipChar = , stack = , ch = <
state = 0, m_SkipChar = , stack = , ch = <
state = 1, m_SkipChar = , stack = , ch = !
state = 17, m_SkipChar = , stack = , ch = [
state = 14, m_SkipChar = >, stack = >, ch = [
state = 14, m_SkipChar = ], stack = ], ch = C
state = 14, m_SkipChar = ], stack = ], ch = D
state = 14, m_SkipChar = ], stack = ], ch = A
state = 14, m_SkipChar = ], stack = ], ch = T
state = 14, m_SkipChar = ], stack = ], ch = A
state = 14, m_SkipChar = ], stack = ], ch = [
state = 14, m_SkipChar = ], stack = ], ch = 

state = 14, m_SkipChar = ], stack = ], ch = i
state = 14, m_SkipChar = ], stack = ], ch = f
state = 14, m_SkipChar = ], stack = ], ch =  
state = 14, m_SkipChar = ], stack = ], ch = (
state = 14, m_SkipChar = ), stack = ), ch = a
state = 14, m_SkipChar = ), stack = ), ch = [
state = 14, m_SkipChar = ], stack = ], ch = 1
state = 14, m_SkipChar = ], stack = ], ch = ]
state = 14, m_SkipChar = ), stack = ), ch =  
state = 14, m_SkipChar = ), stack = ), ch = <
state = 14, m_SkipChar = >, stack = >, ch =  
state = 14, m_SkipChar = >, stack = >, ch = 3
state = 14, m_SkipChar = >, stack = >, ch = )
state = 14, m_SkipChar = >, stack = >, ch = 

state = 14, m_SkipChar = >, stack = >, ch =  
state = 14, m_SkipChar = >, stack = >, ch =  
state = 14, m_SkipChar = >, stack = >, ch = a
state = 14, m_SkipChar = >, stack = >, ch = p
state = 14, m_SkipChar = >, stack = >, ch = p
state = 14, m_SkipChar = >, stack = >, ch = .
state = 14, m_SkipChar = >, stack = >, ch = a
state = 14, m_SkipChar = >, stack = >, ch = l
state = 14, m_SkipChar = >, stack = >, ch = e
state = 14, m_SkipChar = >, stack = >, ch = r
state = 14, m_SkipChar = >, stack = >, ch = t
state = 14, m_SkipChar = >, stack = >, ch = (
state = 14, m_SkipChar = ), stack = ), ch = "
state = 14, m_SkipChar = ", stack = ", ch = T
state = 14, m_SkipChar = ", stack = ", ch = c
state = 14, m_SkipChar = ", stack = ", ch = l
state = 14, m_SkipChar = ", stack = ", ch = a
state = 14, m_SkipChar = ", stack = ", ch = m
state = 14, m_SkipChar = ", stack = ", ch = s
state = 14, m_SkipChar = ", stack = ", ch = "
state = 14, m_SkipChar = ), stack = ), ch = )
state = 14, m_SkipChar = >, stack = >, ch = ;
state = 14, m_SkipChar = >, stack = >, ch = 

state = 14, m_SkipChar = >, stack = >, ch = ]
state = 14, m_SkipChar = >, stack = >, ch = ]
state = 14, m_SkipChar = >, stack = >, ch = >
state = 14, m_SkipChar = ), stack = ), ch = 

state = 14, m_SkipChar = ), stack = ), ch = <
state = 14, m_SkipChar = >, stack = >, ch = /
state = 14, m_SkipChar = >, stack = >, ch = s
state = 14, m_SkipChar = >, stack = >, ch = c
state = 14, m_SkipChar = >, stack = >, ch = r
state = 14, m_SkipChar = >, stack = >, ch = i
state = 14, m_SkipChar = >, stack = >, ch = p
state = 14, m_SkipChar = >, stack = >, ch = t
state = 14, m_SkipChar = >, stack = >, ch = >
state = 14, m_SkipChar = ), stack = ), ch = 

state = 14, m_SkipChar = ), stack = ), ch = <
state = 14, m_SkipChar = >, stack = >, ch = s
state = 14, m_SkipChar = >, stack = >, ch = c
state = 14, m_SkipChar = >, stack = >, ch = r
state = 14, m_SkipChar = >, stack = >, ch = i
state = 14, m_SkipChar = >, stack = >, ch = p
state = 14, m_SkipChar = >, stack = >, ch = t
state = 14, m_SkipChar = >, stack = >, ch =  

It's getting confused by the less-than sign for the "if" statement (or any 
other expected balanced punctuation) inside the cdata section.  There's no 
reason to believe that any data in this section follows the XML rules, and what 
you need to look for is "]]>", and nothing else.  For example, I also believe 
this will exit early if I had data like:

  <![CDATA[foo]bar]>

since it's merely popping matching characters and not looking for the literal 
]]>.

This makes running JS as part of test cases rather difficult.

Original issue reported on code.google.com by tsepez@chromium.org on 9 Dec 2014 at 1:13