Open GoogleCodeExporter opened 9 years ago
What steps will reproduce the problem? I first ran into this isse using the new feature for producing Google Base feeds but it potentially affects all operation of the module. 1. Create an item in OSCommerce and, in the description, enter named HTMK entities like  2. Generate the Google Base Feed and validate it. What is the expected output? What do you see instead? You would expect proper XML output. Instead we get an XML parse error on the place where you have the Â. The error occurs because there is no DTD for named HTML entities and they are not being fixed by the google_xml_builder class. What version of the product are you using? On what operating system? 1.5.0 RC1 / PHP 4 / Linux Please provide any additional information below. This occurs because you sanitize the values using "htmlentities" function. This function only sanitizes a small number of entities. A broader function that fixes ALL entities and replaces them with their NUMERIC entity values can be found bellow: /** * Sanitizes all Text to make sure we output a valid utf8 text * This is better than htmlentities because htmlentities only encodes * a few characters whereas we need to encode ALL of them */ /* html_convert_entities($string) -- convert named HTML entities to * XML-compatible numeric entities. */ function html_convert_entities($s) { $sOut = preg_replace_callback("/&([a-zA-Z][a-zA-Z0-9] +);/", array( &$this, 'convert_entity'), htmlentities($s), -1); return utf8_encode($sOut); } /* Swap HTML named entity with its numeric equivalent. If the entity isn't * in the lookup table, this function returns a blank, which destroys the * character in the output - this is probably the desired behaviour when * producing XML. */ function convert_entity($matches) { static $table = array('quot' => '"', 'amp' => '&', 'lt' => '<', 'gt' => '>', 'OElig' => 'Œ', 'oelig' => 'œ', 'Scaron' => 'Š', 'scaron' => 'š', 'Yuml' => 'Ÿ', 'circ' => 'ˆ', 'tilde' => '˜', 'ensp' => ' ', 'emsp' => ' ', 'thinsp' => ' ', 'zwnj' => '‌', 'zwj' => '‍', 'lrm' => '‎', 'rlm' => '‏', 'ndash' => '–', 'mdash' => '—', 'lsquo' => '‘', 'rsquo' => '’', 'sbquo' => '‚', 'ldquo' => '“', 'rdquo' => '”', 'bdquo' => '„', 'dagger' => '†', 'Dagger' => '‡', 'permil' => '‰', 'lsaquo' => '‹', 'rsaquo' => '›', 'euro' => '€', 'fnof' => 'ƒ', 'Alpha' => 'Α', 'Beta' => 'Β', 'Gamma' => 'Γ', 'Delta' => 'Δ', 'Epsilon' => 'Ε', 'Zeta' => 'Ζ', 'Eta' => 'Η', 'Theta' => 'Θ', 'Iota' => 'Ι', 'Kappa' => 'Κ', 'Lambda' => 'Λ', 'Mu' => 'Μ', 'Nu' => 'Ν', 'Xi' => 'Ξ', 'Omicron' => 'Ο', 'Pi' => 'Π', 'Rho' => 'Ρ', 'Sigma' => 'Σ', 'Tau' => 'Τ', 'Upsilon' => 'Υ', 'Phi' => 'Φ', 'Chi' => 'Χ', 'Psi' => 'Ψ', 'Omega' => 'Ω', 'alpha' => 'α', 'beta' => 'β', 'gamma' => 'γ', 'delta' => 'δ', 'epsilon' => 'ε', 'zeta' => 'ζ', 'eta' => 'η', 'theta' => 'θ', 'iota' => 'ι', 'kappa' => 'κ', 'lambda' => 'λ', 'mu' => 'μ', 'nu' => 'ν', 'xi' => 'ξ', 'omicron' => 'ο', 'pi' => 'π', 'rho' => 'ρ', 'sigmaf' => 'ς', 'sigma' => 'σ', 'tau' => 'τ', 'upsilon' => 'υ', 'phi' => 'φ', 'chi' => 'χ', 'psi' => 'ψ', 'omega' => 'ω', 'thetasym' => 'ϑ', 'upsih' => 'ϒ', 'piv' => 'ϖ', 'bull' => '•', 'hellip' => '…', 'prime' => '′', 'Prime' => '″', 'oline' => '‾', 'frasl' => '⁄', 'weierp' => '℘', 'image' => 'ℑ', 'real' => 'ℜ', 'trade' => '™', 'alefsym' => 'ℵ', 'larr' => '←', 'uarr' => '↑', 'rarr' => '→', 'darr' => '↓', 'harr' => '↔', 'crarr' => '↵', 'lArr' => '⇐', 'uArr' => '⇑', 'rArr' => '⇒', 'dArr' => '⇓', 'hArr' => '⇔', 'forall' => '∀', 'part' => '∂', 'exist' => '∃', 'empty' => '∅', 'nabla' => '∇', 'isin' => '∈', 'notin' => '∉', 'ni' => '∋', 'prod' => '∏', 'sum' => '∑', 'minus' => '−', 'lowast' => '∗', 'radic' => '√', 'prop' => '∝', 'infin' => '∞', 'ang' => '∠', 'and' => '∧', 'or' => '∨', 'cap' => '∩', 'cup' => '∪', 'int' => '∫', 'there4' => '∴', 'sim' => '∼', 'cong' => '≅', 'asymp' => '≈', 'ne' => '≠', 'equiv' => '≡', 'le' => '≤', 'ge' => '≥', 'sub' => '⊂', 'sup' => '⊃', 'nsub' => '⊄', 'sube' => '⊆', 'supe' => '⊇', 'oplus' => '⊕', 'otimes' => '⊗', 'perp' => '⊥', 'sdot' => '⋅', 'lceil' => '⌈', 'rceil' => '⌉', 'lfloor' => '⌊', 'rfloor' => '⌋', 'lang' => '〈', 'rang' => '〉', 'loz' => '◊', 'spades' => '♠', 'clubs' => '♣', 'hearts' => '♥', 'diams' => '♦', 'nbsp' => ' ', 'iexcl' => '¡', 'cent' => '¢', 'pound' => '£', 'curren' => '¤', 'yen' => '¥', 'brvbar' => '¦', 'sect' => '§', 'uml' => '¨', 'copy' => '©', 'ordf' => 'ª', 'laquo' => '«', 'not' => '¬', 'shy' => '­', 'reg' => '®', 'macr' => '¯', 'deg' => '°', 'plusmn' => '±', 'sup2' => '²', 'sup3' => '³', 'acute' => '´', 'micro' => 'µ', 'para' => '¶', 'middot' => '·', 'cedil' => '¸', 'sup1' => '¹', 'ordm' => 'º', 'raquo' => '»', 'frac14' => '¼', 'frac12' => '½', 'frac34' => '¾', 'iquest' => '¿', 'Agrave' => 'À', 'Aacute' => 'Á', 'Acirc' => 'Â', 'Atilde' => 'Ã', 'Auml' => 'Ä', 'Aring' => 'Å', 'AElig' => 'Æ', 'Ccedil' => 'Ç', 'Egrave' => 'È', 'Eacute' => 'É', 'Ecirc' => 'Ê', 'Euml' => 'Ë', 'Igrave' => 'Ì', 'Iacute' => 'Í', 'Icirc' => 'Î', 'Iuml' => 'Ï', 'ETH' => 'Ð', 'Ntilde' => 'Ñ', 'Ograve' => 'Ò', 'Oacute' => 'Ó', 'Ocirc' => 'Ô', 'Otilde' => 'Õ', 'Ouml' => 'Ö', 'times' => '×', 'Oslash' => 'Ø', 'Ugrave' => 'Ù', 'Uacute' => 'Ú', 'Ucirc' => 'Û', 'Uuml' => 'Ü', 'Yacute' => 'Ý', 'THORN' => 'Þ', 'szlig' => 'ß', 'agrave' => 'à', 'aacute' => 'á', 'acirc' => 'â', 'atilde' => 'ã', 'auml' => 'ä', 'aring' => 'å', 'aelig' => 'æ', 'ccedil' => 'ç', 'egrave' => 'è', 'eacute' => 'é', 'ecirc' => 'ê', 'euml' => 'ë', 'igrave' => 'ì', 'iacute' => 'í', 'icirc' => 'î', 'iuml' => 'ï', 'eth' => 'ð', 'ntilde' => 'ñ', 'ograve' => 'ò', 'oacute' => 'ó', 'ocirc' => 'ô', 'otilde' => 'õ', 'ouml' => 'ö', 'divide' => '÷', 'oslash' => 'ø', 'ugrave' => 'ù', 'uacute' => 'ú', 'ucirc' => 'û', 'uuml' => 'ü', 'yacute' => 'ý', 'thorn' => 'þ', 'yuml' => 'ÿ' ); // Entity not found? Destroy it. if (isset($table[$matches[1]])) { return $table[$matches[1]]; } else { return ''; } } By opening google_xml_builder and replacing all calls to htmlentities with a call to this cusotm function (html_convert_entities) these issues disappear. Pedro
Original issue reported on code.google.com by Limor.Sc...@gmail.com on 13 Oct 2009 at 2:40
Limor.Sc...@gmail.com
Original issue reported on code.google.com by
Limor.Sc...@gmail.com
on 13 Oct 2009 at 2:40