Open GoogleCodeExporter opened 9 years ago
I have a patch for this for HTML markup:
I would be happy to submit a 'patch' but I'm working from the 'one file release:
0.9.5 RC1'
protected function loadMarkup($markup) {
$loaded = false;
// Look for requested content-type
$reqContentType = null;
$reqCharset = null;
if ($this->contentType) {
list($reqContentType, $reqCharset) = $this->contentTypeToArray($this->contentType);
if($reqContentType === 'text')
$reqContentType = null; // Auto-detect
}
if($reqContentType) {
self::debug("Load markup for content type {$this->contentType}");
switch($reqContentType) {
case 'text/html':
$loaded = $this->loadMarkupHTML($markup, $reqCharset);
break;
case 'text/xml':
case 'application/xhtml+xml':
$loaded = $this->loadMarkupXML($markup, $reqCharset);
break;
default:
// for feeds or anything that sometimes doesn't use text/xml
if (strpos('xml', $this->contentType) !== false)
$loaded = $this->loadMarkupXML($markup, $reqCharset);
else
phpQuery::debug("Could not determine document type from content type
'{$this->contentType}'");
}
} else {
// content type autodetection
if ($this->isXML($markup)) {
$loaded = $this->loadMarkupXML($markup, $reqCharset);
if (! $loaded && $this->isXHTML) {
phpQuery::debug('Loading as XML failed, trying to load as HTML');
$loaded = $this->loadMarkupHTML($markup, $reqCharset);
}
} else {
$loaded = $this->loadMarkupHTML($markup, $reqCharset);
}
}
return $loaded;
}
protected function loadMarkupReset() {
$this->isXML = $this->isXHTML = $this->isHTML = false;
}
protected function documentCreate($charset, $version = '1.0') {
if (! $version)
$version = '1.0';
$this->document = new DOMDocument($version, $charset);
$this->charset = $charset;
$this->document->encoding = $charset;
$this->document->formatOutput = true;
$this->document->preserveWhiteSpace = true;
}
protected function loadMarkupHTML($markup, $requestedCharset = null) {
if (phpQuery::$debug)
phpQuery::debug('Full markup load (HTML): '.substr($markup, 0, 250));
$this->loadMarkupReset();
$this->isHTML = true;
if (!isset($this->isDocumentFragment))
$this->isDocumentFragment = self::isDocumentFragmentHTML($markup);
$charset = null;
$documentCharset = $this->charsetFromHTML($markup);
$addDocumentCharset = false;
if ($documentCharset) {
$charset = $documentCharset;
$markup = $this->charsetFixHTML($markup);
} else if ($requestedCharset) {
$charset = $requestedCharset;
}
if (! $charset )
$charset = phpQuery::$defaultCharset;
// HTTP 1.1 says that the default charset is ISO-8859-1
// @see http://www.w3.org/International/O-HTTP-charset
if(!$documentCharset) {
$documentCharset = 'ISO-8859-1';
$addDocumentCharset = true;
}
// Should be careful here, still need 'magic encoding detection' since lots of
pages have other 'default encoding'
// Worse, some pages can have mixed encodings... we'll try not to worry about that
$requestedCharset = strtoupper($requestedCharset);
$documentCharset = strtoupper($documentCharset);
phpQuery::debug("DOC: $documentCharset REQ: $requestedCharset");
if ($requestedCharset && $documentCharset && $requestedCharset !== $documentCharset) {
phpQuery::debug("CHARSET CONVERT");
// Document Encoding Conversion
// http://code.google.com/p/phpquery/issues/detail?id=86
if (function_exists('mb_detect_encoding')) {
$docEncoding = mb_detect_encoding($markup, $requestedCharset . ", AUTO");
if(!$docEncoding)
$docEncoding = $documentCharset; // ok trust the document
phpQuery::debug("DETECTED '$docEncoding'");
// Detected does not match what document says...
if($docEncoding !== $documentCharset) {
// Tricky..
}
if($docEncoding !== $requestedCharset) {
phpQuery::debug("CONVERT $docEncoding => $requestedCharset");
$markup = mb_convert_encoding($markup, $requestedCharset, $docEncoding);
$charset = $requestedCharset;
}
} else {
// Native functions?
}
}
$return = false;
if ($this->isDocumentFragment) {
phpQuery::debug("Full markup load (HTML), DocumentFragment detected, using charset
'$charset'");
$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);
} else {
if ($addDocumentCharset) {
phpQuery::debug("Full markup load (HTML), appending charset: '$charset'");
$markup = $this->charsetAppendToHTML($markup, $charset);
}
phpQuery::debug("Full markup load (HTML), documentCreate('$charset')");
$this->documentCreate($charset);
$return = phpQuery::$debug === 2
? $this->document->loadHTML($markup)
: @$this->document->loadHTML($markup);
if ($return)
$this->root = $this->document;
}
if ($return && ! $this->contentType)
$this->contentType = 'text/html';
return $return;
}
Original comment by jbo...@gmail.com
on 26 Feb 2009 at 4:15
Thx for the patch. Ive applied it with some changes to dev branch in r361.
Unfortunately this wont close this ticket, as it covers only HTML, load-time
conversion.
PS. Please use "attach file" for big code chunks.
Original comment by tobiasz....@gmail.com
on 11 Mar 2009 at 11:59
[deleted comment]
Hi,
Maybe,
// check supported encodings
$suportedEncodings = mb_list_encodings();
if (!in_array($docEncoding, $suportedEncodings)) {
$docEncoding = 'auto';
}
is needed before mb_convert_encoding method.
Original comment by andyn...@gmail.com
on 30 Mar 2015 at 2:42
Original issue reported on code.google.com by
tobiasz....@gmail.com
on 7 Dec 2008 at 1:37