thagenbeek / phpquery

Automatically exported from code.google.com/p/phpquery
0 stars 0 forks source link

Charset conversions #86

Open GoogleCodeExporter opened 9 years ago

GoogleCodeExporter commented 9 years ago
Automatic charset conversions:
 * loading document encoded with charset A as charset B
 * node transfers between documents
 * saving document in other charsets (maybe)

Original issue reported on code.google.com by tobiasz....@gmail.com on 7 Dec 2008 at 1:37

GoogleCodeExporter commented 9 years ago
I have a patch for this for HTML markup:

I would be happy to submit a 'patch' but I'm working from the 'one file release:
0.9.5 RC1'

         protected function loadMarkup($markup) {
        $loaded = false;

        // Look for requested content-type
        $reqContentType = null;
        $reqCharset = null;

        if ($this->contentType) {
            list($reqContentType, $reqCharset) = $this->contentTypeToArray($this->contentType);
            if($reqContentType === 'text')
                $reqContentType = null; // Auto-detect
        }

        if($reqContentType) {

            self::debug("Load markup for content type {$this->contentType}");
            switch($reqContentType) {
                case 'text/html':
                    $loaded = $this->loadMarkupHTML($markup, $reqCharset);
                break;
                case 'text/xml':
                case 'application/xhtml+xml':
                    $loaded = $this->loadMarkupXML($markup, $reqCharset);
                break;
                default:
                    // for feeds or anything that sometimes doesn't use text/xml
                    if (strpos('xml', $this->contentType) !== false)
                        $loaded = $this->loadMarkupXML($markup, $reqCharset);
                    else
                        phpQuery::debug("Could not determine document type from content type
'{$this->contentType}'");
            }
        } else {
            // content type autodetection
            if ($this->isXML($markup)) {
                $loaded = $this->loadMarkupXML($markup, $reqCharset);
                if (! $loaded && $this->isXHTML) {
                    phpQuery::debug('Loading as XML failed, trying to load as HTML');
                    $loaded = $this->loadMarkupHTML($markup, $reqCharset);
                }
            } else {
                $loaded = $this->loadMarkupHTML($markup, $reqCharset);
            }
        }
        return $loaded;
    }
    protected function loadMarkupReset() {
        $this->isXML = $this->isXHTML = $this->isHTML = false;
    }
    protected function documentCreate($charset, $version = '1.0') {
        if (! $version)
            $version = '1.0';
        $this->document = new DOMDocument($version, $charset);
        $this->charset = $charset;
        $this->document->encoding = $charset;
        $this->document->formatOutput = true;
        $this->document->preserveWhiteSpace = true;
    }
    protected function loadMarkupHTML($markup, $requestedCharset = null) {

        if (phpQuery::$debug)
            phpQuery::debug('Full markup load (HTML): '.substr($markup, 0, 250));
        $this->loadMarkupReset();
        $this->isHTML = true;
        if (!isset($this->isDocumentFragment))
            $this->isDocumentFragment = self::isDocumentFragmentHTML($markup);
        $charset = null;
        $documentCharset = $this->charsetFromHTML($markup);
        $addDocumentCharset = false;
        if ($documentCharset) {
            $charset = $documentCharset;
            $markup = $this->charsetFixHTML($markup);
        } else if ($requestedCharset) {
            $charset = $requestedCharset;
        }
        if (! $charset )
            $charset = phpQuery::$defaultCharset;

        // HTTP 1.1 says that the default charset is ISO-8859-1
        // @see http://www.w3.org/International/O-HTTP-charset
        if(!$documentCharset) {
            $documentCharset = 'ISO-8859-1';
            $addDocumentCharset = true; 
        }
        // Should be careful here, still need 'magic encoding detection' since lots of
pages have other 'default encoding'
        // Worse, some pages can have mixed encodings... we'll try not to worry about that

        $requestedCharset = strtoupper($requestedCharset);
        $documentCharset = strtoupper($documentCharset);
        phpQuery::debug("DOC: $documentCharset REQ: $requestedCharset");

        if ($requestedCharset && $documentCharset && $requestedCharset !== $documentCharset) {
            phpQuery::debug("CHARSET CONVERT");
            // Document Encoding Conversion
            // http://code.google.com/p/phpquery/issues/detail?id=86
            if (function_exists('mb_detect_encoding')) {
                $docEncoding = mb_detect_encoding($markup, $requestedCharset . ", AUTO");
                if(!$docEncoding)
                    $docEncoding = $documentCharset; // ok trust the document

                phpQuery::debug("DETECTED '$docEncoding'");

                // Detected does not match what document says...
                if($docEncoding !== $documentCharset) {
                    // Tricky..
                }

                if($docEncoding !== $requestedCharset) {
                    phpQuery::debug("CONVERT $docEncoding => $requestedCharset");

                    $markup = mb_convert_encoding($markup, $requestedCharset, $docEncoding);
                    $charset = $requestedCharset;
                }
            } else {
                // Native functions?    
            }
        }

        $return = false;
        if ($this->isDocumentFragment) {
            phpQuery::debug("Full markup load (HTML), DocumentFragment detected, using charset
'$charset'");
            $return = $this->documentFragmentLoadMarkup($this, $charset, $markup);
        } else {
            if ($addDocumentCharset) {
                phpQuery::debug("Full markup load (HTML), appending charset: '$charset'");
                $markup = $this->charsetAppendToHTML($markup, $charset);
            }
            phpQuery::debug("Full markup load (HTML), documentCreate('$charset')");

            $this->documentCreate($charset);
            $return = phpQuery::$debug === 2
                ? $this->document->loadHTML($markup)
                : @$this->document->loadHTML($markup);
            if ($return)
                $this->root = $this->document;
        }
        if ($return && ! $this->contentType)
            $this->contentType = 'text/html';
        return $return;
    }

Original comment by jbo...@gmail.com on 26 Feb 2009 at 4:15

GoogleCodeExporter commented 9 years ago
Thx for the patch. Ive applied it with some changes to dev branch in r361.

Unfortunately this wont close this ticket, as it covers only HTML, load-time 
conversion.

PS. Please use "attach file" for big code chunks.

Original comment by tobiasz....@gmail.com on 11 Mar 2009 at 11:59

GoogleCodeExporter commented 9 years ago
[deleted comment]
GoogleCodeExporter commented 9 years ago
Hi,

Maybe,
   // check supported encodings
   $suportedEncodings = mb_list_encodings();
   if (!in_array($docEncoding, $suportedEncodings)) {
       $docEncoding = 'auto';
   }
is needed before mb_convert_encoding method.

Original comment by andyn...@gmail.com on 30 Mar 2015 at 2:42