emacsattic / readability

Readability for Emacs
39 stars 2 forks source link

Character parsing from HTML codepoint #3

Closed wvxvw closed 10 years ago

wvxvw commented 10 years ago

I ran into this error:

Debugger entered--Lisp error: (invalid-read-syntax "#")
  read("#39;")
  (format "%c" (read $hex))
  (insert (format "%c" (read $hex)))
  (let (($beg (match-beginning 0)) ($end (match-end 0)) ($hex (match-string 1))) (delete-region $beg $end) (insert (format "%c" (read $hex))))
  (while (re-search-forward "&\\(#[^;]+;\\)" nil t) (let (($beg (match-beginning 0)) ($end (match-end 0)) ($hex (match-string 1))) (delete-region $beg $end) (insert (format "%c" (read $hex)))))
  (progn (insert $string) (goto-char (point-min)) (while (re-search-forward "&\\(#[^;]+;\\)" nil t) (let (($beg (match-beginning 0)) ($end (match-end 0)) ($hex (match-string 1))) (delete-region $beg $end) (insert (format "%c" (read $hex))))) (buffer-string))
  (unwind-protect (progn (insert $string) (goto-char (point-min)) (while (re-search-forward "&\\(#[^;]+;\\)" nil t) (let (($beg (match-beginning 0)) ($end (match-end 0)) ($hex (match-string 1))) (delete-region $beg $end) (insert (format "%c" (read $hex))))) (buffer-string)) (and (buffer-name temp-buffer) (kill-buffer temp-buffer)))
  (save-current-buffer (set-buffer temp-buffer) (unwind-protect (progn (insert $string) (goto-char (point-min)) (while (re-search-forward "&\\(#[^;]+;\\)" nil t) (let (($beg (match-beginning 0)) ($end (match-end 0)) ($hex (match-string 1))) (delete-region $beg $end) (insert (format "%c" (read $hex))))) (buffer-string)) (and (buffer-name temp-buffer) (kill-buffer temp-buffer))))
  (let ((temp-buffer (generate-new-buffer " *temp*"))) (save-current-buffer (set-buffer temp-buffer) (unwind-protect (progn (insert $string) (goto-char (point-min)) (while (re-search-forward "&\\(#[^;]+;\\)" nil t) (let (($beg ...) ($end ...) ($hex ...)) (delete-region $beg $end) (insert (format "%c" ...)))) (buffer-string)) (and (buffer-name temp-buffer) (kill-buffer temp-buffer)))))
  readability--decode-json-string("<pre>long text skipped</pre>" #<buffer *emacs*>)
  async-when-done(#<process emacs> "finished\n")

And I believe that the culprit is this:

(defun readability--decode-json-string ($string)
  (with-temp-buffer
    (insert $string)
    (goto-char (point-min))
-   (while (re-search-forward "\&\\(\#[^;]+;\\)" nil t)
+   (while (re-search-forward "\&\#\\([^;]+\\);" nil t)
      (let (($beg (match-beginning 0))
            ($end (match-end 0))
            ($hex (match-string 1)))
        (delete-region $beg $end)
-       (insert (format "%c" (read $hex)))))
+       (insert (format "%c" (string-to-number $hex 16)))))
    (buffer-string)))

I.e. you wanted to extract just the integer and to print the corresponding character, but you captured the sharp and the semicolon signs too.

ShingoFukuyama commented 10 years ago

I didn't expect that sort of error. This function is for decoding multi-byte characters.

(readability--decode-json-string
 "<p>&#x826F;&#x3044;&#x30D7;&#x30ED;&#x30B0;&#x30E9;&#x30DE;&#x30FC;&#x306F; &#39; </p>")
;; => "<p>良いプログラマーは &#39; </p>"
(string-to-number "826F" 16)
;; => 33391
(format "%c" (read "#x826F"))
;; => "良"

Now &#39; remains intact. Thanks to you!