system-manager-wakasu
1;;; org-protocol-capture-html.el --- Capture HTML with org-protocol
2
3;; URL: https://github.com/alphapapa/org-protocol-capture-html
4;; Version: 0.1-pre
5;; Package-Requires: ((emacs "24.4"))
6
7;;; Commentary:
8
9;; This package captures Web pages into Org-mode using Pandoc to
10;; process HTML. It can also use eww's eww-readable functionality to
11;; get the main content of a page.
12
13;; These are the helper functions that run in Emacs. To capture pages
14;; into Emacs, you can use either a browser bookmarklet or the
15;; org-protocol-capture-html.sh shell script. See the README.org file
16;; for instructions.
17
18;;; License:
19
20;; This program is free software; you can redistribute it and/or modify
21;; it under the terms of the GNU General Public License as published by
22;; the Free Software Foundation, either version 3 of the License, or
23;; (at your option) any later version.
24
25;; This program is distributed in the hope that it will be useful,
26;; but WITHOUT ANY WARRANTY; without even the implied warranty of
27;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28;; GNU General Public License for more details.
29
30;; You should have received a copy of the GNU General Public License
31;; along with this program. If not, see <http://www.gnu.org/licenses/>.
32
33;;; Code:
34
35;;;; Require
36
37(require 'org-protocol)
38(require 'cl-lib)
39(require 'subr-x)
40(require 's)
41
42;;;; Vars
43
44(defcustom org-protocol-capture-html-demote-times 1
45 "How many times to demote headings in captured pages.
46You may want to increase this if you use a sub-heading in your capture template."
47 :group 'org-protocol-capture-html :type 'integer)
48
49;;;; Test Pandoc
50
51(defconst org-protocol-capture-html-pandoc-no-wrap-option nil
52 ;; Set this so it won't be unbound
53 "Option to pass to Pandoc to disable wrapping.
54Pandoc >= 1.16 deprecates `--no-wrap' in favor of
55`--wrap=none'.")
56
57(defun org-protocol-capture-html--define-pandoc-wrap-const ()
58 "Set `org-protocol-capture-html-pandoc-no-wrap-option'."
59 (setq org-protocol-capture-html-pandoc-no-wrap-option
60 ;; Pandoc >= 1.16 deprecates the --no-wrap option, replacing it with
61 ;; --wrap=none. Sending the wrong option causes output to STDERR,
62 ;; which `call-process-region' doesn't like. So we test Pandoc to see
63 ;; which option to use.
64 (with-temp-buffer
65 (let* ((process (start-process "test-pandoc" (current-buffer) "pandoc" "--dump-args" "--no-wrap"))
66 (limit 3)
67 (checked 0))
68 (while (process-live-p process)
69 (if (= checked limit)
70 (progn
71 ;; Pandoc didn't exit in time. Kill it and raise
72 ;; an error. This function will return `nil' and
73 ;; `org-protocol-capture-html-pandoc-no-wrap-option'
74 ;; will remain `nil', which will cause this
75 ;; function to run again and set the const when a
76 ;; capture is run.
77 (set-process-query-on-exit-flag process nil)
78 (error "Unable to test Pandoc! Please report this bug! (include the output of \"pandoc --dump-args --no-wrap\")"))
79 (sleep-for 0.2)
80 (cl-incf checked)))
81 (if (and (zerop (process-exit-status process))
82 (not (string-match "--no-wrap is deprecated" (buffer-string))))
83 "--no-wrap"
84 "--wrap=none")))))
85
86;;;; Direct-to-Pandoc
87
88(defun org-protocol-capture-html--with-pandoc (data)
89 "Process an org-protocol://capture-html:// URL using DATA.
90
91This function is basically a copy of `org-protocol-do-capture', but
92it passes the captured content (not the URL or title) through
93Pandoc, converting HTML to Org-mode."
94
95 ;; It would be nice to not basically duplicate
96 ;; `org-protocol-do-capture', but passing the data back to that
97 ;; function would require re-encoding the data into a URL string
98 ;; with Emacs after Pandoc converts it. Since we've already split
99 ;; it up, we might as well go ahead and run the capture directly.
100
101 (unless org-protocol-capture-html-pandoc-no-wrap-option
102 (org-protocol-capture-html--define-pandoc-wrap-const))
103
104 (let* ((template (or (plist-get data :template)
105 org-protocol-default-template-key))
106 (url (org-protocol-sanitize-uri (plist-get data :url)))
107 (type (if (string-match "^\\([a-z]+\\):" url)
108 (match-string 1 url)))
109 (title (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :title))) ""))
110 (content (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :body))) ""))
111 (orglink (org-make-link-string
112 url (if (string-match "[^[:space:]]" title) title url)))
113 (org-capture-link-is-already-stored t)) ; avoid call to org-store-link
114
115 (setq org-stored-links
116 (cons (list url title) org-stored-links))
117 (kill-new orglink)
118
119 (with-temp-buffer
120 (insert content)
121 (if (not (zerop (call-process-region
122 (point-min) (point-max)
123 "pandoc" t t nil "-f" "html" "-t" "org" org-protocol-capture-html-pandoc-no-wrap-option)))
124 (message "Pandoc failed: %s" (buffer-string))
125 (progn
126 ;; Pandoc succeeded
127 (org-store-link-props :type type
128 :annotation orglink
129 :link url
130 :description title
131 :orglink orglink
132 :initial (buffer-string)))))
133 (org-protocol-capture-html--do-capture)
134 nil))
135
136(add-to-list 'org-protocol-protocol-alist
137 '("capture-html"
138 :protocol "capture-html"
139 :function org-protocol-capture-html--with-pandoc
140 :kill-client t))
141
142;;;; eww-readable
143
144(defvar url-http-end-of-headers)
145
146(eval-when-compile
147 ;; eww-readable only works on Emacs >=25.1, but I think it's better
148 ;; to check for the actual symbols. I think using
149 ;; `eval-when-compile' is the right way to do this, but I'm not
150 ;; sure.
151 (when (and (require 'eww nil t)
152 (require 'dom nil t)
153 (fboundp 'eww-score-readability))
154
155 (defun org-protocol-capture-html--capture-eww-readable (data)
156 "Capture content of URL with eww-readable.."
157
158 (unless org-protocol-capture-html-pandoc-no-wrap-option
159 (org-protocol-capture-html--define-pandoc-wrap-const))
160
161 (let* ((template (or (plist-get data :template)
162 org-protocol-default-template-key))
163 (url (org-protocol-sanitize-uri (plist-get data :url)))
164 (type (if (string-match "^\\([a-z]+\\):" url)
165 (match-string 1 url)))
166 (html (org-protocol-capture-html--url-html url))
167 (result (org-protocol-capture-html--eww-readable html))
168 (title (cdr result))
169 (content (with-temp-buffer
170 (insert (org-protocol-capture-html--nbsp-to-space (car result)))
171 ;; Convert to Org with Pandoc
172 (unless (= 0 (call-process-region (point-min) (point-max)
173 "pandoc" t t nil "-f" "html" "-t" "org"
174 org-protocol-capture-html-pandoc-no-wrap-option))
175 (error "Pandoc failed"))
176 (save-excursion
177 ;; Remove DOS CR/LF line endings
178 (goto-char (point-min))
179 (while (search-forward (string ?\C-m) nil t)
180 (replace-match "")))
181 ;; Demote page headings in capture buffer to below the
182 ;; top-level Org heading and "Article" 2nd-level heading
183 (save-excursion
184 (goto-char (point-min))
185 (while (re-search-forward (rx bol (1+ "*") (1+ space)) nil t)
186 (beginning-of-line)
187 (insert "**")
188 (end-of-line)))
189 (buffer-string)))
190 (orglink (org-make-link-string
191 url (if (s-present? title) title url)))
192 ;; Avoid call to org-store-link
193 (org-capture-link-is-already-stored t))
194
195 (setq org-stored-links
196 (cons (list url title) org-stored-links))
197 (kill-new orglink)
198
199 (org-store-link-props :type type
200 :annotation orglink
201 :link url
202 :description title
203 :orglink orglink
204 :initial content)
205 (org-protocol-capture-html--do-capture)
206 nil))
207
208 (add-to-list 'org-protocol-protocol-alist
209 '("capture-eww-readable"
210 :protocol "capture-eww-readable"
211 :function org-protocol-capture-html--capture-eww-readable
212 :kill-client t))
213
214 (defun org-protocol-capture-html--url-html (url)
215 "Return HTML from URL as string."
216 (let* ((response-buffer (url-retrieve-synchronously url nil t))
217 (encoded-html (with-current-buffer response-buffer
218 (pop-to-buffer response-buffer)
219 ;; Skip HTTP headers, using marker provided by url-http
220 (delete-region (point-min) (1+ url-http-end-of-headers))
221 (buffer-string))))
222 (kill-buffer response-buffer) ; Not sure if necessary to avoid leaking buffer
223 (with-temp-buffer
224 ;; For some reason, running `decode-coding-region' in the
225 ;; response buffer has no effect, so we have to do it in a
226 ;; temp buffer.
227 (insert encoded-html)
228 (condition-case nil
229 ;; Fix undecoded text
230 (decode-coding-region (point-min) (point-max) 'utf-8)
231 (coding-system-error nil))
232 (buffer-string))))
233
234 (defun org-protocol-capture-html--eww-readable (html)
235 "Return `eww-readable' part of HTML with title.
236Returns list (HTML . TITLE)."
237 ;; Based on `eww-readable'
238 (let* ((html
239 ;; Convert " " in HTML to plain spaces.
240 ;; `libxml-parse-html-region' turns them into
241 ;; underlines. The closest I can find to an explanation
242 ;; is at <http://www.perlmonks.org/?node_id=825188>.
243 (org-protocol-capture-html--nbsp-to-space html))
244 (dom (with-temp-buffer
245 (insert html)
246 (libxml-parse-html-region (point-min) (point-max))))
247 (title (cl-caddr (car (dom-by-tag dom 'title)))))
248 (eww-score-readability dom)
249 (cons (with-temp-buffer
250 (shr-dom-print (eww-highest-readability dom))
251 (buffer-string))
252 title)))))
253
254;;;; Helper functions
255
256(defun org-protocol-capture-html--nbsp-to-space (s)
257 "Convert HTML non-breaking spaces to plain spaces in S."
258 ;; Not sure why sometimes these are in the HTML and Pandoc converts
259 ;; them to underlines instead of spaces, but this fixes it.
260 (replace-regexp-in-string (rx " ") " " s t t))
261
262(with-no-warnings
263 ;; Ignore warning about the dynamically scoped `template' variable.
264 (defun org-protocol-capture-html--do-capture ()
265 "Call `org-capture' and demote page headings in capture buffer."
266 (raise-frame)
267 (funcall 'org-capture nil template)
268
269 ;; Demote page headings in capture buffer to below the
270 ;; top-level Org heading
271 (save-excursion
272 (goto-char (point-min))
273 (re-search-forward (rx bol "*" (1+ space)) nil t) ; Skip 1st heading
274 (while (re-search-forward (rx bol "*" (1+ space)) nil t)
275 (dotimes (n org-protocol-capture-html-demote-times)
276 (org-demote-subtree))))))
277
278(provide 'org-protocol-capture-html)
279
280;;; org-protocol-capture-html.el ends here