fedora-csb-system-manager
  1;;; org-protocol-capture-html.el --- Capture HTML with org-protocol
  2
  3;; URL: https://github.com/alphapapa/org-protocol-capture-html
  4;; Version: 0.1-pre
  5;; Package-Requires: ((emacs "24.4"))
  6
  7;;; Commentary:
  8
  9;; This package captures Web pages into Org-mode using Pandoc to
 10;; process HTML.  It can also use eww's eww-readable functionality to
 11;; get the main content of a page.
 12
 13;; These are the helper functions that run in Emacs.  To capture pages
 14;; into Emacs, you can use either a browser bookmarklet or the
 15;; org-protocol-capture-html.sh shell script.  See the README.org file
 16;; for instructions.
 17
 18;;; License:
 19
 20;; This program is free software; you can redistribute it and/or modify
 21;; it under the terms of the GNU General Public License as published by
 22;; the Free Software Foundation, either version 3 of the License, or
 23;; (at your option) any later version.
 24
 25;; This program is distributed in the hope that it will be useful,
 26;; but WITHOUT ANY WARRANTY; without even the implied warranty of
 27;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 28;; GNU General Public License for more details.
 29
 30;; You should have received a copy of the GNU General Public License
 31;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
 32
 33;;; Code:
 34
 35;;;; Require
 36
 37(require 'org-protocol)
 38(require 'cl-lib)
 39(require 'subr-x)
 40(require 's)
 41
 42;;;; Vars
 43
 44(defcustom org-protocol-capture-html-demote-times 1
 45  "How many times to demote headings in captured pages.
 46You may want to increase this if you use a sub-heading in your capture template."
 47  :group 'org-protocol-capture-html :type 'integer)
 48
 49;;;; Test Pandoc
 50
 51(defconst org-protocol-capture-html-pandoc-no-wrap-option nil
 52  ;; Set this so it won't be unbound
 53  "Option to pass to Pandoc to disable wrapping.
 54Pandoc >= 1.16 deprecates `--no-wrap' in favor of
 55`--wrap=none'.")
 56
 57(defun org-protocol-capture-html--define-pandoc-wrap-const ()
 58  "Set `org-protocol-capture-html-pandoc-no-wrap-option'."
 59  (setq org-protocol-capture-html-pandoc-no-wrap-option
 60        ;; Pandoc >= 1.16 deprecates the --no-wrap option, replacing it with
 61        ;; --wrap=none.  Sending the wrong option causes output to STDERR,
 62        ;; which `call-process-region' doesn't like.  So we test Pandoc to see
 63        ;; which option to use.
 64        (with-temp-buffer
 65          (let* ((process (start-process "test-pandoc" (current-buffer) "pandoc" "--dump-args" "--no-wrap"))
 66                 (limit 3)
 67                 (checked 0))
 68            (while (process-live-p process)
 69              (if (= checked limit)
 70                  (progn
 71                    ;; Pandoc didn't exit in time.  Kill it and raise
 72                    ;; an error.  This function will return `nil' and
 73                    ;; `org-protocol-capture-html-pandoc-no-wrap-option'
 74                    ;; will remain `nil', which will cause this
 75                    ;; function to run again and set the const when a
 76                    ;; capture is run.
 77                    (set-process-query-on-exit-flag process nil)
 78                    (error "Unable to test Pandoc!  Please report this bug! (include the output of \"pandoc --dump-args --no-wrap\")"))
 79                (sleep-for 0.2)
 80                (cl-incf checked)))
 81            (if (and (zerop (process-exit-status process))
 82                     (not (string-match "--no-wrap is deprecated" (buffer-string))))
 83                "--no-wrap"
 84              "--wrap=none")))))
 85
 86;;;; Direct-to-Pandoc
 87
 88(defun org-protocol-capture-html--with-pandoc (data)
 89  "Process an org-protocol://capture-html:// URL using DATA.
 90
 91This function is basically a copy of `org-protocol-do-capture', but
 92it passes the captured content (not the URL or title) through
 93Pandoc, converting HTML to Org-mode."
 94
 95  ;; It would be nice to not basically duplicate
 96  ;; `org-protocol-do-capture', but passing the data back to that
 97  ;; function would require re-encoding the data into a URL string
 98  ;; with Emacs after Pandoc converts it.  Since we've already split
 99  ;; it up, we might as well go ahead and run the capture directly.
100
101  (unless org-protocol-capture-html-pandoc-no-wrap-option
102    (org-protocol-capture-html--define-pandoc-wrap-const))
103
104  (let* ((template (or (plist-get data :template)
105                       org-protocol-default-template-key))
106         (url (org-protocol-sanitize-uri (plist-get data :url)))
107         (type (if (string-match "^\\([a-z]+\\):" url)
108                   (match-string 1 url)))
109         (title (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :title))) ""))
110         (content (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :body))) ""))
111         (orglink (org-make-link-string
112                   url (if (string-match "[^[:space:]]" title) title url)))
113         (org-capture-link-is-already-stored t)) ; avoid call to org-store-link
114
115    (setq org-stored-links
116          (cons (list url title) org-stored-links))
117    (kill-new orglink)
118
119    (with-temp-buffer
120      (insert content)
121      (if (not (zerop (call-process-region
122                       (point-min) (point-max)
123                       "pandoc" t t nil "-f" "html" "-t" "org" org-protocol-capture-html-pandoc-no-wrap-option)))
124          (message "Pandoc failed: %s" (buffer-string))
125        (progn
126          ;; Pandoc succeeded
127          (org-store-link-props :type type
128                                :annotation orglink
129                                :link url
130                                :description title
131                                :orglink orglink
132                                :initial (buffer-string)))))
133    (org-protocol-capture-html--do-capture)
134    nil))
135
136(add-to-list 'org-protocol-protocol-alist
137             '("capture-html"
138               :protocol "capture-html"
139               :function org-protocol-capture-html--with-pandoc
140               :kill-client t))
141
142;;;; eww-readable
143
144(defvar url-http-end-of-headers)
145
146(eval-when-compile
147  ;; eww-readable only works on Emacs >=25.1, but I think it's better
148  ;; to check for the actual symbols.  I think using
149  ;; `eval-when-compile' is the right way to do this, but I'm not
150  ;; sure.
151  (when (and (require 'eww nil t)
152             (require 'dom nil t)
153             (fboundp 'eww-score-readability))
154
155    (defun org-protocol-capture-html--capture-eww-readable (data)
156      "Capture content of URL with eww-readable.."
157
158      (unless org-protocol-capture-html-pandoc-no-wrap-option
159        (org-protocol-capture-html--define-pandoc-wrap-const))
160
161      (let* ((template (or (plist-get data :template)
162                           org-protocol-default-template-key))
163             (url (org-protocol-sanitize-uri (plist-get data :url)))
164             (type (if (string-match "^\\([a-z]+\\):" url)
165                       (match-string 1 url)))
166             (html (org-protocol-capture-html--url-html url))
167             (result (org-protocol-capture-html--eww-readable html))
168             (title (cdr result))
169             (content (with-temp-buffer
170                        (insert (org-protocol-capture-html--nbsp-to-space (car result)))
171                        ;; Convert to Org with Pandoc
172                        (unless (= 0 (call-process-region (point-min) (point-max)
173                                                          "pandoc" t t nil "-f" "html" "-t" "org"
174                                                          org-protocol-capture-html-pandoc-no-wrap-option))
175                          (error "Pandoc failed"))
176                        (save-excursion
177                          ;; Remove DOS CR/LF line endings
178                          (goto-char (point-min))
179                          (while (search-forward (string ?\C-m) nil t)
180                            (replace-match "")))
181                        ;; Demote page headings in capture buffer to below the
182                        ;; top-level Org heading and "Article" 2nd-level heading
183                        (save-excursion
184                          (goto-char (point-min))
185                          (while (re-search-forward (rx bol (1+ "*") (1+ space)) nil t)
186                            (beginning-of-line)
187                            (insert "**")
188                            (end-of-line)))
189                        (buffer-string)))
190             (orglink (org-make-link-string
191                       url (if (s-present? title) title url)))
192             ;; Avoid call to org-store-link
193             (org-capture-link-is-already-stored t))
194
195        (setq org-stored-links
196              (cons (list url title) org-stored-links))
197        (kill-new orglink)
198
199        (org-store-link-props :type type
200                              :annotation orglink
201                              :link url
202                              :description title
203                              :orglink orglink
204                              :initial content)
205        (org-protocol-capture-html--do-capture)
206        nil))
207
208    (add-to-list 'org-protocol-protocol-alist
209                 '("capture-eww-readable"
210                   :protocol "capture-eww-readable"
211                   :function org-protocol-capture-html--capture-eww-readable
212                   :kill-client t))
213
214    (defun org-protocol-capture-html--url-html (url)
215      "Return HTML from URL as string."
216      (let* ((response-buffer (url-retrieve-synchronously url nil t))
217             (encoded-html (with-current-buffer response-buffer
218                             (pop-to-buffer response-buffer)
219                             ;; Skip HTTP headers, using marker provided by url-http
220                             (delete-region (point-min) (1+ url-http-end-of-headers))
221                             (buffer-string))))
222        (kill-buffer response-buffer)     ; Not sure if necessary to avoid leaking buffer
223        (with-temp-buffer
224          ;; For some reason, running `decode-coding-region' in the
225          ;; response buffer has no effect, so we have to do it in a
226          ;; temp buffer.
227          (insert encoded-html)
228          (condition-case nil
229              ;; Fix undecoded text
230              (decode-coding-region (point-min) (point-max) 'utf-8)
231            (coding-system-error nil))
232          (buffer-string))))
233
234    (defun org-protocol-capture-html--eww-readable (html)
235      "Return `eww-readable' part of HTML with title.
236Returns list (HTML . TITLE)."
237      ;; Based on `eww-readable'
238      (let* ((html
239              ;; Convert "&nbsp;" in HTML to plain spaces.
240              ;; `libxml-parse-html-region' turns them into
241              ;; underlines.  The closest I can find to an explanation
242              ;; is at <http://www.perlmonks.org/?node_id=825188>.
243              (org-protocol-capture-html--nbsp-to-space html))
244             (dom (with-temp-buffer
245                    (insert html)
246                    (libxml-parse-html-region (point-min) (point-max))))
247             (title (cl-caddr (car (dom-by-tag dom 'title)))))
248        (eww-score-readability dom)
249        (cons (with-temp-buffer
250                (shr-dom-print (eww-highest-readability dom))
251                (buffer-string))
252              title)))))
253
254;;;; Helper functions
255
256(defun org-protocol-capture-html--nbsp-to-space (s)
257  "Convert HTML non-breaking spaces to plain spaces in S."
258  ;; Not sure why sometimes these are in the HTML and Pandoc converts
259  ;; them to underlines instead of spaces, but this fixes it.
260  (replace-regexp-in-string (rx "&nbsp;") " " s t t))
261
262(with-no-warnings
263  ;; Ignore warning about the dynamically scoped `template' variable.
264  (defun org-protocol-capture-html--do-capture ()
265    "Call `org-capture' and demote page headings in capture buffer."
266    (raise-frame)
267    (funcall 'org-capture nil template)
268
269    ;; Demote page headings in capture buffer to below the
270    ;; top-level Org heading
271    (save-excursion
272      (goto-char (point-min))
273      (re-search-forward (rx bol "*" (1+ space)) nil t) ; Skip 1st heading
274      (while (re-search-forward (rx bol "*" (1+ space)) nil t)
275        (dotimes (n org-protocol-capture-html-demote-times)
276          (org-demote-subtree))))))
277
278(provide 'org-protocol-capture-html)
279
280;;; org-protocol-capture-html.el ends here