diff options
Diffstat (limited to 'share/emacs/site-lisp/w3m/w3m-filter.el')
-rw-r--r-- | share/emacs/site-lisp/w3m/w3m-filter.el | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/share/emacs/site-lisp/w3m/w3m-filter.el b/share/emacs/site-lisp/w3m/w3m-filter.el new file mode 100644 index 00000000000..8feb1439e21 --- /dev/null +++ b/share/emacs/site-lisp/w3m/w3m-filter.el @@ -0,0 +1,367 @@ +;;; w3m-filter.el --- filtering utility of advertisements on WEB sites -*- coding: euc-japan -*- + +;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 +;; TSUCHIYA Masatoshi <tsuchiya@namazu.org> + +;; Authors: TSUCHIYA Masatoshi <tsuchiya@namazu.org> +;; Keywords: w3m, WWW, hypermedia + +;; This file is a part of emacs-w3m. + +;; This program is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with this program; see the file COPYING. If not, write to +;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. + + +;;; Commentary: + +;; w3m-filter.el is the add-on utility to filter advertisements on WEB +;; sites. + + +;;; Code: + +(provide 'w3m-filter) + +(eval-when-compile + (require 'cl)) + +(require 'w3m) + +(defcustom w3m-filter-rules + `(("\\`http://www\\.geocities\\.co\\.jp/" + w3m-filter-delete-regions + "<DIV ALIGN=CENTER>\n<!--*/GeoGuide/*-->" "<!--*/GeoGuide/*-->\n</DIV>") + ("\\`http://[a-z]+\\.hp\\.infoseek\\.co\\.jp/" + w3m-filter-delete-regions + "<!-- start AD -->" "<!-- end AD -->") + ("\\`http://linux\\.ascii24\\.com/linux/" + w3m-filter-delete-regions + "<!-- DAC CHANNEL AD START -->" "<!-- DAC CHANNEL AD END -->") + ("\\`http://\\(www\\|images\\|news\\|maps\\|groups\\)\\.google\\." + w3m-filter-google) + ("\\`https?://\\(?:www\\.\\)?amazon\\.\ +\\(?:com\\|co\\.\\(?:jp\\|uk\\)\\|fr\\|de\\)/" + w3m-filter-amazon) + ("\\`https?://mixi\\.jp" w3m-filter-mixi) + ("\\`http://eow\\.alc\\.co\\.jp/[^/]+/UTF-8" w3m-filter-alc) + ("\\`http://www\\.asahi\\.com/" w3m-filter-asahi-shimbun) + ("\\`http://imepita\\.jp/[0-9]+/[0-9]+" w3m-filter-imepita) + ("\\`http://allatanys\\.jp/" w3m-filter-allatanys) + ("\\`http://.*\\.wikipedia\\.org/" w3m-filter-wikipedia) + ("" w3m-filter-iframe)) + "Rules to filter advertisements on WEB sites." + :group 'w3m + :type '(repeat + (cons :format "%v" :indent 4 + (regexp :format "Regexp: %v\n" :size 0) + (choice + :tag "Filtering Rule" + (list :tag "Delete regions surrounded with these patterns" + (function-item :format "" w3m-filter-delete-region) + (regexp :tag "Start") + (regexp :tag "End")) + (list :tag "Filter with a user defined function" + function + (repeat :tag "Arguments" sexp)))))) + +(defcustom w3m-filter-google-use-utf8 + (or (featurep 'un-define) (fboundp 'utf-translate-cjk-mode) + (and (not (equal "Japanese" w3m-language)) + (w3m-find-coding-system 'utf-8))) + "*Use the converting rule to UTF-8 on the site of Google." + :group 'w3m + :type 'boolean) + +(defcustom w3m-filter-google-use-ruled-line t + "*Use the ruled line on the site of Google." + :group 'w3m + :type 'boolean) + +(defcustom w3m-filter-google-separator "<hr>" + "Field separator for Google's search results ." + :group 'w3m + :type 'string) + +(defcustom w3m-filter-amazon-regxp + (concat + "\\`\\(https?://\\(?:www\\.\\)?amazon\\." + "\\(?:com\\|co\\.\\(?:jp\\|uk\\)\\|fr\\|de\\)" + ;; "Joyo.com" + "\\)/" + "\\(?:" + "\\(?:exec/obidos\\|o\\)/ASIN" + "\\|" + "gp/product" + "\\|" + "\\(?:[^/]+/\\)?dp" + "\\)" + "/\\([0-9]+\\)") + "*Regexp to extract ASIN number for Amazon." + :group 'w3m + :type '(string :size 0)) + +(defcustom w3m-filter-amazon-short-url-bottom nil + "*Amazon short URLs insert bottom position." + :group 'w3m + :type 'boolean) + +;;;###autoload +(defun w3m-filter (url) + "Apply filtering rule of URL against a content in this buffer." + (save-match-data + (dolist (elem w3m-filter-rules) + (when (string-match (car elem) url) + (apply (cadr elem) url (cddr elem)))))) + +(defun w3m-filter-delete-regions (url start end) + "Delete regions surrounded with a START pattern and an END pattern." + (goto-char (point-min)) + (let (p (i 0)) + (while (and (search-forward start nil t) + (setq p (match-beginning 0)) + (search-forward end nil t)) + (delete-region p (match-end 0)) + (incf i)) + (> i 0))) + +(defun w3m-filter-replace-regexp (url regexp to-string) + "Replace all occurrences of REGEXP with TO-STRING." + (goto-char (point-min)) + (while (re-search-forward regexp nil t) + (replace-match to-string nil nil))) + +;; Filter functions: +(defun w3m-filter-asahi-shimbun (url) + "Convert entity reference of UCS." + (when w3m-use-mule-ucs + (goto-char (point-min)) + (let ((case-fold-search t) + end ucs) + (while (re-search-forward "alt=\"\\([^\"]+\\)" nil t) + (goto-char (match-beginning 1)) + (setq end (set-marker (make-marker) (match-end 1))) + (while (re-search-forward "&#\\([0-9]+\\);" (max end (point)) t) + (setq ucs (string-to-number (match-string 1))) + (delete-region (match-beginning 0) (match-end 0)) + (insert-char (w3m-ucs-to-char ucs) 1)))))) + +(defun w3m-filter-google (url) + "Insert separator within items." + (goto-char (point-min)) + (let ((endm (make-marker)) + (case-fold-search t) + pos beg end) + (when (and w3m-filter-google-use-utf8 + (re-search-forward "\ +<a class=. href=\"http://\\(www\\|images\\|news\\|maps\\|groups\\)\\.google\\." + nil t) + (setq pos (match-beginning 0)) + (search-backward "<table" nil t) + (setq beg (match-beginning 0)) + (search-forward "</table" nil t) + (set-marker endm (match-end 0)) + (< pos (marker-position endm))) + (goto-char beg) + (while (re-search-forward "[?&][io]e=\\([^&]+\\)&" endm t) + (replace-match "UTF-8" nil nil nil 1)) + (setq end (marker-position endm))) + (when (string-match "\\`http://www\\.google\\.[^/]+/search\\?" url) + (goto-char (point-max)) + (when (and w3m-filter-google-use-ruled-line + (search-backward "<div class=" end t) + (search-forward "</div>" nil t)) + (insert w3m-filter-google-separator)) + (if w3m-filter-google-use-ruled-line + (while (search-backward "<div class=" end t) + (insert w3m-filter-google-separator)) + (while (search-backward "<div class=" end t) + (insert "<p>")))))) + +(defun w3m-filter-amazon (url) + "Insert Amazon short URIs." + (when (string-match w3m-filter-amazon-regxp url) + (let* ((base (match-string 1 url)) + (asin (match-string 2 url)) + (shorturls `(,(concat base "/dp/" asin "/") + ,(concat base "/o/ASIN/" asin "/") + ,(concat base "/gp/product/" asin "/"))) + (case-fold-search t) + shorturl) + (goto-char (point-min)) + (setq url (file-name-as-directory url)) + (when (or (and (not w3m-filter-amazon-short-url-bottom) + (search-forward "<body" nil t) + (search-forward ">" nil t)) + (and w3m-filter-amazon-short-url-bottom + (search-forward "</body>" nil t) + (goto-char (match-beginning 0)))) + (insert "\n") + (while (setq shorturl (car shorturls)) + (setq shorturls (cdr shorturls)) + (unless (string= url shorturl) + (insert (format "Amazon Short URL: <a href=\"%s\">%s</a><br>\n" + shorturl shorturl)))) + (insert "\n"))))) + +(defun w3m-filter-mixi (url) + "Direct jump to the external diary." + (goto-char (point-min)) + (let (newurl) + (while (re-search-forward "<a href=\"?view_diary\\.pl\\?url=\\([^>]+\\)>" + nil t) + (setq newurl (match-string 1)) + (when newurl + (delete-region (match-beginning 0) (match-end 0)) + (when (string-match "&owner_id=[0-9]+\"?\\'" newurl) + (setq newurl (substring newurl 0 (match-beginning 0)))) + (insert (format "<a href=\"%s\">" + (w3m-url-readable-string newurl))))))) + +(defun w3m-filter-alc (url) + (let ((baseurl "http://eow.alc.co.jp/%s/UTF-8/") + curl cword beg tmp1) + (when (string-match "\\`http://eow\\.alc\\.co\\.jp/\\([^/]+\\)/UTF-8/" url) + (setq curl (match-string 0 url)) + (setq cword (match-string 1 url)) + (setq cword (car (split-string (w3m-url-decode-string cword 'utf-8) + " "))) + (goto-char (point-min)) + (while (search-forward "データの転載は禁じられています" nil t) + (delete-region (line-beginning-position) (line-end-position)) + (insert "<br>")) + (goto-char (point-min)) + (when (search-forward "<body" nil t) + (forward-line 1) + (insert "<h1>英辞朗 on the WEB<h1>\n") + (setq beg (point)) + (when (search-forward "<!-- ▼検索文字列 -->" nil t) + (forward-line 1) + (delete-region beg (point))) + (when (search-forward "<!-- ▼ワードリンク 履歴 -->" nil t) + (forward-line 1) + (setq beg (point)) + (when (search-forward "</body>" nil t) + (delete-region beg (match-beginning 0)))) + (insert "<br>*データの転載は禁じられています。") + ;; next/previous page + (goto-char (point-min)) + (while (re-search-forward + "<a href='javascript:goPage(\"\\([0-9]+\\)\")'>" + nil t) + (setq tmp1 (match-string 1)) + (delete-region (match-beginning 0) (match-end 0)) + (insert (format "<a href=\"%s?pg=%s\">" curl tmp1))) + ;; wordlink + (goto-char (point-min)) + (while (re-search-forward + "<span class=\"wordlink\">\\([^<]+\\)</span>" + nil t) + (setq tmp1 (match-string 1)) + (delete-region (match-beginning 0) (match-end 0)) + (insert (format "<a href=\"%s\">%s</a>" (format baseurl tmp1) tmp1))) + ;; goGradable/goFairWord + (goto-char (point-min)) + (while (re-search-forward + "<a href='javascript:\\(goGradable\\|goFairWord\\)(\"\\([^\"]+\\)\")'>" + nil t) + (setq tmp1 (match-string 2)) + (delete-region (match-beginning 0) (match-end 0)) + (insert (format "<a href=\"%s\">" (format baseurl tmp1)))) + ;; remove spacer + (goto-char (point-min)) + (while (search-forward "img/spacer.gif" nil t) + (delete-region (line-beginning-position) (line-end-position))) + (goto-char (point-min)) + ;; remove ワードリンク + (when (search-forward "alt=\"ワードリンク\"" nil t) + (delete-region (line-beginning-position) (line-end-position))) + ;; 全文を表示するは無理 + (goto-char (point-min)) + (while (re-search-forward + (concat "<br */> *⇒<strong>" + "<a href='javascript:goFullText(\"[^\"]+\", \"[^\"]+\")'>" + "全文を表示する</a>") + nil t) + (delete-region (match-beginning 0) (match-end 0))) + ;; Java Document write... ;_; + ;; (while (re-search-forward + ;; "<a href='javascript:goFullText(\"\\([^\"]+\\)\", \"\\([^\"]+\\)\")'>" + ;; nil t) + ;; (setq tmp1 (match-string 1)) + ;; (setq tmp2 (match-string 2)) + ;; (delete-region (match-beginning 0) (match-end 0)) + ;; ;; &dk=JE, &dk=EJ + ;; (insert (format "<a href=\"%s?ref=ex&exp=%s&dn=%s&dk=%s\">" + ;; curl tmp1 tmp2 + ;; (if (string-match "\\Cj" cword) "JE" "EJ")))) + )))) + +(defun w3m-filter-imepita (url) + "JavaScript emulation." + (goto-char (point-min)) + (let (tmp) + (when (re-search-forward + (concat "<script><!--\ndocument.write('\\([^\n]*\\)');\r\n//--></script>\n" + "<noscript>.*</noscript>") + nil t) + (setq tmp (match-string 1)) + (delete-region (match-beginning 0) (match-end 0)) + (insert tmp)))) + +(defun w3m-filter-iframe (url) + (goto-char (point-min)) + (while (re-search-forward "<iframe [^>]*src=\"\\([^\"]*\\)\"[^>]*>" nil t) + (insert (concat "[iframe:<a href=\"" (match-string 1) "\">" (match-string 1) "</a>]")))) + +(defun w3m-filter-allatanys (url) + "JavaScript emulation." + (goto-char (point-min)) + (let (aturl atexpurl) + (if (re-search-forward + (concat "<body[ \t\r\f\n]+onload=\"window\\.top\\.location\\.replace('" + w3m-html-string-regexp + "');\">") + nil t) + (progn + (setq aturl (match-string 1)) + (setq atexpurl (w3m-expand-url aturl url)) + (delete-region (match-beginning 0) (match-end 0)) + (insert "<body>\n" + "<hr>" + "Body has a <b>url=window.top.location.replace()</b><br><br>\n" + (format "Goto: <a href=%s>%s</a>\n" aturl atexpurl) + "<hr>") + (goto-char (point-min)) + (insert (format "<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0;URL=%s\">\n" + aturl))) + (while (re-search-forward (concat "<a[ \t\r\l\n]+href=\"javascript:[^(]+('" + "\\([^']+\\)')\">") + nil t) + (setq aturl (match-string 1)) + (delete-region (match-beginning 0) (match-end 0)) + (insert (format "<a href=\"%s\">" aturl)))))) + +(defun w3m-filter-wikipedia (url) + "Make anchor reference to work." + (goto-char (point-min)) + (let (matched-text refid) + (while (re-search-forward + "<\\(?:sup\\|cite\\) id=\"\\([^\"]*\\)\"" nil t) + (setq matched-text (match-string 0) + refid (match-string 1)) + (delete-region (match-beginning 0) (match-end 0)) + (insert (format "<a name=\"%s\"></a>%s" refid matched-text))))) + +;;; w3m-filter.el ends here |