#!/usr/bin/python # -*- coding: utf-8 -*- """ $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $ """ import atexit import cgi import http_auth import httplib import os import re import surbl import sys import tempfile import tidy import urlparse from subprocess import Popen, PIPE CONTENT_TYPE = "text/html;charset=utf-8" Page = """ HTML Diff service

Create Diff between HTML pages

""" Page2 = """

Tip: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.

Diff markings

This service relies on GNU diff. The found differences are roughly marked as follow:

deleted text is shown in pink with down-arrows (as styled for a <del> element)
where there is replacement, it’s shown in green with bi-directional arrows,
where there is newly inserted text, it’s yellow with up arrows (<ins> element)

script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $
by Dominique Hazaël-Massieux
based on Shane McCarron’ Perl script wrapped in a Python CGI

""" def checkInputUrl(url): checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist') if url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2: print "Status: 403" print "Content-Type: text/plain" print print "sorry, I decline to handle file: addresses" sys.exit() elif checker.isMarkedAsSpam(url): print "Status: 403" print "Content-Type: text/plain; charset=utf-8" print print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/" sys.exit() def copyHeader(copy_func, source, key, header_name=None): value = source.get(key) if not value: return False elif header_name is None: header_name = key copy_func(header_name, value) return True def setupRequest(source_headers): opener = http_auth.ProxyAuthURLopener() copyHeader(opener.addheader, source_headers, 'If-Modified-Since') copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr') return opener def tidyFile(file): # option for tidy options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8') html5 = re.search(r"", file.read(4096), re.IGNORECASE) file.seek(0) html5_options = {"add_xml_space": "no", "output_xhtml": "no", "tidy_mark": "no", "new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle', "new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark', "break_before_br": "no", "vertical_space": "no", "enclose_text": "no", "numeric_entities": "yes", "wrap": "1000", "wrap_attributes": "no", "drop_empty_paras": "no" } if html5: options.update(html5_options) newtidy = tidy.parseString(file.read(), **options) if len(newtidy.errors) > 0: if not html5: file.seek(0) options.update(html5_options) newtidy = tidy.parseString(file.read(), **options) file.close() file = tempfile.NamedTemporaryFile( mode='w+', prefix='htmldiff-', suffix='.html') atexit.register(file.close) file.write(str(newtidy)) file.flush() file.seek(0) return (file, newtidy.errors) def matchPredecessorRel(rel): return rel and "predecessor-version" in rel.lower().split(" ") def mirrorURL(url, opener): try: filename, headers = opener.retrieve(url) except IOError, error: opener.error = "I/O error: %s %s" % (error.errno, error.strerror) except httplib.InvalidURL: opener.error = "Invalid URL submitted" except AttributeError: # ProxyAuthURLopener returned None. pass # There's already an error set. else: atexit.register(os.unlink, filename) file = open(filename) if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip": import gzip from StringIO import StringIO data = StringIO(file.read()) file.close() file = gzip.GzipFile(fileobj=data) file,errors = tidyFile(file) if len(errors) == 0: return (file, headers) else: opener.error = "Tidy errors: %s" % (str(errors)) return (None, {}) def showPage(url1='', url2='', error_html='', **headers): for name, value in headers.items(): print "%s: %s" % (name.replace('_', '-'), value) print print Page print error_html print Page2 % (url1, url2) sys.exit() def serveRequest(): fields = cgi.FieldStorage() if (not fields.has_key('doc2')): showPage(Content_Type=CONTENT_TYPE) # if doc1 is not specified, we load doc2 to check if it has a previous version link doc2 = fields['doc2'].value checkInputUrl(doc2) url_opener2 = setupRequest(fields.headers) newdoc, newheaders = mirrorURL(doc2, url_opener2) if fields.has_key('doc1'): doc1 = fields['doc1'].value elif newdoc is not None: from BeautifulSoup import BeautifulSoup soup = BeautifulSoup(newdoc.read()) newdoc.seek(0) try: doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"] except: try: doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"] except: doc1 = None else: doc1 = None if (not doc1): showPage(Content_Type=CONTENT_TYPE) checkInputUrl(doc1) esc1 = cgi.escape(doc1, True) esc2 = cgi.escape(doc2, True) urlcomponents1 = urlparse.urlparse(doc1) urlcomponents2 = urlparse.urlparse(doc2) # if same domain, we can use the same urlopener # otherwise, we create a separate one if urlcomponents2[1] == urlcomponents1[1]: url_opener = url_opener2 else: url_opener = setupRequest(fields.headers) refdoc, refheaders = mirrorURL(doc1, url_opener) if not (refdoc and newdoc): http_error = "" url = "" if not refdoc: http_error = url_opener.error url = esc1 else: http_error = url_opener2.error url = esc2 if re.match("^[1234][0-9][0-9] ", http_error): print "Status: %s" %(http_error) error="

An error (%s) occured trying to get %s.

" % (cgi.escape(http_error), url, url) showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE) print "Content-Type: text/html" if newheaders.has_key('Content-Type'): contentType = cgi.parse_header(newheaders["Content-Type"]) if contentType[1].has_key('charset'): charset = contentType[1]['charset'].lower() #if charset == "iso-8859-1": # options["char_encoding"]='latin1' for proxy_header in ('Last-Modified', 'Expires'): if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header): print print p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name], stdin=PIPE, stdout=PIPE, stderr=PIPE) sys.stdout.flush() sys.stderr.flush() (out, err) = p.communicate() p.stdin.close() if err: error = "

An error occured when running htmldiff on the documents:

%s

" % (cgi.escape(err),) showPage(esc1, esc2, error) else: print out if __name__ == '__main__': if os.environ.has_key('SCRIPT_NAME'): serveRequest()