243 lines
9.2 KiB
Python
Executable File
243 lines
9.2 KiB
Python
Executable File
#!/usr/bin/python
|
||
# -*- coding: utf-8 -*-
|
||
""" $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $
|
||
"""
|
||
|
||
import atexit
|
||
import cgi
|
||
import http_auth
|
||
import httplib
|
||
import os
|
||
import re
|
||
import surbl
|
||
import sys
|
||
import tempfile
|
||
import tidy
|
||
import urlparse
|
||
|
||
from subprocess import Popen, PIPE
|
||
|
||
CONTENT_TYPE = "text/html;charset=utf-8"
|
||
|
||
Page = """
|
||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
|
||
<head><title>HTML Diff service</title>
|
||
<link rel="stylesheet" href="http://www.w3.org/StyleSheets/base" />
|
||
</head>
|
||
<body>
|
||
|
||
<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a> <a href="http://www.w3.org/2003/Editors">W3C Editors homepage</a></p>
|
||
|
||
<h1>Create Diff between HTML pages</h1>
|
||
"""
|
||
Page2 = """
|
||
<form method="GET">
|
||
<p>Address of reference document: <input name="doc1" type="url" value="%s" style="width:100%%"/></p>
|
||
<p>Address of new document: <input name="doc2" value="%s" style="width:100%%"/></p>
|
||
<p><input type="submit" value="get Diff"/></p>
|
||
</form>
|
||
|
||
<p><strong>Tip</strong>: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.</p>
|
||
<h2>Diff markings</h2>
|
||
<p>This service relies on <a href="https://www.gnu.org/software/diffutils/">GNU diff</a>. The found differences are roughly marked as follow:
|
||
<ul>
|
||
<li>deleted text is shown in pink with down-arrows (as styled for a <del> element)</li>
|
||
<li>where there is replacement, it’s shown in green with bi-directional arrows,</li>
|
||
<li>where there is newly inserted text, it’s yellow with up arrows (<ins> element)</li>
|
||
</ul>
|
||
<address>
|
||
script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $<br />
|
||
by <a href="http://www.w3.org/People/Dom/">Dominique Hazaël-Massieux</a><br />based on <a href="https://dev.w3.org/cvsweb/2009/htmldiff/htmldiff.pl">Shane McCarron’ Perl script</a> wrapped in a <a href="http://dev.w3.org/cvsweb/2009/htmldiff/">Python CGI</a>
|
||
</address>
|
||
</body>
|
||
</html>
|
||
"""
|
||
|
||
def checkInputUrl(url):
|
||
checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist')
|
||
|
||
if url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2:
|
||
print "Status: 403"
|
||
print "Content-Type: text/plain"
|
||
print
|
||
print "sorry, I decline to handle file: addresses"
|
||
sys.exit()
|
||
elif checker.isMarkedAsSpam(url):
|
||
print "Status: 403"
|
||
print "Content-Type: text/plain; charset=utf-8"
|
||
print
|
||
print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/"
|
||
sys.exit()
|
||
|
||
def copyHeader(copy_func, source, key, header_name=None):
|
||
value = source.get(key)
|
||
if not value:
|
||
return False
|
||
elif header_name is None:
|
||
header_name = key
|
||
copy_func(header_name, value)
|
||
return True
|
||
|
||
def setupRequest(source_headers):
|
||
opener = http_auth.ProxyAuthURLopener()
|
||
copyHeader(opener.addheader, source_headers, 'If-Modified-Since')
|
||
copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr')
|
||
return opener
|
||
|
||
def tidyFile(file):
|
||
# option for tidy
|
||
options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8')
|
||
html5 = re.search(r"<!doctype\s+html\s*>", file.read(4096),
|
||
re.IGNORECASE)
|
||
file.seek(0)
|
||
html5_options = {"add_xml_space": "no",
|
||
"output_xhtml": "no",
|
||
"tidy_mark": "no",
|
||
"new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle',
|
||
"new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark',
|
||
"break_before_br": "no",
|
||
"vertical_space": "no",
|
||
"enclose_text": "no",
|
||
"numeric_entities": "yes",
|
||
"wrap": "1000",
|
||
"wrap_attributes": "no",
|
||
"drop_empty_paras": "no"
|
||
}
|
||
if html5:
|
||
options.update(html5_options)
|
||
newtidy = tidy.parseString(file.read(), **options)
|
||
if len(newtidy.errors) > 0:
|
||
if not html5:
|
||
file.seek(0)
|
||
options.update(html5_options)
|
||
newtidy = tidy.parseString(file.read(), **options)
|
||
file.close()
|
||
file = tempfile.NamedTemporaryFile(
|
||
mode='w+', prefix='htmldiff-', suffix='.html')
|
||
atexit.register(file.close)
|
||
file.write(str(newtidy))
|
||
file.flush()
|
||
file.seek(0)
|
||
return (file, newtidy.errors)
|
||
|
||
def matchPredecessorRel(rel):
|
||
return rel and "predecessor-version" in rel.lower().split(" ")
|
||
|
||
def mirrorURL(url, opener):
|
||
try:
|
||
filename, headers = opener.retrieve(url)
|
||
except IOError, error:
|
||
opener.error = "I/O error: %s %s" % (error.errno, error.strerror)
|
||
except httplib.InvalidURL:
|
||
opener.error = "Invalid URL submitted"
|
||
except AttributeError: # ProxyAuthURLopener returned None.
|
||
pass # There's already an error set.
|
||
else:
|
||
atexit.register(os.unlink, filename)
|
||
file = open(filename)
|
||
if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip":
|
||
import gzip
|
||
from StringIO import StringIO
|
||
data = StringIO(file.read())
|
||
file.close()
|
||
file = gzip.GzipFile(fileobj=data)
|
||
file,errors = tidyFile(file)
|
||
if len(errors) == 0:
|
||
return (file, headers)
|
||
else:
|
||
opener.error = "Tidy errors: %s" % (str(errors))
|
||
return (None, {})
|
||
|
||
def showPage(url1='', url2='', error_html='', **headers):
|
||
for name, value in headers.items():
|
||
print "%s: %s" % (name.replace('_', '-'), value)
|
||
print
|
||
print Page
|
||
print error_html
|
||
print Page2 % (url1, url2)
|
||
sys.exit()
|
||
|
||
def serveRequest():
|
||
fields = cgi.FieldStorage()
|
||
|
||
if (not fields.has_key('doc2')):
|
||
showPage(Content_Type=CONTENT_TYPE)
|
||
# if doc1 is not specified, we load doc2 to check if it has a previous version link
|
||
doc2 = fields['doc2'].value
|
||
checkInputUrl(doc2)
|
||
url_opener2 = setupRequest(fields.headers)
|
||
newdoc, newheaders = mirrorURL(doc2, url_opener2)
|
||
if fields.has_key('doc1'):
|
||
doc1 = fields['doc1'].value
|
||
elif newdoc is not None:
|
||
from BeautifulSoup import BeautifulSoup
|
||
|
||
soup = BeautifulSoup(newdoc.read())
|
||
newdoc.seek(0)
|
||
try:
|
||
doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"]
|
||
except:
|
||
try:
|
||
doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"]
|
||
except:
|
||
doc1 = None
|
||
else:
|
||
doc1 = None
|
||
if (not doc1):
|
||
showPage(Content_Type=CONTENT_TYPE)
|
||
|
||
checkInputUrl(doc1)
|
||
esc1 = cgi.escape(doc1, True)
|
||
esc2 = cgi.escape(doc2, True)
|
||
urlcomponents1 = urlparse.urlparse(doc1)
|
||
urlcomponents2 = urlparse.urlparse(doc2)
|
||
# if same domain, we can use the same urlopener
|
||
# otherwise, we create a separate one
|
||
if urlcomponents2[1] == urlcomponents1[1]:
|
||
url_opener = url_opener2
|
||
else:
|
||
url_opener = setupRequest(fields.headers)
|
||
|
||
refdoc, refheaders = mirrorURL(doc1, url_opener)
|
||
if not (refdoc and newdoc):
|
||
http_error = ""
|
||
url = ""
|
||
if not refdoc:
|
||
http_error = url_opener.error
|
||
url = esc1
|
||
else:
|
||
http_error = url_opener2.error
|
||
url = esc2
|
||
if re.match("^[1234][0-9][0-9] ", http_error):
|
||
print "Status: %s" %(http_error)
|
||
error="<p style='color:#FF0000'>An error (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (cgi.escape(http_error), url, url)
|
||
showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE)
|
||
|
||
print "Content-Type: text/html"
|
||
if newheaders.has_key('Content-Type'):
|
||
contentType = cgi.parse_header(newheaders["Content-Type"])
|
||
if contentType[1].has_key('charset'):
|
||
charset = contentType[1]['charset'].lower()
|
||
#if charset == "iso-8859-1":
|
||
# options["char_encoding"]='latin1'
|
||
|
||
for proxy_header in ('Last-Modified', 'Expires'):
|
||
if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header):
|
||
print
|
||
print
|
||
p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name],
|
||
stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
||
sys.stdout.flush()
|
||
sys.stderr.flush()
|
||
(out, err) = p.communicate()
|
||
p.stdin.close()
|
||
if err:
|
||
error = "<p style='color:#FF0000'>An error occured when running <code>htmldiff</code> on the documents:</p><pre>%s</pre>" % (cgi.escape(err),)
|
||
showPage(esc1, esc2, error)
|
||
else:
|
||
print out
|
||
if __name__ == '__main__':
|
||
if os.environ.has_key('SCRIPT_NAME'):
|
||
serveRequest()
|