Added URL normalization to urlhistory module to allow better detection of duplicates
Added configurable ignored URLs to urlhistory module
This commit is contained in:
parent
7380470cf0
commit
bb709a74bf
|
@ -7,6 +7,8 @@ from datetime import timedelta
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from util import hook, timesince
|
from util import hook, timesince
|
||||||
|
from util import urlnorm
|
||||||
|
#from util import texttime
|
||||||
|
|
||||||
url_re = re.compile(r'([a-zA-Z]+://|www\.)[^ ]*')
|
url_re = re.compile(r'([a-zA-Z]+://|www\.)[^ ]*')
|
||||||
|
|
||||||
|
@ -15,6 +17,8 @@ dbname = "skybot.db"
|
||||||
|
|
||||||
expiration_period = timedelta(days=1)
|
expiration_period = timedelta(days=1)
|
||||||
|
|
||||||
|
ignored_urls = [ urlnorm.normalize("http://google.com") ]
|
||||||
|
|
||||||
#TODO: Generate expiration_period_text from expiration_period
|
#TODO: Generate expiration_period_text from expiration_period
|
||||||
expiration_period_text = "24 hours"
|
expiration_period_text = "24 hours"
|
||||||
|
|
||||||
|
@ -75,13 +79,7 @@ def dbconnect(db):
|
||||||
return conn
|
return conn
|
||||||
|
|
||||||
def normalize_url(url):
|
def normalize_url(url):
|
||||||
# TODO: do something so that:
|
return urlnorm.normalize(url)
|
||||||
# - http://www.google.com
|
|
||||||
# - www.google.com
|
|
||||||
# - http://google.com
|
|
||||||
# - http://google.com/
|
|
||||||
# etc are all considered to be the same URL
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_once_twice(count):
|
def get_once_twice(count):
|
||||||
if count == 1:
|
if count == 1:
|
||||||
|
@ -98,6 +96,7 @@ def urlinput(bot, input):
|
||||||
if m:
|
if m:
|
||||||
# URL detected
|
# URL detected
|
||||||
url = normalize_url(m.group(0))
|
url = normalize_url(m.group(0))
|
||||||
|
if url not in ignored_urls:
|
||||||
conn = dbconnect(dbpath)
|
conn = dbconnect(dbpath)
|
||||||
dupes = select_history_for_url_and_channel(conn, url, input.chan)
|
dupes = select_history_for_url_and_channel(conn, url, input.chan)
|
||||||
num_dupes = len(dupes)
|
num_dupes = len(dupes)
|
||||||
|
|
|
@ -0,0 +1,215 @@
|
||||||
|
"""
|
||||||
|
URI Normalization function:
|
||||||
|
* Always provide the URI scheme in lowercase characters.
|
||||||
|
* Always provide the host, if any, in lowercase characters.
|
||||||
|
* Only perform percent-encoding where it is essential.
|
||||||
|
* Always use uppercase A-through-F characters when percent-encoding.
|
||||||
|
* Prevent dot-segments appearing in non-relative URI paths.
|
||||||
|
* For schemes that define a default authority, use an empty authority if the
|
||||||
|
default is desired.
|
||||||
|
* For schemes that define an empty path to be equivalent to a path of "/",
|
||||||
|
use "/".
|
||||||
|
* For schemes that define a port, use an empty port if the default is desired
|
||||||
|
* All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||||
|
|
||||||
|
implements:
|
||||||
|
http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
|
||||||
|
http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
|
||||||
|
|
||||||
|
inspired by:
|
||||||
|
Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
|
||||||
|
Mark Nottingham, http://www.mnot.net/python/urlnorm.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
__license__ = "Python"
|
||||||
|
|
||||||
|
import re, unicodedata, urlparse
|
||||||
|
from urllib import quote, unquote
|
||||||
|
|
||||||
|
default_port = {
|
||||||
|
'ftp': 21,
|
||||||
|
'telnet': 23,
|
||||||
|
'http': 80,
|
||||||
|
'gopher': 70,
|
||||||
|
'news': 119,
|
||||||
|
'nntp': 119,
|
||||||
|
'prospero': 191,
|
||||||
|
'https': 443,
|
||||||
|
'snews': 563,
|
||||||
|
'snntp': 563,
|
||||||
|
}
|
||||||
|
|
||||||
|
def normalize(url):
|
||||||
|
"""Normalize a URL."""
|
||||||
|
|
||||||
|
scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
|
||||||
|
(userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups()
|
||||||
|
|
||||||
|
# Always provide the URI scheme in lowercase characters.
|
||||||
|
scheme = scheme.lower()
|
||||||
|
|
||||||
|
# Always provide the host, if any, in lowercase characters.
|
||||||
|
host = host.lower()
|
||||||
|
if host and host[-1] == '.': host = host[:-1]
|
||||||
|
if host and host.startswith("www."):
|
||||||
|
if not scheme: scheme = "http"
|
||||||
|
host = host[4:]
|
||||||
|
elif path and path.startswith("www."):
|
||||||
|
if not scheme: scheme = "http"
|
||||||
|
path = path[4:]
|
||||||
|
|
||||||
|
# Only perform percent-encoding where it is essential.
|
||||||
|
# Always use uppercase A-through-F characters when percent-encoding.
|
||||||
|
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
||||||
|
def clean(string):
|
||||||
|
string=unicode(unquote(string),'utf-8','replace')
|
||||||
|
return unicodedata.normalize('NFC',string).encode('utf-8')
|
||||||
|
path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
|
||||||
|
fragment=quote(clean(fragment),"~")
|
||||||
|
|
||||||
|
# note care must be taken to only encode & and = characters as values
|
||||||
|
query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
|
||||||
|
for t in q.split("=",1)]) for q in query.split("&")])
|
||||||
|
|
||||||
|
# Prevent dot-segments appearing in non-relative URI paths.
|
||||||
|
if scheme in ["","http","https","ftp","file"]:
|
||||||
|
output=[]
|
||||||
|
for input in path.split('/'):
|
||||||
|
if input=="":
|
||||||
|
if not output: output.append(input)
|
||||||
|
elif input==".":
|
||||||
|
pass
|
||||||
|
elif input=="..":
|
||||||
|
if len(output)>1: output.pop()
|
||||||
|
else:
|
||||||
|
output.append(input)
|
||||||
|
if input in ["",".",".."]: output.append("")
|
||||||
|
path='/'.join(output)
|
||||||
|
|
||||||
|
# For schemes that define a default authority, use an empty authority if
|
||||||
|
# the default is desired.
|
||||||
|
if userinfo in ["@",":@"]: userinfo=""
|
||||||
|
|
||||||
|
# For schemes that define an empty path to be equivalent to a path of "/",
|
||||||
|
# use "/".
|
||||||
|
if path=="" and scheme in ["http","https","ftp","file"]:
|
||||||
|
path="/"
|
||||||
|
|
||||||
|
# For schemes that define a port, use an empty port if the default is
|
||||||
|
# desired
|
||||||
|
if port and scheme in default_port.keys():
|
||||||
|
if port.isdigit():
|
||||||
|
port=str(int(port))
|
||||||
|
if int(port)==default_port[scheme]:
|
||||||
|
port = ''
|
||||||
|
|
||||||
|
# Put it all back together again
|
||||||
|
auth=(userinfo or "") + host
|
||||||
|
if port: auth+=":"+port
|
||||||
|
if url.endswith("#") and query=="" and fragment=="": path+="#"
|
||||||
|
return urlparse.urlunsplit((scheme,auth,path,query,fragment)).replace("http:///", "http://")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import unittest
|
||||||
|
suite = unittest.TestSuite()
|
||||||
|
|
||||||
|
""" from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
|
||||||
|
tests= [
|
||||||
|
(False, "http://:@example.com/"),
|
||||||
|
(False, "http://@example.com/"),
|
||||||
|
(False, "http://example.com"),
|
||||||
|
(False, "HTTP://example.com/"),
|
||||||
|
(False, "http://EXAMPLE.COM/"),
|
||||||
|
(False, "http://example.com/%7Ejane"),
|
||||||
|
(False, "http://example.com/?q=%C7"),
|
||||||
|
(False, "http://example.com/?q=%5c"),
|
||||||
|
(False, "http://example.com/?q=C%CC%A7"),
|
||||||
|
(False, "http://example.com/a/../a/b"),
|
||||||
|
(False, "http://example.com/a/./b"),
|
||||||
|
(False, "http://example.com:80/"),
|
||||||
|
(True, "http://example.com/"),
|
||||||
|
(True, "http://example.com/?q=%C3%87"),
|
||||||
|
(True, "http://example.com/?q=%E2%85%A0"),
|
||||||
|
(True, "http://example.com/?q=%5C"),
|
||||||
|
(True, "http://example.com/~jane"),
|
||||||
|
(True, "http://example.com/a/b"),
|
||||||
|
(True, "http://example.com:8080/"),
|
||||||
|
(True, "http://user:password@example.com/"),
|
||||||
|
|
||||||
|
# from rfc2396bis
|
||||||
|
(True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
|
||||||
|
(True, "http://www.ietf.org/rfc/rfc2396.txt"),
|
||||||
|
(True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
|
||||||
|
(True, "mailto:John.Doe@example.com"),
|
||||||
|
(True, "news:comp.infosystems.www.servers.unix"),
|
||||||
|
(True, "tel:+1-816-555-1212"),
|
||||||
|
(True, "telnet://192.0.2.16:80/"),
|
||||||
|
(True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
|
||||||
|
|
||||||
|
# other
|
||||||
|
(True, "http://127.0.0.1/"),
|
||||||
|
(False, "http://127.0.0.1:80/"),
|
||||||
|
(True, "http://www.w3.org/2000/01/rdf-schema#"),
|
||||||
|
(False, "http://example.com:081/"),
|
||||||
|
]
|
||||||
|
|
||||||
|
def testcase(expected,value):
|
||||||
|
class test(unittest.TestCase):
|
||||||
|
def runTest(self):
|
||||||
|
assert (normalize(value)==value)==expected, \
|
||||||
|
(expected, value, normalize(value))
|
||||||
|
return test()
|
||||||
|
|
||||||
|
for (expected,value) in tests:
|
||||||
|
suite.addTest(testcase(expected,value))
|
||||||
|
|
||||||
|
""" mnot test suite; three tests updated for rfc2396bis. """
|
||||||
|
tests = {
|
||||||
|
'/foo/bar/.': '/foo/bar/',
|
||||||
|
'/foo/bar/./': '/foo/bar/',
|
||||||
|
'/foo/bar/..': '/foo/',
|
||||||
|
'/foo/bar/../': '/foo/',
|
||||||
|
'/foo/bar/../baz': '/foo/baz',
|
||||||
|
'/foo/bar/../..': '/',
|
||||||
|
'/foo/bar/../../': '/',
|
||||||
|
'/foo/bar/../../baz': '/baz',
|
||||||
|
'/foo/bar/../../../baz': '/baz', #was: '/../baz',
|
||||||
|
'/foo/bar/../../../../baz': '/baz',
|
||||||
|
'/./foo': '/foo',
|
||||||
|
'/../foo': '/foo', #was: '/../foo',
|
||||||
|
'/foo.': '/foo.',
|
||||||
|
'/.foo': '/.foo',
|
||||||
|
'/foo..': '/foo..',
|
||||||
|
'/..foo': '/..foo',
|
||||||
|
'/./../foo': '/foo', #was: '/../foo',
|
||||||
|
'/./foo/.': '/foo/',
|
||||||
|
'/foo/./bar': '/foo/bar',
|
||||||
|
'/foo/../bar': '/bar',
|
||||||
|
'/foo//': '/foo/',
|
||||||
|
'/foo///bar//': '/foo/bar/',
|
||||||
|
'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
|
||||||
|
'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
|
||||||
|
'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
|
||||||
|
'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
|
||||||
|
'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
|
||||||
|
'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
|
||||||
|
'ftp://user:pass@ftp.foo.net/foo/bar':
|
||||||
|
'ftp://user:pass@ftp.foo.net/foo/bar',
|
||||||
|
'http://USER:pass@www.Example.COM/foo/bar':
|
||||||
|
'http://USER:pass@www.example.com/foo/bar',
|
||||||
|
'http://www.example.com./': 'http://www.example.com/',
|
||||||
|
'-': '-',
|
||||||
|
}
|
||||||
|
|
||||||
|
def testcase(original,normalized):
|
||||||
|
class test(unittest.TestCase):
|
||||||
|
def runTest(self):
|
||||||
|
assert normalize(original)==normalized, \
|
||||||
|
(original, normalized, normalize(original))
|
||||||
|
return test()
|
||||||
|
|
||||||
|
for (original,normalized) in tests.items():
|
||||||
|
suite.addTest(testcase(original,normalized))
|
||||||
|
|
||||||
|
""" execute tests """
|
||||||
|
unittest.TextTestRunner().run(suite)
|
Loading…
Reference in New Issue