From bb709a74bfd7f3943a0202928da1940e33235601 Mon Sep 17 00:00:00 2001 From: melonhead Date: Mon, 18 Jan 2010 15:07:06 -0500 Subject: [PATCH] Added URL normalization to urlhistory module to allow better detection of duplicates Added configurable ignored URLs to urlhistory module --- plugins/urlhistory.py | 33 +++--- plugins/util/urlnorm.py | 215 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 17 deletions(-) create mode 100644 plugins/util/urlnorm.py diff --git a/plugins/urlhistory.py b/plugins/urlhistory.py index 0bd33d2..ed02ec8 100644 --- a/plugins/urlhistory.py +++ b/plugins/urlhistory.py @@ -7,6 +7,8 @@ from datetime import timedelta import re from util import hook, timesince +from util import urlnorm +#from util import texttime url_re = re.compile(r'([a-zA-Z]+://|www\.)[^ ]*') @@ -15,6 +17,8 @@ dbname = "skybot.db" expiration_period = timedelta(days=1) +ignored_urls = [ urlnorm.normalize("http://google.com") ] + #TODO: Generate expiration_period_text from expiration_period expiration_period_text = "24 hours" @@ -75,13 +79,7 @@ def dbconnect(db): return conn def normalize_url(url): - # TODO: do something so that: - # - http://www.google.com - # - www.google.com - # - http://google.com - # - http://google.com/ - # etc are all considered to be the same URL - return url + return urlnorm.normalize(url) def get_once_twice(count): if count == 1: @@ -98,14 +96,15 @@ def urlinput(bot, input): if m: # URL detected url = normalize_url(m.group(0)) - conn = dbconnect(dbpath) - dupes = select_history_for_url_and_channel(conn, url, input.chan) - num_dupes = len(dupes) - if num_dupes > 0 and input.nick not in dupes: - nicks = get_nicklist(dupes) - reply = "That link has been posted " + get_once_twice(num_dupes) - reply += " in the past " + expiration_period_text + " by " + nicks - input.reply(reply) - insert_history(conn, url, input.chan, input.nick) - conn.close() + if url not in ignored_urls: + conn = dbconnect(dbpath) + dupes = select_history_for_url_and_channel(conn, url, input.chan) + num_dupes = len(dupes) + if num_dupes > 0 and input.nick not in dupes: + nicks = get_nicklist(dupes) + reply = "That link has been posted " + get_once_twice(num_dupes) + reply += " in the past " + expiration_period_text + " by " + nicks + input.reply(reply) + insert_history(conn, url, input.chan, input.nick) + conn.close() diff --git a/plugins/util/urlnorm.py b/plugins/util/urlnorm.py new file mode 100644 index 0000000..3a07621 --- /dev/null +++ b/plugins/util/urlnorm.py @@ -0,0 +1,215 @@ +""" +URI Normalization function: + * Always provide the URI scheme in lowercase characters. + * Always provide the host, if any, in lowercase characters. + * Only perform percent-encoding where it is essential. + * Always use uppercase A-through-F characters when percent-encoding. + * Prevent dot-segments appearing in non-relative URI paths. + * For schemes that define a default authority, use an empty authority if the + default is desired. + * For schemes that define an empty path to be equivalent to a path of "/", + use "/". + * For schemes that define a port, use an empty port if the default is desired + * All portions of the URI must be utf-8 encoded NFC from Unicode strings + +implements: + http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form + http://www.intertwingly.net/wiki/pie/PaceCanonicalIds + +inspired by: + Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py + Mark Nottingham, http://www.mnot.net/python/urlnorm.py +""" + +__license__ = "Python" + +import re, unicodedata, urlparse +from urllib import quote, unquote + +default_port = { + 'ftp': 21, + 'telnet': 23, + 'http': 80, + 'gopher': 70, + 'news': 119, + 'nntp': 119, + 'prospero': 191, + 'https': 443, + 'snews': 563, + 'snntp': 563, +} + +def normalize(url): + """Normalize a URL.""" + + scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip()) + (userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups() + + # Always provide the URI scheme in lowercase characters. + scheme = scheme.lower() + + # Always provide the host, if any, in lowercase characters. + host = host.lower() + if host and host[-1] == '.': host = host[:-1] + if host and host.startswith("www."): + if not scheme: scheme = "http" + host = host[4:] + elif path and path.startswith("www."): + if not scheme: scheme = "http" + path = path[4:] + + # Only perform percent-encoding where it is essential. + # Always use uppercase A-through-F characters when percent-encoding. + # All portions of the URI must be utf-8 encoded NFC from Unicode strings + def clean(string): + string=unicode(unquote(string),'utf-8','replace') + return unicodedata.normalize('NFC',string).encode('utf-8') + path=quote(clean(path),"~:/?#[]@!$&'()*+,;=") + fragment=quote(clean(fragment),"~") + + # note care must be taken to only encode & and = characters as values + query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=") + for t in q.split("=",1)]) for q in query.split("&")]) + + # Prevent dot-segments appearing in non-relative URI paths. + if scheme in ["","http","https","ftp","file"]: + output=[] + for input in path.split('/'): + if input=="": + if not output: output.append(input) + elif input==".": + pass + elif input=="..": + if len(output)>1: output.pop() + else: + output.append(input) + if input in ["",".",".."]: output.append("") + path='/'.join(output) + + # For schemes that define a default authority, use an empty authority if + # the default is desired. + if userinfo in ["@",":@"]: userinfo="" + + # For schemes that define an empty path to be equivalent to a path of "/", + # use "/". + if path=="" and scheme in ["http","https","ftp","file"]: + path="/" + + # For schemes that define a port, use an empty port if the default is + # desired + if port and scheme in default_port.keys(): + if port.isdigit(): + port=str(int(port)) + if int(port)==default_port[scheme]: + port = '' + + # Put it all back together again + auth=(userinfo or "") + host + if port: auth+=":"+port + if url.endswith("#") and query=="" and fragment=="": path+="#" + return urlparse.urlunsplit((scheme,auth,path,query,fragment)).replace("http:///", "http://") + +if __name__ == "__main__": + import unittest + suite = unittest.TestSuite() + + """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """ + tests= [ + (False, "http://:@example.com/"), + (False, "http://@example.com/"), + (False, "http://example.com"), + (False, "HTTP://example.com/"), + (False, "http://EXAMPLE.COM/"), + (False, "http://example.com/%7Ejane"), + (False, "http://example.com/?q=%C7"), + (False, "http://example.com/?q=%5c"), + (False, "http://example.com/?q=C%CC%A7"), + (False, "http://example.com/a/../a/b"), + (False, "http://example.com/a/./b"), + (False, "http://example.com:80/"), + (True, "http://example.com/"), + (True, "http://example.com/?q=%C3%87"), + (True, "http://example.com/?q=%E2%85%A0"), + (True, "http://example.com/?q=%5C"), + (True, "http://example.com/~jane"), + (True, "http://example.com/a/b"), + (True, "http://example.com:8080/"), + (True, "http://user:password@example.com/"), + + # from rfc2396bis + (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"), + (True, "http://www.ietf.org/rfc/rfc2396.txt"), + (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"), + (True, "mailto:John.Doe@example.com"), + (True, "news:comp.infosystems.www.servers.unix"), + (True, "tel:+1-816-555-1212"), + (True, "telnet://192.0.2.16:80/"), + (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"), + + # other + (True, "http://127.0.0.1/"), + (False, "http://127.0.0.1:80/"), + (True, "http://www.w3.org/2000/01/rdf-schema#"), + (False, "http://example.com:081/"), + ] + + def testcase(expected,value): + class test(unittest.TestCase): + def runTest(self): + assert (normalize(value)==value)==expected, \ + (expected, value, normalize(value)) + return test() + + for (expected,value) in tests: + suite.addTest(testcase(expected,value)) + + """ mnot test suite; three tests updated for rfc2396bis. """ + tests = { + '/foo/bar/.': '/foo/bar/', + '/foo/bar/./': '/foo/bar/', + '/foo/bar/..': '/foo/', + '/foo/bar/../': '/foo/', + '/foo/bar/../baz': '/foo/baz', + '/foo/bar/../..': '/', + '/foo/bar/../../': '/', + '/foo/bar/../../baz': '/baz', + '/foo/bar/../../../baz': '/baz', #was: '/../baz', + '/foo/bar/../../../../baz': '/baz', + '/./foo': '/foo', + '/../foo': '/foo', #was: '/../foo', + '/foo.': '/foo.', + '/.foo': '/.foo', + '/foo..': '/foo..', + '/..foo': '/..foo', + '/./../foo': '/foo', #was: '/../foo', + '/./foo/.': '/foo/', + '/foo/./bar': '/foo/bar', + '/foo/../bar': '/bar', + '/foo//': '/foo/', + '/foo///bar//': '/foo/bar/', + 'http://www.foo.com:80/foo': 'http://www.foo.com/foo', + 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', + 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', + 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', + 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', + 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', + 'ftp://user:pass@ftp.foo.net/foo/bar': + 'ftp://user:pass@ftp.foo.net/foo/bar', + 'http://USER:pass@www.Example.COM/foo/bar': + 'http://USER:pass@www.example.com/foo/bar', + 'http://www.example.com./': 'http://www.example.com/', + '-': '-', + } + + def testcase(original,normalized): + class test(unittest.TestCase): + def runTest(self): + assert normalize(original)==normalized, \ + (original, normalized, normalize(original)) + return test() + + for (original,normalized) in tests.items(): + suite.addTest(testcase(original,normalized)) + + """ execute tests """ + unittest.TextTestRunner().run(suite)