From bb709a74bfd7f3943a0202928da1940e33235601 Mon Sep 17 00:00:00 2001
From: melonhead <devnull@localhost>
Date: Mon, 18 Jan 2010 15:07:06 -0500
Subject: [PATCH] Added URL normalization to urlhistory module to allow better
 detection of duplicates Added configurable ignored URLs to urlhistory module

---
 plugins/urlhistory.py   |  33 +++---
 plugins/util/urlnorm.py | 215 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 17 deletions(-)
 create mode 100644 plugins/util/urlnorm.py

diff --git a/plugins/urlhistory.py b/plugins/urlhistory.py
index 0bd33d2..ed02ec8 100644
--- a/plugins/urlhistory.py
+++ b/plugins/urlhistory.py
@@ -7,6 +7,8 @@ from datetime import timedelta
 import re
 
 from util import hook, timesince
+from util import urlnorm
+#from util import texttime
 
 url_re = re.compile(r'([a-zA-Z]+://|www\.)[^ ]*')
 
@@ -15,6 +17,8 @@ dbname = "skybot.db"
 
 expiration_period = timedelta(days=1)
 
+ignored_urls = [ urlnorm.normalize("http://google.com") ]
+
 #TODO: Generate expiration_period_text from expiration_period
 expiration_period_text = "24 hours"
 
@@ -75,13 +79,7 @@ def dbconnect(db):
     return conn
 
 def normalize_url(url):
-    # TODO: do something so that:
-    # - http://www.google.com
-    # - www.google.com
-    # - http://google.com
-    # - http://google.com/ 
-    # etc are all considered to be the same URL
-    return url
+    return urlnorm.normalize(url)
  
 def get_once_twice(count):
    if count == 1:
@@ -98,14 +96,15 @@ def urlinput(bot, input):
     if m:
         # URL detected
         url = normalize_url(m.group(0))
-        conn = dbconnect(dbpath)
-        dupes = select_history_for_url_and_channel(conn, url, input.chan)
-        num_dupes = len(dupes)
-        if num_dupes > 0 and input.nick not in dupes:
-            nicks = get_nicklist(dupes)
-            reply = "That link has been posted " + get_once_twice(num_dupes)
-            reply += " in the past " + expiration_period_text + " by " + nicks
-            input.reply(reply)
-        insert_history(conn, url, input.chan, input.nick)
-        conn.close()
+        if url not in ignored_urls:
+           conn = dbconnect(dbpath)
+           dupes = select_history_for_url_and_channel(conn, url, input.chan)
+           num_dupes = len(dupes)
+           if num_dupes > 0 and input.nick not in dupes:
+               nicks = get_nicklist(dupes)
+               reply = "That link has been posted " + get_once_twice(num_dupes)
+               reply += " in the past " + expiration_period_text + " by " + nicks
+               input.reply(reply)
+           insert_history(conn, url, input.chan, input.nick)
+           conn.close()
 
diff --git a/plugins/util/urlnorm.py b/plugins/util/urlnorm.py
new file mode 100644
index 0000000..3a07621
--- /dev/null
+++ b/plugins/util/urlnorm.py
@@ -0,0 +1,215 @@
+"""
+URI Normalization function:
+ * Always provide the URI scheme in lowercase characters.
+ * Always provide the host, if any, in lowercase characters.
+ * Only perform percent-encoding where it is essential.
+ * Always use uppercase A-through-F characters when percent-encoding.
+ * Prevent dot-segments appearing in non-relative URI paths.
+ * For schemes that define a default authority, use an empty authority if the
+   default is desired.
+ * For schemes that define an empty path to be equivalent to a path of "/",
+   use "/".
+ * For schemes that define a port, use an empty port if the default is desired
+ * All portions of the URI must be utf-8 encoded NFC from Unicode strings
+
+implements:
+  http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
+  http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
+
+inspired by:
+  Tony J. Ibbs,    http://starship.python.net/crew/tibs/python/tji_url.py
+  Mark Nottingham, http://www.mnot.net/python/urlnorm.py
+"""
+
+__license__ = "Python"
+
+import re, unicodedata, urlparse
+from urllib import quote, unquote
+
+default_port = {
+    'ftp': 21,
+    'telnet': 23,
+    'http': 80,
+    'gopher': 70,
+    'news': 119,
+    'nntp': 119,
+    'prospero': 191,
+    'https': 443,
+    'snews': 563,
+    'snntp': 563,
+}
+
+def normalize(url):
+    """Normalize a URL."""
+
+    scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
+    (userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups()
+
+    # Always provide the URI scheme in lowercase characters.
+    scheme = scheme.lower()
+
+    # Always provide the host, if any, in lowercase characters.
+    host = host.lower()
+    if host and host[-1] == '.': host = host[:-1]
+    if host and host.startswith("www."): 
+        if not scheme: scheme = "http"
+        host = host[4:]
+    elif path and path.startswith("www."):
+        if not scheme: scheme = "http"
+        path = path[4:]
+
+    # Only perform percent-encoding where it is essential.
+    # Always use uppercase A-through-F characters when percent-encoding.
+    # All portions of the URI must be utf-8 encoded NFC from Unicode strings
+    def clean(string):
+        string=unicode(unquote(string),'utf-8','replace')
+        return unicodedata.normalize('NFC',string).encode('utf-8')
+    path=quote(clean(path),"~:/?#[]@!$&'()*+,;=")
+    fragment=quote(clean(fragment),"~")
+
+    # note care must be taken to only encode & and = characters as values
+    query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
+        for t in q.split("=",1)]) for q in query.split("&")])
+
+    # Prevent dot-segments appearing in non-relative URI paths.
+    if scheme in ["","http","https","ftp","file"]:
+        output=[]
+        for input in path.split('/'):
+            if input=="":
+                if not output: output.append(input)
+            elif input==".":
+                pass
+            elif input=="..":
+                if len(output)>1: output.pop()
+            else:
+                output.append(input)
+        if input in ["",".",".."]: output.append("")
+        path='/'.join(output)
+
+    # For schemes that define a default authority, use an empty authority if
+    # the default is desired.
+    if userinfo in ["@",":@"]: userinfo=""
+
+    # For schemes that define an empty path to be equivalent to a path of "/",
+    # use "/".
+    if path=="" and scheme in ["http","https","ftp","file"]:
+        path="/"
+
+    # For schemes that define a port, use an empty port if the default is
+    # desired
+    if port and scheme in default_port.keys():
+        if port.isdigit():
+            port=str(int(port))
+            if int(port)==default_port[scheme]:
+                port = ''
+
+    # Put it all back together again
+    auth=(userinfo or "") + host
+    if port: auth+=":"+port
+    if url.endswith("#") and query=="" and fragment=="": path+="#"
+    return urlparse.urlunsplit((scheme,auth,path,query,fragment)).replace("http:///", "http://")
+
+if __name__ == "__main__":
+    import unittest
+    suite = unittest.TestSuite()
+
+    """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
+    tests= [
+        (False, "http://:@example.com/"),
+        (False, "http://@example.com/"),
+        (False, "http://example.com"),
+        (False, "HTTP://example.com/"),
+        (False, "http://EXAMPLE.COM/"),
+        (False, "http://example.com/%7Ejane"),
+        (False, "http://example.com/?q=%C7"),
+        (False, "http://example.com/?q=%5c"),
+        (False, "http://example.com/?q=C%CC%A7"),
+        (False, "http://example.com/a/../a/b"),
+        (False, "http://example.com/a/./b"),
+        (False, "http://example.com:80/"),
+        (True,  "http://example.com/"),
+        (True,  "http://example.com/?q=%C3%87"),
+        (True,  "http://example.com/?q=%E2%85%A0"),
+        (True,  "http://example.com/?q=%5C"),
+        (True,  "http://example.com/~jane"),
+        (True,  "http://example.com/a/b"),
+        (True,  "http://example.com:8080/"),
+        (True,  "http://user:password@example.com/"),
+
+        # from rfc2396bis
+        (True,  "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
+        (True,  "http://www.ietf.org/rfc/rfc2396.txt"),
+        (True,  "ldap://[2001:db8::7]/c=GB?objectClass?one"),
+        (True,  "mailto:John.Doe@example.com"),
+        (True,  "news:comp.infosystems.www.servers.unix"),
+        (True,  "tel:+1-816-555-1212"),
+        (True,  "telnet://192.0.2.16:80/"),
+        (True,  "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
+
+        # other
+        (True,  "http://127.0.0.1/"),
+        (False,  "http://127.0.0.1:80/"),
+        (True,   "http://www.w3.org/2000/01/rdf-schema#"),
+        (False, "http://example.com:081/"),
+    ]
+
+    def testcase(expected,value):
+        class test(unittest.TestCase):
+            def runTest(self):
+                assert (normalize(value)==value)==expected, \
+                    (expected, value, normalize(value))
+        return test()
+
+    for (expected,value) in tests:
+        suite.addTest(testcase(expected,value))
+
+    """ mnot test suite; three tests updated for rfc2396bis. """
+    tests = {
+        '/foo/bar/.':                    '/foo/bar/',
+        '/foo/bar/./':                   '/foo/bar/',
+        '/foo/bar/..':                   '/foo/',
+        '/foo/bar/../':                  '/foo/',
+        '/foo/bar/../baz':               '/foo/baz',
+        '/foo/bar/../..':                '/',
+        '/foo/bar/../../':               '/',
+        '/foo/bar/../../baz':            '/baz',
+        '/foo/bar/../../../baz':         '/baz', #was: '/../baz',
+        '/foo/bar/../../../../baz':      '/baz',
+        '/./foo':                        '/foo',
+        '/../foo':                       '/foo', #was: '/../foo',
+        '/foo.':                         '/foo.',
+        '/.foo':                         '/.foo',
+        '/foo..':                        '/foo..',
+        '/..foo':                        '/..foo',
+        '/./../foo':                     '/foo', #was: '/../foo',
+        '/./foo/.':                      '/foo/',
+        '/foo/./bar':                    '/foo/bar',
+        '/foo/../bar':                   '/bar',
+        '/foo//':                        '/foo/',
+        '/foo///bar//':                  '/foo/bar/',
+        'http://www.foo.com:80/foo':     'http://www.foo.com/foo',
+        'http://www.foo.com:8000/foo':   'http://www.foo.com:8000/foo',
+        'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
+        'http://www.foo.com.:81/foo':    'http://www.foo.com:81/foo',
+        'http://www.foo.com/%7ebar':     'http://www.foo.com/~bar',
+        'http://www.foo.com/%7Ebar':     'http://www.foo.com/~bar',
+        'ftp://user:pass@ftp.foo.net/foo/bar':
+             'ftp://user:pass@ftp.foo.net/foo/bar',
+        'http://USER:pass@www.Example.COM/foo/bar':
+             'http://USER:pass@www.example.com/foo/bar',
+        'http://www.example.com./':      'http://www.example.com/',
+        '-':                             '-',
+    }
+
+    def testcase(original,normalized):
+        class test(unittest.TestCase):
+            def runTest(self):
+                assert normalize(original)==normalized, \
+                    (original, normalized, normalize(original))
+        return test()
+
+    for (original,normalized) in tests.items():
+        suite.addTest(testcase(original,normalized))
+
+    """ execute tests """
+    unittest.TextTestRunner().run(suite)