From ee8d51dc624f6ba7402166e2b6baed8032f862a3 Mon Sep 17 00:00:00 2001 From: Ryan Hitchman Date: Wed, 3 Mar 2010 20:25:13 -0700 Subject: [PATCH] remove cruft from urlnorm --- plugins/util/urlnorm.py | 147 +++++----------------------------------- 1 file changed, 18 insertions(+), 129 deletions(-) diff --git a/plugins/util/urlnorm.py b/plugins/util/urlnorm.py index 3a07621..7b919ba 100644 --- a/plugins/util/urlnorm.py +++ b/plugins/util/urlnorm.py @@ -27,23 +27,14 @@ import re, unicodedata, urlparse from urllib import quote, unquote default_port = { - 'ftp': 21, - 'telnet': 23, 'http': 80, - 'gopher': 70, - 'news': 119, - 'nntp': 119, - 'prospero': 191, - 'https': 443, - 'snews': 563, - 'snntp': 563, } def normalize(url): """Normalize a URL.""" - scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip()) - (userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups() + scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip()) + userinfo, host, port=re.search('([^@]*@)?([^:]*):?(.*)', auth).groups() # Always provide the URI scheme in lowercase characters. scheme = scheme.lower() @@ -62,17 +53,17 @@ def normalize(url): # Always use uppercase A-through-F characters when percent-encoding. # All portions of the URI must be utf-8 encoded NFC from Unicode strings def clean(string): - string=unicode(unquote(string),'utf-8','replace') - return unicodedata.normalize('NFC',string).encode('utf-8') - path=quote(clean(path),"~:/?#[]@!$&'()*+,;=") - fragment=quote(clean(fragment),"~") + string=unicode(unquote(string), 'utf-8', 'replace') + return unicodedata.normalize('NFC', string).encode('utf-8') + path=quote(clean(path), "~:/?#[]@!$&'()*+,;=") + fragment=quote(clean(fragment), "~") # note care must be taken to only encode & and = characters as values - query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=") - for t in q.split("=",1)]) for q in query.split("&")]) + query="&".join(["=".join([quote(clean(t) , "~:/?#[]@!$'()*+,;=") + for t in q.split("=", 1)]) for q in query.split("&")]) # Prevent dot-segments appearing in non-relative URI paths. - if scheme in ["","http","https","ftp","file"]: + if scheme in ["", "http", "https", "ftp", "file"]: output=[] for input in path.split('/'): if input=="": @@ -83,16 +74,16 @@ def normalize(url): if len(output)>1: output.pop() else: output.append(input) - if input in ["",".",".."]: output.append("") + if input in ["", ".", ".."]: output.append("") path='/'.join(output) # For schemes that define a default authority, use an empty authority if # the default is desired. - if userinfo in ["@",":@"]: userinfo="" + if userinfo in ["@", ":@"]: userinfo="" # For schemes that define an empty path to be equivalent to a path of "/", # use "/". - if path=="" and scheme in ["http","https","ftp","file"]: + if path=="" and scheme in ["http", "https", "ftp", "file"]: path="/" # For schemes that define a port, use an empty port if the default is @@ -105,111 +96,9 @@ def normalize(url): # Put it all back together again auth=(userinfo or "") + host - if port: auth+=":"+port - if url.endswith("#") and query=="" and fragment=="": path+="#" - return urlparse.urlunsplit((scheme,auth,path,query,fragment)).replace("http:///", "http://") - -if __name__ == "__main__": - import unittest - suite = unittest.TestSuite() - - """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """ - tests= [ - (False, "http://:@example.com/"), - (False, "http://@example.com/"), - (False, "http://example.com"), - (False, "HTTP://example.com/"), - (False, "http://EXAMPLE.COM/"), - (False, "http://example.com/%7Ejane"), - (False, "http://example.com/?q=%C7"), - (False, "http://example.com/?q=%5c"), - (False, "http://example.com/?q=C%CC%A7"), - (False, "http://example.com/a/../a/b"), - (False, "http://example.com/a/./b"), - (False, "http://example.com:80/"), - (True, "http://example.com/"), - (True, "http://example.com/?q=%C3%87"), - (True, "http://example.com/?q=%E2%85%A0"), - (True, "http://example.com/?q=%5C"), - (True, "http://example.com/~jane"), - (True, "http://example.com/a/b"), - (True, "http://example.com:8080/"), - (True, "http://user:password@example.com/"), - - # from rfc2396bis - (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"), - (True, "http://www.ietf.org/rfc/rfc2396.txt"), - (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"), - (True, "mailto:John.Doe@example.com"), - (True, "news:comp.infosystems.www.servers.unix"), - (True, "tel:+1-816-555-1212"), - (True, "telnet://192.0.2.16:80/"), - (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"), - - # other - (True, "http://127.0.0.1/"), - (False, "http://127.0.0.1:80/"), - (True, "http://www.w3.org/2000/01/rdf-schema#"), - (False, "http://example.com:081/"), - ] - - def testcase(expected,value): - class test(unittest.TestCase): - def runTest(self): - assert (normalize(value)==value)==expected, \ - (expected, value, normalize(value)) - return test() - - for (expected,value) in tests: - suite.addTest(testcase(expected,value)) - - """ mnot test suite; three tests updated for rfc2396bis. """ - tests = { - '/foo/bar/.': '/foo/bar/', - '/foo/bar/./': '/foo/bar/', - '/foo/bar/..': '/foo/', - '/foo/bar/../': '/foo/', - '/foo/bar/../baz': '/foo/baz', - '/foo/bar/../..': '/', - '/foo/bar/../../': '/', - '/foo/bar/../../baz': '/baz', - '/foo/bar/../../../baz': '/baz', #was: '/../baz', - '/foo/bar/../../../../baz': '/baz', - '/./foo': '/foo', - '/../foo': '/foo', #was: '/../foo', - '/foo.': '/foo.', - '/.foo': '/.foo', - '/foo..': '/foo..', - '/..foo': '/..foo', - '/./../foo': '/foo', #was: '/../foo', - '/./foo/.': '/foo/', - '/foo/./bar': '/foo/bar', - '/foo/../bar': '/bar', - '/foo//': '/foo/', - '/foo///bar//': '/foo/bar/', - 'http://www.foo.com:80/foo': 'http://www.foo.com/foo', - 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', - 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', - 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', - 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', - 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', - 'ftp://user:pass@ftp.foo.net/foo/bar': - 'ftp://user:pass@ftp.foo.net/foo/bar', - 'http://USER:pass@www.Example.COM/foo/bar': - 'http://USER:pass@www.example.com/foo/bar', - 'http://www.example.com./': 'http://www.example.com/', - '-': '-', - } - - def testcase(original,normalized): - class test(unittest.TestCase): - def runTest(self): - assert normalize(original)==normalized, \ - (original, normalized, normalize(original)) - return test() - - for (original,normalized) in tests.items(): - suite.addTest(testcase(original,normalized)) - - """ execute tests """ - unittest.TextTestRunner().run(suite) + if port: + auth+=":"+port + if url.endswith("#") and query == "" and fragment == "": + path += "#" + return urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace( + "http:///", "http://")