remove cruft from urlnorm

This commit is contained in:
Ryan Hitchman 2010-03-03 20:25:13 -07:00
parent 98d939ace4
commit ee8d51dc62
1 changed files with 18 additions and 129 deletions

View File

@ -27,23 +27,14 @@ import re, unicodedata, urlparse
from urllib import quote, unquote from urllib import quote, unquote
default_port = { default_port = {
'ftp': 21,
'telnet': 23,
'http': 80, 'http': 80,
'gopher': 70,
'news': 119,
'nntp': 119,
'prospero': 191,
'https': 443,
'snews': 563,
'snntp': 563,
} }
def normalize(url): def normalize(url):
"""Normalize a URL.""" """Normalize a URL."""
scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip()) scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
(userinfo,host,port)=re.search('([^@]*@)?([^:]*):?(.*)',auth).groups() userinfo, host, port=re.search('([^@]*@)?([^:]*):?(.*)', auth).groups()
# Always provide the URI scheme in lowercase characters. # Always provide the URI scheme in lowercase characters.
scheme = scheme.lower() scheme = scheme.lower()
@ -62,17 +53,17 @@ def normalize(url):
# Always use uppercase A-through-F characters when percent-encoding. # Always use uppercase A-through-F characters when percent-encoding.
# All portions of the URI must be utf-8 encoded NFC from Unicode strings # All portions of the URI must be utf-8 encoded NFC from Unicode strings
def clean(string): def clean(string):
string=unicode(unquote(string),'utf-8','replace') string=unicode(unquote(string), 'utf-8', 'replace')
return unicodedata.normalize('NFC',string).encode('utf-8') return unicodedata.normalize('NFC', string).encode('utf-8')
path=quote(clean(path),"~:/?#[]@!$&'()*+,;=") path=quote(clean(path), "~:/?#[]@!$&'()*+,;=")
fragment=quote(clean(fragment),"~") fragment=quote(clean(fragment), "~")
# note care must be taken to only encode & and = characters as values # note care must be taken to only encode & and = characters as values
query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=") query="&".join(["=".join([quote(clean(t) , "~:/?#[]@!$'()*+,;=")
for t in q.split("=",1)]) for q in query.split("&")]) for t in q.split("=", 1)]) for q in query.split("&")])
# Prevent dot-segments appearing in non-relative URI paths. # Prevent dot-segments appearing in non-relative URI paths.
if scheme in ["","http","https","ftp","file"]: if scheme in ["", "http", "https", "ftp", "file"]:
output=[] output=[]
for input in path.split('/'): for input in path.split('/'):
if input=="": if input=="":
@ -83,16 +74,16 @@ def normalize(url):
if len(output)>1: output.pop() if len(output)>1: output.pop()
else: else:
output.append(input) output.append(input)
if input in ["",".",".."]: output.append("") if input in ["", ".", ".."]: output.append("")
path='/'.join(output) path='/'.join(output)
# For schemes that define a default authority, use an empty authority if # For schemes that define a default authority, use an empty authority if
# the default is desired. # the default is desired.
if userinfo in ["@",":@"]: userinfo="" if userinfo in ["@", ":@"]: userinfo=""
# For schemes that define an empty path to be equivalent to a path of "/", # For schemes that define an empty path to be equivalent to a path of "/",
# use "/". # use "/".
if path=="" and scheme in ["http","https","ftp","file"]: if path=="" and scheme in ["http", "https", "ftp", "file"]:
path="/" path="/"
# For schemes that define a port, use an empty port if the default is # For schemes that define a port, use an empty port if the default is
@ -105,111 +96,9 @@ def normalize(url):
# Put it all back together again # Put it all back together again
auth=(userinfo or "") + host auth=(userinfo or "") + host
if port: auth+=":"+port if port:
if url.endswith("#") and query=="" and fragment=="": path+="#" auth+=":"+port
return urlparse.urlunsplit((scheme,auth,path,query,fragment)).replace("http:///", "http://") if url.endswith("#") and query == "" and fragment == "":
path += "#"
if __name__ == "__main__": return urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
import unittest "http:///", "http://")
suite = unittest.TestSuite()
""" from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
tests= [
(False, "http://:@example.com/"),
(False, "http://@example.com/"),
(False, "http://example.com"),
(False, "HTTP://example.com/"),
(False, "http://EXAMPLE.COM/"),
(False, "http://example.com/%7Ejane"),
(False, "http://example.com/?q=%C7"),
(False, "http://example.com/?q=%5c"),
(False, "http://example.com/?q=C%CC%A7"),
(False, "http://example.com/a/../a/b"),
(False, "http://example.com/a/./b"),
(False, "http://example.com:80/"),
(True, "http://example.com/"),
(True, "http://example.com/?q=%C3%87"),
(True, "http://example.com/?q=%E2%85%A0"),
(True, "http://example.com/?q=%5C"),
(True, "http://example.com/~jane"),
(True, "http://example.com/a/b"),
(True, "http://example.com:8080/"),
(True, "http://user:password@example.com/"),
# from rfc2396bis
(True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
(True, "http://www.ietf.org/rfc/rfc2396.txt"),
(True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
(True, "mailto:John.Doe@example.com"),
(True, "news:comp.infosystems.www.servers.unix"),
(True, "tel:+1-816-555-1212"),
(True, "telnet://192.0.2.16:80/"),
(True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
# other
(True, "http://127.0.0.1/"),
(False, "http://127.0.0.1:80/"),
(True, "http://www.w3.org/2000/01/rdf-schema#"),
(False, "http://example.com:081/"),
]
def testcase(expected,value):
class test(unittest.TestCase):
def runTest(self):
assert (normalize(value)==value)==expected, \
(expected, value, normalize(value))
return test()
for (expected,value) in tests:
suite.addTest(testcase(expected,value))
""" mnot test suite; three tests updated for rfc2396bis. """
tests = {
'/foo/bar/.': '/foo/bar/',
'/foo/bar/./': '/foo/bar/',
'/foo/bar/..': '/foo/',
'/foo/bar/../': '/foo/',
'/foo/bar/../baz': '/foo/baz',
'/foo/bar/../..': '/',
'/foo/bar/../../': '/',
'/foo/bar/../../baz': '/baz',
'/foo/bar/../../../baz': '/baz', #was: '/../baz',
'/foo/bar/../../../../baz': '/baz',
'/./foo': '/foo',
'/../foo': '/foo', #was: '/../foo',
'/foo.': '/foo.',
'/.foo': '/.foo',
'/foo..': '/foo..',
'/..foo': '/..foo',
'/./../foo': '/foo', #was: '/../foo',
'/./foo/.': '/foo/',
'/foo/./bar': '/foo/bar',
'/foo/../bar': '/bar',
'/foo//': '/foo/',
'/foo///bar//': '/foo/bar/',
'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
'ftp://user:pass@ftp.foo.net/foo/bar':
'ftp://user:pass@ftp.foo.net/foo/bar',
'http://USER:pass@www.Example.COM/foo/bar':
'http://USER:pass@www.example.com/foo/bar',
'http://www.example.com./': 'http://www.example.com/',
'-': '-',
}
def testcase(original,normalized):
class test(unittest.TestCase):
def runTest(self):
assert normalize(original)==normalized, \
(original, normalized, normalize(original))
return test()
for (original,normalized) in tests.items():
suite.addTest(testcase(original,normalized))
""" execute tests """
unittest.TextTestRunner().run(suite)