105 lines
3.7 KiB
Python
105 lines
3.7 KiB
Python
"""
|
|
URI Normalization function:
|
|
* Always provide the URI scheme in lowercase characters.
|
|
* Always provide the host, if any, in lowercase characters.
|
|
* Only perform percent-encoding where it is essential.
|
|
* Always use uppercase A-through-F characters when percent-encoding.
|
|
* Prevent dot-segments appearing in non-relative URI paths.
|
|
* For schemes that define a default authority, use an empty authority if the
|
|
default is desired.
|
|
* For schemes that define an empty path to be equivalent to a path of "/",
|
|
use "/".
|
|
* For schemes that define a port, use an empty port if the default is desired
|
|
* All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
|
|
|
implements:
|
|
http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
|
|
http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
|
|
|
|
inspired by:
|
|
Tony J. Ibbs, http://starship.python.net/crew/tibs/python/tji_url.py
|
|
Mark Nottingham, http://www.mnot.net/python/urlnorm.py
|
|
"""
|
|
|
|
__license__ = "Python"
|
|
|
|
import re, unicodedata, urlparse
|
|
from urllib import quote, unquote
|
|
|
|
default_port = {
|
|
'http': 80,
|
|
}
|
|
|
|
def normalize(url):
|
|
"""Normalize a URL."""
|
|
|
|
scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
|
|
userinfo, host, port=re.search('([^@]*@)?([^:]*):?(.*)', auth).groups()
|
|
|
|
# Always provide the URI scheme in lowercase characters.
|
|
scheme = scheme.lower()
|
|
|
|
# Always provide the host, if any, in lowercase characters.
|
|
host = host.lower()
|
|
if host and host[-1] == '.': host = host[:-1]
|
|
if host and host.startswith("www."):
|
|
if not scheme: scheme = "http"
|
|
host = host[4:]
|
|
elif path and path.startswith("www."):
|
|
if not scheme: scheme = "http"
|
|
path = path[4:]
|
|
|
|
# Only perform percent-encoding where it is essential.
|
|
# Always use uppercase A-through-F characters when percent-encoding.
|
|
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
|
|
def clean(string):
|
|
string=unicode(unquote(string), 'utf-8', 'replace')
|
|
return unicodedata.normalize('NFC', string).encode('utf-8')
|
|
path=quote(clean(path), "~:/?#[]@!$&'()*+,;=")
|
|
fragment=quote(clean(fragment), "~")
|
|
|
|
# note care must be taken to only encode & and = characters as values
|
|
query="&".join(["=".join([quote(clean(t), "~:/?#[]@!$'()*+,;=")
|
|
for t in q.split("=", 1)]) for q in query.split("&")])
|
|
|
|
# Prevent dot-segments appearing in non-relative URI paths.
|
|
if scheme in ["", "http", "https", "ftp", "file"]:
|
|
output=[]
|
|
for input in path.split('/'):
|
|
if input=="":
|
|
if not output: output.append(input)
|
|
elif input==".":
|
|
pass
|
|
elif input=="..":
|
|
if len(output)>1: output.pop()
|
|
else:
|
|
output.append(input)
|
|
if input in ["", ".", ".."]: output.append("")
|
|
path='/'.join(output)
|
|
|
|
# For schemes that define a default authority, use an empty authority if
|
|
# the default is desired.
|
|
if userinfo in ["@", ":@"]: userinfo=""
|
|
|
|
# For schemes that define an empty path to be equivalent to a path of "/",
|
|
# use "/".
|
|
if path=="" and scheme in ["http", "https", "ftp", "file"]:
|
|
path="/"
|
|
|
|
# For schemes that define a port, use an empty port if the default is
|
|
# desired
|
|
if port and scheme in default_port.keys():
|
|
if port.isdigit():
|
|
port=str(int(port))
|
|
if int(port)==default_port[scheme]:
|
|
port = ''
|
|
|
|
# Put it all back together again
|
|
auth=(userinfo or "") + host
|
|
if port:
|
|
auth+=":"+port
|
|
if url.endswith("#") and query == "" and fragment == "":
|
|
path += "#"
|
|
return urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
|
|
"http:///", "http://")
|