h/plugins/util/urlnorm.py

"""
URI Normalization function:
 * Always provide the URI scheme in lowercase characters.
 * Always provide the host, if any, in lowercase characters.
 * Only perform percent-encoding where it is essential.
 * Always use uppercase A-through-F characters when percent-encoding.
 * Prevent dot-segments appearing in non-relative URI paths.
 * For schemes that define a default authority, use an empty authority if the
   default is desired.
 * For schemes that define an empty path to be equivalent to a path of "/",
   use "/".
 * For schemes that define a port, use an empty port if the default is desired
 * All portions of the URI must be utf-8 encoded NFC from Unicode strings

implements:
  http://gbiv.com/protocols/uri/rev-2002/rfc2396bis.html#canonical-form
  http://www.intertwingly.net/wiki/pie/PaceCanonicalIds

inspired by:
  Tony J. Ibbs,    http://starship.python.net/crew/tibs/python/tji_url.py
  Mark Nottingham, http://www.mnot.net/python/urlnorm.py
"""

__license__ = "Python"

import re, unicodedata, urlparse
from urllib import quote, unquote

default_port = {
    'http': 80,
}

def normalize(url):
    """Normalize a URL."""

    scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
    userinfo, host, port=re.search('([^@]*@)?([^:]*):?(.*)', auth).groups()

    # Always provide the URI scheme in lowercase characters.
    scheme = scheme.lower()

    # Always provide the host, if any, in lowercase characters.
    host = host.lower()
    if host and host[-1] == '.': host = host[:-1]
    if host and host.startswith("www."):
        if not scheme: scheme = "http"
        host = host[4:]
    elif path and path.startswith("www."):
        if not scheme: scheme = "http"
        path = path[4:]

    # Only perform percent-encoding where it is essential.
    # Always use uppercase A-through-F characters when percent-encoding.
    # All portions of the URI must be utf-8 encoded NFC from Unicode strings
    def clean(string):
        string=unicode(unquote(string), 'utf-8', 'replace')
        return unicodedata.normalize('NFC', string).encode('utf-8')
    path=quote(clean(path), "~:/?#[]@!$&'()*+,;=")
    fragment=quote(clean(fragment), "~")

    # note care must be taken to only encode & and = characters as values
    query="&".join(["=".join([quote(clean(t), "~:/?#[]@!$'()*+,;=")
        for t in q.split("=", 1)]) for q in query.split("&")])

    # Prevent dot-segments appearing in non-relative URI paths.
    if scheme in ["", "http", "https", "ftp", "file"]:
        output=[]
        for input in path.split('/'):
            if input=="":
                if not output: output.append(input)
            elif input==".":
                pass
            elif input=="..":
                if len(output)>1: output.pop()
            else:
                output.append(input)
        if input in ["", ".", ".."]: output.append("")
        path='/'.join(output)

    # For schemes that define a default authority, use an empty authority if
    # the default is desired.
    if userinfo in ["@", ":@"]: userinfo=""

    # For schemes that define an empty path to be equivalent to a path of "/",
    # use "/".
    if path=="" and scheme in ["http", "https", "ftp", "file"]:
        path="/"

    # For schemes that define a port, use an empty port if the default is
    # desired
    if port and scheme in default_port.keys():
        if port.isdigit():
            port=str(int(port))
            if int(port)==default_port[scheme]:
                port = ''

    # Put it all back together again
    auth=(userinfo or "") + host
    if port:
        auth+=":"+port
    if url.endswith("#") and query == "" and fragment == "":
        path += "#"
    return urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
            "http:///", "http://")