From 61c42e7d8a40d7fbe35876f507cc7595e6a7d2db Mon Sep 17 00:00:00 2001 From: melonhead Date: Wed, 14 Jul 2010 16:45:26 -0400 Subject: [PATCH] urlhistory: main regex no longer matches 'http://' or 'www.' urlhistory: added URL normalization for Amazon, Waffleimages, and Youtube --- plugins/urlhistory.py | 2 +- plugins/util/urlnorm.py | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/plugins/urlhistory.py b/plugins/urlhistory.py index fe839da..b850c02 100644 --- a/plugins/urlhistory.py +++ b/plugins/urlhistory.py @@ -4,7 +4,7 @@ import time from util import hook, urlnorm, timesince -url_re = r'([a-zA-Z]+://|www\.)[^ ]*' +url_re = r'([a-zA-Z]+://|www\.)[^ ]+' expiration_period = 60 * 60 * 24 # 1 day diff --git a/plugins/util/urlnorm.py b/plugins/util/urlnorm.py index a51a76e..6cdc858 100644 --- a/plugins/util/urlnorm.py +++ b/plugins/util/urlnorm.py @@ -29,7 +29,20 @@ from urllib import quote, unquote default_port = { 'http': 80, } - + +class Normalizer(object): + def __init__(self, regex, normalize_func): + self.regex = regex + self.normalize = normalize_func + +normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P[0-9A-Za-z]{10})'), + lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))), + Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'), + lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ), + Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'), + lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ), + ) + def normalize(url): """Normalize a URL.""" @@ -100,5 +113,12 @@ def normalize(url): auth+=":"+port if url.endswith("#") and query == "" and fragment == "": path += "#" - return urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace( - "http:///", "http://") + normal_url = urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace( + "http:///", "http://") + for norm in normalizers: + m = norm.regex.match(normal_url) + if m: + print 'Normalized %s to %s' % (url, norm.normalize(m)) + return norm.normalize(m) + print 'Normalized %s to %s' % (url, normal_url) + return normal_url