From 93f626c482e86a4c58d7544472c7e442af236ede Mon Sep 17 00:00:00 2001 From: melonhead Date: Wed, 14 Jul 2010 16:48:18 -0400 Subject: [PATCH] Removed debug print lines from URL normalizer --- plugins/util/urlnorm.py | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/plugins/util/urlnorm.py b/plugins/util/urlnorm.py index 6cdc858..024fc32 100644 --- a/plugins/util/urlnorm.py +++ b/plugins/util/urlnorm.py @@ -29,20 +29,20 @@ from urllib import quote, unquote default_port = { 'http': 80, } - -class Normalizer(object): - def __init__(self, regex, normalize_func): - self.regex = regex - self.normalize = normalize_func - -normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P[0-9A-Za-z]{10})'), - lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))), - Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'), - lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ), - Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'), - lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ), - ) - + +class Normalizer(object): + def __init__(self, regex, normalize_func): + self.regex = regex + self.normalize = normalize_func + +normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P[0-9A-Za-z]{10})'), + lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))), + Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'), + lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ), + Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'), + lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ), + ) + def normalize(url): """Normalize a URL.""" @@ -114,11 +114,9 @@ def normalize(url): if url.endswith("#") and query == "" and fragment == "": path += "#" normal_url = urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace( - "http:///", "http://") - for norm in normalizers: - m = norm.regex.match(normal_url) - if m: - print 'Normalized %s to %s' % (url, norm.normalize(m)) - return norm.normalize(m) - print 'Normalized %s to %s' % (url, normal_url) + "http:///", "http://") + for norm in normalizers: + m = norm.regex.match(normal_url) + if m: + return norm.normalize(m) return normal_url