Removed debug print lines from URL normalizer
This commit is contained in:
parent
61c42e7d8a
commit
93f626c482
|
@ -29,20 +29,20 @@ from urllib import quote, unquote
|
||||||
default_port = {
|
default_port = {
|
||||||
'http': 80,
|
'http': 80,
|
||||||
}
|
}
|
||||||
|
|
||||||
class Normalizer(object):
|
class Normalizer(object):
|
||||||
def __init__(self, regex, normalize_func):
|
def __init__(self, regex, normalize_func):
|
||||||
self.regex = regex
|
self.regex = regex
|
||||||
self.normalize = normalize_func
|
self.normalize = normalize_func
|
||||||
|
|
||||||
normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P<tld>[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P<ASIN>[0-9A-Za-z]{10})'),
|
normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P<tld>[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P<ASIN>[0-9A-Za-z]{10})'),
|
||||||
lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))),
|
lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))),
|
||||||
Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'),
|
Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'),
|
||||||
lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ),
|
lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ),
|
||||||
Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'),
|
Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'),
|
||||||
lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ),
|
lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ),
|
||||||
)
|
)
|
||||||
|
|
||||||
def normalize(url):
|
def normalize(url):
|
||||||
"""Normalize a URL."""
|
"""Normalize a URL."""
|
||||||
|
|
||||||
|
@ -114,11 +114,9 @@ def normalize(url):
|
||||||
if url.endswith("#") and query == "" and fragment == "":
|
if url.endswith("#") and query == "" and fragment == "":
|
||||||
path += "#"
|
path += "#"
|
||||||
normal_url = urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
|
normal_url = urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
|
||||||
"http:///", "http://")
|
"http:///", "http://")
|
||||||
for norm in normalizers:
|
for norm in normalizers:
|
||||||
m = norm.regex.match(normal_url)
|
m = norm.regex.match(normal_url)
|
||||||
if m:
|
if m:
|
||||||
print 'Normalized %s to %s' % (url, norm.normalize(m))
|
return norm.normalize(m)
|
||||||
return norm.normalize(m)
|
|
||||||
print 'Normalized %s to %s' % (url, normal_url)
|
|
||||||
return normal_url
|
return normal_url
|
||||||
|
|
Loading…
Reference in New Issue