From 61c42e7d8a40d7fbe35876f507cc7595e6a7d2db Mon Sep 17 00:00:00 2001
From: melonhead <devnull@localhost>
Date: Wed, 14 Jul 2010 16:45:26 -0400
Subject: [PATCH] urlhistory: main regex no longer matches 'http://' or 'www.'
 urlhistory: added URL normalization for Amazon, Waffleimages, and Youtube

---
 plugins/urlhistory.py   |  2 +-
 plugins/util/urlnorm.py | 26 +++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/plugins/urlhistory.py b/plugins/urlhistory.py
index fe839da..b850c02 100644
--- a/plugins/urlhistory.py
+++ b/plugins/urlhistory.py
@@ -4,7 +4,7 @@ import time
 
 from util import hook, urlnorm, timesince
 
-url_re = r'([a-zA-Z]+://|www\.)[^ ]*'
+url_re = r'([a-zA-Z]+://|www\.)[^ ]+'
 
 expiration_period = 60 * 60 * 24  # 1 day
 
diff --git a/plugins/util/urlnorm.py b/plugins/util/urlnorm.py
index a51a76e..6cdc858 100644
--- a/plugins/util/urlnorm.py
+++ b/plugins/util/urlnorm.py
@@ -29,7 +29,20 @@ from urllib import quote, unquote
 default_port = {
     'http': 80,
 }
-
+
+class Normalizer(object):
+    def __init__(self, regex, normalize_func):
+        self.regex = regex
+        self.normalize = normalize_func
+
+normalizers = ( Normalizer( re.compile(r'(?:https?://)?(?:[a-zA-Z0-9\-]+\.)?(?:amazon|amzn){1}\.(?P<tld>[a-zA-Z\.]{2,})\/(gp/(?:product|offer-listing|customer-media/product-gallery)/|exec/obidos/tg/detail/-/|o/ASIN/|dp/|(?:[A-Za-z0-9\-]+)/dp/)?(?P<ASIN>[0-9A-Za-z]{10})'),
+                            lambda m: r'http://amazon.%s/dp/%s' % (m.group('tld'), m.group('ASIN'))),
+                Normalizer( re.compile(r'.*waffleimages\.com.*/([0-9a-fA-F]{40})'), 
+                            lambda m: r'http://img.waffleimages.com/%s' % m.group(1) ),
+                Normalizer( re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)'),
+                            lambda m: r'http://youtube.com/watch?v=%s' % m.group(1) ),
+    )
+
 def normalize(url):
     """Normalize a URL."""
 
@@ -100,5 +113,12 @@ def normalize(url):
         auth+=":"+port
     if url.endswith("#") and query == "" and fragment == "": 
         path += "#"
-    return urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
-            "http:///", "http://")
+    normal_url = urlparse.urlunsplit((scheme, auth, path, query, fragment)).replace(
+            "http:///", "http://")
+    for norm in normalizers:
+        m = norm.regex.match(normal_url)
+        if m:
+            print 'Normalized %s to %s' % (url, norm.normalize(m))
+            return norm.normalize(m)
+    print 'Normalized %s to %s' % (url, normal_url)
+    return normal_url