Added URL normalization to urlhistory module to allow better detection of duplicates

Added configurable ignored URLs to urlhistory module
This commit is contained in:
melonhead 2010-01-18 15:07:06 -05:00
parent 7380470cf0
commit bb709a74bf
2 changed files with 231 additions and 17 deletions

View File

@ -7,6 +7,8 @@ from datetime import timedelta
import re
from util import hook, timesince
from util import urlnorm
#from util import texttime
url_re = re.compile(r'([a-zA-Z]+://|www\.)[^ ]*')
@ -15,6 +17,8 @@ dbname = "skybot.db"
expiration_period = timedelta(days=1)
ignored_urls = [ urlnorm.normalize("") ]
#TODO: Generate expiration_period_text from expiration_period
expiration_period_text = "24 hours"
@ -75,13 +79,7 @@ def dbconnect(db):
return conn
def normalize_url(url):
# TODO: do something so that:
# -
# -
# -
# -
# etc are all considered to be the same URL
return url
return urlnorm.normalize(url)
def get_once_twice(count):
if count == 1:
@ -98,14 +96,15 @@ def urlinput(bot, input):
if m:
# URL detected
url = normalize_url(
conn = dbconnect(dbpath)
dupes = select_history_for_url_and_channel(conn, url, input.chan)
num_dupes = len(dupes)
if num_dupes > 0 and input.nick not in dupes:
nicks = get_nicklist(dupes)
reply = "That link has been posted " + get_once_twice(num_dupes)
reply += " in the past " + expiration_period_text + " by " + nicks
insert_history(conn, url, input.chan, input.nick)
if url not in ignored_urls:
conn = dbconnect(dbpath)
dupes = select_history_for_url_and_channel(conn, url, input.chan)
num_dupes = len(dupes)
if num_dupes > 0 and input.nick not in dupes:
nicks = get_nicklist(dupes)
reply = "That link has been posted " + get_once_twice(num_dupes)
reply += " in the past " + expiration_period_text + " by " + nicks
insert_history(conn, url, input.chan, input.nick)

plugins/util/ Normal file
View File

@ -0,0 +1,215 @@
URI Normalization function:
* Always provide the URI scheme in lowercase characters.
* Always provide the host, if any, in lowercase characters.
* Only perform percent-encoding where it is essential.
* Always use uppercase A-through-F characters when percent-encoding.
* Prevent dot-segments appearing in non-relative URI paths.
* For schemes that define a default authority, use an empty authority if the
default is desired.
* For schemes that define an empty path to be equivalent to a path of "/",
use "/".
* For schemes that define a port, use an empty port if the default is desired
* All portions of the URI must be utf-8 encoded NFC from Unicode strings
inspired by:
Tony J. Ibbs,
Mark Nottingham,
__license__ = "Python"
import re, unicodedata, urlparse
from urllib import quote, unquote
default_port = {
'ftp': 21,
'telnet': 23,
'http': 80,
'gopher': 70,
'news': 119,
'nntp': 119,
'prospero': 191,
'https': 443,
'snews': 563,
'snntp': 563,
def normalize(url):
"""Normalize a URL."""
scheme,auth,path,query,fragment = urlparse.urlsplit(url.strip())
# Always provide the URI scheme in lowercase characters.
scheme = scheme.lower()
# Always provide the host, if any, in lowercase characters.
host = host.lower()
if host and host[-1] == '.': host = host[:-1]
if host and host.startswith("www."):
if not scheme: scheme = "http"
host = host[4:]
elif path and path.startswith("www."):
if not scheme: scheme = "http"
path = path[4:]
# Only perform percent-encoding where it is essential.
# Always use uppercase A-through-F characters when percent-encoding.
# All portions of the URI must be utf-8 encoded NFC from Unicode strings
def clean(string):
return unicodedata.normalize('NFC',string).encode('utf-8')
# note care must be taken to only encode & and = characters as values
query="&".join(["=".join([quote(clean(t) ,"~:/?#[]@!$'()*+,;=")
for t in q.split("=",1)]) for q in query.split("&")])
# Prevent dot-segments appearing in non-relative URI paths.
if scheme in ["","http","https","ftp","file"]:
for input in path.split('/'):
if input=="":
if not output: output.append(input)
elif input==".":
elif input=="..":
if len(output)>1: output.pop()
if input in ["",".",".."]: output.append("")
# For schemes that define a default authority, use an empty authority if
# the default is desired.
if userinfo in ["@",":@"]: userinfo=""
# For schemes that define an empty path to be equivalent to a path of "/",
# use "/".
if path=="" and scheme in ["http","https","ftp","file"]:
# For schemes that define a port, use an empty port if the default is
# desired
if port and scheme in default_port.keys():
if port.isdigit():
if int(port)==default_port[scheme]:
port = ''
# Put it all back together again
auth=(userinfo or "") + host
if port: auth+=":"+port
if url.endswith("#") and query=="" and fragment=="": path+="#"
return urlparse.urlunsplit((scheme,auth,path,query,fragment)).replace("http:///", "http://")
if __name__ == "__main__":
import unittest
suite = unittest.TestSuite()
""" from """
tests= [
(False, ""),
(False, ""),
(False, ""),
(False, "HTTP://"),
(False, "http://EXAMPLE.COM/"),
(False, ""),
(False, ""),
(False, ""),
(False, ""),
(False, ""),
(False, ""),
(False, ""),
(True, ""),
(True, ""),
(True, ""),
(True, ""),
(True, ""),
(True, ""),
(True, ""),
(True, ""),
# from rfc2396bis
(True, ""),
(True, ""),
(True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
(True, ""),
(True, "news:comp.infosystems.www.servers.unix"),
(True, "tel:+1-816-555-1212"),
(True, "telnet://"),
(True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
# other
(True, ""),
(False, ""),
(True, ""),
(False, ""),
def testcase(expected,value):
class test(unittest.TestCase):
def runTest(self):
assert (normalize(value)==value)==expected, \
(expected, value, normalize(value))
return test()
for (expected,value) in tests:
""" mnot test suite; three tests updated for rfc2396bis. """
tests = {
'/foo/bar/.': '/foo/bar/',
'/foo/bar/./': '/foo/bar/',
'/foo/bar/..': '/foo/',
'/foo/bar/../': '/foo/',
'/foo/bar/../baz': '/foo/baz',
'/foo/bar/../..': '/',
'/foo/bar/../../': '/',
'/foo/bar/../../baz': '/baz',
'/foo/bar/../../../baz': '/baz', #was: '/../baz',
'/foo/bar/../../../../baz': '/baz',
'/./foo': '/foo',
'/../foo': '/foo', #was: '/../foo',
'/foo.': '/foo.',
'/.foo': '/.foo',
'/foo..': '/foo..',
'/': '/',
'/./../foo': '/foo', #was: '/../foo',
'/./foo/.': '/foo/',
'/foo/./bar': '/foo/bar',
'/foo/../bar': '/bar',
'/foo//': '/foo/',
'/foo///bar//': '/foo/bar/',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'-': '-',
def testcase(original,normalized):
class test(unittest.TestCase):
def runTest(self):
assert normalize(original)==normalized, \
(original, normalized, normalize(original))
return test()
for (original,normalized) in tests.items():
""" execute tests """