#coding: UTF8 """ This module compares the speed of native python code versus SQLAlchemy/sqlite code with user-defined functions. The user-defined function is the subdist.get_score, which performs fuzzy substring matching using Levenshtein (edit) distance. MIT License To run this module, you'll need SQLAlchemy and the subdist module. You can get them via easy_install: easy_install sqlalchemy easy_install subdist """ import subdist from sqlalchemy import create_engine from sqlalchemy import Table, Column, Integer, UnicodeText from sqlalchemy import MetaData from sqlalchemy.orm import scoped_session, sessionmaker, mapper from sqlalchemy.sql import text import time __author__ = "Ryan Ginstrom" __version__ = "0.1" __license__ = "MIT" def make_gloss_func(haystack): """Create a glossary function using the haystack""" get_score = subdist.get_score def gloss_func(needle): """Search for fuzzy substring matches. needle = term """ return get_score(needle, haystack) return gloss_func class Term(object): """A term object in our database""" def __init__(self, source, trans): self.source = source self.trans = trans def __repr__(self): return "<%s - %s>" % (self.source.encode("utf-8"), self.trans.encode("utf-8")) def set_up_database(connection_string=':memory:'): """Create our database. Defaults to in-memory database""" engine = create_engine("sqlite:///%s" % connection_string, echo=False) metadata = MetaData() terms = Table('terms', metadata, Column('id', Integer, primary_key=True), Column('source', UnicodeText(), index=True), Column('trans', UnicodeText(), index=True), ) metadata.create_all(engine) mapper(Term, terms) return scoped_session(sessionmaker(bind=engine, autoflush=True, transactional=True)) def create_records(): """Creates "records" by getting all the pair combinations of the words in text""" # the basis of the "glossary" text = """ cry havoc and let slip the dogs of war you can knock me down steal my car drink my liquor from an old fruit jar but dont you step on my blue suede shoes uh-uh honey lay off of them shoes four score and twenty years ago our forefathers brought forth onto this continent a new nation conceived in liberty and dedicated to the proposition that all men are created equal i gave my love a cherry that had no stone a chicken that had no bone a story that had no end en alta mar habia un marinero que la guitarra gustaba de tocar y cuando se acordaba de su patria querida tocaba la guitarra y poniase a cantar there was an old lady who lived in a shoe she had so many children she didnt know what to do so she gave them some broth without any bread and then spanked them all soundly and sent them to bed old mother hubbard went to the cubbard to fetch her poor dog a bone but when she looked there the cubbard was bare and so the poor dog had none todas las promesas de mi amor se iran contigo me olvidaras x2 luego en la estacion te llorare igual que un nino porque te vas sometimes i try to do things and it just doesnt work out the way i want it to and i get real frustrated and like i try hard to do it and i like take my time they stick me in an institution said it was the only solution to give me needed professional help and protect me from the enemy myself doesnt matter ill probably get hit by a car anyway never gonna give you up never gonna let you down never gonna run around and desert you i go mom im ok im just thinking she goes no youre not thinking youre on drugs normal people dont act that way i go wait what are you talking about we decided my best interest how do you know what my best interest is """ words = set(text.split()) records = [] for w1 in words: for w2 in words: if w1 != w2: word = "%s %s" % (w1, w2) records.append(dict(source=unicode(word), trans=unicode(word))) return records def time_it(func, *args): """Times the function using the supplied arguments through 50 iterations""" start = time.clock() func(*args) return time.clock() - start def python_native(min_score, gloss_func, records): """Does the search using native Python code""" return [x for x in records if gloss_func(x["source"]) >= min_score] def make_sql_func(SessionClass): """Creates the SQL query function""" def sql_func(min_score, gloss_func): """This function will call the glossary function on the database""" try: session = SessionClass() conn = session.bind.connect() conn.connection.create_function("get_score", 1, gloss_func) search_string = """SELECT * FROM terms WHERE get_score(source)>=%f""" % min_score return conn.execute(text(search_string)) finally: SessionClass.remove() return sql_func def main(): """Runs when called as main""" SessionClass = set_up_database() sentence = u"I will order the dogs to be brought forth for this somewhat long sentence" gloss_func = make_gloss_func(sentence) # create the records records = create_records() print "There are", len(records), "records" # Add the terms to the database session = SessionClass() for record in records: term = Term(**record) session.save(term) session.commit() sql_func = make_sql_func(SessionClass) # Get the results min_score = .9 native_r = python_native(min_score, gloss_func, records) sql_r = list(sql_func(min_score, gloss_func)) # make sure we get the same results assert len(native_r) == len(sql_r) assert set([x["source"] for x in native_r]) == set([x["source"] for x in sql_r]) print "There are", len(native_r), "fuzzy substring matches" print "Native python:", time_it(python_native, min_score, gloss_func, records) print "sqlalchemy: ", time_it(sql_func, min_score, gloss_func) if __name__ == "__main__": main()