diff options
| -rw-r--r-- | database.py | 349 | ||||
| -rw-r--r-- | search.py | 68 | ||||
| -rw-r--r-- | tf_idf.py | 32 | 
3 files changed, 246 insertions, 203 deletions
diff --git a/database.py b/database.py index 0a804de..875d22e 100644 --- a/database.py +++ b/database.py @@ -1,169 +1,180 @@ -from dataclasses import dataclass -import math as maths -import sqlite3 -import tf_idf -import os - -class DatabaseCursor(sqlite3.Cursor): -    def __init__(self, *args, **kwargs): -        super().__init__(*args, **kwargs) - -    def __enter__(self): -        return self - -    def __exit__(self, type, value, traceback): -        self.close() - -@dataclass -class Database: -    db_path:str = os.path.join(".", "wikibooks.db") - -    def __enter__(self): -        if not os.path.exists(self.db_path): -            self.__connection = sqlite3.connect(self.db_path) -            self.__build_db() -        else: -            self.__connection = sqlite3.connect(self.db_path) - -        self.__connection.create_function('log', 1, maths.log10) -        self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf) -        return self - -    def __exit__(self, type, value, traceback): -        self.__connection.commit() -        self.__connection.close() - -    def __build_db(self): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute(""" -            CREATE TABLE IF NOT EXISTS `documents` ( -                `document_id` INTEGER PRIMARY KEY, -                `document_name` TEXT NOT NULL -            );""") -            cursor.execute(""" -            CREATE TABLE IF NOT EXISTS `vocabulary` ( -                `vocabulary_id` INTEGER PRIMARY KEY, -                `term` TEXT NOT NULL -            );""") -            cursor.execute(""" -            CREATE TABLE IF NOT EXISTS `term_frequency` (  -                `vocabulary_id` INT UNSIGNED NOT NULL,  -                `document_id` INT UNSIGNED NOT NULL,  -                `frequency` INT UNSIGNED NOT NULL,  -                FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`),  -                FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`),  -                PRIMARY KEY(`vocabulary_id`, `document_id`)  -            );""") -            cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);") - -    def append_documents(self, documents): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents]) - -    def get_document_name_by_id(self, id_): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, )) -            return cursor.fetchone()[0] - -    def get_document_id_by_name(self, document_name): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, )) -            return cursor.fetchone()[0] - -    def get_num_documents(self): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT COUNT(*) FROM documents;") -            return cursor.fetchone()[0] - -    def append_terms(self, terms): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms]) - -    def append_terms_in_document(self, document_id, counter): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.executemany(""" -            INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`)  -            VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?) -            """, [(i[0], document_id, i[1]) for i in counter.items()]) - -    def build_tf_idf_table(self): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;") -            # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;") -            # i wish i could find a way to do this with a single view but alas i am -            # not good enough at SQL -            cursor.execute(""" -            CREATE VIEW IF NOT EXISTS `vocab_count` AS  -            SELECT vocabulary_id,  -            COUNT(vocabulary_id) AS vocabulary_count -            FROM term_frequency  -            GROUP BY vocabulary_id; -            """) -            cursor.execute(""" -            CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT -            `term_frequency`.`vocabulary_id` AS `vocabulary_id`, -            `document_id`, -            `term_frequency`.`frequency`, -            LOG_TF(`frequency`) AS tf, -            (SELECT COUNT(`document_id`) FROM `documents`) AS n, -            `vocab_count`.`vocabulary_count` AS df, -            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf, -            LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf -            FROM `term_frequency` -            INNER JOIN `vocab_count` -            ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id` -            ;""") - -    def get_term_frequencies(self): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT * FROM `term_frequency`;") -            return cursor.fetchall() - -    def append_tf_idf_table(self, tfs): -        # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1]) -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.executemany(""" -            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) -            VALUES ( -                ?, ?, ?,  -                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?), -                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?) -            """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs]) - -    def append_tf_idf_table_single(self, vocabulary_id, document_id, tf): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute(""" -            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) -            VALUES ( -                ?, ?, ?,  -                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?), -                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?) -            """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf)) - -    def test_log(self, to_log): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT LOG(?);", (to_log, )) -            return cursor.fetchone() - -    def test_tf_log(self, to_tf): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT LOG_TF(?);", (to_tf, )) -            return cursor.fetchone() - -    def get_tf_idf_table(self): -        with self.__connection.cursor(factory = DatabaseCursor) as cursor: -            cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;") -            out = cursor.fetchall() -            print(len(out)) -            print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf")) -            for l in out[:100]: -                print(l) - -if __name__ == "__main__": -    with Database() as db: -        # print(db.get_num_documents()) -        # print(db.get_document_name_by_id(69420)) -        # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html")) -        # print(db.test_log(100)) -        # print(db.test_log(21)) -        db.get_tf_idf_table()
\ No newline at end of file +from dataclasses import dataclass
 +import math as maths
 +import sqlite3
 +import tf_idf
 +import os
 +
 +class DatabaseCursor(sqlite3.Cursor):
 +    def __init__(self, *args, **kwargs):
 +        super().__init__(*args, **kwargs)
 +
 +    def __enter__(self):
 +        return self
 +
 +    def __exit__(self, type, value, traceback):
 +        self.close()
 +
 +@dataclass
 +class Database:
 +    db_path:str = os.path.join(".", "wikibooks.db")
 +
 +    def __enter__(self):
 +        if not os.path.exists(self.db_path):
 +            self.__connection = sqlite3.connect(self.db_path)
 +            self.__build_db()
 +        else:
 +            self.__connection = sqlite3.connect(self.db_path)
 +
 +        self.__connection.create_function('log', 1, maths.log10)
 +        self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf)
 +        return self
 +
 +    def __exit__(self, type, value, traceback):
 +        self.__connection.commit()
 +        self.__connection.close()
 +
 +    def __build_db(self):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("""
 +            CREATE TABLE IF NOT EXISTS `documents` (
 +                `document_id` INTEGER PRIMARY KEY,
 +                `document_name` TEXT NOT NULL
 +            );""")
 +            cursor.execute("""
 +            CREATE TABLE IF NOT EXISTS `vocabulary` (
 +                `vocabulary_id` INTEGER PRIMARY KEY,
 +                `term` TEXT NOT NULL
 +            );""")
 +            cursor.execute("""
 +            CREATE TABLE IF NOT EXISTS `term_frequency` ( 
 +                `vocabulary_id` INT UNSIGNED NOT NULL, 
 +                `document_id` INT UNSIGNED NOT NULL, 
 +                `frequency` INT UNSIGNED NOT NULL, 
 +                FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`), 
 +                FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`), 
 +                PRIMARY KEY(`vocabulary_id`, `document_id`) 
 +            );""")
 +            cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);")
 +
 +    def append_documents(self, documents):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents])
 +
 +    def get_document_name_by_id(self, id_):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, ))
 +            return cursor.fetchone()[0]
 +
 +    def get_document_id_by_name(self, document_name):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, ))
 +            return cursor.fetchone()[0]
 +
 +    def get_num_documents(self):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT COUNT(*) FROM documents;")
 +            return cursor.fetchone()[0]
 +
 +    def append_terms(self, terms):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
 +
 +    def append_terms_in_document(self, document_id, counter):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.executemany("""
 +            INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`) 
 +            VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?)
 +            """, [(i[0], document_id, i[1]) for i in counter.items()])
 +
 +    def build_tf_idf_table(self):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;")
 +            # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;")
 +            # i wish i could find a way to do this with a single view but alas i am
 +            # not good enough at SQL
 +            cursor.execute("""
 +            CREATE VIEW IF NOT EXISTS `vocab_count` AS 
 +            SELECT vocabulary_id, 
 +            COUNT(vocabulary_id) AS vocabulary_count
 +            FROM term_frequency 
 +            GROUP BY vocabulary_id;
 +            """)
 +            cursor.execute("""
 +            CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
 +            `term_frequency`.`vocabulary_id` AS `vocabulary_id`,
 +            `document_id`,
 +            `term_frequency`.`frequency`,
 +            LOG_TF(`frequency`) AS tf,
 +            (SELECT COUNT(`document_id`) FROM `documents`) AS n,
 +            `vocab_count`.`vocabulary_count` AS df,
 +            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
 +            LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
 +            FROM `term_frequency`
 +            INNER JOIN `vocab_count`
 +            ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id`
 +            ;""")
 +
 +    def get_term_frequencies(self):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT * FROM `term_frequency`;")
 +            return cursor.fetchall()
 +
 +    def append_tf_idf_table(self, tfs):
 +        # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1])
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.executemany("""
 +            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
 +            VALUES (
 +                ?, ?, ?, 
 +                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
 +                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
 +            """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
 +
 +    def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("""
 +            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
 +            VALUES (
 +                ?, ?, ?, 
 +                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
 +                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
 +            """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
 +
 +    def test_log(self, to_log):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT LOG(?);", (to_log, ))
 +            return cursor.fetchone()
 +
 +    def test_tf_log(self, to_tf):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT LOG_TF(?);", (to_tf, ))
 +            return cursor.fetchone()
 +
 +    def get_tf_idf_table(self):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
 +            out = cursor.fetchall()
 +            print(len(out))
 +            print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf"))
 +            for l in out[:100]:
 +                print(l)
 +
 +    def get_tf_idf_score(self, term, tf_idf_thresh = 0, limit = 1000):
 +        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
 +            cursor.execute("""
 +            SELECT `document_id`, `tf_idf` FROM `tf_idf` WHERE `vocabulary_id` = (
 +                SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
 +            ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
 +            """, (term, tf_idf_thresh, limit))
 +            return {i[0]: i[1] for i in cursor.fetchall()}
 +
 +if __name__ == "__main__":
 +    with Database() as db:
 +        # print(db.get_num_documents())
 +        # print(db.get_document_name_by_id(69420))
 +        # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html"))
 +        # print(db.test_log(100))
 +        # print(db.test_log(21))
 +        # db.get_tf_idf_table()
 +        for i, v in db.get_tf_idf_score("enzyme", 1).items():
 +            print(i, v)
\ No newline at end of file @@ -1,18 +1,50 @@ -import terms -import sys -import re - -def main(search_words): -     -    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] -     -    search_words = [] -    for i in txt: -        search_words += re.split(r"\s+|\n", i) -     -    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] - -    print(search_words) - -if __name__ == "__main__": -    main(sys.argv[1:])
\ No newline at end of file +import database
 +import logging
 +import terms
 +import sys
 +import re
 +
 +logging.basicConfig(
 +    format = "[%(asctime)s]\t%(message)s",
 +    level = logging.INFO,
 +    handlers=[
 +        logging.FileHandler("searches.log"),
 +        logging.StreamHandler()
 +])
 +
 +def main(search_words):
 +    
 +    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
 +    
 +    search_words = []
 +    for i in txt:
 +        search_words += re.split(r"\s+|\n", i)
 +    
 +    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
 +    logging.info("Started searching. Using terms: %s" % " ".join(search_words))
 +
 +    with database.Database() as db:
 +        tf_idf_scores = []
 +        for term in search_words:
 +            tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
 +            logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
 +
 +        merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
 +        for scorelist in tf_idf_scores:
 +            for doc_id, score in scorelist.items():
 +                merged_scores[doc_id] += score
 +        logging.info("Merged scores...")
 +
 +        sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
 +        toshow = 20
 +        print("Sorted scores...")
 +
 +        for i, j in enumerate(sorted_scores, 0):
 +            if i >= toshow:
 +                break
 +
 +            docid, score = j
 +            logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
 +        
 +if __name__ == "__main__":
 +    main(sys.argv[1:])
 @@ -1,17 +1,17 @@ -import math as maths -import database - -def main(): -    with database.Database() as db: -        db.build_tf_idf_table() - -        db.get_tf_idf_table() - -def calc_log_tf(tf): -    if tf == 0: -        return 0 -    else: -        return maths.log10(1 + tf) - -if __name__ == "__main__": +import math as maths
 +import database
 +
 +def main():
 +    with database.Database() as db:
 +        db.build_tf_idf_table()
 +
 +        db.get_tf_idf_table()
 +
 +def calc_log_tf(tf):
 +    if tf == 0:
 +        return 0
 +    else:
 +        return maths.log10(1 + tf)
 +
 +if __name__ == "__main__":
      main()
\ No newline at end of file  | 
