added a simple search system

author: jwansek <eddie.atten.ea29@gmail.com> 2021-11-24 16:31:46 +0000
committer: jwansek <eddie.atten.ea29@gmail.com> 2021-11-24 16:31:46 +0000
commit: 5e4aee62e7a746491edd9077c0bdd959c444960b (patch)
tree: 09dd0d3bdbfdb1a5dc5c84b86c0088c9082a7842
parent: cb4be28b07823639a64afaa34bf3919220a81e12 (diff)
download: searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.tar.gz
searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.zip
3 files changed, 246 insertions, 203 deletions
diff --git a/database.py b/database.py
index 0a804de..875d22e 100644
--- a/database.py
+++ b/database.py
@@ -1,169 +1,180 @@
-from dataclasses import dataclass
-import math as maths
-import sqlite3
-import tf_idf
-import os
-
-class DatabaseCursor(sqlite3.Cursor):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.close()
-
-@dataclass
-class Database:
-    db_path:str = os.path.join(".", "wikibooks.db")
-
-    def __enter__(self):
-        if not os.path.exists(self.db_path):
-            self.__connection = sqlite3.connect(self.db_path)
-            self.__build_db()
-        else:
-            self.__connection = sqlite3.connect(self.db_path)
-
-        self.__connection.create_function('log', 1, maths.log10)
-        self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.__connection.commit()
-        self.__connection.close()
-
-    def __build_db(self):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("""
-            CREATE TABLE IF NOT EXISTS `documents` (
-                `document_id` INTEGER PRIMARY KEY,
-                `document_name` TEXT NOT NULL
-            );""")
-            cursor.execute("""
-            CREATE TABLE IF NOT EXISTS `vocabulary` (
-                `vocabulary_id` INTEGER PRIMARY KEY,
-                `term` TEXT NOT NULL
-            );""")
-            cursor.execute("""
-            CREATE TABLE IF NOT EXISTS `term_frequency` ( 
-                `vocabulary_id` INT UNSIGNED NOT NULL, 
-                `document_id` INT UNSIGNED NOT NULL, 
-                `frequency` INT UNSIGNED NOT NULL, 
-                FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`), 
-                FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`), 
-                PRIMARY KEY(`vocabulary_id`, `document_id`) 
-            );""")
-            cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);")
-
-    def append_documents(self, documents):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents])
-
-    def get_document_name_by_id(self, id_):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, ))
-            return cursor.fetchone()[0]
-
-    def get_document_id_by_name(self, document_name):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, ))
-            return cursor.fetchone()[0]
-
-    def get_num_documents(self):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT COUNT(*) FROM documents;")
-            return cursor.fetchone()[0]
-
-    def append_terms(self, terms):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
-
-    def append_terms_in_document(self, document_id, counter):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.executemany("""
-            INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`) 
-            VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?)
-            """, [(i[0], document_id, i[1]) for i in counter.items()])
-
-    def build_tf_idf_table(self):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;")
-            # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;")
-            # i wish i could find a way to do this with a single view but alas i am
-            # not good enough at SQL
-            cursor.execute("""
-            CREATE VIEW IF NOT EXISTS `vocab_count` AS 
-            SELECT vocabulary_id, 
-            COUNT(vocabulary_id) AS vocabulary_count
-            FROM term_frequency 
-            GROUP BY vocabulary_id;
-            """)
-            cursor.execute("""
-            CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
-            `term_frequency`.`vocabulary_id` AS `vocabulary_id`,
-            `document_id`,
-            `term_frequency`.`frequency`,
-            LOG_TF(`frequency`) AS tf,
-            (SELECT COUNT(`document_id`) FROM `documents`) AS n,
-            `vocab_count`.`vocabulary_count` AS df,
-            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
-            LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
-            FROM `term_frequency`
-            INNER JOIN `vocab_count`
-            ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id`
-            ;""")
-
-    def get_term_frequencies(self):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT * FROM `term_frequency`;")
-            return cursor.fetchall()
-
-    def append_tf_idf_table(self, tfs):
-        # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1])
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.executemany("""
-            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
-            VALUES (
-                ?, ?, ?, 
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
-            """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
-
-    def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("""
-            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
-            VALUES (
-                ?, ?, ?, 
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
-                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
-            """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
-
-    def test_log(self, to_log):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT LOG(?);", (to_log, ))
-            return cursor.fetchone()
-
-    def test_tf_log(self, to_tf):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT LOG_TF(?);", (to_tf, ))
-            return cursor.fetchone()
-
-    def get_tf_idf_table(self):
-        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
-            cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
-            out = cursor.fetchall()
-            print(len(out))
-            print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf"))
-            for l in out[:100]:
-                print(l)
-
-if __name__ == "__main__":
-    with Database() as db:
-        # print(db.get_num_documents())
-        # print(db.get_document_name_by_id(69420))
-        # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html"))
-        # print(db.test_log(100))
-        # print(db.test_log(21))
-        db.get_tf_idf_table()
-\ No newline at end of file
+from dataclasses import dataclass
+import math as maths
+import sqlite3
+import tf_idf
+import os
+
+class DatabaseCursor(sqlite3.Cursor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+@dataclass
+class Database:
+    db_path:str = os.path.join(".", "wikibooks.db")
+
+    def __enter__(self):
+        if not os.path.exists(self.db_path):
+            self.__connection = sqlite3.connect(self.db_path)
+            self.__build_db()
+        else:
+            self.__connection = sqlite3.connect(self.db_path)
+
+        self.__connection.create_function('log', 1, maths.log10)
+        self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf)
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.__connection.commit()
+        self.__connection.close()
+
+    def __build_db(self):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("""
+            CREATE TABLE IF NOT EXISTS `documents` (
+                `document_id` INTEGER PRIMARY KEY,
+                `document_name` TEXT NOT NULL
+            );""")
+            cursor.execute("""
+            CREATE TABLE IF NOT EXISTS `vocabulary` (
+                `vocabulary_id` INTEGER PRIMARY KEY,
+                `term` TEXT NOT NULL
+            );""")
+            cursor.execute("""
+            CREATE TABLE IF NOT EXISTS `term_frequency` ( 
+                `vocabulary_id` INT UNSIGNED NOT NULL, 
+                `document_id` INT UNSIGNED NOT NULL, 
+                `frequency` INT UNSIGNED NOT NULL, 
+                FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`), 
+                FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`), 
+                PRIMARY KEY(`vocabulary_id`, `document_id`) 
+            );""")
+            cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);")
+
+    def append_documents(self, documents):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents])
+
+    def get_document_name_by_id(self, id_):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, ))
+            return cursor.fetchone()[0]
+
+    def get_document_id_by_name(self, document_name):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, ))
+            return cursor.fetchone()[0]
+
+    def get_num_documents(self):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT COUNT(*) FROM documents;")
+            return cursor.fetchone()[0]
+
+    def append_terms(self, terms):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
+
+    def append_terms_in_document(self, document_id, counter):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.executemany("""
+            INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`) 
+            VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?)
+            """, [(i[0], document_id, i[1]) for i in counter.items()])
+
+    def build_tf_idf_table(self):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;")
+            # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;")
+            # i wish i could find a way to do this with a single view but alas i am
+            # not good enough at SQL
+            cursor.execute("""
+            CREATE VIEW IF NOT EXISTS `vocab_count` AS 
+            SELECT vocabulary_id, 
+            COUNT(vocabulary_id) AS vocabulary_count
+            FROM term_frequency 
+            GROUP BY vocabulary_id;
+            """)
+            cursor.execute("""
+            CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
+            `term_frequency`.`vocabulary_id` AS `vocabulary_id`,
+            `document_id`,
+            `term_frequency`.`frequency`,
+            LOG_TF(`frequency`) AS tf,
+            (SELECT COUNT(`document_id`) FROM `documents`) AS n,
+            `vocab_count`.`vocabulary_count` AS df,
+            (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
+            LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
+            FROM `term_frequency`
+            INNER JOIN `vocab_count`
+            ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id`
+            ;""")
+
+    def get_term_frequencies(self):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT * FROM `term_frequency`;")
+            return cursor.fetchall()
+
+    def append_tf_idf_table(self, tfs):
+        # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1])
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.executemany("""
+            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
+            VALUES (
+                ?, ?, ?, 
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+            """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
+
+    def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("""
+            INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
+            VALUES (
+                ?, ?, ?, 
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
+                (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+            """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
+
+    def test_log(self, to_log):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT LOG(?);", (to_log, ))
+            return cursor.fetchone()
+
+    def test_tf_log(self, to_tf):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT LOG_TF(?);", (to_tf, ))
+            return cursor.fetchone()
+
+    def get_tf_idf_table(self):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
+            out = cursor.fetchall()
+            print(len(out))
+            print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf"))
+            for l in out[:100]:
+                print(l)
+
+    def get_tf_idf_score(self, term, tf_idf_thresh = 0, limit = 1000):
+        with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+            cursor.execute("""
+            SELECT `document_id`, `tf_idf` FROM `tf_idf` WHERE `vocabulary_id` = (
+                SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
+            ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
+            """, (term, tf_idf_thresh, limit))
+            return {i[0]: i[1] for i in cursor.fetchall()}
+
+if __name__ == "__main__":
+    with Database() as db:
+        # print(db.get_num_documents())
+        # print(db.get_document_name_by_id(69420))
+        # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html"))
+        # print(db.test_log(100))
+        # print(db.test_log(21))
+        # db.get_tf_idf_table()
+        for i, v in db.get_tf_idf_score("enzyme", 1).items():
+            print(i, v)
+\ No newline at end of file
diff --git a/search.py b/search.py
index d41d0a2..60876d9 100644
--- a/search.py
+++ b/search.py
@@ -1,18 +1,50 @@
-import terms
-import sys
-import re
-
-def main(search_words):
-    
-    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
-    
-    search_words = []
-    for i in txt:
-        search_words += re.split(r"\s+|\n", i)
-    
-    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
-
-    print(search_words)
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
-\ No newline at end of file
+import database
+import logging
+import terms
+import sys
+import re
+
+logging.basicConfig(
+    format = "[%(asctime)s]\t%(message)s",
+    level = logging.INFO,
+    handlers=[
+        logging.FileHandler("searches.log"),
+        logging.StreamHandler()
+])
+
+def main(search_words):
+    
+    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
+    
+    search_words = []
+    for i in txt:
+        search_words += re.split(r"\s+|\n", i)
+    
+    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
+    logging.info("Started searching. Using terms: %s" % " ".join(search_words))
+
+    with database.Database() as db:
+        tf_idf_scores = []
+        for term in search_words:
+            tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
+            logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
+
+        merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
+        for scorelist in tf_idf_scores:
+            for doc_id, score in scorelist.items():
+                merged_scores[doc_id] += score
+        logging.info("Merged scores...")
+
+        sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
+        toshow = 20
+        print("Sorted scores...")
+
+        for i, j in enumerate(sorted_scores, 0):
+            if i >= toshow:
+                break
+
+            docid, score = j
+            logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
+        
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/tf_idf.py b/tf_idf.py
index 615720d..883f236 100644
--- a/tf_idf.py
+++ b/tf_idf.py
@@ -1,17 +1,17 @@
-import math as maths
-import database
-
-def main():
-    with database.Database() as db:
-        db.build_tf_idf_table()
-
-        db.get_tf_idf_table()
-
-def calc_log_tf(tf):
-    if tf == 0:
-        return 0
-    else:
-        return maths.log10(1 + tf)
-
-if __name__ == "__main__":
+import math as maths
+import database
+
+def main():
+    with database.Database() as db:
+        db.build_tf_idf_table()
+
+        db.get_tf_idf_table()
+
+def calc_log_tf(tf):
+    if tf == 0:
+        return 0
+    else:
+        return maths.log10(1 + tf)
+
+if __name__ == "__main__":
     main()
 \ No newline at end of file
author	jwansek <eddie.atten.ea29@gmail.com>	2021-11-24 16:31:46 +0000
committer	jwansek <eddie.atten.ea29@gmail.com>	2021-11-24 16:31:46 +0000
commit	5e4aee62e7a746491edd9077c0bdd959c444960b (patch)
tree	09dd0d3bdbfdb1a5dc5c84b86c0088c9082a7842
parent	cb4be28b07823639a64afaa34bf3919220a81e12 (diff)
download	searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.tar.gz searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.zip