diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-24 16:31:46 +0000 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-24 16:31:46 +0000 |
commit | 5e4aee62e7a746491edd9077c0bdd959c444960b (patch) | |
tree | 09dd0d3bdbfdb1a5dc5c84b86c0088c9082a7842 | |
parent | cb4be28b07823639a64afaa34bf3919220a81e12 (diff) | |
download | searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.tar.gz searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.zip |
added a simple search system
-rw-r--r-- | database.py | 349 | ||||
-rw-r--r-- | search.py | 68 | ||||
-rw-r--r-- | tf_idf.py | 32 |
3 files changed, 246 insertions, 203 deletions
diff --git a/database.py b/database.py index 0a804de..875d22e 100644 --- a/database.py +++ b/database.py @@ -1,169 +1,180 @@ -from dataclasses import dataclass -import math as maths -import sqlite3 -import tf_idf -import os - -class DatabaseCursor(sqlite3.Cursor): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - -@dataclass -class Database: - db_path:str = os.path.join(".", "wikibooks.db") - - def __enter__(self): - if not os.path.exists(self.db_path): - self.__connection = sqlite3.connect(self.db_path) - self.__build_db() - else: - self.__connection = sqlite3.connect(self.db_path) - - self.__connection.create_function('log', 1, maths.log10) - self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf) - return self - - def __exit__(self, type, value, traceback): - self.__connection.commit() - self.__connection.close() - - def __build_db(self): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute(""" - CREATE TABLE IF NOT EXISTS `documents` ( - `document_id` INTEGER PRIMARY KEY, - `document_name` TEXT NOT NULL - );""") - cursor.execute(""" - CREATE TABLE IF NOT EXISTS `vocabulary` ( - `vocabulary_id` INTEGER PRIMARY KEY, - `term` TEXT NOT NULL - );""") - cursor.execute(""" - CREATE TABLE IF NOT EXISTS `term_frequency` ( - `vocabulary_id` INT UNSIGNED NOT NULL, - `document_id` INT UNSIGNED NOT NULL, - `frequency` INT UNSIGNED NOT NULL, - FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`), - FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`), - PRIMARY KEY(`vocabulary_id`, `document_id`) - );""") - cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);") - - def append_documents(self, documents): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents]) - - def get_document_name_by_id(self, id_): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, )) - return cursor.fetchone()[0] - - def get_document_id_by_name(self, document_name): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, )) - return cursor.fetchone()[0] - - def get_num_documents(self): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT COUNT(*) FROM documents;") - return cursor.fetchone()[0] - - def append_terms(self, terms): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms]) - - def append_terms_in_document(self, document_id, counter): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.executemany(""" - INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`) - VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?) - """, [(i[0], document_id, i[1]) for i in counter.items()]) - - def build_tf_idf_table(self): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;") - # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;") - # i wish i could find a way to do this with a single view but alas i am - # not good enough at SQL - cursor.execute(""" - CREATE VIEW IF NOT EXISTS `vocab_count` AS - SELECT vocabulary_id, - COUNT(vocabulary_id) AS vocabulary_count - FROM term_frequency - GROUP BY vocabulary_id; - """) - cursor.execute(""" - CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT - `term_frequency`.`vocabulary_id` AS `vocabulary_id`, - `document_id`, - `term_frequency`.`frequency`, - LOG_TF(`frequency`) AS tf, - (SELECT COUNT(`document_id`) FROM `documents`) AS n, - `vocab_count`.`vocabulary_count` AS df, - (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf, - LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf - FROM `term_frequency` - INNER JOIN `vocab_count` - ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id` - ;""") - - def get_term_frequencies(self): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT * FROM `term_frequency`;") - return cursor.fetchall() - - def append_tf_idf_table(self, tfs): - # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1]) - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.executemany(""" - INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) - VALUES ( - ?, ?, ?, - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?), - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?) - """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs]) - - def append_tf_idf_table_single(self, vocabulary_id, document_id, tf): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute(""" - INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) - VALUES ( - ?, ?, ?, - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?), - (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?) - """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf)) - - def test_log(self, to_log): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT LOG(?);", (to_log, )) - return cursor.fetchone() - - def test_tf_log(self, to_tf): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT LOG_TF(?);", (to_tf, )) - return cursor.fetchone() - - def get_tf_idf_table(self): - with self.__connection.cursor(factory = DatabaseCursor) as cursor: - cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;") - out = cursor.fetchall() - print(len(out)) - print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf")) - for l in out[:100]: - print(l) - -if __name__ == "__main__": - with Database() as db: - # print(db.get_num_documents()) - # print(db.get_document_name_by_id(69420)) - # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html")) - # print(db.test_log(100)) - # print(db.test_log(21)) - db.get_tf_idf_table()
\ No newline at end of file +from dataclasses import dataclass
+import math as maths
+import sqlite3
+import tf_idf
+import os
+
+class DatabaseCursor(sqlite3.Cursor):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self.close()
+
+@dataclass
+class Database:
+ db_path:str = os.path.join(".", "wikibooks.db")
+
+ def __enter__(self):
+ if not os.path.exists(self.db_path):
+ self.__connection = sqlite3.connect(self.db_path)
+ self.__build_db()
+ else:
+ self.__connection = sqlite3.connect(self.db_path)
+
+ self.__connection.create_function('log', 1, maths.log10)
+ self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf)
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self.__connection.commit()
+ self.__connection.close()
+
+ def __build_db(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("""
+ CREATE TABLE IF NOT EXISTS `documents` (
+ `document_id` INTEGER PRIMARY KEY,
+ `document_name` TEXT NOT NULL
+ );""")
+ cursor.execute("""
+ CREATE TABLE IF NOT EXISTS `vocabulary` (
+ `vocabulary_id` INTEGER PRIMARY KEY,
+ `term` TEXT NOT NULL
+ );""")
+ cursor.execute("""
+ CREATE TABLE IF NOT EXISTS `term_frequency` (
+ `vocabulary_id` INT UNSIGNED NOT NULL,
+ `document_id` INT UNSIGNED NOT NULL,
+ `frequency` INT UNSIGNED NOT NULL,
+ FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`),
+ FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`),
+ PRIMARY KEY(`vocabulary_id`, `document_id`)
+ );""")
+ cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);")
+
+ def append_documents(self, documents):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents])
+
+ def get_document_name_by_id(self, id_):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, ))
+ return cursor.fetchone()[0]
+
+ def get_document_id_by_name(self, document_name):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, ))
+ return cursor.fetchone()[0]
+
+ def get_num_documents(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT COUNT(*) FROM documents;")
+ return cursor.fetchone()[0]
+
+ def append_terms(self, terms):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms])
+
+ def append_terms_in_document(self, document_id, counter):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.executemany("""
+ INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`)
+ VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?)
+ """, [(i[0], document_id, i[1]) for i in counter.items()])
+
+ def build_tf_idf_table(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;")
+ # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;")
+ # i wish i could find a way to do this with a single view but alas i am
+ # not good enough at SQL
+ cursor.execute("""
+ CREATE VIEW IF NOT EXISTS `vocab_count` AS
+ SELECT vocabulary_id,
+ COUNT(vocabulary_id) AS vocabulary_count
+ FROM term_frequency
+ GROUP BY vocabulary_id;
+ """)
+ cursor.execute("""
+ CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
+ `term_frequency`.`vocabulary_id` AS `vocabulary_id`,
+ `document_id`,
+ `term_frequency`.`frequency`,
+ LOG_TF(`frequency`) AS tf,
+ (SELECT COUNT(`document_id`) FROM `documents`) AS n,
+ `vocab_count`.`vocabulary_count` AS df,
+ (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
+ LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
+ FROM `term_frequency`
+ INNER JOIN `vocab_count`
+ ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id`
+ ;""")
+
+ def get_term_frequencies(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT * FROM `term_frequency`;")
+ return cursor.fetchall()
+
+ def append_tf_idf_table(self, tfs):
+ # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1])
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.executemany("""
+ INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
+ VALUES (
+ ?, ?, ?,
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+ """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
+
+ def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("""
+ INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
+ VALUES (
+ ?, ?, ?,
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?),
+ (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?)
+ """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
+
+ def test_log(self, to_log):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT LOG(?);", (to_log, ))
+ return cursor.fetchone()
+
+ def test_tf_log(self, to_tf):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT LOG_TF(?);", (to_tf, ))
+ return cursor.fetchone()
+
+ def get_tf_idf_table(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
+ out = cursor.fetchall()
+ print(len(out))
+ print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf"))
+ for l in out[:100]:
+ print(l)
+
+ def get_tf_idf_score(self, term, tf_idf_thresh = 0, limit = 1000):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("""
+ SELECT `document_id`, `tf_idf` FROM `tf_idf` WHERE `vocabulary_id` = (
+ SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
+ ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
+ """, (term, tf_idf_thresh, limit))
+ return {i[0]: i[1] for i in cursor.fetchall()}
+
+if __name__ == "__main__":
+ with Database() as db:
+ # print(db.get_num_documents())
+ # print(db.get_document_name_by_id(69420))
+ # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html"))
+ # print(db.test_log(100))
+ # print(db.test_log(21))
+ # db.get_tf_idf_table()
+ for i, v in db.get_tf_idf_score("enzyme", 1).items():
+ print(i, v)
\ No newline at end of file @@ -1,18 +1,50 @@ -import terms -import sys -import re - -def main(search_words): - - txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] - - search_words = [] - for i in txt: - search_words += re.split(r"\s+|\n", i) - - search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] - - print(search_words) - -if __name__ == "__main__": - main(sys.argv[1:])
\ No newline at end of file +import database
+import logging
+import terms
+import sys
+import re
+
+logging.basicConfig(
+ format = "[%(asctime)s]\t%(message)s",
+ level = logging.INFO,
+ handlers=[
+ logging.FileHandler("searches.log"),
+ logging.StreamHandler()
+])
+
+def main(search_words):
+
+ txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
+
+ search_words = []
+ for i in txt:
+ search_words += re.split(r"\s+|\n", i)
+
+ search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
+ logging.info("Started searching. Using terms: %s" % " ".join(search_words))
+
+ with database.Database() as db:
+ tf_idf_scores = []
+ for term in search_words:
+ tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
+ logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
+
+ merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
+ for scorelist in tf_idf_scores:
+ for doc_id, score in scorelist.items():
+ merged_scores[doc_id] += score
+ logging.info("Merged scores...")
+
+ sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
+ toshow = 20
+ print("Sorted scores...")
+
+ for i, j in enumerate(sorted_scores, 0):
+ if i >= toshow:
+ break
+
+ docid, score = j
+ logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
@@ -1,17 +1,17 @@ -import math as maths -import database - -def main(): - with database.Database() as db: - db.build_tf_idf_table() - - db.get_tf_idf_table() - -def calc_log_tf(tf): - if tf == 0: - return 0 - else: - return maths.log10(1 + tf) - -if __name__ == "__main__": +import math as maths
+import database
+
+def main():
+ with database.Database() as db:
+ db.build_tf_idf_table()
+
+ db.get_tf_idf_table()
+
+def calc_log_tf(tf):
+ if tf == 0:
+ return 0
+ else:
+ return maths.log10(1 + tf)
+
+if __name__ == "__main__":
main()
\ No newline at end of file |