From cb4be28b07823639a64afaa34bf3919220a81e12 Mon Sep 17 00:00:00 2001 From: jwansek Date: Tue, 23 Nov 2021 13:05:48 +0000 Subject: first commit, create parser and tf-idf table --- .gitignore | 2 + database.py | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ documents.py | 28 ++++++++++ search.py | 18 +++++++ terms.py | 47 +++++++++++++++++ tf_idf.py | 17 ++++++ 6 files changed, 281 insertions(+) create mode 100644 database.py create mode 100644 documents.py create mode 100644 search.py create mode 100644 terms.py create mode 100644 tf_idf.py diff --git a/.gitignore b/.gitignore index b6e4761..8275cc8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.db + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/database.py b/database.py new file mode 100644 index 0000000..0a804de --- /dev/null +++ b/database.py @@ -0,0 +1,169 @@ +from dataclasses import dataclass +import math as maths +import sqlite3 +import tf_idf +import os + +class DatabaseCursor(sqlite3.Cursor): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + +@dataclass +class Database: + db_path:str = os.path.join(".", "wikibooks.db") + + def __enter__(self): + if not os.path.exists(self.db_path): + self.__connection = sqlite3.connect(self.db_path) + self.__build_db() + else: + self.__connection = sqlite3.connect(self.db_path) + + self.__connection.create_function('log', 1, maths.log10) + self.__connection.create_function("log_tf", 1, tf_idf.calc_log_tf) + return self + + def __exit__(self, type, value, traceback): + self.__connection.commit() + self.__connection.close() + + def __build_db(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute(""" + CREATE TABLE IF NOT EXISTS `documents` ( + `document_id` INTEGER PRIMARY KEY, + `document_name` TEXT NOT NULL + );""") + cursor.execute(""" + CREATE TABLE IF NOT EXISTS `vocabulary` ( + `vocabulary_id` INTEGER PRIMARY KEY, + `term` TEXT NOT NULL + );""") + cursor.execute(""" + CREATE TABLE IF NOT EXISTS `term_frequency` ( + `vocabulary_id` INT UNSIGNED NOT NULL, + `document_id` INT UNSIGNED NOT NULL, + `frequency` INT UNSIGNED NOT NULL, + FOREIGN KEY (`vocabulary_id`) REFERENCES `vocabulary`(`vocabulary_id`), + FOREIGN KEY (`document_id`) REFERENCES `documents`(`document_id`), + PRIMARY KEY(`vocabulary_id`, `document_id`) + );""") + cursor.execute("CREATE UNIQUE INDEX unique_terms on vocabulary (term);") + + def append_documents(self, documents): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.executemany("INSERT INTO `documents` (document_name) VALUES (?);", [(doc, ) for doc in documents]) + + def get_document_name_by_id(self, id_): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT `document_name` FROM `documents` WHERE `document_id` = ?;", (id_, )) + return cursor.fetchone()[0] + + def get_document_id_by_name(self, document_name): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT `document_id` FROM `documents` WHERE `document_name` = ?", (document_name, )) + return cursor.fetchone()[0] + + def get_num_documents(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT COUNT(*) FROM documents;") + return cursor.fetchone()[0] + + def append_terms(self, terms): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms]) + + def append_terms_in_document(self, document_id, counter): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.executemany(""" + INSERT INTO `term_frequency`(`vocabulary_id`, `document_id`, `frequency`) + VALUES ((SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?), ?, ?) + """, [(i[0], document_id, i[1]) for i in counter.items()]) + + def build_tf_idf_table(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;") + # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;") + # i wish i could find a way to do this with a single view but alas i am + # not good enough at SQL + cursor.execute(""" + CREATE VIEW IF NOT EXISTS `vocab_count` AS + SELECT vocabulary_id, + COUNT(vocabulary_id) AS vocabulary_count + FROM term_frequency + GROUP BY vocabulary_id; + """) + cursor.execute(""" + CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT + `term_frequency`.`vocabulary_id` AS `vocabulary_id`, + `document_id`, + `term_frequency`.`frequency`, + LOG_TF(`frequency`) AS tf, + (SELECT COUNT(`document_id`) FROM `documents`) AS n, + `vocab_count`.`vocabulary_count` AS df, + (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf, + LOG_TF(`frequency`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf + FROM `term_frequency` + INNER JOIN `vocab_count` + ON `vocab_count`.`vocabulary_id` = `term_frequency`.`vocabulary_id` + ;""") + + def get_term_frequencies(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT * FROM `term_frequency`;") + return cursor.fetchall() + + def append_tf_idf_table(self, tfs): + # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1]) + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.executemany(""" + INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) + VALUES ( + ?, ?, ?, + (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?), + (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?) + """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs]) + + def append_tf_idf_table_single(self, vocabulary_id, document_id, tf): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute(""" + INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`) + VALUES ( + ?, ?, ?, + (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?), + (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_frequency WHERE vocabulary_id = ?) * ?) + """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf)) + + def test_log(self, to_log): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT LOG(?);", (to_log, )) + return cursor.fetchone() + + def test_tf_log(self, to_tf): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT LOG_TF(?);", (to_tf, )) + return cursor.fetchone() + + def get_tf_idf_table(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;") + out = cursor.fetchall() + print(len(out)) + print(("vocabulary_id", "document_id", "frequency", "tf", "n", "df", "idf")) + for l in out[:100]: + print(l) + +if __name__ == "__main__": + with Database() as db: + # print(db.get_num_documents()) + # print(db.get_document_name_by_id(69420)) + # print(db.get_document_id_by_name("../Datasets/Wikibooks/Russian Lesson 2.html")) + # print(db.test_log(100)) + # print(db.test_log(21)) + db.get_tf_idf_table() \ No newline at end of file diff --git a/documents.py b/documents.py new file mode 100644 index 0000000..2053f0f --- /dev/null +++ b/documents.py @@ -0,0 +1,28 @@ +import database +import sys +import os + +def add_documents(documents_path): + docs = [os.path.join(documents_path, f) for f in os.listdir(documents_path)] + print(docs) + with database.Database() as db: + db.append_documents(docs) + +def get_document_name_by_id(id_): + with database.Database() as db: + return db.get_document_name_by_id(id_) + +def get_document_id_by_name(document_name): + with database.Database() as db: + return db.get_document_id_by_name(document_name) + +def get_num_documents(): + with database.Database() as db: + return db.get_num_documents() + +if __name__ == "__main__": + add_documents(sys.argv[1]) + + # print(get_document_name_by_id(1)) + # print(get_document_id_by_name("../Wikibooks/USMLE Step 1 Review Reproductive.html")) + # print(get_num_documents()) \ No newline at end of file diff --git a/search.py b/search.py new file mode 100644 index 0000000..d41d0a2 --- /dev/null +++ b/search.py @@ -0,0 +1,18 @@ +import terms +import sys +import re + +def main(search_words): + + txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] + + search_words = [] + for i in txt: + search_words += re.split(r"\s+|\n", i) + + search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] + + print(search_words) + +if __name__ == "__main__": + main(sys.argv[1:]) \ No newline at end of file diff --git a/terms.py b/terms.py new file mode 100644 index 0000000..ab3fcfc --- /dev/null +++ b/terms.py @@ -0,0 +1,47 @@ +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from nltk.stem import WordNetLemmatizer +from nltk.util import ngrams +import collections +import documents +import database +import bs4 +import re + +STOPWORDS = set(stopwords.words('english')).difference({ + "how", "because", "through", "or", "as", "about", "not", + "no", "who", "of", "can", "over", "you" +}).union({chr(i) for i in range(97, 123)}.difference({"a", "i"})) +LEM = WordNetLemmatizer() + +def main(): + numdocs = documents.get_num_documents() + for document_id in range(1, numdocs): + parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) + + # break + +def parse_document(document_id, document_path, numdocs): + with open(document_path, "r") as f: + soup = bs4.BeautifulSoup(f.read(), "lxml") + + text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)] + text = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in text] + + terms = [] + for i in text: + terms += re.split(r"\s+|\n", i) + + terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] + terms_counted = collections.Counter(terms) + + with database.Database() as db: + db.append_terms(terms) + print("[%f%%] Added %d terms from docid: %d" % ((document_id/numdocs)*100, len(terms_counted), document_id)) + + db.append_terms_in_document(document_id, terms_counted) + print("Appended term frequency too") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tf_idf.py b/tf_idf.py new file mode 100644 index 0000000..615720d --- /dev/null +++ b/tf_idf.py @@ -0,0 +1,17 @@ +import math as maths +import database + +def main(): + with database.Database() as db: + db.build_tf_idf_table() + + db.get_tf_idf_table() + +def calc_log_tf(tf): + if tf == 0: + return 0 + else: + return maths.log10(1 + tf) + +if __name__ == "__main__": + main() \ No newline at end of file -- cgit v1.2.3