From 5e4aee62e7a746491edd9077c0bdd959c444960b Mon Sep 17 00:00:00 2001 From: jwansek Date: Wed, 24 Nov 2021 16:31:46 +0000 Subject: added a simple search system --- search.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 18 deletions(-) (limited to 'search.py') diff --git a/search.py b/search.py index d41d0a2..60876d9 100644 --- a/search.py +++ b/search.py @@ -1,18 +1,50 @@ -import terms -import sys -import re - -def main(search_words): - - txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] - - search_words = [] - for i in txt: - search_words += re.split(r"\s+|\n", i) - - search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] - - print(search_words) - -if __name__ == "__main__": - main(sys.argv[1:]) \ No newline at end of file +import database +import logging +import terms +import sys +import re + +logging.basicConfig( + format = "[%(asctime)s]\t%(message)s", + level = logging.INFO, + handlers=[ + logging.FileHandler("searches.log"), + logging.StreamHandler() +]) + +def main(search_words): + + txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] + + search_words = [] + for i in txt: + search_words += re.split(r"\s+|\n", i) + + search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] + logging.info("Started searching. Using terms: %s" % " ".join(search_words)) + + with database.Database() as db: + tf_idf_scores = [] + for term in search_words: + tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000)) + logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term)) + + merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)} + for scorelist in tf_idf_scores: + for doc_id, score in scorelist.items(): + merged_scores[doc_id] += score + logging.info("Merged scores...") + + sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1]))) + toshow = 20 + print("Sorted scores...") + + for i, j in enumerate(sorted_scores, 0): + if i >= toshow: + break + + docid, score = j + logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid))) + +if __name__ == "__main__": + main(sys.argv[1:]) -- cgit v1.2.3