added a simple search system

author: jwansek <eddie.atten.ea29@gmail.com> 2021-11-24 16:31:46 +0000
committer: jwansek <eddie.atten.ea29@gmail.com> 2021-11-24 16:31:46 +0000
commit: 5e4aee62e7a746491edd9077c0bdd959c444960b (patch)
tree: 09dd0d3bdbfdb1a5dc5c84b86c0088c9082a7842 /search.py
parent: cb4be28b07823639a64afaa34bf3919220a81e12 (diff)
download: searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.tar.gz
searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.zip
1 files changed, 50 insertions, 18 deletions
diff --git a/search.py b/search.py
index d41d0a2..60876d9 100644
--- a/search.py
+++ b/search.py
@@ -1,18 +1,50 @@
-import terms
-import sys
-import re
-
-def main(search_words):
-    
-    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
-    
-    search_words = []
-    for i in txt:
-        search_words += re.split(r"\s+|\n", i)
-    
-    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
-
-    print(search_words)
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
-\ No newline at end of file
+import database
+import logging
+import terms
+import sys
+import re
+
+logging.basicConfig(
+    format = "[%(asctime)s]\t%(message)s",
+    level = logging.INFO,
+    handlers=[
+        logging.FileHandler("searches.log"),
+        logging.StreamHandler()
+])
+
+def main(search_words):
+    
+    txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
+    
+    search_words = []
+    for i in txt:
+        search_words += re.split(r"\s+|\n", i)
+    
+    search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
+    logging.info("Started searching. Using terms: %s" % " ".join(search_words))
+
+    with database.Database() as db:
+        tf_idf_scores = []
+        for term in search_words:
+            tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
+            logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
+
+        merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
+        for scorelist in tf_idf_scores:
+            for doc_id, score in scorelist.items():
+                merged_scores[doc_id] += score
+        logging.info("Merged scores...")
+
+        sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
+        toshow = 20
+        print("Sorted scores...")
+
+        for i, j in enumerate(sorted_scores, 0):
+            if i >= toshow:
+                break
+
+            docid, score = j
+            logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
+        
+if __name__ == "__main__":
+    main(sys.argv[1:])
author	jwansek <eddie.atten.ea29@gmail.com>	2021-11-24 16:31:46 +0000
committer	jwansek <eddie.atten.ea29@gmail.com>	2021-11-24 16:31:46 +0000
commit	5e4aee62e7a746491edd9077c0bdd959c444960b (patch)
tree	09dd0d3bdbfdb1a5dc5c84b86c0088c9082a7842 /search.py
parent	cb4be28b07823639a64afaa34bf3919220a81e12 (diff)
download	searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.tar.gz searchEngine-5e4aee62e7a746491edd9077c0bdd959c444960b.zip