first commit, create parser and tf-idf table

author: jwansek <eddie.atten.ea29@gmail.com> 2021-11-23 13:05:48 +0000
committer: jwansek <eddie.atten.ea29@gmail.com> 2021-11-23 13:05:48 +0000
commit: cb4be28b07823639a64afaa34bf3919220a81e12 (patch)
tree: 21cbcc5597fcf3f86ddb019415c692b61683dcd6 /terms.py
parent: 5583c58431d583753fc671b1ae5a93b380fd0e15 (diff)
download: searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.tar.gz
searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.zip
1 files changed, 47 insertions, 0 deletions
diff --git a/terms.py b/terms.py
new file mode 100644
index 0000000..ab3fcfc
--- /dev/null
+++ b/terms.py
@@ -0,0 +1,47 @@
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from nltk.stem import WordNetLemmatizer
+from nltk.util import ngrams  
+import collections  
+import documents
+import database
+import bs4
+import re
+
+STOPWORDS = set(stopwords.words('english')).difference({
+    "how", "because", "through", "or", "as", "about", "not",
+    "no", "who", "of", "can", "over", "you"
+}).union({chr(i) for i in range(97, 123)}.difference({"a", "i"}))
+LEM = WordNetLemmatizer()
+
+def main():
+    numdocs = documents.get_num_documents()
+    for document_id in range(1, numdocs):
+        parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)
+
+        # break
+
+def parse_document(document_id, document_path, numdocs):
+    with open(document_path, "r") as f:
+       soup = bs4.BeautifulSoup(f.read(), "lxml")
+
+    text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)]
+    text = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in text]
+    
+    terms = []
+    for i in text:
+        terms += re.split(r"\s+|\n", i)
+    
+    terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]
+    terms_counted = collections.Counter(terms)
+
+    with database.Database() as db:
+        db.append_terms(terms)
+        print("[%f%%] Added %d terms from docid: %d" % ((document_id/numdocs)*100, len(terms_counted), document_id)) 
+
+        db.append_terms_in_document(document_id, terms_counted)
+        print("Appended term frequency too")
+
+if __name__ == "__main__":
+    main()
+\ No newline at end of file
author	jwansek <eddie.atten.ea29@gmail.com>	2021-11-23 13:05:48 +0000
committer	jwansek <eddie.atten.ea29@gmail.com>	2021-11-23 13:05:48 +0000
commit	cb4be28b07823639a64afaa34bf3919220a81e12 (patch)
tree	21cbcc5597fcf3f86ddb019415c692b61683dcd6 /terms.py
parent	5583c58431d583753fc671b1ae5a93b380fd0e15 (diff)
download	searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.tar.gz searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.zip