diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-23 13:05:48 +0000 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-23 13:05:48 +0000 |
commit | cb4be28b07823639a64afaa34bf3919220a81e12 (patch) | |
tree | 21cbcc5597fcf3f86ddb019415c692b61683dcd6 /terms.py | |
parent | 5583c58431d583753fc671b1ae5a93b380fd0e15 (diff) | |
download | searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.tar.gz searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.zip |
first commit, create parser and tf-idf table
Diffstat (limited to 'terms.py')
-rw-r--r-- | terms.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/terms.py b/terms.py new file mode 100644 index 0000000..ab3fcfc --- /dev/null +++ b/terms.py @@ -0,0 +1,47 @@ +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from nltk.stem import WordNetLemmatizer +from nltk.util import ngrams +import collections +import documents +import database +import bs4 +import re + +STOPWORDS = set(stopwords.words('english')).difference({ + "how", "because", "through", "or", "as", "about", "not", + "no", "who", "of", "can", "over", "you" +}).union({chr(i) for i in range(97, 123)}.difference({"a", "i"})) +LEM = WordNetLemmatizer() + +def main(): + numdocs = documents.get_num_documents() + for document_id in range(1, numdocs): + parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) + + # break + +def parse_document(document_id, document_path, numdocs): + with open(document_path, "r") as f: + soup = bs4.BeautifulSoup(f.read(), "lxml") + + text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)] + text = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in text] + + terms = [] + for i in text: + terms += re.split(r"\s+|\n", i) + + terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] + terms_counted = collections.Counter(terms) + + with database.Database() as db: + db.append_terms(terms) + print("[%f%%] Added %d terms from docid: %d" % ((document_id/numdocs)*100, len(terms_counted), document_id)) + + db.append_terms_in_document(document_id, terms_counted) + print("Appended term frequency too") + +if __name__ == "__main__": + main()
\ No newline at end of file |