aboutsummaryrefslogtreecommitdiffstats
path: root/terms.py
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2021-11-25 16:39:25 +0000
committerjwansek <eddie.atten.ea29@gmail.com>2021-11-25 16:39:25 +0000
commitfd2b9c85377df274514c6f0542cd6d1dbcbab183 (patch)
treeeca0fe2058c9981386018a578c3368b238153e74 /terms.py
parent95563cbf5c0d66016892e8d580033088f865010b (diff)
downloadsearchEngine-fd2b9c85377df274514c6f0542cd6d1dbcbab183.tar.gz
searchEngine-fd2b9c85377df274514c6f0542cd6d1dbcbab183.zip
made some minor fixes
Diffstat (limited to 'terms.py')
-rw-r--r--terms.py16
1 files changed, 9 insertions, 7 deletions
diff --git a/terms.py b/terms.py
index 5da0acc..74eafd9 100644
--- a/terms.py
+++ b/terms.py
@@ -19,22 +19,24 @@ import re
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
+nlp.max_length = 100000000
STOPWORDS = set(stopwords.words('english')).difference({
"how", "because", "through", "or", "as", "about", "not",
"no", "who", "of", "can", "over", "you"
-}).union({chr(i) for i in range(97, 123)}.difference({"a", "i"}))
+}).union({chr(i) for i in range(98, 123)})
+STOPWORDS.remove("a")
LEM = WordNetLemmatizer()
def main():
numdocs = documents.get_num_documents()
- docid = random.randint(1, numdocs)
- parse_document(docid, documents.get_document_name_by_id(docid), numdocs)
+ #docid = random.randint(1, numdocs)
+ #parse_document(docid, documents.get_document_name_by_id(docid), numdocs)
- # for document_id in range(1, numdocs):
- # parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)
+ for document_id in range(1, numdocs):
+ parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)
- # # break
+ #break
def parse_region(raw_text, region_weight, document_id):
terms = word_tokenize(raw_text)
@@ -126,4 +128,4 @@ def parse_document(document_id, document_path, numdocs):
if __name__ == "__main__":
- main() \ No newline at end of file
+ main()