diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-25 16:39:25 +0000 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-25 16:39:25 +0000 |
commit | fd2b9c85377df274514c6f0542cd6d1dbcbab183 (patch) | |
tree | eca0fe2058c9981386018a578c3368b238153e74 /terms.py | |
parent | 95563cbf5c0d66016892e8d580033088f865010b (diff) | |
download | searchEngine-fd2b9c85377df274514c6f0542cd6d1dbcbab183.tar.gz searchEngine-fd2b9c85377df274514c6f0542cd6d1dbcbab183.zip |
made some minor fixes
Diffstat (limited to 'terms.py')
-rw-r--r-- | terms.py | 16 |
1 files changed, 9 insertions, 7 deletions
@@ -19,22 +19,24 @@ import re import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") +nlp.max_length = 100000000 STOPWORDS = set(stopwords.words('english')).difference({ "how", "because", "through", "or", "as", "about", "not", "no", "who", "of", "can", "over", "you" -}).union({chr(i) for i in range(97, 123)}.difference({"a", "i"})) +}).union({chr(i) for i in range(98, 123)}) +STOPWORDS.remove("a") LEM = WordNetLemmatizer() def main(): numdocs = documents.get_num_documents() - docid = random.randint(1, numdocs) - parse_document(docid, documents.get_document_name_by_id(docid), numdocs) + #docid = random.randint(1, numdocs) + #parse_document(docid, documents.get_document_name_by_id(docid), numdocs) - # for document_id in range(1, numdocs): - # parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) + for document_id in range(1, numdocs): + parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) - # # break + #break def parse_region(raw_text, region_weight, document_id): terms = word_tokenize(raw_text) @@ -126,4 +128,4 @@ def parse_document(document_id, document_path, numdocs): if __name__ == "__main__": - main()
\ No newline at end of file + main() |