diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-23 13:05:48 +0000 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2021-11-23 13:05:48 +0000 |
commit | cb4be28b07823639a64afaa34bf3919220a81e12 (patch) | |
tree | 21cbcc5597fcf3f86ddb019415c692b61683dcd6 /search.py | |
parent | 5583c58431d583753fc671b1ae5a93b380fd0e15 (diff) | |
download | searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.tar.gz searchEngine-cb4be28b07823639a64afaa34bf3919220a81e12.zip |
first commit, create parser and tf-idf table
Diffstat (limited to 'search.py')
-rw-r--r-- | search.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/search.py b/search.py new file mode 100644 index 0000000..d41d0a2 --- /dev/null +++ b/search.py @@ -0,0 +1,18 @@ +import terms +import sys +import re + +def main(search_words): + + txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words] + + search_words = [] + for i in txt: + search_words += re.split(r"\s+|\n", i) + + search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS] + + print(search_words) + +if __name__ == "__main__": + main(sys.argv[1:])
\ No newline at end of file |