aboutsummaryrefslogtreecommitdiffstats
path: root/terms.py
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2021-12-02 17:31:36 +0000
committerjwansek <eddie.atten.ea29@gmail.com>2021-12-02 17:31:36 +0000
commitfae80d80bbfe7168535d45775f0e60abb897bf1b (patch)
treeee65573503d4343785f8cba7345d78a9426fe377 /terms.py
parent1f5dec8047af8c58ce3acb5014d82caf7e6766df (diff)
downloadsearchEngine-fae80d80bbfe7168535d45775f0e60abb897bf1b.tar.gz
searchEngine-fae80d80bbfe7168535d45775f0e60abb897bf1b.zip
added query expansion, linked term searches
Diffstat (limited to 'terms.py')
-rw-r--r--terms.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/terms.py b/terms.py
index 07055dd..b9d77e2 100644
--- a/terms.py
+++ b/terms.py
@@ -41,7 +41,7 @@ def main():
#break
def parse_region(raw_text, region_weight, document_id):
- print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text)))
+ print("d: %d; w: %d; len: %d" % (document_id, region_weight, len(raw_text)), end = "")
terms = word_tokenize(raw_text)
terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms]
terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]
@@ -71,6 +71,7 @@ def append_region(terms, linked_words, region_weight, document_id):
weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()}
weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()}
+ print("; t: %d; lt: %d" % (len(weighted_terms), len(weighted_linked_terms)))
return weighted_terms, weighted_linked_terms
def parse_document(document_id, document_path, numdocs):