1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
import database
import logging
import terms
import sys
import re
logging.basicConfig(
format = "[%(asctime)s]\t%(message)s",
level = logging.INFO,
handlers=[
logging.FileHandler("searches.log"),
logging.StreamHandler()
])
def main(search_words):
txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
search_words = []
for i in txt:
search_words += re.split(r"\s+|\n", i)
search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
logging.info("Started searching. Using terms: %s" % " ".join(search_words))
with database.Database() as db:
tf_idf_scores = []
for term in search_words:
tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
for scorelist in tf_idf_scores:
for doc_id, score in scorelist.items():
merged_scores[doc_id] += score
logging.info("Merged scores...")
sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
toshow = 20
print("Sorted scores...")
for i, j in enumerate(sorted_scores, 0):
if i >= toshow:
break
docid, score = j
logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
if __name__ == "__main__":
main(sys.argv[1:])
|