aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2021-12-05 16:02:30 +0000
committerjwansek <eddie.atten.ea29@gmail.com>2021-12-05 16:02:30 +0000
commitdf2a6e1a882b9b85f42f391f2bd129c9663c9e42 (patch)
treef9ed452500cb6199ac3771bd1dfd1e5b50dc725f
parentfae80d80bbfe7168535d45775f0e60abb897bf1b (diff)
downloadsearchEngine-df2a6e1a882b9b85f42f391f2bd129c9663c9e42.tar.gz
searchEngine-df2a6e1a882b9b85f42f391f2bd129c9663c9e42.zip
added rendering to html
-rw-r--r--.gitignore2
-rw-r--r--markdown_renderer.py26
-rw-r--r--reportWriter.py69
-rw-r--r--requirements.txt7
-rw-r--r--search.py25
5 files changed, 118 insertions, 11 deletions
diff --git a/.gitignore b/.gitignore
index 8275cc8..0f28e9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
*.db
+searches/*.md
+searches/*.html
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/markdown_renderer.py b/markdown_renderer.py
new file mode 100644
index 0000000..2f78a0d
--- /dev/null
+++ b/markdown_renderer.py
@@ -0,0 +1,26 @@
+import tempfile
+import webbrowser
+import misaka
+import sys
+import os
+
+class SearchReportRenderer(misaka.HtmlRenderer):
+ # override the default renderer to add line breaks after every paragraph
+ def paragraph(self, text):
+ return "<p>%s</p><br><br>" % text
+
+def render(md_path):
+ renderer = SearchReportRenderer()
+ md = misaka.Markdown(renderer)
+ with open(md_path, "r", encoding='utf-8') as f:
+ return md(f.read())
+
+def render_and_view(md_path):
+ html_path = os.path.join(*os.path.split(md_path)[:-1], os.path.splitext(os.path.split(md_path)[-1])[0] + ".html")
+ with open(html_path, "w", encoding='utf-8') as f:
+ f.writelines(render(md_path))
+
+ webbrowser.open(html_path)
+
+if __name__ == "__main__":
+ render_and_view(sys.argv[1]) \ No newline at end of file
diff --git a/reportWriter.py b/reportWriter.py
new file mode 100644
index 0000000..fb68f42
--- /dev/null
+++ b/reportWriter.py
@@ -0,0 +1,69 @@
+from nltk.tokenize import sent_tokenize, word_tokenize
+import documents
+import datetime
+import time
+import bs4
+import os
+import re
+
+
+def write(search_terms, results, timetaken, expanded_terms):
+
+ out_path = os.path.join("searches", "search_%s-%f.md" % ("+".join(expanded_terms)[:20], time.time()))
+
+ with open(out_path, "w", encoding='utf-8') as f:
+ f.write("# Search %s\n" % str(datetime.datetime.now()))
+ f.write("**Using search terms: %s**\n" % " ".join(search_terms))
+ f.write("*Found %d results in %.2f minutes*\n\n" % (len(results), timetaken / 60))
+
+ for docid, score in results[:30]:
+ fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), documents.get_document_name_by_id(docid))
+ title, ext = os.path.splitext(os.path.split(fullpath)[-1])
+ f.write("### %.2f - [%s](file://%s) [%s]\n" % (score, title, fullpath, ext[1:].upper()))
+ f.write("###### *%s*\n" % fullpath)
+ f.write("%s\n" % get_snippet(fullpath, expanded_terms))
+
+ f.write("\n")
+
+ return out_path
+
+def get_snippet(docpath, expanded_terms, num_to_get = 2):
+ with open(docpath, "r", encoding='utf-8') as f:
+ soup = bs4.BeautifulSoup(f.read(), "lxml")
+
+ text = " ".join([e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)])
+ found_sentences = []
+ for sentence in sent_tokenize(text):
+ if len(found_sentences) == num_to_get:
+ break
+
+ found_terms = re.search("|".join(expanded_terms), sentence, re.IGNORECASE)
+ if found_terms is not None:
+ sentence = shorten_sentence(sentence, expanded_terms)
+ new_sentence = sentence
+ for match in re.finditer("|".join(expanded_terms), sentence, re.IGNORECASE):
+ if not new_sentence[match.start() - 4:match.start()].startswith("*"):
+ new_sentence = new_sentence.replace(match.group(0), "**" + match.group(0) + "**").replace("\n", " ")
+
+ found_sentences.append(new_sentence)
+
+ return "*[...]* " + "\t*[...]*\t".join(found_sentences) + " *[...]*"
+
+def shorten_sentence(sentence, expanded_terms):
+ if len(sentence) > 200:
+ match = re.search("|".join(expanded_terms), sentence, re.IGNORECASE)
+ add_to_end = 0
+ startindex = match.start() - 106
+ if startindex < 0:
+ add_to_end += abs(startindex)
+ startindex = 0
+
+ endindex = match.end() + 106 + add_to_end
+ return " ".join(word_tokenize(sentence[startindex:endindex])[1:-1])
+ else:
+ return sentence
+
+if __name__ == "__main__":
+ # print(get_snippet("../Wikibooks/Wikibooks/History of video games Print version Timeline.html", ['cosmonaut', 'astronaut', 'spaceman']))
+
+ print(shorten_sentence('star cloud") maryan....constellation ("collection of stars") marmeg....comet ("star rock") mommeg....meteor ("space rock") mammeg....meteorite ("sky rock") The following are vehicles and derivatives that are specific to one of the above physical spheres: Vehicles Specific to Various Spheres mompur....spaceship momper....travel through space momput....cosmonaut, astronaut mampur....airplane mamper....fly mamput....flyer, pilot mempur....automobile memper....ride, drive memput....rider, driver mimpur....shipobmimpar....submarine mimper....sail, navigate mimput....sailor, navigatorobmimput....submariner mumpur....subway mumper....tunnel, go by metro mumput....metro rider Note: marpur = starship and muarpur = lunar module Names of the Planets[edit | edit source] Here are the names of the planets in our solar system.', ["cosmonaut", 'astronaut', 'spaceman'])) \ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..64fdf1d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4==4.10.0
+misaka==2.1.1
+nltk==3.6.5
+spacy==3.2.0
+lxml
+
+# python -m spacy download en_core_web_sm \ No newline at end of file
diff --git a/search.py b/search.py
index 60cbd2a..331e113 100644
--- a/search.py
+++ b/search.py
@@ -1,5 +1,7 @@
from nltk.corpus import wordnet
from nltk import pos_tag
+import markdown_renderer
+import reportWriter
import collections
import itertools
import database
@@ -70,9 +72,9 @@ def main(search_words):
logging.info("Got %d scores for term '%s' (multiplier %d)" % (len(scores), single_term, search_weight))
tf_idf_scores += scores
- for linked_terms, search_weight in linked_terms.items():
- scores = db.get_tf_idf_score_linked(linked_terms.split(","), tf_idf_thresh=0, multiplier=search_weight)
- logging.info("Got %d scores for linked term '%s' (multiplier %d)" % (len(scores), str(linked_terms), search_weight))
+ for linked_term, search_weight in linked_terms.items():
+ scores = db.get_tf_idf_score_linked(linked_term.split(","), tf_idf_thresh=0, multiplier=search_weight)
+ logging.info("Got %d scores for linked term '%s' (multiplier %d)" % (len(scores), str(linked_term), search_weight))
tf_idf_scores += scores
sorted_scores = list(reversed(sorted(tf_idf_scores.items(), key = lambda i: i[1])))
@@ -80,15 +82,16 @@ def main(search_words):
logging.info("Sorted scores...")
logging.info("Results:\n\n")
- for i, j in enumerate(sorted_scores, 0):
- if i >= toshow:
- break
-
- docid, score = j
+ for docid, score in sorted_scores[:30]:
logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
- logging.info("Got %d results in total. Took %.2f minutes (%.2fs per term)" % (len(tf_idf_scores), (time.time() - starttime) / 60, (time.time() - starttime) / (len(single_terms) + len(linked_terms))))
-
+ timetaken = time.time() - starttime
+ logging.info("Got %d results in total. Took %.2f minutes (%.2fs per term)" % (len(tf_idf_scores), timetaken / 60, timetaken / (len(single_terms) + len(linked_terms))))
+ md_path = reportWriter.write(sys.argv[1:], sorted_scores, timetaken, list(single_terms.keys()) + [i.replace(",", " ") for i in linked_terms.keys()])
+ logging.info("Report written to %s..." % md_path)
+ markdown_renderer.render_and_view(md_path)
+ logging.info("Report rendered as HTML and showing..")
+
if __name__ == "__main__":
- main(sys.argv[1:])
+ main(sys.argv[1:]) \ No newline at end of file