From 5e528a1484533e64815db07608dc7fb1613fa36f Mon Sep 17 00:00:00 2001
From: jwansek <eddie.atten.ea29@gmail.com>
Date: Mon, 15 Mar 2021 16:14:47 +0000
Subject: switched markdown parser, added greentexts

---
 parser.py | 159 +++++++++++---------------------------------------------------
 1 file changed, 28 insertions(+), 131 deletions(-)

(limited to 'parser.py')
diff --git a/parser.py b/parser.py
index 8fb7408..1ba94cd 100755
--- a/parser.py
+++ b/parser.py
@@ -1,26 +1,40 @@
 #!/usr/bin/env python3
 
 from urllib.parse import urlparse
+from pygments import highlight
+from pygments.formatters import HtmlFormatter, ClassNotFound
+from pygments.lexers import get_lexer_by_name
 import webbrowser
 import database
 import argparse
 import getpass
+import houdini
+import misaka
 import app
 import sys
 import re
 import os
 
-# DISCLAIMER
-# There is almost certainly a python package to
-# do this better. I wanted to do it myself as a challenge.
-
-# TODO:
-#   - Add table formatting
-#   - Fix <br>s with newlines
-#   - Fix nested markdown elements
-
-HEADER_INCREMENTER = 1
-IMAGE_TYPES = [".png", ".jpg"]
+class HighlighterRenderer(misaka.SaferHtmlRenderer):
+    def blockcode(self, text, lang):
+        try:
+            lexer = get_lexer_by_name(lang, stripall=True)
+        except ClassNotFound:
+            lexer = None
+
+        if lexer:
+            formatter = HtmlFormatter()
+            return highlight(text, lexer, formatter)
+        # default
+        return '\n<pre><code>{}</code></pre>\n'.format(houdini.escape_html(text.strip()))
+
+    def blockquote(self, content):
+        content = content[3:-5] # idk why this is required...
+        out = '\n<blockquote>'
+        for line in houdini.escape_html(content.strip()).split("\n"):
+            out += '\n<span class="quote">{}</span><br>'.format(line)
+        print(out)
+        return out + '\n</blockquote>'
 
 def get_thought_from_id(db, id_):
     category_name, title, dt, markdown = db.get_thought(id_)
@@ -33,127 +47,10 @@ def parse_file(path):
     return parse_text(unformatted)    
 
 def parse_text(unformatted):
-    formatted = parse_headers(unformatted)
-    formatted = parse_asteriscs(formatted)
-    formatted = parse_links(formatted)
-    formatted = parse_code(formatted)
-    formatted = parse_lists(formatted)
-    formatted = add_linebreaks(formatted)
-
-    return formatted
-
-def parse_headers(test_str):
-    regex = r"^#{1,5}\s\w.*$"
-    matches = re.finditer(regex, test_str, re.MULTILINE)
-    offset = 0
-
-    for match in matches:
-        # work out if its h2, h3 etc. from the number of #s
-        headerNo = len(match.group().split(" ")[0]) + HEADER_INCREMENTER
-        
-        replacement = "<h%i>%s</h%i>" % (headerNo, " ".join(match.group().split(" ")[1:]), headerNo)
-
-        #don't use .replace() in the unlikely case the the regex hit appears in a block
-        test_str = test_str[:match.start()+offset] + replacement + test_str[match.end()+offset:]
-        #replacing the hits fucks up the indexes, accommodate for this
-        offset += (len(replacement) - (match.end() - match.start()))
-
-    return test_str
-
-def parse_asteriscs(test_str):
-    regex = r"(?<!\\)\*{1,3}.*?\*{1,3}"
-    matches = re.finditer(regex, test_str, re.MULTILINE)
-    offset = 0
-
-    for match in matches:
-        if len(re.findall(r"\*{1,3}.*?\\\*{1,3}", match.group())) == 0:     #need to find a way of doing this with regexes
-            if match.group().startswith(re.findall(r"\w\*{1,3}", match.group())[0][1:]):    #this too
-                if match.group().startswith("***"):
-                    replacement = "<b><i>%s</i></b>" % (match.group()[3:-3])
-                elif match.group().startswith("**"):
-                    replacement = "<b>%s</b>" % (match.group()[2:-2])
-                else:
-                    replacement = "<i>%s</i>" % (match.group()[1:-1])
-        
-                test_str = test_str[:match.start()+offset] + replacement + test_str[match.end()+offset:]
-                offset += (len(replacement) - (match.end() - match.start()))
-
-    return test_str
-
-def parse_links(test_str):
-    regex = r"(?<!\\)\[.*?\]\(.*?\)"
-    matches = re.finditer(regex, test_str, re.MULTILINE)
-    offset = 0
-
-    for match in matches:
-        s = match.group().split("(")
-        label = s[0][1:-1]
-        url = s[1][:-1]
-
-        if os.path.splitext(urlparse(url).path)[1] in IMAGE_TYPES:
-            replacement = "<img alt='%s' src=%s>" % (label, url)
-        else:
-            replacement = "<a href=%s>%s</a>" % (url, label)
-
-        test_str = test_str[:match.start()+offset] + replacement + test_str[match.end()+offset:]
-        offset += (len(replacement) - (match.end() - match.start()))
-
-    return test_str
-
-def parse_code(test_str):
-    regex = r"(?<!\\)`\w{1,}?`"
-    # this only matches single words, but escaping is less complicated
-    matches = re.finditer(regex, test_str, re.MULTILINE)
-    offset = 0
-
-    for match in matches:
-        replacement = "<em class=inlineCode style='font-family: monospace;font-style: normal;'>%s</em>" % match.group()[1:-1]
-        test_str = test_str[:match.start()+offset] + replacement + test_str[match.end()+offset:]
-        offset += (len(replacement) - (match.end() - match.start()))
-
-    out = ""
-    inBlock = 0
-    for line in test_str.split("\n"):
-        if line == "```":
-            if inBlock % 2 == 0:
-                out += "<p class=codeBlock style='font-family: monospace;font-style: normal;white-space: pre-wrap;'>\n"
-            else:
-                out += "</p>\n"
-            inBlock += 1
-        else:
-            out += line + "\n"
-
-    return out
-
-def parse_lists(test_str):
-    regex = r"^[1-9][.)] .*$|- .*$"
-    matches = re.finditer(regex, test_str, re.MULTILINE)
-    offset = 0
-    theFirstOne = True
-
-    for match in matches:
-        if theFirstOne:
-            if match.group()[0].isdigit():
-                listType = "ol"
-                cutoff = 3
-            else:
-                listType = "ul"
-                cutoff = 2
-            replacement = "<%s>\n<li>%s</li>" % (listType, match.group()[cutoff:])
-            theFirstOne = False
-        else:
-            if re.match(regex, [i for i in test_str[match.end()+offset:].split("\n") if i != ''][0]) is None:
-                theFirstOne = True
-                replacement = "<li>%s</li>\n</%s>" % (match.group()[cutoff:], listType)
-            else:
-                replacement = "<li>%s</li>" % match.group()[cutoff:]
-        test_str = test_str[:match.start()+offset] + replacement + test_str[match.end()+offset:]
-        offset += (len(replacement) - (match.end() - match.start()))
-
-    return test_str
+    renderer = HighlighterRenderer()
+    md = misaka.Markdown(renderer, extensions=('fenced-code', 'quote'))
 
-def add_linebreaks(test_str):
-    return re.sub(r"^$", "<br><br>", test_str, 0, re.MULTILINE)
+    return md(unformatted)
 
 def preview_markdown(path, title, category):
     def startBrowser():
-- 
cgit v1.2.3