1 files changed, 19 insertions, 3 deletions
diff --git a/get_images.py b/get_images.py
index 817b9c1..9d25599 100644..100755
--- a/get_images.py
+++ b/get_images.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from PIL import Image, ImageDraw
+from lxml.html import parse
 from io import StringIO
 from lxml import etree
 import requests
@@ -85,10 +86,21 @@ def get_num_pages(tags):
     else:
         return int(int(urllib.parse.parse_qs(page_element.get("href"))["pid"][0]) / (5*8))
 
+def check_pixiv_404(url):
+    text = requests.get(url).text
+    return text[text.find("<title>") + 7 : text.find("</title>")] == "イラストコミュニケーションサービス[pixiv]"
+
 def fix_source_url(url):
-    if "pixiv.net" in url or "pximg.net" in url: 
-        return "https://www.pixiv.net/en/artworks/%s" % url.split("/")[-1][:8]                
-    return url 
+    parsed = urllib.parse.urlparse(url)
+    if parsed.netloc == "www.pixiv.net":
+        return "https://www.pixiv.net/en/artworks/" + urllib.parse.parse_qs(parsed.query)["illust_id"][0]
+    elif parsed.netloc in ["bishie.booru.org", "www.secchan.net"]:
+        return ConnectionError("Couldn't get source")
+    elif "pximg.net" in parsed.netloc or "pixiv.net" in parsed.netloc:
+        return "https://www.pixiv.net/en/artworks/" + parsed.path.split("/")[-1][:8]
+    elif parsed.netloc == "twitter.com":
+        return url.replace("twitter.com", "nitter.eda.gay")
+    return url
 
 def append_blacklisted(id_):
     with open(CONFIG["blacklist"], "a") as f:
@@ -127,6 +139,10 @@ def main(draw_faces = False):
         logging.info("Retried, already posted image...")
         return main()
 
+    if check_pixiv_404(fix_source_url(simg.source)):
+        logging.warning("Skipping since pixiv linked 404'd")
+        return main()
+
     append_blacklisted(simg.id)
 
     with DownloadedImage(simg.imurl) as impath: