diff options
Diffstat (limited to 'get_images.py')
-rwxr-xr-x[-rw-r--r--] | get_images.py | 22 |
1 files changed, 19 insertions, 3 deletions
diff --git a/get_images.py b/get_images.py index 817b9c1..9d25599 100644..100755 --- a/get_images.py +++ b/get_images.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from PIL import Image, ImageDraw +from lxml.html import parse from io import StringIO from lxml import etree import requests @@ -85,10 +86,21 @@ def get_num_pages(tags): else: return int(int(urllib.parse.parse_qs(page_element.get("href"))["pid"][0]) / (5*8)) +def check_pixiv_404(url): + text = requests.get(url).text + return text[text.find("<title>") + 7 : text.find("</title>")] == "イラストコミュニケーションサービス[pixiv]" + def fix_source_url(url): - if "pixiv.net" in url or "pximg.net" in url: - return "https://www.pixiv.net/en/artworks/%s" % url.split("/")[-1][:8] - return url + parsed = urllib.parse.urlparse(url) + if parsed.netloc == "www.pixiv.net": + return "https://www.pixiv.net/en/artworks/" + urllib.parse.parse_qs(parsed.query)["illust_id"][0] + elif parsed.netloc in ["bishie.booru.org", "www.secchan.net"]: + return ConnectionError("Couldn't get source") + elif "pximg.net" in parsed.netloc or "pixiv.net" in parsed.netloc: + return "https://www.pixiv.net/en/artworks/" + parsed.path.split("/")[-1][:8] + elif parsed.netloc == "twitter.com": + return url.replace("twitter.com", "nitter.eda.gay") + return url def append_blacklisted(id_): with open(CONFIG["blacklist"], "a") as f: @@ -127,6 +139,10 @@ def main(draw_faces = False): logging.info("Retried, already posted image...") return main() + if check_pixiv_404(fix_source_url(simg.source)): + logging.warning("Skipping since pixiv linked 404'd") + return main() + append_blacklisted(simg.id) with DownloadedImage(simg.imurl) as impath: |