diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2020-08-23 22:18:06 +0100 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2020-08-23 22:18:06 +0100 |
commit | 0740f143a22555183d41a4ff78b4958a95a12944 (patch) | |
tree | 9eab9536e4f1214c1b8e659c67237c49834fbf7b /get_images.py | |
parent | 5bfdbe9b3af2c70d459378ee237542997bab1da0 (diff) | |
download | yaoi-communism-0740f143a22555183d41a4ff78b4958a95a12944.tar.gz yaoi-communism-0740f143a22555183d41a4ff78b4958a95a12944.zip |
added functions to get images
Diffstat (limited to 'get_images.py')
-rw-r--r-- | get_images.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/get_images.py b/get_images.py new file mode 100644 index 0000000..8d4d826 --- /dev/null +++ b/get_images.py @@ -0,0 +1,74 @@ +from lxml import etree +from io import StringIO +# import xml.etree.ElementTree as etree +import requests +import urllib +from http.cookiejar import LWPCookieJar +from dataclasses import dataclass +import random +import time + +# all of these tags are added to all queries. Preceded with '-' to blacklist +base_tags = ["yaoi", "-muscle"] +# one of these will be added +search_tags = ["looking_at_another", "kiss", "trap", "2boys", "promare"] + +def get_random_searchtag(): + return [random.choice(search_tags)] + +@dataclass +class SafebooruImage: + id: int + tags: list + source: str + imurl: str + +def get_id_from_url(url): + return int(urllib.parse.parse_qs(url)["id"][0]) + +def get_image(tags): + search_url = "https://safebooru.org/index.php?page=post&s=list&tags=%s&pid=%i" % ("+".join(base_tags+tags), (random.randint(1, get_num_pages(tags))-1)*5*8) + tree = etree.parse(StringIO(requests.get(search_url).text), etree.HTMLParser()) + elements = [e for e in tree.xpath("/html/body/div[6]/div/div[2]/div[1]")[0].iter(tag = "a")] + element = random.choice(elements) + simg = SafebooruImage( + id = get_id_from_url(element.get("href")), + tags = element.find("img").get("alt").split(), + source = get_source("https://safebooru.org/" + element.get("href")), + imurl = get_imurl("https://safebooru.org/" + element.get("href")) + ) + if simg.source is None: + print("https://safebooru.org/" + element.get("href")) + return simg + +def get_source(url): + tree = etree.parse(StringIO(requests.get(url).text), etree.HTMLParser()) + for element in tree.xpath('//*[@id="stats"]')[0].iter("li"): + if element.text.startswith("Source: h"): + return element.text[8:] + elif element.text.startswith("Source:"): + for child in element.iter(): + if child.get("href") is not None: + return child.get("href") + raise ConnectionError("Couldn't find source image for id %i" % get_id_from_url(url)) + +def get_imurl(url): + tree = etree.parse(StringIO(requests.get(url).text), etree.HTMLParser()) + return tree.xpath('//*[@id="image"]')[0].get("src") + +def get_num_pages(tags): + search_url = "https://safebooru.org/index.php?page=post&s=list&tags=%s" % "+".join(base_tags+tags) + html = requests.get(search_url).text + tree = etree.parse(StringIO(html), etree.HTMLParser()) + try: + page_element = tree.xpath("/html/body/div[6]/div/div[2]/div[2]/div/a[12]")[0] + except IndexError: + return 1 + else: + return int(int(urllib.parse.parse_qs(page_element.get("href"))["pid"][0]) / (5*8)) + +if __name__ == "__main__": + # get_page_images(tags = ["yaoi"]) + print(get_image(get_random_searchtag())) + + |