diff options
-rwxr-xr-x | .gitignore | 2 | ||||
-rw-r--r-- | .gitmodules | 4 | ||||
-rwxr-xr-x | app.py | 5 | ||||
-rwxr-xr-x | database.py | 156 | ||||
-rwxr-xr-x | docker-compose.yml | 44 | ||||
m--------- | nitter/nitter | 0 | ||||
-rwxr-xr-x | services.py | 115 | ||||
-rwxr-xr-x | templates/diary.html.j2 | 2 |
8 files changed, 208 insertions, 120 deletions
@@ -6,6 +6,8 @@ static/images/random.jpg static/zips/*.zip .nfs* static/images/Thumbs.db +nitter/nitter.conf +nitter/guest_accounts.json # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1b086ba --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "nitter/nitter"] + path = nitter/nitter + url = git@github.com:zedeus/nitter.git + branch = guest_accounts @@ -71,7 +71,7 @@ def index(): markdown = parser.parse_text(f.read())[0], featured_thoughts = db.get_featured_thoughts(), tweets = db.get_cached_tweets(7) + [("view all tweets...", db.get_my_twitter())], - commits = db.get_cached_commits(since = datetime.datetime.now() - datetime.timedelta(days = 7)), + commits = db.get_cached_commits()[:7], sidebar_img = get_sidebar_img(db) ) @@ -85,7 +85,8 @@ def diary(): return flask.render_template( "diary.html.j2", **get_template_items("diary", db), - diary = db.get_diary() + diary = db.get_diary(), + diary_account = db.config.get("twitter", "diary_account") ) @app.route("/discord") diff --git a/database.py b/database.py index 1877ab7..ea3835c 100755 --- a/database.py +++ b/database.py @@ -1,10 +1,10 @@ from urllib.parse import urlparse from dataclasses import dataclass -from github import Github from lxml import html import configparser import curiouscat import threading +import services import operator import datetime import requests @@ -40,6 +40,7 @@ class Database: return self def __exit__(self, type, value, traceback): + self.__connection.commit() self.__connection.close() def get_header_links(self): @@ -132,24 +133,18 @@ class Database: """) return cursor.fetchall() - def get_cached_tweets(self, numToGet = None, recurse = True): + def get_cached_tweets(self, numToGet = None): with self.__connection.cursor() as cursor: + sql = "SELECT tweet, tweet_id, account FROM diary WHERE account = %s ORDER BY tweeted_at" + args = (self.config.get("twitter", "main_account"), ) if numToGet is not None: - cursor.execute("SELECT text, url FROM twitterCache ORDER BY appended DESC LIMIT %s;", (numToGet, )) + sql += " LIMIT %s;" + args = (self.config.get("twitter", "main_account"), numToGet) else: - cursor.execute("SELECT text, url FROM twitterCache ORDER BY appended DESC;") - if recurse: - threading.Thread(target = update_cache).start() - return list(cursor.fetchall()) + sql += ";" + cursor.execute(sql, args) - def update_twitter_cache(self, requested): - with self.__connection.cursor() as cursor: - cursor.execute("SELECT DISTINCT url FROM twitterCache;") - urls = [i[0] for i in cursor.fetchall()] - for url, text in requested: - if url not in urls: - cursor.execute("INSERT INTO twitterCache (text, url) VALUES (%s, %s);", (text, url)) - self.__connection.commit() + return [(i[0], "https://%s/%s/status/%d" % (self.config.get("nitter", "outsideurl"), i[2], i[1])) for i in cursor.fetchall()] def get_cached_commits(self, since = None, recurse = True): with self.__connection.cursor() as cursor: @@ -214,29 +209,38 @@ class Database: self.__connection.commit() return id_ - def append_diary(self, tweet_id, tweeted_at, replying_to, tweet): + def append_diary(self, tweet_id, tweeted_at, replying_to, tweet, account): + if tweet is None: + tweet = "(Image only)" with self.__connection.cursor() as cursor: - cursor.execute("INSERT INTO diary VALUES (%s, %s, %s, %s);", (tweet_id, tweeted_at, replying_to, tweet)) - print("Appended diary with tweet '%s'" % tweet) + cursor.execute("INSERT INTO diary VALUES (%s, %s, %s, %s, %s);", (tweet_id, tweeted_at, replying_to, tweet, account)) + self.__connection.commit() + + print("Appended diary with tweet " + tweet + " @ " + str(tweeted_at)) def append_diary_images(self, tweet_id, imurl): with self.__connection.cursor() as cursor: cursor.execute("INSERT INTO diaryimages (tweet_id, link) VALUES (%s, %s);", (tweet_id, imurl)) + self.__connection.commit() + - def get_diary(self): + def get_diary(self, account = None): threading.Thread(target = update_cache).start() out = {} + if account is None: + account = self.get_my_diary_twitter() + with self.__connection.cursor() as cursor: # cursor.execute("SELECT tweet_id, tweeted_at, tweet FROM diary WHERE replying_to IS NULL ORDER BY tweeted_at DESC;") # attempt to ignore curiouscat automatic tweets by comparing with the q&a table - cursor.execute("SELECT tweet_id, tweeted_at, tweet FROM diary WHERE replying_to IS NULL AND tweet_id NOT IN (SELECT tweet_id FROM diary INNER JOIN qnas ON SUBSTRING(tweet, 1, 16) = SUBSTRING(question, 1, 16)) ORDER BY tweeted_at DESC;") + cursor.execute("SELECT tweet_id, tweeted_at, tweet FROM diary WHERE replying_to IS NULL AND tweet_id NOT IN (SELECT tweet_id FROM diary INNER JOIN qnas ON SUBSTRING(tweet, 1, 16) = SUBSTRING(question, 1, 16)) AND account = %s ORDER BY tweeted_at DESC;", (account, )) for tweet_id, tweeted_at, tweet_text in cursor.fetchall(): # print(tweet_id, tweeted_at, tweet_text) out[tweeted_at] = [{ "text": tweet_text, "images": self.get_diary_image(tweet_id), "link": "https://%s/%s/status/%d" % ( - self.config.get("nitter", "domain"), + self.config.get("nitter", "outsideurl"), self.get_my_diary_twitter(), tweet_id ) @@ -268,46 +272,17 @@ class Database: "text": out[1], "images": self.get_diary_image(id_), "link": "https://%s/%s/status/%d" % ( - self.config.get("nitter", "domain"), self.get_my_diary_twitter(), id_ + self.config.get("nitter", "outsideurl"), self.get_my_diary_twitter(), id_ ) }, id_ - def get_newest_diary_tweet_id(self): + def get_newest_diary_tweet_id(self, account = None): + if account is None: + account = self.get_my_diary_twitter() with self.__connection.cursor() as cursor: - cursor.execute("SELECT MAX(tweet_id) FROM diary;") + cursor.execute("SELECT MAX(tweet_id) FROM diary WHERE account = %s;", (account, )) return cursor.fetchone()[0] - def fetch_diary(self): - twitteracc = self.get_my_diary_twitter() - - twitter = twython.Twython( - self.config.get("twitter", "app_key"), - access_token = self.config.get("twitter", "oauth_2_token") - ) - - for tweet in twitter.search( - q = "(from:%s)" % twitteracc, since_id = self.get_newest_diary_tweet_id(), - tweet_mode = 'extended' - )["statuses"]: - - tweet_id = tweet["id"] - tweeted_at = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") - replying_to = tweet["in_reply_to_status_id"] - tweet_text = re.sub(r"https://t\.co/\w{10}", "", tweet["full_text"], 0, re.MULTILINE) - - if tweet["in_reply_to_screen_name"] == twitteracc or tweet["in_reply_to_screen_name"] is None: - self.append_diary(tweet_id, tweeted_at, replying_to, tweet_text) - - if "media" in tweet["entities"].keys(): - associated_images = [ - i["media_url_https"].replace("pbs.twimg.com", self.config.get("nitter", "domain") + "/pic") - for i in tweet["entities"]["media"] - ] - for im in associated_images: - self.append_diary_images(tweet_id, im) - - self.__connection.commit() - def get_curiouscat_username(self): with self.__connection.cursor() as cursor: cursor.execute("SELECT link FROM headerLinks WHERE name = 'curiouscat';") @@ -339,7 +314,7 @@ class Database: return sorted(cursor.fetchall(), key = operator.itemgetter(2), reverse = True) def update_cache(): - # print("updating cache...") + print("Updating cache...") with Database() as db: db.append_curiouscat_qnas( curiouscat.get_all_curiouscat_qnas_before( @@ -347,58 +322,25 @@ def update_cache(): db.get_biggest_curiouscat_timestamp() ) ) - db.fetch_diary() - db.update_twitter_cache(request_recent_tweets(10000)) - # print("Done updating twitter cache...") - db.update_commit_cache(request_recent_commits(since = db.get_last_commit_time())) - # print("Done updating commit cache...") - -CONFIG = configparser.ConfigParser() -CONFIG.read("edaweb.conf") - -def request_recent_tweets(numToGet): - tweets = [] - domain = "http://" + CONFIG.get("nitter", "domain") - with Database() as db: - for title, url in db.get_header_links(): - if title == "twitter": - break - tree = html.fromstring(requests.get(url).content) - for i, tweetUrlElement in enumerate(tree.xpath('//*[@class="tweet-link"]'), 0): - if i > 0: - tweets.append(( - domain + tweetUrlElement.get("href"), - tweetUrlElement.getparent().find_class("tweet-content media-body")[0].text - )) - if len(tweets) >= numToGet: - break - return tweets - -def request_recent_commits(since = datetime.datetime.now() - datetime.timedelta(days=7)): - g = Github(CONFIG.get("github", "access_code")) - out = [] - for repo in g.get_user().get_repos(): - # print(repo.name, list(repo.get_branches())) - try: - for commit in repo.get_commits(since = since): - out.append({ - "repo": repo.name, - "message": commit.commit.message, - "url": commit.html_url, - "datetime": commit.commit.author.date, - "stats": { - "additions": commit.stats.additions, - "deletions": commit.stats.deletions, - "total": commit.stats.total - } - }) - except Exception as e: - print(e) - - return sorted(out, key = lambda a: a["datetime"], reverse = True) + print("Finished adding curiouscat...") + db.update_commit_cache(services.request_recent_commits(since = db.get_last_commit_time())) + print("Finished adding github commits...") + for id_, dt, replying_to, text, username, images in services.scrape_nitter(db.get_my_diary_twitter(), db.get_newest_diary_tweet_id()): + db.append_diary(id_, dt, replying_to, text, username) + for image in images: + db.append_diary_images(id_, image) + print("Finished getting diary tweets...") + for id_, dt, replying_to, text, username, images in services.scrape_nitter( + db.config.get("twitter", "main_account"), db.get_newest_diary_tweet_id(db.config.get("twitter", "main_account")) + ): + db.append_diary(id_, dt, replying_to, text, username) + for image in images: + db.append_diary_images(id_, image) + print("Done updating commit cache...") if __name__ == "__main__": - # print(request_recent_commits()) with Database() as db: - print(db.get_curiouscat_qnas()) + print(db.get_cached_tweets()) + + # update_cache() diff --git a/docker-compose.yml b/docker-compose.yml index c8a70c1..2a78e22 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -37,6 +37,47 @@ services: - mariadb:mysql restart: unless-stopped + nitter: + build: + context: ./nitter/nitter + dockerfile: Dockerfile + image: jwansek/nitter:latest + ports: + - "127.0.0.1:7777:7777" # Replace with "8080:8080" if you don't use a reverse proxy + volumes: + - ./nitter/nitter.conf:/src/nitter.conf:Z,ro + - ./nitter/guest_accounts.json:/src/guest_accounts.json:Z,ro + depends_on: + - nitter-redis + healthcheck: + test: wget -nv --tries=1 --spider http://127.0.0.1:8080/Jack/status/20 || exit 1 + interval: 30s + timeout: 5s + retries: 2 + user: "998:998" + read_only: true + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + + nitter-redis: + image: redis:6-alpine + command: redis-server --save 60 1 --loglevel warning + volumes: + - nitter-redis:/data + healthcheck: + test: redis-cli ping + interval: 30s + timeout: 5s + retries: 2 + user: "999:1000" + read_only: true + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + networks: edaweb-net: external: @@ -45,3 +86,6 @@ networks: db-network: external: name: mariadb + +volumes: + nitter-redis: diff --git a/nitter/nitter b/nitter/nitter new file mode 160000 +Subproject fcd74e8048362fcf8284871ee067099e8de28a8 diff --git a/services.py b/services.py index 77361fa..0a10d34 100755 --- a/services.py +++ b/services.py @@ -1,9 +1,11 @@ from dataclasses import dataclass from io import StringIO from lxml import html, etree +from github import Github import multiprocessing import pihole as ph import qbittorrent +import configparser import requests import datetime import urllib @@ -13,10 +15,11 @@ import random import queue import json import time -import app import os theLastId = 0 +CONFIG = configparser.ConfigParser(interpolation = None) +CONFIG.read("edaweb.conf") def humanbytes(B): 'Return the given bytes as a human friendly KB, MB, GB, or TB string' @@ -51,7 +54,7 @@ def timeout(func): t = multiprocessing.Process(target = runFunc, args = (returnVan, func)) t.start() - t.join(timeout = app.CONFIG["servicetimeout"].getint("seconds")) + t.join(timeout = CONFIG["servicetimeout"].getint("seconds")) # print("Request took:", time.time() - ti) try: @@ -64,7 +67,7 @@ def timeout(func): @timeout def get_docker_stats(): - client = docker.DockerClient(base_url = "tcp://%s:%s" % (app.CONFIG["docker"]["url"], app.CONFIG["docker"]["port"])) + client = docker.DockerClient(base_url = "tcp://%s:%s" % (CONFIG["docker"]["url"], CONFIG["docker"]["port"])) return { container.name: container.status for container in client.containers.list(all = True) @@ -76,8 +79,8 @@ def get_qbit_stats(): numtorrents = 0 bytes_dl = 0 bytes_up = 0 - qb = qbittorrent.Client('http://%s:%s/' % (app.CONFIG["qbittorrent"]["url"], app.CONFIG["qbittorrent"]["port"])) - qb.login(username = app.CONFIG["qbittorrent"]["user"], password = app.CONFIG["qbittorrent"]["passwd"]) + qb = qbittorrent.Client('http://%s:%s/' % (CONFIG["qbittorrent"]["url"], CONFIG["qbittorrent"]["port"])) + qb.login(username = CONFIG["qbittorrent"]["user"], password = CONFIG["qbittorrent"]["passwd"]) for torrent in qb.torrents(): numtorrents += 1 @@ -94,9 +97,9 @@ def get_qbit_stats(): @timeout def get_trans_stats(): client = clutch.client.Client( - address = "http://%s:%s/transmission/rpc" % (app.CONFIG["transmission"]["url"], app.CONFIG["transmission"]["port"]), - # username = app.CONFIG["transmission"]["username"], - # password = app.CONFIG["transmission"]["password"] + address = "http://%s:%s/transmission/rpc" % (CONFIG["transmission"]["url"], CONFIG["transmission"]["port"]), + # username = CONFIG["transmission"]["username"], + # password = CONFIG["transmission"]["password"] ) stats = json.loads(client.session.stats().json()) active_for = datetime.timedelta(seconds = stats["arguments"]["cumulative_stats"]["seconds_active"]) @@ -110,7 +113,7 @@ def get_trans_stats(): @timeout def get_pihole_stats(): - pihole = ph.PiHole(app.CONFIG["pihole"]["url"]) + pihole = ph.PiHole(CONFIG["pihole"]["url"]) return { "status": pihole.status, "queries": pihole.total_queries, @@ -230,5 +233,97 @@ def link_deleted(url): text = requests.get(url).text return text[text.find("<title>") + 7 : text.find("</title>")] in ["Error | nitter", "イラストコミュニケーションサービス[pixiv]"] +def request_recent_commits(since = datetime.datetime.now() - datetime.timedelta(days=7)): + g = Github(CONFIG.get("github", "access_code")) + out = [] + for repo in g.get_user().get_repos(): + # print(repo.name, list(repo.get_branches())) + try: + for commit in repo.get_commits(since = since): + out.append({ + "repo": repo.name, + "message": commit.commit.message, + "url": commit.html_url, + "datetime": commit.commit.author.date, + "stats": { + "additions": commit.stats.additions, + "deletions": commit.stats.deletions, + "total": commit.stats.total + } + }) + except Exception as e: + print(e) + + return sorted(out, key = lambda a: a["datetime"], reverse = True) + +def scrape_nitter(username, get_until:int): + new_tweets = [] + nitter_url = CONFIG.get("nitter", "internalurl") + nitter_port = CONFIG.getint("nitter", "internalport") + scrape_new_pages = True + url = "http://%s:%d/%s" % (nitter_url, nitter_port, username) + + while scrape_new_pages: + tree = html.fromstring(requests.get(url).content) + for i, tweetUrlElement in enumerate(tree.xpath('//*[@class="tweet-link"]'), 0): + if i > 0 and tweetUrlElement.get("href").split("/")[1] == username: + id_ = int(urllib.parse.urlparse(tweetUrlElement.get("href")).path.split("/")[-1]) + tweet_link = "http://%s:%d%s" % (nitter_url, nitter_port, tweetUrlElement.get("href")) + + if id_ == get_until: + scrape_new_pages = False + break + + try: + dt, replying_to, text, images = parse_tweet(tweet_link) + new_tweets.append((id_, dt, replying_to, text, username, images)) + print(dt, text) + except IndexError: + print("Couldn't get any more tweets") + scrape_new_pages = False + break + + + try: + cursor = tree.xpath('//*[@class="show-more"]/a')[0].get("href") + except IndexError: + # no more elements + break + url = "http://%s:%d/%s%s" % (nitter_url, nitter_port, username, cursor) + + return new_tweets + +def parse_tweet(tweet_url): + # print(tweet_url) + tree = html.fromstring(requests.get(tweet_url).content) + # with open("2images.html", "r") as f: + # tree = html.fromstring(f.read()) + + main_tweet_elem = tree.xpath('//*[@class="main-tweet"]')[0] + + dt_str = main_tweet_elem.xpath('//*[@class="tweet-published"]')[0].text + dt = datetime.datetime.strptime(dt_str.replace("Â", ""), "%b %d, %Y · %I:%M %p UTC") + text = tree.xpath('//*[@class="main-tweet"]/div/div/div[2]')[0].text + replying_to_elems = tree.xpath('//*[@class="before-tweet thread-line"]/div/a') + if replying_to_elems != []: + replying_to = int(urllib.parse.urlparse(replying_to_elems[-1].get("href")).path.split("/")[-1]) + else: + replying_to = None + + images = [] + images_elems = tree.xpath('//*[@class="main-tweet"]/div/div/div[3]/div/div/a/img') + for image_elem in images_elems: + images.append("https://" + CONFIG.get("nitter", "outsideurl") + urllib.parse.urlparse(image_elem.get("src")).path) + + return dt, replying_to, text, images + + + + + if __name__ == "__main__": - print(get_trans_stats()) + # print(get_trans_stats()) + + print(scrape_nitter(CONFIG.get("twitter", "diary_account"), 1694624180405260291)) + + # print(parse_tweet("https://nitter.net/HONMISGENDERER/status/1694231618443981161#m")) diff --git a/templates/diary.html.j2 b/templates/diary.html.j2 index d7c363b..f6604f7 100755 --- a/templates/diary.html.j2 +++ b/templates/diary.html.j2 @@ -1,7 +1,7 @@ {% extends "template.html.j2" %} {% block content %} <h4>this page might not be up-to-date if my diary account is search banned</h4> - <p>if in doubt check my <a href="https://twitter.com/FORMER_SHOTA">diary account</a> <br> <s>serves me right for using the official API instead of HTML scraping like i did with the <a href="/">recent tweets</a> thing</s> <br> <a href="https://shadowban.yuzurisa.com/FORMER_SHOTA">check if i'm currently search banned</a></p> + <p>if in doubt check my <a href="https://twitter.com/{{ diary_account }}">diary account</a> <br> <a href="https://shadowban.yuzurisa.com/{{ diary_account }}">check if i'm currently search banned</a></p> <dl> {% for dt, entries in diary.items() %} <dt><a href="{{ entries[0]['link'] }}">{{ dt }}</a></dt> |