Updated whispa scrapingHEAD master

author: jwansek <eddie.atten.ea29@gmail.com> 2026-06-10 20:46:28 +0100
committer: jwansek <eddie.atten.ea29@gmail.com> 2026-06-10 20:46:28 +0100
commit: d79696e80100cb0ee45d52dec8ee4b230f5b2df6 (patch)
tree: 8ae723c28b9488bc815a8383d065a9a993de43f8
parent: c4279ce733214f45deb71301ff2d1caad039fb78 (diff)
download: eda.gay-master.tar.gz
eda.gay-master.zip
1 files changed, 62 insertions, 35 deletions
diff --git a/edaweb/services.py b/edaweb/services.py
index b01bee2..4565955 100644
--- a/edaweb/services.py
+++ b/edaweb/services.py
@@ -20,6 +20,7 @@ import queue
 import json
 import time
 import os
+import re
 
 theLastId = 0
 config_path = os.path.join(os.path.dirname(__file__), "..", "edaweb.conf")
@@ -250,6 +251,30 @@ def parse_tweet(tweet_url):
 
     return dt, replying_to, text, images
 
+def parse_rsc_flight(payload: str):
+    chunks = {}
+    lines = payload.strip().split('\n')
+    
+    for line in lines:
+        if not line.strip():
+            continue
+        # Match ID:payload
+        match = re.match(r'^([\w$]+):(.*)$', line.strip())
+        if not match:
+            continue
+            
+        chunk_id, content = match.groups()
+        
+        try:
+            # Try to parse as JSON-like (handles most data chunks)
+            data = json.loads(content)
+            chunks[chunk_id] = data
+        except json.JSONDecodeError:
+            # Keep raw for complex chunks (I[...], $ references, etc.)
+            chunks[chunk_id] = content
+    
+    return chunks
+
 def scrape_whispa(whispa_url, since = None):
     def query_answer(answer_url, max_retries = 10):
         for i in range(max_retries):
@@ -280,37 +305,39 @@ def scrape_whispa(whispa_url, since = None):
         js = str(script.text)
         if "receivedFeedback" in js:
             # my god this is horrible...
-            parsed_json = json.loads(json.loads(js[19:-1])[1][2:])[0][3]["loadedUser"]["receivedFeedback"]
-            # print(json.dumps(parsed_json, indent = 4))
-            # with open("whispas_%i.json" % i, "w") as f:
-            #     json.dump(parsed_json, f, indent = 4)
-            for j in parsed_json:
-                if j["_count"]["childFeedback"] < 0:
-                    continue
-
-                answer_url = "https://apiv4.whispa.sh/feedbacks/%s/children/public" % j["id"]
-                req = query_answer(answer_url)
-                try:
-                    firstanswer = req.json()["data"][0]
-                except IndexError:
-                    continue
-                dt = datetime.datetime.fromisoformat(firstanswer["createdAt"][:-1])
-
-                qna = {
-                    # "id": int(j["id"], base = 16),
-                    "id": int(dt.timestamp()),
-                    "link": answer_url,
-                    "datetime": dt,
-                    "question": j["content"],
-                    "answer": firstanswer["content"],
-                    "host": "whispa.sh"
-                }
-                print(qna)     
-                qnas.append(qna)
-                time.sleep(2.03)
-                if dt <= stop_at:
-                    print("Met the threshold for oldest Q&A, so stopped looking.")
-                    break
+            j = parse_rsc_flight(json.loads(js[19:-1])[1])
+            for j, v in j.items():
+                if type(v) is not str:
+                    parsed_json = v[0][3]["loadedUser"]["receivedFeedback"]
+
+                    for j in parsed_json:
+                        if j["_count"]["childFeedback"] < 0:
+                            continue
+
+                        answer_url = "https://apiv7.whispa.sh/feedbacks/%s/children/public" % j["id"]
+                        print(answer_url)
+                        req = query_answer(answer_url)
+                        try:
+                            firstanswer = req.json()["data"][0]
+                        except IndexError:
+                            continue
+                        dt = datetime.datetime.fromisoformat(firstanswer["createdAt"][:-1])
+
+                        qna = {
+                            # "id": int(j["id"], base = 16),
+                            "id": int(dt.timestamp()),
+                            "link": answer_url,
+                            "datetime": dt,
+                            "question": j["content"],
+                            "answer": firstanswer["content"],
+                            "host": "whispa.sh"
+                        }
+                        print(qna)     
+                        qnas.append(qna)
+                        time.sleep(2.03)
+                        if dt <= stop_at:
+                            print("Met the threshold for oldest Q&A, so stopped looking.")
+                            break
     return qnas
 
 def get_docker_containers(host, ssh_key_path):
@@ -414,10 +441,10 @@ def get_recent_commits(db, max_per_repo = 3):
     return sorted(out, key = lambda a: a["datetime"], reverse = True)
 
 if __name__ == "__main__":
-    # print(scrape_whispa(CONFIG.get("qnas", "url")))
-    # import database
-    print(cache_all_docker_containers(os.path.join(os.path.dirname(__file__), "..", "edaweb-docker.pem")))
-    print(get_all_docker_containers())
+    print(scrape_whispa(CONFIG.get("qnas", "url")))
+    # # import database
+    # print(cache_all_docker_containers(os.path.join(os.path.dirname(__file__), "..", "edaweb-docker.pem")))
+    # print(get_all_docker_containers())
 
     # with database.Database() as db:
     #     print(json.dumps(get_recent_commits(db), indent=4))
author	jwansek <eddie.atten.ea29@gmail.com>	2026-06-10 20:46:28 +0100
committer	jwansek <eddie.atten.ea29@gmail.com>	2026-06-10 20:46:28 +0100
commit	d79696e80100cb0ee45d52dec8ee4b230f5b2df6 (patch)
tree	8ae723c28b9488bc815a8383d065a9a993de43f8
parent	c4279ce733214f45deb71301ff2d1caad039fb78 (diff)
download	eda.gay-master.tar.gz eda.gay-master.zip