🗝
summary refs log tree commit diff
path: root/3_archive.py
diff options
context:
space:
mode:
Diffstat (limited to '3_archive.py')
-rw-r--r--3_archive.py21
1 files changed, 15 insertions, 6 deletions
diff --git a/3_archive.py b/3_archive.py
index 6eef0e1..39affdd 100644
--- a/3_archive.py
+++ b/3_archive.py
@@ -1,4 +1,5 @@
 import json
+import time
 from http.client import HTTPResponse
 from pathlib import Path
 from shutil import copyfileobj
@@ -16,11 +17,13 @@ conn: psycopg.Connection = config["connect"]()
 graph = parse_graph()
 print("reading filterlist")
 filtered = Path("filtered.list").read_text().strip().splitlines()
+filtered = list(map(lambda line: line.split(' ')[0], filtered))
 
 collected_users = {}
 def collect_user(id: str):
     if id in collected_users:
         return
+    time.sleep(0.001)
     user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
     if user is None:
         return None
@@ -44,10 +47,11 @@ def collect_note(id: str):
     output = {}
     output["id"] = id
 
-    note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone()
+    time.sleep(0.001)
+    note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds", cw from note where id = %s', [id]).fetchone()
     if note is None:
         return None
-    text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note
+    text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids, cw = note
     collect_user(user_id)
 
     output["text"] = text
@@ -59,6 +63,7 @@ def collect_note(id: str):
     output["reactions"] = reactions
     output["renotes"] = renotes
     output["visibility"] = Visibility.from_db(visibility).code()
+    output["cw"] = cw
 
     node = graph[id]
     replies = [collect_note(reply) for reply in node["replies"]]
@@ -68,6 +73,7 @@ def collect_note(id: str):
 
     output["attachments"] = []
     for file_id in file_ids:
+        time.sleep(0.0005)
         name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
         attachment = {
             "id": file_id,
@@ -117,13 +123,15 @@ pb = progressbar.ProgressBar(
 )
 
 for id, note in collected_notes:
-    outfile = outdir / "note" / f"{id}.mpk.br"
+    outfile = outdir / "note" / id[:3] / f"{id[3:]}.mpk.br"
+    outfile.parent.mkdir(exist_ok=True)
     with outfile.open("wb") as f:
         f.write(brotli.compress(msgpack.dumps(note)))
     pb.increment()
 
 for id, user in collected_users.items():
-    outfile = outdir / "user" / f"{id}.mpk.br"
+    outfile = outdir / "user" / id[:2] / f"{id[2:]}.mpk.br"
+    outfile.parent.mkdir(exist_ok=True)
     with outfile.open("wb") as f:
         f.write(brotli.compress(msgpack.dumps(note)))
     pb.increment()
@@ -134,8 +142,9 @@ pb = progressbar.ProgressBar(
     len(files_to_collect),
     prefix="downloading attachments ",
 )
-for (id, url) in files_to_collect: 
-    outfile = outdir / "file" / id
+for (id, url) in files_to_collect:
+    outfile = outdir / "file" / id[:2] / id[2:]
+    outfile.parent.mkdir(exist_ok=True)
     response: HTTPResponse = urlopen(url)
     with outfile.open("wb") as f:
         copyfileobj(response, f)