🗝
summary refs log tree commit diff
path: root/3_archive.py
diff options
context:
space:
mode:
Diffstat (limited to '3_archive.py')
-rw-r--r--3_archive.py144
1 files changed, 144 insertions, 0 deletions
diff --git a/3_archive.py b/3_archive.py
new file mode 100644
index 0000000..6eef0e1
--- /dev/null
+++ b/3_archive.py
@@ -0,0 +1,144 @@
+import json
+from http.client import HTTPResponse
+from pathlib import Path
+from shutil import copyfileobj
+from urllib.request import urlopen
+
+import brotli
+import msgpack
+import psycopg
+
+from com import Visibility, eval_config, parse_graph, progressbar
+
+config = eval_config()
+conn: psycopg.Connection = config["connect"]()
+
+graph = parse_graph()
+print("reading filterlist")
+filtered = Path("filtered.list").read_text().strip().splitlines()
+
+collected_users = {}
+def collect_user(id: str):
+    if id in collected_users:
+        return
+    user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
+    if user is None:
+        return None
+    username, host, avatar_url = user
+    profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
+    description, fields = profile or ("", [])
+
+    output = {}
+    output["id"] = id
+    output["username"] = username
+    output["host"] = host
+    output["description"] = description
+    output["fields"] = fields
+    output["avatar_url"] = avatar_url
+
+    collected_users[id] = output
+
+collected_notes = []
+files_to_collect = []
+def collect_note(id: str):
+    output = {}
+    output["id"] = id
+
+    note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone()
+    if note is None:
+        return None
+    text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note
+    collect_user(user_id)
+
+    output["text"] = text
+    output["user_id"] = user_id
+    output["created_at"] = created_at.astimezone(tz=None).isoformat()
+    output["updated_at"] = None
+    if updated_at is not None:
+        output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
+    output["reactions"] = reactions
+    output["renotes"] = renotes
+    output["visibility"] = Visibility.from_db(visibility).code()
+
+    node = graph[id]
+    replies = [collect_note(reply) for reply in node["replies"]]
+    replies = filter(lambda reply: reply is not None, replies)
+    quotes = [collect_note(quote) for quote in node["quotes"]]
+    quotes = filter(lambda quote: quote is not None, quotes)
+
+    output["attachments"] = []
+    for file_id in file_ids:
+        name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
+        attachment = {
+            "id": file_id,
+            "type": type_,
+            "comment": comment,
+        }
+        if "self" in node["flags"]: # archive own attachments
+            files_to_collect.append((file_id, url))
+            attachment["url"] = None
+        else:
+            attachment["url"] = url
+
+    output["replies"] = list(replies)
+    output["quotes"] = list(quotes)
+
+    if len(output["attachments"]) == 0: del output["attachments"]
+    if len(output["replies"]) == 0: del output["replies"]
+    if len(output["quotes"]) == 0: del output["quotes"]
+
+    return output
+
+pb = progressbar.ProgressBar(
+    0,
+    len(filtered),
+    prefix="collecting data ",
+)
+for id in filtered:
+    note = collect_note(id)
+    collected_notes.append((id, note))
+    pb.increment()
+pb.finish()
+
+outdir = Path("out")
+if not outdir.exists():
+    outdir.mkdir()
+if not (outdir / "note").exists():
+    (outdir / "note").mkdir()
+if not (outdir / "user").exists():
+    (outdir / "user").mkdir()
+if not (outdir / "file").exists():
+    (outdir / "file").mkdir()
+
+pb = progressbar.ProgressBar(
+    0,
+    len(collected_notes) + len(collected_users),
+    prefix="writing data ",
+)
+
+for id, note in collected_notes:
+    outfile = outdir / "note" / f"{id}.mpk.br"
+    with outfile.open("wb") as f:
+        f.write(brotli.compress(msgpack.dumps(note)))
+    pb.increment()
+
+for id, user in collected_users.items():
+    outfile = outdir / "user" / f"{id}.mpk.br"
+    with outfile.open("wb") as f:
+        f.write(brotli.compress(msgpack.dumps(note)))
+    pb.increment()
+pb.finish()
+
+pb = progressbar.ProgressBar(
+    0,
+    len(files_to_collect),
+    prefix="downloading attachments ",
+)
+for (id, url) in files_to_collect: 
+    outfile = outdir / "file" / id
+    response: HTTPResponse = urlopen(url)
+    with outfile.open("wb") as f:
+        copyfileobj(response, f)
+    response.close()
+    pb.increment()
+pb.finish()