diff options
Diffstat (limited to '3_archive.py')
-rw-r--r-- | 3_archive.py | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/3_archive.py b/3_archive.py new file mode 100644 index 0000000..6eef0e1 --- /dev/null +++ b/3_archive.py @@ -0,0 +1,144 @@ +import json +from http.client import HTTPResponse +from pathlib import Path +from shutil import copyfileobj +from urllib.request import urlopen + +import brotli +import msgpack +import psycopg + +from com import Visibility, eval_config, parse_graph, progressbar + +config = eval_config() +conn: psycopg.Connection = config["connect"]() + +graph = parse_graph() +print("reading filterlist") +filtered = Path("filtered.list").read_text().strip().splitlines() + +collected_users = {} +def collect_user(id: str): + if id in collected_users: + return + user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone() + if user is None: + return None + username, host, avatar_url = user + profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone() + description, fields = profile or ("", []) + + output = {} + output["id"] = id + output["username"] = username + output["host"] = host + output["description"] = description + output["fields"] = fields + output["avatar_url"] = avatar_url + + collected_users[id] = output + +collected_notes = [] +files_to_collect = [] +def collect_note(id: str): + output = {} + output["id"] = id + + note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone() + if note is None: + return None + text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note + collect_user(user_id) + + output["text"] = text + output["user_id"] = user_id + output["created_at"] = created_at.astimezone(tz=None).isoformat() + output["updated_at"] = None + if updated_at is not None: + output["updated_at"] = updated_at.astimezone(tz=None).isoformat() + output["reactions"] = reactions + output["renotes"] = renotes + output["visibility"] = Visibility.from_db(visibility).code() + + node = graph[id] + replies = [collect_note(reply) for reply in node["replies"]] + replies = filter(lambda reply: reply is not None, replies) + quotes = [collect_note(quote) for quote in node["quotes"]] + quotes = filter(lambda quote: quote is not None, quotes) + + output["attachments"] = [] + for file_id in file_ids: + name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone() + attachment = { + "id": file_id, + "type": type_, + "comment": comment, + } + if "self" in node["flags"]: # archive own attachments + files_to_collect.append((file_id, url)) + attachment["url"] = None + else: + attachment["url"] = url + + output["replies"] = list(replies) + output["quotes"] = list(quotes) + + if len(output["attachments"]) == 0: del output["attachments"] + if len(output["replies"]) == 0: del output["replies"] + if len(output["quotes"]) == 0: del output["quotes"] + + return output + +pb = progressbar.ProgressBar( + 0, + len(filtered), + prefix="collecting data ", +) +for id in filtered: + note = collect_note(id) + collected_notes.append((id, note)) + pb.increment() +pb.finish() + +outdir = Path("out") +if not outdir.exists(): + outdir.mkdir() +if not (outdir / "note").exists(): + (outdir / "note").mkdir() +if not (outdir / "user").exists(): + (outdir / "user").mkdir() +if not (outdir / "file").exists(): + (outdir / "file").mkdir() + +pb = progressbar.ProgressBar( + 0, + len(collected_notes) + len(collected_users), + prefix="writing data ", +) + +for id, note in collected_notes: + outfile = outdir / "note" / f"{id}.mpk.br" + with outfile.open("wb") as f: + f.write(brotli.compress(msgpack.dumps(note))) + pb.increment() + +for id, user in collected_users.items(): + outfile = outdir / "user" / f"{id}.mpk.br" + with outfile.open("wb") as f: + f.write(brotli.compress(msgpack.dumps(note))) + pb.increment() +pb.finish() + +pb = progressbar.ProgressBar( + 0, + len(files_to_collect), + prefix="downloading attachments ", +) +for (id, url) in files_to_collect: + outfile = outdir / "file" / id + response: HTTPResponse = urlopen(url) + with outfile.open("wb") as f: + copyfileobj(response, f) + response.close() + pb.increment() +pb.finish() |