import json from http.client import HTTPResponse from pathlib import Path from shutil import copyfileobj from urllib.request import urlopen import brotli import msgpack import psycopg from com import Visibility, eval_config, parse_graph, progressbar config = eval_config() conn: psycopg.Connection = config["connect"]() graph = parse_graph() print("reading filterlist") filtered = Path("filtered.list").read_text().strip().splitlines() collected_users = {} def collect_user(id: str): if id in collected_users: return user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone() if user is None: return None username, host, avatar_url = user profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone() description, fields = profile or ("", []) output = {} output["id"] = id output["username"] = username output["host"] = host output["description"] = description output["fields"] = fields output["avatar_url"] = avatar_url collected_users[id] = output collected_notes = [] files_to_collect = [] def collect_note(id: str): output = {} output["id"] = id note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone() if note is None: return None text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note collect_user(user_id) output["text"] = text output["user_id"] = user_id output["created_at"] = created_at.astimezone(tz=None).isoformat() output["updated_at"] = None if updated_at is not None: output["updated_at"] = updated_at.astimezone(tz=None).isoformat() output["reactions"] = reactions output["renotes"] = renotes output["visibility"] = Visibility.from_db(visibility).code() node = graph[id] replies = [collect_note(reply) for reply in node["replies"]] replies = filter(lambda reply: reply is not None, replies) quotes = [collect_note(quote) for quote in node["quotes"]] quotes = filter(lambda quote: quote is not None, quotes) output["attachments"] = [] for file_id in file_ids: name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone() attachment = { "id": file_id, "type": type_, "comment": comment, } if "self" in node["flags"]: # archive own attachments files_to_collect.append((file_id, url)) attachment["url"] = None else: attachment["url"] = url output["replies"] = list(replies) output["quotes"] = list(quotes) if len(output["attachments"]) == 0: del output["attachments"] if len(output["replies"]) == 0: del output["replies"] if len(output["quotes"]) == 0: del output["quotes"] return output pb = progressbar.ProgressBar( 0, len(filtered), prefix="collecting data ", ) for id in filtered: note = collect_note(id) collected_notes.append((id, note)) pb.increment() pb.finish() outdir = Path("out") if not outdir.exists(): outdir.mkdir() if not (outdir / "note").exists(): (outdir / "note").mkdir() if not (outdir / "user").exists(): (outdir / "user").mkdir() if not (outdir / "file").exists(): (outdir / "file").mkdir() pb = progressbar.ProgressBar( 0, len(collected_notes) + len(collected_users), prefix="writing data ", ) for id, note in collected_notes: outfile = outdir / "note" / f"{id}.mpk.br" with outfile.open("wb") as f: f.write(brotli.compress(msgpack.dumps(note))) pb.increment() for id, user in collected_users.items(): outfile = outdir / "user" / f"{id}.mpk.br" with outfile.open("wb") as f: f.write(brotli.compress(msgpack.dumps(note))) pb.increment() pb.finish() pb = progressbar.ProgressBar( 0, len(files_to_collect), prefix="downloading attachments ", ) for (id, url) in files_to_collect: outfile = outdir / "file" / id response: HTTPResponse = urlopen(url) with outfile.open("wb") as f: copyfileobj(response, f) response.close() pb.increment() pb.finish()