import json import time from http.client import HTTPResponse from pathlib import Path from shutil import copyfileobj from urllib.request import urlopen import brotli import msgpack import psycopg from com import Visibility, eval_config, parse_graph, progressbar config = eval_config() conn: psycopg.Connection = config["connect"]() graph = parse_graph() print("reading filterlist") filtered = Path("filtered.list").read_text().strip().splitlines() filtered = list(map(lambda line: line.split(' ')[0], filtered)) collected_users = {} def collect_user(id: str): if id in collected_users: return time.sleep(0.001) user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone() if user is None: return None username, host, avatar_url = user profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone() description, fields = profile or ("", []) output = {} output["id"] = id output["username"] = username output["host"] = host output["description"] = description output["fields"] = fields output["avatar_url"] = avatar_url collected_users[id] = output collected_notes = [] files_to_collect = [] def collect_note(id: str): output = {} output["id"] = id time.sleep(0.001) note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds", cw from note where id = %s', [id]).fetchone() if note is None: return None text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids, cw = note collect_user(user_id) output["text"] = text output["user_id"] = user_id output["created_at"] = created_at.astimezone(tz=None).isoformat() output["updated_at"] = None if updated_at is not None: output["updated_at"] = updated_at.astimezone(tz=None).isoformat() output["reactions"] = reactions output["renotes"] = renotes output["visibility"] = Visibility.from_db(visibility).code() output["cw"] = cw node = graph[id] replies = [collect_note(reply) for reply in node["replies"]] replies = filter(lambda reply: reply is not None, replies) quotes = [collect_note(quote) for quote in node["quotes"]] quotes = filter(lambda quote: quote is not None, quotes) output["attachments"] = [] for file_id in file_ids: time.sleep(0.0005) row = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone() if row is None: continue name, type_, comment, url = row attachment = { "id": file_id, "type": type_, "comment": comment, } if "self" in node["flags"]: # archive own attachments files_to_collect.append((file_id, url)) attachment["url"] = None else: attachment["url"] = url output["replies"] = list(replies) output["quotes"] = list(quotes) if len(output["attachments"]) == 0: del output["attachments"] if len(output["replies"]) == 0: del output["replies"] if len(output["quotes"]) == 0: del output["quotes"] return output pb = progressbar.ProgressBar( 0, len(filtered), prefix="collecting data ", ) for id in filtered: note = collect_note(id) collected_notes.append((id, note)) pb.increment() pb.finish() outdir = Path("out") if not outdir.exists(): outdir.mkdir() if not (outdir / "note").exists(): (outdir / "note").mkdir() if not (outdir / "user").exists(): (outdir / "user").mkdir() if not (outdir / "file").exists(): (outdir / "file").mkdir() pb = progressbar.ProgressBar( 0, len(collected_notes) + len(collected_users), prefix="writing data ", ) for id, note in collected_notes: outfile = outdir / "note" / id[:3] / f"{id[3:]}.mpk.br" outfile.parent.mkdir(exist_ok=True) with outfile.open("wb") as f: f.write(brotli.compress(msgpack.dumps(note))) pb.increment() for id, user in collected_users.items(): outfile = outdir / "user" / id[:2] / f"{id[2:]}.mpk.br" outfile.parent.mkdir(exist_ok=True) with outfile.open("wb") as f: f.write(brotli.compress(msgpack.dumps(note))) pb.increment() pb.finish() pb = progressbar.ProgressBar( 0, len(files_to_collect), prefix="downloading attachments ", ) for (id, url) in files_to_collect: outfile = outdir / "file" / id[:2] / id[2:] outfile.parent.mkdir(exist_ok=True) response: HTTPResponse = urlopen(url) with outfile.open("wb") as f: copyfileobj(response, f) response.close() pb.increment() pb.finish()