diff options
author | mia <mia@mia.jetzt> | 2024-10-04 15:43:40 -0700 |
---|---|---|
committer | mia <mia@mia.jetzt> | 2024-10-04 15:43:40 -0700 |
commit | 7e060e5cf2656a0a53d41ea0ff42b753316cd441 (patch) | |
tree | 2629f3d1e12d21c406974000dd195518aa5b6041 /3_archive.py | |
parent | bb8a48fd4d85ba4f8224c68aaaf9069d5d79dae2 (diff) | |
download | scrubber-7e060e5cf2656a0a53d41ea0ff42b753316cd441.tar.gz scrubber-7e060e5cf2656a0a53d41ea0ff42b753316cd441.zip |
she's goin
Diffstat (limited to '3_archive.py')
-rw-r--r-- | 3_archive.py | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/3_archive.py b/3_archive.py index 6eef0e1..39affdd 100644 --- a/3_archive.py +++ b/3_archive.py @@ -1,4 +1,5 @@ import json +import time from http.client import HTTPResponse from pathlib import Path from shutil import copyfileobj @@ -16,11 +17,13 @@ conn: psycopg.Connection = config["connect"]() graph = parse_graph() print("reading filterlist") filtered = Path("filtered.list").read_text().strip().splitlines() +filtered = list(map(lambda line: line.split(' ')[0], filtered)) collected_users = {} def collect_user(id: str): if id in collected_users: return + time.sleep(0.001) user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone() if user is None: return None @@ -44,10 +47,11 @@ def collect_note(id: str): output = {} output["id"] = id - note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone() + time.sleep(0.001) + note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds", cw from note where id = %s', [id]).fetchone() if note is None: return None - text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note + text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids, cw = note collect_user(user_id) output["text"] = text @@ -59,6 +63,7 @@ def collect_note(id: str): output["reactions"] = reactions output["renotes"] = renotes output["visibility"] = Visibility.from_db(visibility).code() + output["cw"] = cw node = graph[id] replies = [collect_note(reply) for reply in node["replies"]] @@ -68,6 +73,7 @@ def collect_note(id: str): output["attachments"] = [] for file_id in file_ids: + time.sleep(0.0005) name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone() attachment = { "id": file_id, @@ -117,13 +123,15 @@ pb = progressbar.ProgressBar( ) for id, note in collected_notes: - outfile = outdir / "note" / f"{id}.mpk.br" + outfile = outdir / "note" / id[:3] / f"{id[3:]}.mpk.br" + outfile.parent.mkdir(exist_ok=True) with outfile.open("wb") as f: f.write(brotli.compress(msgpack.dumps(note))) pb.increment() for id, user in collected_users.items(): - outfile = outdir / "user" / f"{id}.mpk.br" + outfile = outdir / "user" / id[:2] / f"{id[2:]}.mpk.br" + outfile.parent.mkdir(exist_ok=True) with outfile.open("wb") as f: f.write(brotli.compress(msgpack.dumps(note))) pb.increment() @@ -134,8 +142,9 @@ pb = progressbar.ProgressBar( len(files_to_collect), prefix="downloading attachments ", ) -for (id, url) in files_to_collect: - outfile = outdir / "file" / id +for (id, url) in files_to_collect: + outfile = outdir / "file" / id[:2] / id[2:] + outfile.parent.mkdir(exist_ok=True) response: HTTPResponse = urlopen(url) with outfile.open("wb") as f: copyfileobj(response, f) |