From bb8a48fd4d85ba4f8224c68aaaf9069d5d79dae2 Mon Sep 17 00:00:00 2001 From: mia Date: Wed, 4 Sep 2024 04:47:13 -0700 Subject: desktop changes --- .gitignore | 5 ++ 1_graph.py | 12 ++--- 2_filter.py | 24 ++-------- 3_archive.py | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4_delete.py | 33 +++++++++++++ com.py | 97 +++++++++++++++++++++++++++++++++++++ conf_mia.py | 46 ++++++++++++++++++ conf_pain.py | 14 ++++++ go.sh | 13 +++++ proxy.sh | 2 + requirements.txt | 5 ++ ty.py | 61 ----------------------- 12 files changed, 366 insertions(+), 90 deletions(-) create mode 100644 .gitignore create mode 100644 3_archive.py create mode 100644 4_delete.py create mode 100644 com.py create mode 100644 conf_mia.py create mode 100644 conf_pain.py create mode 100755 go.sh create mode 100755 proxy.sh delete mode 100644 ty.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2caa084 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/__pycache__/ +/graph.db +/filtered.list +/out/ +/sec.py diff --git a/1_graph.py b/1_graph.py index bc8116c..824d723 100644 --- a/1_graph.py +++ b/1_graph.py @@ -3,24 +3,20 @@ import sys from collections import namedtuple from functools import cache from pathlib import Path +from typing import Optional import psycopg -try: - import progressbar2 as progressbar -except ImportError: - import progressbar +from com import eval_config, progressbar Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"]) Tree = namedtuple("Tree", ["id", "replies", "renotes"]) -print("configuring") -config = {} -exec(Path("config.py").read_text(), config) +config = eval_config() conn: psycopg.Connection = config["connect"]() user_id: str = config["user_id"] -early_exit = config.get("early_exit") +early_exit: Optional[int] = config.get("early_exit") print("fetching note ids", file=sys.stderr) diff --git a/2_filter.py b/2_filter.py index 816e762..8e77945 100644 --- a/2_filter.py +++ b/2_filter.py @@ -4,32 +4,14 @@ from typing import Callable, List import psycopg -from ty import FilterableNote, Visibility +from com import FilterableNote, Visibility, eval_config, parse_graph, progressbar -try: - import progressbar2 as progressbar -except ImportError: - import progressbar - -print("configuring") -config = {} -exec(Path("config.py").read_text(), config) +config = eval_config() conn: psycopg.Connection = config["connect"]() criteria: Callable[[FilterableNote], bool] = config["criteria"] -intermediate = {} - -print("parsing") -for line in Path("graph.db").read_text().splitlines(): - id, replies, quotes, flags = line.split("\t") - intermediate[id] = { - "id": id, - "replies": replies.split(",") if len(replies) > 0 else [], - "quotes": quotes.split(",") if len(quotes) > 0 else [], - "flags": flags.split(",") if len(flags) > 0 else [], - } - +intermediate = parse_graph() def transform(entry: dict) -> FilterableNote: note = conn.execute( diff --git a/3_archive.py b/3_archive.py new file mode 100644 index 0000000..6eef0e1 --- /dev/null +++ b/3_archive.py @@ -0,0 +1,144 @@ +import json +from http.client import HTTPResponse +from pathlib import Path +from shutil import copyfileobj +from urllib.request import urlopen + +import brotli +import msgpack +import psycopg + +from com import Visibility, eval_config, parse_graph, progressbar + +config = eval_config() +conn: psycopg.Connection = config["connect"]() + +graph = parse_graph() +print("reading filterlist") +filtered = Path("filtered.list").read_text().strip().splitlines() + +collected_users = {} +def collect_user(id: str): + if id in collected_users: + return + user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone() + if user is None: + return None + username, host, avatar_url = user + profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone() + description, fields = profile or ("", []) + + output = {} + output["id"] = id + output["username"] = username + output["host"] = host + output["description"] = description + output["fields"] = fields + output["avatar_url"] = avatar_url + + collected_users[id] = output + +collected_notes = [] +files_to_collect = [] +def collect_note(id: str): + output = {} + output["id"] = id + + note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone() + if note is None: + return None + text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note + collect_user(user_id) + + output["text"] = text + output["user_id"] = user_id + output["created_at"] = created_at.astimezone(tz=None).isoformat() + output["updated_at"] = None + if updated_at is not None: + output["updated_at"] = updated_at.astimezone(tz=None).isoformat() + output["reactions"] = reactions + output["renotes"] = renotes + output["visibility"] = Visibility.from_db(visibility).code() + + node = graph[id] + replies = [collect_note(reply) for reply in node["replies"]] + replies = filter(lambda reply: reply is not None, replies) + quotes = [collect_note(quote) for quote in node["quotes"]] + quotes = filter(lambda quote: quote is not None, quotes) + + output["attachments"] = [] + for file_id in file_ids: + name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone() + attachment = { + "id": file_id, + "type": type_, + "comment": comment, + } + if "self" in node["flags"]: # archive own attachments + files_to_collect.append((file_id, url)) + attachment["url"] = None + else: + attachment["url"] = url + + output["replies"] = list(replies) + output["quotes"] = list(quotes) + + if len(output["attachments"]) == 0: del output["attachments"] + if len(output["replies"]) == 0: del output["replies"] + if len(output["quotes"]) == 0: del output["quotes"] + + return output + +pb = progressbar.ProgressBar( + 0, + len(filtered), + prefix="collecting data ", +) +for id in filtered: + note = collect_note(id) + collected_notes.append((id, note)) + pb.increment() +pb.finish() + +outdir = Path("out") +if not outdir.exists(): + outdir.mkdir() +if not (outdir / "note").exists(): + (outdir / "note").mkdir() +if not (outdir / "user").exists(): + (outdir / "user").mkdir() +if not (outdir / "file").exists(): + (outdir / "file").mkdir() + +pb = progressbar.ProgressBar( + 0, + len(collected_notes) + len(collected_users), + prefix="writing data ", +) + +for id, note in collected_notes: + outfile = outdir / "note" / f"{id}.mpk.br" + with outfile.open("wb") as f: + f.write(brotli.compress(msgpack.dumps(note))) + pb.increment() + +for id, user in collected_users.items(): + outfile = outdir / "user" / f"{id}.mpk.br" + with outfile.open("wb") as f: + f.write(brotli.compress(msgpack.dumps(note))) + pb.increment() +pb.finish() + +pb = progressbar.ProgressBar( + 0, + len(files_to_collect), + prefix="downloading attachments ", +) +for (id, url) in files_to_collect: + outfile = outdir / "file" / id + response: HTTPResponse = urlopen(url) + with outfile.open("wb") as f: + copyfileobj(response, f) + response.close() + pb.increment() +pb.finish() diff --git a/4_delete.py b/4_delete.py new file mode 100644 index 0000000..51e1ef3 --- /dev/null +++ b/4_delete.py @@ -0,0 +1,33 @@ +from pathlib import Path + +import httpx +import psycopg + +from com import eval_config, parse_graph, progressbar + +config = eval_config() +conn: psycopg.Connection = config["connect"]() +token: str = config["token"] +api: str = config["api"] + +graph = parse_graph() +print("reading filterlist") +filtered = Path("filtered.list").read_text().strip().splitlines() + +queue = [] + +def enqueue(note): + for reply in note["replies"]: + enqueue(graph[reply]) + for quote in note["quotes"]: + enqueue(graph[quote]) + if "self" in note["flags"]: + files = conn.execute('select "fileIds" from note where id = %s', [note["id"]]).fetchone()[0] + queue.append((note["id"], files)) + +for id in filtered: + enqueue(graph[id]) + +print(queue) + +# client = httpx.Client() diff --git a/com.py b/com.py new file mode 100644 index 0000000..4ceb849 --- /dev/null +++ b/com.py @@ -0,0 +1,97 @@ +import sys +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from pathlib import Path +from typing import Callable, Dict, List + +try: + import progressbar2 as progressbar +except ImportError: + import progressbar + + +class Visibility(Enum): + public = 1 + unlisted = 2 + followers = 3 + direct = 4 + + @classmethod + def from_db(cls, raw: str) -> "Visibility": + match raw: + case "public": return cls.public + case "home": return cls.unlisted + case "followers": return cls.followers + case "specified": return cls.direct + case _: raise ValueError(f"unknown visibility `{raw}`") + + def code(self) -> str: + match self: + case self.public: return "p" + case self.unlisted: return "u" + case self.followers: return "f" + case self.direct: return "d" + + +@dataclass +class FilterableNote: + id: str + mine: bool + replies: List["FilterableNote"] + quotes: List["FilterableNote"] + when: datetime + reactions: int + renotes: int + visibility: Visibility + + def thread(self) -> List["FilterableNote"]: + acc = [] + for reply in self.replies: + acc += reply.thread() + for quote in self.quotes: + acc += quote.thread() + acc.append(self) + return acc + + def thread_self(self) -> List["FilterableNote"]: + acc = [] + for reply in self.replies: + acc += reply.thread_self() + for quote in self.quotes: + acc += quote.thread_self() + if self.mine: + acc.append(self) + return acc + + def to_dict(self): + return { + "id": self.id, + "mine": self.mine, + "replies": [note.to_dict() for note in self.replies], + "quotes": [note.to_dict() for note in self.quotes], + "when": self.when.isoformat(), + "reactions": self.reactions, + "renotes": self.renotes, + } + + +def eval_config() -> dict: + print("configuring") + config = {} + exec(Path(sys.argv[1]).read_text(), config) + return config + + +def parse_graph() -> Dict[str, dict]: + print("parsing graph") + graph = {} + for line in Path("graph.db").read_text().splitlines(): + id, replies, quotes, flags = line.split("\t") + graph[id] = { + "id": id, + "replies": replies.split(",") if len(replies) > 0 else [], + "quotes": quotes.split(",") if len(quotes) > 0 else [], + "flags": flags.split(",") if len(flags) > 0 else [], + } + return graph diff --git a/conf_mia.py b/conf_mia.py new file mode 100644 index 0000000..6496e3b --- /dev/null +++ b/conf_mia.py @@ -0,0 +1,46 @@ +import math +from datetime import UTC, datetime, timedelta + +from com import FilterableNote, Visibility +from sec import connect, tokens + +user_id = "9gf2ev4ex5dflllo" +token = tokens["mia"] +api = "https://void.rehab/api/" +early_exit = 0xFFF + +now = datetime.now(UTC) +threshold = 0.1 + +def criteria(root: FilterableNote) -> bool: + thread = root.thread() + thread_self = root.thread_self() + + # if there are dms involved... + low_vis = min(thread, key=lambda note: note.visibility.value) + if low_vis.visibility == Visibility.direct: + is_direct = lambda note: note.visibility == Visibility.direct + most_recent_direct = max(filter(is_direct, thread), key=lambda note: note.when) + # ...and the dms are younger than two months... + if now - most_recent_direct.when < timedelta(days=30 * 2): + # ...do not delete the thread + return False + + # get the most recent post... + others_recency = max(thread, key=lambda note: note.when) + # ...and bail if it's too new + if now - others_recency.when < timedelta(days=14): + return False + + # get my... + most_recent_post = max(thread_self, key=lambda note: note.when) # ...most recent post... + score = lambda note: note.reactions + note.renotes*5 + high_score_post = max(thread_self, key=score) # ...highest scoring post... + # ...and their values... + most_recent = most_recent_post.when + most_recent_age = now - most_recent + high_score = score(high_score_post) + # ...weigh it... + weighted_score = high_score / math.sqrt(most_recent_age.days) + # ...and check it against a threshold + return weighted_score < threshold diff --git a/conf_pain.py b/conf_pain.py new file mode 100644 index 0000000..85e7095 --- /dev/null +++ b/conf_pain.py @@ -0,0 +1,14 @@ +import math +from datetime import UTC, datetime, timedelta + +from com import FilterableNote +from sec import connect, tokens + +user_id = "9gszslkcdfnomssj" +token = tokens["pain"] +api = "https://void.rehab/api/" + +def criteria(root: FilterableNote) -> bool: + # if it's more than two months old, delete + # return (datetime.now(UTC) - root.when).days > 60 + return (datetime.now(UTC) - root.when).days > (12 * 30) diff --git a/go.sh b/go.sh new file mode 100755 index 0000000..39f3779 --- /dev/null +++ b/go.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +set -ex + +test -f graph.db && rm graph.db +test -f filtered.list && rm filtered.list +test -d out && rm -r out +python3 1_graph.py conf_$1.py +python3 2_filter.py conf_$1.py +# python3 3_archive.py conf_$1.py +# echo uploading to memorial +# rsync -r -e 'ssh -p23' --progress out/ memorial:fediverse/$1/ +# python3 4_delete.py conf_$1.py diff --git a/proxy.sh b/proxy.sh new file mode 100755 index 0000000..9628fab --- /dev/null +++ b/proxy.sh @@ -0,0 +1,2 @@ +#!/bin/sh +exec ssh -NL 5432:localhost:5432 vr diff --git a/requirements.txt b/requirements.txt index e69de29..094393e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,5 @@ +httpx +progressbar2 +psycopg +brotli +msgpack diff --git a/ty.py b/ty.py deleted file mode 100644 index e17c046..0000000 --- a/ty.py +++ /dev/null @@ -1,61 +0,0 @@ -from dataclasses import dataclass -from typing import List, Callable -from datetime import datetime -from enum import Enum - -class Visibility(Enum): - public = 1 - unlisted = 2 - followers = 3 - direct = 4 - - @classmethod - def from_db(cls, raw: str) -> "Visibility": - match raw: - case "public": return cls.public - case "home": return cls.unlisted - case "followers": return cls.followers - case "specified": return cls.direct - case _: raise ValueError(f"unknown visibility `{raw}`") - - -@dataclass -class FilterableNote: - id: str - mine: bool - replies: List["FilterableNote"] - quotes: List["FilterableNote"] - when: datetime - reactions: int - renotes: int - visibility: Visibility - - def thread(self) -> List["FilterableNote"]: - acc = [] - for reply in self.replies: - acc += reply.thread() - for quote in self.quotes: - acc += quote.thread() - acc.append(self) - return acc - - def thread_self(self) -> List["FilterableNote"]: - acc = [] - for reply in self.replies: - acc += reply.thread_self() - for quote in self.quotes: - acc += quote.thread_self() - if self.mine: - acc.append(self) - return acc - - def to_dict(self): - return { - "id": self.id, - "mine": self.mine, - "replies": [note.to_dict() for note in self.replies], - "quotes": [note.to_dict() for note in self.quotes], - "when": self.when.isoformat(), - "reactions": self.reactions, - "renotes": self.renotes, - } -- cgit 1.4.1