diff options
author | mia <mia@mia.jetzt> | 2024-07-26 09:36:56 -0700 |
---|---|---|
committer | mia <mia@mia.jetzt> | 2024-07-26 09:36:56 -0700 |
commit | 81071e8feefdf815e29318226c668664e1706da2 (patch) | |
tree | bda31195ca8018b8c5fe2d6f0286f97fe6bde4c6 | |
download | scrubber-81071e8feefdf815e29318226c668664e1706da2.tar.gz scrubber-81071e8feefdf815e29318226c668664e1706da2.zip |
initial commit
-rw-r--r-- | 1_graph.py | 159 | ||||
-rw-r--r-- | 2_filter.py | 84 | ||||
-rw-r--r-- | requirements.txt | 0 | ||||
-rw-r--r-- | ty.py | 61 |
4 files changed, 304 insertions, 0 deletions
diff --git a/1_graph.py b/1_graph.py new file mode 100644 index 0000000..bc8116c --- /dev/null +++ b/1_graph.py @@ -0,0 +1,159 @@ +import json +import sys +from collections import namedtuple +from functools import cache +from pathlib import Path + +import psycopg + +try: + import progressbar2 as progressbar +except ImportError: + import progressbar + + +Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"]) +Tree = namedtuple("Tree", ["id", "replies", "renotes"]) + +print("configuring") +config = {} +exec(Path("config.py").read_text(), config) +conn: psycopg.Connection = config["connect"]() +user_id: str = config["user_id"] +early_exit = config.get("early_exit") + + +print("fetching note ids", file=sys.stderr) +note_ids = set() +cur = conn.execute( + 'select id from note where "userId" = %s and not ("renoteId" is not null and text is null)', + [user_id], +) +while rows := cur.fetchmany(0xFF): + for row in rows: + note_ids.add(row[0]) + if early_exit and len(note_ids) > early_exit: + break + + +@cache +def get_note(id: str) -> Note: + return Note( + *conn.execute( + 'select "renoteId", "replyId", "userId" from note where id = %s', [id] + ).fetchone() + ) + + +roots = {} +trees = {} + + +def tree_init(id: str, seek: bool = True) -> Tree: + if tree := trees.get(id): + return tree + tree = Tree(id, [], []) + note = get_note(id) + if note.reply_id or note.renote_id: + if note.reply_id: + p_tree = tree_init(note.reply_id) + p_tree.replies.append(tree) + if note.renote_id: + r_tree = tree_init(note.renote_id, False) + r_tree.renotes.append(tree) + else: + roots[id] = tree + trees[id] = tree + return tree + + +def make_widgets(msg, trees, roots): + widgets = [ + f"{msg} ", + progressbar.Percentage(), + " ", + progressbar.Bar(), + " ", + progressbar.SimpleProgress("%(value_s)s/%(max_value_s)s"), + " ", + ] + if trees: + widgets += [progressbar.Variable("trees"), " "] + if roots: + widgets += [progressbar.Variable("roots"), " "] + widgets += [progressbar.ETA()] + return widgets + + +pb = progressbar.ProgressBar( + 0, + len(note_ids), + widgets=make_widgets("building trees", True, True), +) +for note_id in note_ids: + tree_init(note_id) + pb.increment(trees=len(trees), roots=len(roots)) +pb.finish() + + +def traverse(tree: Tree): + note = get_note(tree.id) + if note.user_id == user_id: + expand(tree) + else: + for child in tree.replies: + traverse(child) + + +def expand(tree: Tree): + for row in conn.execute( + "select id from note_replies(%s, 1, 1000)", [tree.id] + ).fetchall(): + if row[0] in trees: + continue + note = get_note(row[0]) + new = Tree(row[0], [], []) + if note.reply_id == tree.id: + # is a reply + tree.replies.append(new) + trees[row[0]] = new + if note.renote_id == tree.id: + # is a renote + tree.renotes.append(new) + trees[row[0]] = new + for child in tree.replies: + expand(child) + + +roots_len = len(roots) +pb = progressbar.ProgressBar( + 0, roots_len, widgets=make_widgets("expanding roots", True, False) +) + +for root in roots.values(): + traverse(root) + pb.increment(trees=len(trees)) +pb.finish() + + +with Path("graph.db").open("w") as f: + pb = progressbar.ProgressBar( + 0, len(trees), widgets=make_widgets("saving graph", False, False) + ) + for key, tree in trees.items(): + note = get_note(tree.id) + is_root = tree.id in roots + f.write(f"{tree.id}\t") + f.write(",".join((reply.id for reply in tree.replies))) + f.write(f"\t") + f.write(",".join((renote.id for renote in tree.renotes))) + f.write(f"\t") + flags = [] + if tree.id in roots: + flags.append("root") + if note.user_id == user_id: + flags.append("self") + f.write(",".join(flags)) + f.write(f"\n") + pb.increment() + pb.finish() diff --git a/2_filter.py b/2_filter.py new file mode 100644 index 0000000..816e762 --- /dev/null +++ b/2_filter.py @@ -0,0 +1,84 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, List + +import psycopg + +from ty import FilterableNote, Visibility + +try: + import progressbar2 as progressbar +except ImportError: + import progressbar + + +print("configuring") +config = {} +exec(Path("config.py").read_text(), config) +conn: psycopg.Connection = config["connect"]() +criteria: Callable[[FilterableNote], bool] = config["criteria"] + +intermediate = {} + +print("parsing") +for line in Path("graph.db").read_text().splitlines(): + id, replies, quotes, flags = line.split("\t") + intermediate[id] = { + "id": id, + "replies": replies.split(",") if len(replies) > 0 else [], + "quotes": quotes.split(",") if len(quotes) > 0 else [], + "flags": flags.split(",") if len(flags) > 0 else [], + } + + +def transform(entry: dict) -> FilterableNote: + note = conn.execute( + 'select "createdAt", reactions, "renoteCount", visibility from note where id = %s', + [entry["id"]], + ).fetchone() + if note is None: + return None # part of thread disappeared during processing + when, reactions, renotes, visibility = note + + replies = [transform(intermediate[reply]) for reply in entry["replies"]] + quotes = [transform(intermediate[quote]) for quote in entry["quotes"]] + if None in replies or None in quotes: + return None # bubble up, buttercup + + return FilterableNote( + entry["id"], + "self" in entry["flags"], + replies, + quotes, + when.astimezone(), + sum(reactions.values()), + renotes, + Visibility.from_db(visibility), + ) + + +root_count = 0 +for entry in intermediate.values(): + if "root" in entry["flags"]: + root_count += 1 + + +pb = progressbar.ProgressBar( + 0, + root_count, + prefix="processing ", +) +targets = [] +for entry in intermediate.values(): + if "root" not in entry["flags"]: + continue + transformed = transform(entry) + if transformed is None: + continue # we'll get to it next cycle + if criteria(transformed): + targets.append(entry["id"]) + pb.increment() +pb.finish() + + +Path("filtered.list").write_text("\n".join(targets)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/requirements.txt diff --git a/ty.py b/ty.py new file mode 100644 index 0000000..e17c046 --- /dev/null +++ b/ty.py @@ -0,0 +1,61 @@ +from dataclasses import dataclass +from typing import List, Callable +from datetime import datetime +from enum import Enum + +class Visibility(Enum): + public = 1 + unlisted = 2 + followers = 3 + direct = 4 + + @classmethod + def from_db(cls, raw: str) -> "Visibility": + match raw: + case "public": return cls.public + case "home": return cls.unlisted + case "followers": return cls.followers + case "specified": return cls.direct + case _: raise ValueError(f"unknown visibility `{raw}`") + + +@dataclass +class FilterableNote: + id: str + mine: bool + replies: List["FilterableNote"] + quotes: List["FilterableNote"] + when: datetime + reactions: int + renotes: int + visibility: Visibility + + def thread(self) -> List["FilterableNote"]: + acc = [] + for reply in self.replies: + acc += reply.thread() + for quote in self.quotes: + acc += quote.thread() + acc.append(self) + return acc + + def thread_self(self) -> List["FilterableNote"]: + acc = [] + for reply in self.replies: + acc += reply.thread_self() + for quote in self.quotes: + acc += quote.thread_self() + if self.mine: + acc.append(self) + return acc + + def to_dict(self): + return { + "id": self.id, + "mine": self.mine, + "replies": [note.to_dict() for note in self.replies], + "quotes": [note.to_dict() for note in self.quotes], + "when": self.when.isoformat(), + "reactions": self.reactions, + "renotes": self.renotes, + } |