🗝
summary refs log tree commit diff
diff options
context:
space:
mode:
authormia <mia@mia.jetzt>2024-07-26 09:36:56 -0700
committermia <mia@mia.jetzt>2024-07-26 09:36:56 -0700
commit81071e8feefdf815e29318226c668664e1706da2 (patch)
treebda31195ca8018b8c5fe2d6f0286f97fe6bde4c6
downloadscrubber-81071e8feefdf815e29318226c668664e1706da2.tar.gz
scrubber-81071e8feefdf815e29318226c668664e1706da2.zip
initial commit
-rw-r--r--1_graph.py159
-rw-r--r--2_filter.py84
-rw-r--r--requirements.txt0
-rw-r--r--ty.py61
4 files changed, 304 insertions, 0 deletions
diff --git a/1_graph.py b/1_graph.py
new file mode 100644
index 0000000..bc8116c
--- /dev/null
+++ b/1_graph.py
@@ -0,0 +1,159 @@
+import json
+import sys
+from collections import namedtuple
+from functools import cache
+from pathlib import Path
+
+import psycopg
+
+try:
+    import progressbar2 as progressbar
+except ImportError:
+    import progressbar
+
+
+Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
+Tree = namedtuple("Tree", ["id", "replies", "renotes"])
+
+print("configuring")
+config = {}
+exec(Path("config.py").read_text(), config)
+conn: psycopg.Connection = config["connect"]()
+user_id: str = config["user_id"]
+early_exit = config.get("early_exit")
+
+
+print("fetching note ids", file=sys.stderr)
+note_ids = set()
+cur = conn.execute(
+    'select id from note where "userId" = %s and not ("renoteId" is not null and text is null)',
+    [user_id],
+)
+while rows := cur.fetchmany(0xFF):
+    for row in rows:
+        note_ids.add(row[0])
+    if early_exit and len(note_ids) > early_exit:
+        break
+
+
+@cache
+def get_note(id: str) -> Note:
+    return Note(
+        *conn.execute(
+            'select "renoteId", "replyId", "userId" from note where id = %s', [id]
+        ).fetchone()
+    )
+
+
+roots = {}
+trees = {}
+
+
+def tree_init(id: str, seek: bool = True) -> Tree:
+    if tree := trees.get(id):
+        return tree
+    tree = Tree(id, [], [])
+    note = get_note(id)
+    if note.reply_id or note.renote_id:
+        if note.reply_id:
+            p_tree = tree_init(note.reply_id)
+            p_tree.replies.append(tree)
+        if note.renote_id:
+            r_tree = tree_init(note.renote_id, False)
+            r_tree.renotes.append(tree)
+    else:
+        roots[id] = tree
+    trees[id] = tree
+    return tree
+
+
+def make_widgets(msg, trees, roots):
+    widgets = [
+        f"{msg} ",
+        progressbar.Percentage(),
+        " ",
+        progressbar.Bar(),
+        " ",
+        progressbar.SimpleProgress("%(value_s)s/%(max_value_s)s"),
+        " ",
+    ]
+    if trees:
+        widgets += [progressbar.Variable("trees"), " "]
+    if roots:
+        widgets += [progressbar.Variable("roots"), " "]
+    widgets += [progressbar.ETA()]
+    return widgets
+
+
+pb = progressbar.ProgressBar(
+    0,
+    len(note_ids),
+    widgets=make_widgets("building trees", True, True),
+)
+for note_id in note_ids:
+    tree_init(note_id)
+    pb.increment(trees=len(trees), roots=len(roots))
+pb.finish()
+
+
+def traverse(tree: Tree):
+    note = get_note(tree.id)
+    if note.user_id == user_id:
+        expand(tree)
+    else:
+        for child in tree.replies:
+            traverse(child)
+
+
+def expand(tree: Tree):
+    for row in conn.execute(
+        "select id from note_replies(%s, 1, 1000)", [tree.id]
+    ).fetchall():
+        if row[0] in trees:
+            continue
+        note = get_note(row[0])
+        new = Tree(row[0], [], [])
+        if note.reply_id == tree.id:
+            # is a reply
+            tree.replies.append(new)
+            trees[row[0]] = new
+        if note.renote_id == tree.id:
+            # is a renote
+            tree.renotes.append(new)
+            trees[row[0]] = new
+    for child in tree.replies:
+        expand(child)
+
+
+roots_len = len(roots)
+pb = progressbar.ProgressBar(
+    0, roots_len, widgets=make_widgets("expanding roots", True, False)
+)
+
+for root in roots.values():
+    traverse(root)
+    pb.increment(trees=len(trees))
+pb.finish()
+
+
+with Path("graph.db").open("w") as f:
+    pb = progressbar.ProgressBar(
+        0, len(trees), widgets=make_widgets("saving graph", False, False)
+    )
+    for key, tree in trees.items():
+        note = get_note(tree.id)
+        is_root = tree.id in roots
+        f.write(f"{tree.id}\t")
+        f.write(",".join((reply.id for reply in tree.replies)))
+        f.write(f"\t")
+        f.write(",".join((renote.id for renote in tree.renotes)))
+        f.write(f"\t")
+        flags = []
+        if tree.id in roots:
+            flags.append("root")
+        if note.user_id == user_id:
+            flags.append("self")
+        f.write(",".join(flags))
+        f.write(f"\n")
+        pb.increment()
+    pb.finish()
diff --git a/2_filter.py b/2_filter.py
new file mode 100644
index 0000000..816e762
--- /dev/null
+++ b/2_filter.py
@@ -0,0 +1,84 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, List
+
+import psycopg
+
+from ty import FilterableNote, Visibility
+
+try:
+    import progressbar2 as progressbar
+except ImportError:
+    import progressbar
+
+
+print("configuring")
+config = {}
+exec(Path("config.py").read_text(), config)
+conn: psycopg.Connection = config["connect"]()
+criteria: Callable[[FilterableNote], bool] = config["criteria"]
+
+intermediate = {}
+
+print("parsing")
+for line in Path("graph.db").read_text().splitlines():
+    id, replies, quotes, flags = line.split("\t")
+    intermediate[id] = {
+        "id": id,
+        "replies": replies.split(",") if len(replies) > 0 else [],
+        "quotes": quotes.split(",") if len(quotes) > 0 else [],
+        "flags": flags.split(",") if len(flags) > 0 else [],
+    }
+
+
+def transform(entry: dict) -> FilterableNote:
+    note = conn.execute(
+        'select "createdAt", reactions, "renoteCount", visibility from note where id = %s',
+        [entry["id"]],
+    ).fetchone()
+    if note is None:
+        return None # part of thread disappeared during processing
+    when, reactions, renotes, visibility = note
+
+    replies = [transform(intermediate[reply]) for reply in entry["replies"]]
+    quotes = [transform(intermediate[quote]) for quote in entry["quotes"]]
+    if None in replies or None in quotes:
+        return None # bubble up, buttercup
+
+    return FilterableNote(
+        entry["id"],
+        "self" in entry["flags"],
+        replies,
+        quotes,
+        when.astimezone(),
+        sum(reactions.values()),
+        renotes,
+        Visibility.from_db(visibility),
+    )
+
+
+root_count = 0
+for entry in intermediate.values():
+    if "root" in entry["flags"]:
+        root_count += 1
+
+
+pb = progressbar.ProgressBar(
+    0,
+    root_count,
+    prefix="processing ",
+)
+targets = []
+for entry in intermediate.values():
+    if "root" not in entry["flags"]:
+        continue
+    transformed = transform(entry)
+    if transformed is None:
+        continue # we'll get to it next cycle
+    if criteria(transformed):
+        targets.append(entry["id"])
+    pb.increment()
+pb.finish()
+
+
+Path("filtered.list").write_text("\n".join(targets))
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/requirements.txt
diff --git a/ty.py b/ty.py
new file mode 100644
index 0000000..e17c046
--- /dev/null
+++ b/ty.py
@@ -0,0 +1,61 @@
+from dataclasses import dataclass
+from typing import List, Callable
+from datetime import datetime
+from enum import Enum
+
+class Visibility(Enum):
+    public = 1
+    unlisted = 2
+    followers = 3
+    direct = 4
+
+    @classmethod
+    def from_db(cls, raw: str) -> "Visibility":
+        match raw:
+            case "public": return cls.public
+            case "home": return cls.unlisted
+            case "followers": return cls.followers
+            case "specified": return cls.direct
+            case _: raise ValueError(f"unknown visibility `{raw}`")
+
+
+@dataclass
+class FilterableNote:
+    id: str
+    mine: bool
+    replies: List["FilterableNote"]
+    quotes: List["FilterableNote"]
+    when: datetime
+    reactions: int
+    renotes: int
+    visibility: Visibility
+
+    def thread(self) -> List["FilterableNote"]:
+        acc = []
+        for reply in self.replies:
+            acc += reply.thread()
+        for quote in self.quotes:
+            acc += quote.thread()
+        acc.append(self)
+        return acc
+
+    def thread_self(self) -> List["FilterableNote"]:
+        acc = []
+        for reply in self.replies:
+            acc += reply.thread_self()
+        for quote in self.quotes:
+            acc += quote.thread_self()
+        if self.mine:
+            acc.append(self)
+        return acc
+
+    def to_dict(self):
+        return {
+            "id": self.id,
+            "mine": self.mine,
+            "replies": [note.to_dict() for note in self.replies],
+            "quotes": [note.to_dict() for note in self.quotes],
+            "when": self.when.isoformat(),
+            "reactions": self.reactions,
+            "renotes": self.renotes,
+        }