🗝
summary refs log tree commit diff
diff options
context:
space:
mode:
authormia <mia@mia.jetzt>2024-09-04 04:47:13 -0700
committermia <mia@mia.jetzt>2024-09-04 04:47:13 -0700
commitbb8a48fd4d85ba4f8224c68aaaf9069d5d79dae2 (patch)
treebdb0654c667f37c69addc9efd1e29b9cfe710c51
parent81071e8feefdf815e29318226c668664e1706da2 (diff)
downloadscrubber-bb8a48fd4d85ba4f8224c68aaaf9069d5d79dae2.tar.gz
scrubber-bb8a48fd4d85ba4f8224c68aaaf9069d5d79dae2.zip
desktop changes
-rw-r--r--.gitignore5
-rw-r--r--1_graph.py12
-rw-r--r--2_filter.py24
-rw-r--r--3_archive.py144
-rw-r--r--4_delete.py33
-rw-r--r--com.py (renamed from ty.py)38
-rw-r--r--conf_mia.py46
-rw-r--r--conf_pain.py14
-rwxr-xr-xgo.sh13
-rwxr-xr-xproxy.sh2
-rw-r--r--requirements.txt5
11 files changed, 306 insertions, 30 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2caa084
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+/__pycache__/
+/graph.db
+/filtered.list
+/out/
+/sec.py
diff --git a/1_graph.py b/1_graph.py
index bc8116c..824d723 100644
--- a/1_graph.py
+++ b/1_graph.py
@@ -3,24 +3,20 @@ import sys
 from collections import namedtuple
 from functools import cache
 from pathlib import Path
+from typing import Optional
 
 import psycopg
 
-try:
-    import progressbar2 as progressbar
-except ImportError:
-    import progressbar
+from com import eval_config, progressbar
 
 
 Note = namedtuple("Note", ["renote_id", "reply_id", "user_id"])
 Tree = namedtuple("Tree", ["id", "replies", "renotes"])
 
-print("configuring")
-config = {}
-exec(Path("config.py").read_text(), config)
+config = eval_config()
 conn: psycopg.Connection = config["connect"]()
 user_id: str = config["user_id"]
-early_exit = config.get("early_exit")
+early_exit: Optional[int] = config.get("early_exit")
 
 
 print("fetching note ids", file=sys.stderr)
diff --git a/2_filter.py b/2_filter.py
index 816e762..8e77945 100644
--- a/2_filter.py
+++ b/2_filter.py
@@ -4,32 +4,14 @@ from typing import Callable, List
 
 import psycopg
 
-from ty import FilterableNote, Visibility
+from com import FilterableNote, Visibility, eval_config, parse_graph, progressbar
 
-try:
-    import progressbar2 as progressbar
-except ImportError:
-    import progressbar
 
-
-print("configuring")
-config = {}
-exec(Path("config.py").read_text(), config)
+config = eval_config()
 conn: psycopg.Connection = config["connect"]()
 criteria: Callable[[FilterableNote], bool] = config["criteria"]
 
-intermediate = {}
-
-print("parsing")
-for line in Path("graph.db").read_text().splitlines():
-    id, replies, quotes, flags = line.split("\t")
-    intermediate[id] = {
-        "id": id,
-        "replies": replies.split(",") if len(replies) > 0 else [],
-        "quotes": quotes.split(",") if len(quotes) > 0 else [],
-        "flags": flags.split(",") if len(flags) > 0 else [],
-    }
-
+intermediate = parse_graph()
 
 def transform(entry: dict) -> FilterableNote:
     note = conn.execute(
diff --git a/3_archive.py b/3_archive.py
new file mode 100644
index 0000000..6eef0e1
--- /dev/null
+++ b/3_archive.py
@@ -0,0 +1,144 @@
+import json
+from http.client import HTTPResponse
+from pathlib import Path
+from shutil import copyfileobj
+from urllib.request import urlopen
+
+import brotli
+import msgpack
+import psycopg
+
+from com import Visibility, eval_config, parse_graph, progressbar
+
+config = eval_config()
+conn: psycopg.Connection = config["connect"]()
+
+graph = parse_graph()
+print("reading filterlist")
+filtered = Path("filtered.list").read_text().strip().splitlines()
+
+collected_users = {}
+def collect_user(id: str):
+    if id in collected_users:
+        return
+    user = conn.execute('select username, host, "avatarUrl" from "user" where id = %s', [id]).fetchone()
+    if user is None:
+        return None
+    username, host, avatar_url = user
+    profile = conn.execute('select description, fields from user_profile where "userId" = %s', [id]).fetchone()
+    description, fields = profile or ("", [])
+
+    output = {}
+    output["id"] = id
+    output["username"] = username
+    output["host"] = host
+    output["description"] = description
+    output["fields"] = fields
+    output["avatar_url"] = avatar_url
+
+    collected_users[id] = output
+
+collected_notes = []
+files_to_collect = []
+def collect_note(id: str):
+    output = {}
+    output["id"] = id
+
+    note = conn.execute('select text, "userId", "createdAt", "updatedAt", reactions, "renoteCount", visibility, "fileIds" from note where id = %s', [id]).fetchone()
+    if note is None:
+        return None
+    text, user_id, created_at, updated_at, reactions, renotes, visibility, file_ids = note
+    collect_user(user_id)
+
+    output["text"] = text
+    output["user_id"] = user_id
+    output["created_at"] = created_at.astimezone(tz=None).isoformat()
+    output["updated_at"] = None
+    if updated_at is not None:
+        output["updated_at"] = updated_at.astimezone(tz=None).isoformat()
+    output["reactions"] = reactions
+    output["renotes"] = renotes
+    output["visibility"] = Visibility.from_db(visibility).code()
+
+    node = graph[id]
+    replies = [collect_note(reply) for reply in node["replies"]]
+    replies = filter(lambda reply: reply is not None, replies)
+    quotes = [collect_note(quote) for quote in node["quotes"]]
+    quotes = filter(lambda quote: quote is not None, quotes)
+
+    output["attachments"] = []
+    for file_id in file_ids:
+        name, type_, comment, url = conn.execute('select name, type, comment, url from drive_file where id = %s', [file_id]).fetchone()
+        attachment = {
+            "id": file_id,
+            "type": type_,
+            "comment": comment,
+        }
+        if "self" in node["flags"]: # archive own attachments
+            files_to_collect.append((file_id, url))
+            attachment["url"] = None
+        else:
+            attachment["url"] = url
+
+    output["replies"] = list(replies)
+    output["quotes"] = list(quotes)
+
+    if len(output["attachments"]) == 0: del output["attachments"]
+    if len(output["replies"]) == 0: del output["replies"]
+    if len(output["quotes"]) == 0: del output["quotes"]
+
+    return output
+
+pb = progressbar.ProgressBar(
+    0,
+    len(filtered),
+    prefix="collecting data ",
+)
+for id in filtered:
+    note = collect_note(id)
+    collected_notes.append((id, note))
+    pb.increment()
+pb.finish()
+
+outdir = Path("out")
+if not outdir.exists():
+    outdir.mkdir()
+if not (outdir / "note").exists():
+    (outdir / "note").mkdir()
+if not (outdir / "user").exists():
+    (outdir / "user").mkdir()
+if not (outdir / "file").exists():
+    (outdir / "file").mkdir()
+
+pb = progressbar.ProgressBar(
+    0,
+    len(collected_notes) + len(collected_users),
+    prefix="writing data ",
+)
+
+for id, note in collected_notes:
+    outfile = outdir / "note" / f"{id}.mpk.br"
+    with outfile.open("wb") as f:
+        f.write(brotli.compress(msgpack.dumps(note)))
+    pb.increment()
+
+for id, user in collected_users.items():
+    outfile = outdir / "user" / f"{id}.mpk.br"
+    with outfile.open("wb") as f:
+        f.write(brotli.compress(msgpack.dumps(note)))
+    pb.increment()
+pb.finish()
+
+pb = progressbar.ProgressBar(
+    0,
+    len(files_to_collect),
+    prefix="downloading attachments ",
+)
+for (id, url) in files_to_collect: 
+    outfile = outdir / "file" / id
+    response: HTTPResponse = urlopen(url)
+    with outfile.open("wb") as f:
+        copyfileobj(response, f)
+    response.close()
+    pb.increment()
+pb.finish()
diff --git a/4_delete.py b/4_delete.py
new file mode 100644
index 0000000..51e1ef3
--- /dev/null
+++ b/4_delete.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+import httpx
+import psycopg
+
+from com import eval_config, parse_graph, progressbar
+
+config = eval_config()
+conn: psycopg.Connection = config["connect"]()
+token: str = config["token"]
+api: str = config["api"]
+
+graph = parse_graph()
+print("reading filterlist")
+filtered = Path("filtered.list").read_text().strip().splitlines()
+
+queue = []
+
+def enqueue(note):
+    for reply in note["replies"]:
+        enqueue(graph[reply])
+    for quote in note["quotes"]:
+        enqueue(graph[quote])
+    if "self" in note["flags"]:
+        files = conn.execute('select "fileIds" from note where id = %s', [note["id"]]).fetchone()[0]
+        queue.append((note["id"], files))
+
+for id in filtered:
+    enqueue(graph[id])
+
+print(queue)
+
+# client = httpx.Client()
diff --git a/ty.py b/com.py
index e17c046..4ceb849 100644
--- a/ty.py
+++ b/com.py
@@ -1,7 +1,15 @@
+import sys
 from dataclasses import dataclass
-from typing import List, Callable
 from datetime import datetime
 from enum import Enum
+from pathlib import Path
+from typing import Callable, Dict, List
+
+try:
+    import progressbar2 as progressbar
+except ImportError:
+    import progressbar
+
 
 class Visibility(Enum):
     public = 1
@@ -17,6 +25,13 @@ class Visibility(Enum):
             case "followers": return cls.followers
             case "specified": return cls.direct
             case _: raise ValueError(f"unknown visibility `{raw}`")
+    
+    def code(self) -> str:
+        match self:
+            case self.public: return "p"
+            case self.unlisted: return "u"
+            case self.followers: return "f"
+            case self.direct: return "d"
 
 
 @dataclass
@@ -59,3 +74,24 @@ class FilterableNote:
             "reactions": self.reactions,
             "renotes": self.renotes,
         }
+
+
+def eval_config() -> dict:
+    print("configuring")
+    config = {}
+    exec(Path(sys.argv[1]).read_text(), config)
+    return config
+
+
+def parse_graph() -> Dict[str, dict]:
+    print("parsing graph")
+    graph = {}
+    for line in Path("graph.db").read_text().splitlines():
+        id, replies, quotes, flags = line.split("\t")
+        graph[id] = {
+            "id": id,
+            "replies": replies.split(",") if len(replies) > 0 else [],
+            "quotes": quotes.split(",") if len(quotes) > 0 else [],
+            "flags": flags.split(",") if len(flags) > 0 else [],
+        }
+    return graph
diff --git a/conf_mia.py b/conf_mia.py
new file mode 100644
index 0000000..6496e3b
--- /dev/null
+++ b/conf_mia.py
@@ -0,0 +1,46 @@
+import math
+from datetime import UTC, datetime, timedelta
+
+from com import FilterableNote, Visibility
+from sec import connect, tokens
+
+user_id = "9gf2ev4ex5dflllo"
+token = tokens["mia"]
+api = "https://void.rehab/api/"
+early_exit = 0xFFF
+
+now = datetime.now(UTC)
+threshold = 0.1
+
+def criteria(root: FilterableNote) -> bool:
+    thread = root.thread()
+    thread_self = root.thread_self()
+
+    # if there are dms involved...
+    low_vis = min(thread, key=lambda note: note.visibility.value)
+    if low_vis.visibility == Visibility.direct:
+        is_direct = lambda note: note.visibility == Visibility.direct
+        most_recent_direct = max(filter(is_direct, thread), key=lambda note: note.when)
+        # ...and the dms are younger than two months...
+        if now - most_recent_direct.when < timedelta(days=30 * 2):
+            # ...do not delete the thread
+            return False
+
+    # get the most recent post...
+    others_recency = max(thread, key=lambda note: note.when)
+    # ...and bail if it's too new
+    if now - others_recency.when < timedelta(days=14):
+        return False
+
+    # get my...
+    most_recent_post = max(thread_self, key=lambda note: note.when) # ...most recent post...
+    score = lambda note: note.reactions + note.renotes*5
+    high_score_post = max(thread_self, key=score) # ...highest scoring post...
+    # ...and their values...
+    most_recent = most_recent_post.when
+    most_recent_age = now - most_recent
+    high_score = score(high_score_post)
+    # ...weigh it...
+    weighted_score = high_score / math.sqrt(most_recent_age.days)
+    # ...and check it against a threshold
+    return weighted_score < threshold
diff --git a/conf_pain.py b/conf_pain.py
new file mode 100644
index 0000000..85e7095
--- /dev/null
+++ b/conf_pain.py
@@ -0,0 +1,14 @@
+import math
+from datetime import UTC, datetime, timedelta
+
+from com import FilterableNote
+from sec import connect, tokens
+
+user_id = "9gszslkcdfnomssj"
+token = tokens["pain"]
+api = "https://void.rehab/api/"
+
+def criteria(root: FilterableNote) -> bool:
+    # if it's more than two months old, delete
+    # return (datetime.now(UTC) - root.when).days > 60
+    return (datetime.now(UTC) - root.when).days > (12 * 30)
diff --git a/go.sh b/go.sh
new file mode 100755
index 0000000..39f3779
--- /dev/null
+++ b/go.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+set -ex
+
+test -f graph.db && rm graph.db
+test -f filtered.list && rm filtered.list
+test -d out && rm -r out
+python3 1_graph.py conf_$1.py
+python3 2_filter.py conf_$1.py
+# python3 3_archive.py conf_$1.py
+# echo uploading to memorial
+# rsync -r -e 'ssh -p23' --progress out/ memorial:fediverse/$1/
+# python3 4_delete.py conf_$1.py
diff --git a/proxy.sh b/proxy.sh
new file mode 100755
index 0000000..9628fab
--- /dev/null
+++ b/proxy.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec ssh -NL 5432:localhost:5432 vr
diff --git a/requirements.txt b/requirements.txt
index e69de29..094393e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+httpx
+progressbar2
+psycopg
+brotli
+msgpack