diff --git a/__pycache__/pick-most-skincpython-314.pyc b/__pycache__/pick-most-skincpython-314.pyc new file mode 100644 index 0000000..457778f Binary files /dev/null and b/__pycache__/pick-most-skincpython-314.pyc differ diff --git a/pick-most-skin b/pick-most-skin new file mode 100755 index 0000000..033f254 --- /dev/null +++ b/pick-most-skin @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +# +# Scan ~/webGoggles (or given path) for all session screenshots. +# For each user (site/username), pick the one with the highest +# skin-to-total pixel ratio and save it as most_skin.png +# next to their bio_screenshot.png. +# +# Usage: +# ./pick-most-skin [--force] [path ...] +# +# If no path given, defaults to ~/webGoggles. +# Multiple paths can be specified. +# +# Requires: opencv-python (pip install opencv-python) + +import os +import sys +import glob as _glob +import shutil + +# If cv2 isn't on the normal path, check pipx-installed opencv-python +try: + import cv2 + import numpy as np +except ImportError: + _pipx_venv = os.path.expanduser( + "~/.local/pipx/venvs/opencv-python/lib/python*/site-packages" + ) + _matches = sorted(_glob.glob(_pipx_venv)) + if _matches: + sys.path.insert(0, _matches[-1]) + import cv2 + import numpy as np + else: + print("Error: opencv-python not installed.") + print(" pip install opencv-python") + sys.exit(1) + + +# HSV ranges for skin tones (tunable) +SKIN_RANGES = [ + ((0, 20, 70), (20, 255, 255)), # warm/peach + ((170, 20, 70), (180, 255, 255)), # reddish wrap-around +] + + +def skin_ratio(path: str) -> float: + """Return fraction of pixels that look like skin (0.0 – 1.0).""" + img = cv2.imread(path) + if img is None: + return 0.0 + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + mask = np.zeros(img.shape[:2], dtype=np.uint8) + for lower, upper in SKIN_RANGES: + mask |= cv2.inRange(hsv, np.array(lower, dtype=np.uint8), + np.array(upper, dtype=np.uint8)) + return float(cv2.countNonZero(mask)) / (img.shape[0] * img.shape[1]) + + +def find_most_skin(site_user: str) -> tuple[float, str | None]: + """Walk sessions under site/user/, return (best_ratio, best_path).""" + sessions_dir = os.path.join(site_user, "sessions") + if not os.path.isdir(sessions_dir): + return 0.0, None + + best_ratio = 0.0 + best_path = None + + for root, dirs, files in os.walk(sessions_dir): + if "screenshot.png" not in files: + continue + path = os.path.join(root, "screenshot.png") + ratio = skin_ratio(path) + if ratio > best_ratio: + best_ratio = ratio + best_path = path + + return best_ratio, best_path + + +def main() -> None: + args = [a for a in sys.argv[1:] if not a.startswith("-")] + force = "--force" in sys.argv[1:] + + roots = args if args else [os.path.expanduser("~/webGoggles")] + + for root in roots: + root = os.path.abspath(root) + if not os.path.isdir(root): + print(f"[pick-most-skin] Skipping {root} – not a directory") + continue + + print(f"[pick-most-skin] Scanning {root}") + + # Walk site/user directories + for site in sorted(os.listdir(root)): + site_path = os.path.join(root, site) + if not os.path.isdir(site_path): + continue + for user in sorted(os.listdir(site_path)): + user_path = os.path.join(site_path, user) + if not os.path.isdir(user_path): + continue + dest = os.path.join(user_path, "most_skin.png") + + if os.path.exists(dest) and not force: + print(f" {site}/{user}: most_skin.png exists (use --force to re-scan)") + continue + + ratio, best = find_most_skin(user_path) + if best is None: + continue + + shutil.copy2(best, dest) + print(f" {site}/{user}: {ratio:.1%} skin -> {dest}") + + # Build/refresh GALLERY symlinks + gallery = os.path.join(root, "GALLERY") + os.makedirs(gallery, exist_ok=True) + + # Remove stale symlinks + for entry in os.listdir(gallery): + if entry == "index.html": + continue + entry_path = os.path.join(gallery, entry) + if os.path.islink(entry_path) or entry.endswith("_ms.png"): + os.unlink(entry_path) + + entries = [] + for site in sorted(os.listdir(root)): + site_path = os.path.join(root, site) + if not os.path.isdir(site_path): + continue + for user in sorted(os.listdir(site_path)): + user_path = os.path.join(site_path, user) + most_skin = os.path.join(user_path, "most_skin.png") + if not os.path.isfile(most_skin): + continue + link_name = f"{user}_ms.png" + full_link = os.path.join(gallery, link_name) + os.symlink(os.path.relpath(most_skin, gallery), full_link) + entries.append((user, link_name)) + + # Generate gallery HTML + html_path = os.path.join(gallery, "index.html") + count = len(entries) + rows = "".join( + f' {u}\n' + for i, (u, fn) in enumerate(entries) + ) + lbs = "".join( + f' \n' + for i, (u, fn) in enumerate(entries) + ) + html = f""" + + + + +webGoggles Gallery + + + +

GALLERY · {count} users

+
+{rows}
+{lbs} +""" + with open(html_path, "w") as f: + f.write(html) + print(f" GALLERY/index.html — {len(entries)} entries") + + print("[pick-most-skin] Done") + + +if __name__ == "__main__": + main() diff --git a/sync-webgoggles b/sync-webgoggles index 6d07d19..56f7ccc 100755 --- a/sync-webgoggles +++ b/sync-webgoggles @@ -7,8 +7,9 @@ # echo -e "username=schmeeve\npassword=yourpass" > ~/.smb/ts.nas # chmod 600 ~/.smb/ts.nas # -# No deletes or overwrites: rsync --backup renames any existing dest file -# with a -CONFLICT suffix before writing the source version. +# Post-sync cleanup: deduplicates identical screenshots by content hash, +# and thins screenshots in sessions older than 6 weeks to 1 per 2 minutes +# (based on page-info.json timestamps, not filenames). # # Adjust SHARE, SUBPATH, and SOURCE to match your environment. @@ -57,5 +58,129 @@ rsync -vrau \ --exclude=".DS_Store" \ --progress --stats +# 3. Cleanup: deduplicate and thin old screenshots on destination +cleanup_destination() { + local dest="$1" + local thin_days="${2:-42}" # default 6 weeks + local keep_interval="${3:-120}" # default 2 minutes + + echo "[sync-webgoggles] Running cleanup on ${dest}" + + python3 - "$1" "$2" "$3" <<'PYEOF' +import os, json, time, sys, hashlib +from collections import defaultdict +from datetime import datetime + +dest = sys.argv[1] +thin_days = int(sys.argv[2]) +keep_interval = int(sys.argv[3]) +cutoff = time.time() - thin_days * 86400 + +# --- Phase 1: Deduplicate by content hash --- +print("[sync-webgoggles cleanup] Deduplicating screenshots...") + +# Collect all screenshot variants (CONFLICT copies included) +size_groups = defaultdict(list) +for root, dirs, files in os.walk(dest): + for f in files: + if f in ('screenshot.png', 'screenshot-CONFLICT.png'): + path = os.path.join(root, f) + try: + size_groups[os.path.getsize(path)].append(path) + except OSError: + pass + +dedup_removed = 0 +for size, paths in size_groups.items(): + if len(paths) < 2: + continue + hash_groups = defaultdict(list) + for p in paths: + try: + h = hashlib.md5(open(p, 'rb').read()).hexdigest() + hash_groups[h].append(p) + except OSError: + pass + for h, same_paths in hash_groups.items(): + if len(same_paths) < 2: + continue + # Keep the chronologically first one, delete the rest + same_paths.sort(key=lambda x: os.path.getmtime(x)) + keep = same_paths[0] + for p in same_paths[1:]: + try: + os.remove(p) + dedup_removed += 1 + except OSError: + pass + +print(f"[sync-webgoggles cleanup] Deduplication removed {dedup_removed} files") + +# --- Phase 2: Thin old session screenshots --- +print("[sync-webgoggles cleanup] Thinning old session screenshots...") + +# Collect all page dirs that have both page-info.json and screenshot.png, +# grouped by session (site/user/sessions/TS/) +sessions = defaultdict(list) # session_dir -> [(page_dir, timestamp_epoch)] + +for root, dirs, files in os.walk(dest): + if 'page-info.json' not in files or 'screenshot.png' not in files: + continue + # Only process pages inside sessions/ structure + parent = os.path.dirname(root) + grandparent = os.path.dirname(parent) + if os.path.basename(grandparent) != 'sessions': + continue + session_dir = parent # .../sessions/2026-05-17T01-10-23-730Z + info_path = os.path.join(root, 'page-info.json') + try: + with open(info_path) as f: + info = json.load(f) + ts_str = info.get('timestamp', '') + if not ts_str: + continue + # Parse ISO-8601, handle Z suffix + if ts_str.endswith('Z'): + ts_str = ts_str[:-1] + '+00:00' + ts = datetime.fromisoformat(ts_str).timestamp() + sessions[session_dir].append((root, ts)) + except (json.JSONDecodeError, KeyError, ValueError, OSError): + pass + +thin_removed = 0 +thin_kept = 0 +for session_dir, pages in sessions.items(): + pages.sort(key=lambda x: x[1]) + # Only thin sessions where the first page is older than cutoff + if pages[0][1] > cutoff: + continue + session_start = pages[0][1] + # Group pages into 120s slots by proximity to ideal keep times + # slot N covers [start+N*K - K/2, start+N*K + K/2) + slots = defaultdict(list) + for page_dir, ts in pages: + slot = int((ts - session_start + keep_interval / 2) / keep_interval) + slots[slot].append((page_dir, ts)) + for slot, slot_pages in sorted(slots.items()): + ideal = session_start + slot * keep_interval + # Keep the screenshot closest to the ideal slot center + best = min(slot_pages, key=lambda x: abs(x[1] - ideal)) + thin_kept += 1 + for page_dir, ts in slot_pages: + if page_dir != best[0]: + ss_path = os.path.join(page_dir, 'screenshot.png') + try: + os.remove(ss_path) + thin_removed += 1 + except OSError: + pass + +print(f"[sync-webgoggles cleanup] Thinning kept {thin_kept}, removed {thin_removed} screenshots") +print(f"[sync-webgoggles cleanup] Done") +PYEOF +} + +cleanup_destination "${MP}" 42 120 + DURATION=$(( $(date +%s) - START )) echo "[sync-webgoggles] Done in ${DURATION}s at $(date)"