#!/bin/bash # # Sync ~/webGoggles to a NAS share. # # Uses mount -t cifs with sudo (prompts for password if no credentials file). # Create ~/.smb/ts.nas to avoid the password prompt each time: # echo -e "username=schmeeve\npassword=yourpass" > ~/.smb/ts.nas # chmod 600 ~/.smb/ts.nas # # Post-sync cleanup: deduplicates identical screenshots by content hash, # and thins screenshots in sessions older than 6 weeks to 1 per 2 minutes # (based on page-info.json timestamps, not filenames). # # Adjust SHARE, SUBPATH, and SOURCE to match your environment. for arg in "$@"; do case "$arg" in --help|-h) cat <<'EOF' Usage: sync-webgoggles [options] Sync ~/webGoggles to a NAS share, with deduplication and old-session thinning. Options: -h, --help Show this help message and exit EOF exit 0 ;; esac done SHARE="//ts.nas/aura" SUBPATH="webGoggles" MOUNTPOINT="${HOME}/mnt/ts.nas/aura" CREDENTIALS="${HOME}/.smb/ts.nas" SOURCE="${HOME}/webGoggles" START="$(date +%s)" echo "[sync-webgoggles] Starting at $(date)" # Verify source exists if [ ! -d "${SOURCE}" ]; then echo "[sync-webgoggles] ERROR: Source ${SOURCE} does not exist" exit 1 fi # 1. Create mountpoint and mount if not already mounted if mount | grep -q "${MOUNTPOINT}"; then echo "[sync-webgoggles] Already mounted at ${MOUNTPOINT}" else echo "[sync-webgoggles] Mounting ${SHARE} → ${MOUNTPOINT}" mkdir -p "${MOUNTPOINT}" OPTS="username=schmeeve,uid=$(id -u),gid=$(id -g),forceuid,forcegid,nounix,serverino" if [ -f "${CREDENTIALS}" ]; then OPTS="${OPTS},credentials=${CREDENTIALS}" fi sudo mount -t cifs "${SHARE}" "${MOUNTPOINT}" -o "${OPTS}" if ! mount | grep -q "${MOUNTPOINT}"; then echo "[sync-webgoggles] ERROR: Failed to mount ${SHARE}" exit 1 fi echo "[sync-webgoggles] Mounted at ${MOUNTPOINT}" fi # 2. Run pick-most-skin on local source to find best screenshot per user PICK_SCRIPT="$(dirname "$0")/pick-most-skin" if [ -x "${PICK_SCRIPT}" ]; then echo "[sync-webgoggles] Running ${PICK_SCRIPT} ${SOURCE}" "${PICK_SCRIPT}" "${SOURCE}" else echo "[sync-webgoggles] Warning: ${PICK_SCRIPT} not found, skipping" fi # 3. One-way rsync of SOURCE into the share subpath MP="${MOUNTPOINT}/${SUBPATH}" mkdir -p "${MP}" echo "[sync-webgoggles] Syncing ${SOURCE}/ → ${MP}/" rsync -vrau \ "${SOURCE}/" \ "${MP}/" \ --backup \ --suffix="-CONFLICT" \ --exclude=".DS_Store" \ --progress --stats # 4. Cleanup: deduplicate and thin old screenshots on destination cleanup_destination() { local dest="$1" local thin_days="${2:-42}" # default 6 weeks local keep_interval="${3:-120}" # default 2 minutes echo "[sync-webgoggles] Running cleanup on ${dest}" python3 - "$1" "$2" "$3" <<'PYEOF' import os, json, time, sys, hashlib from collections import defaultdict from datetime import datetime dest = sys.argv[1] thin_days = int(sys.argv[2]) keep_interval = int(sys.argv[3]) cutoff = time.time() - thin_days * 86400 # --- Phase 1: Deduplicate by content hash --- print("[sync-webgoggles cleanup] Deduplicating screenshots...") # Collect all screenshot variants (CONFLICT copies included) size_groups = defaultdict(list) for root, dirs, files in os.walk(dest): for f in files: if f in ('screenshot.png', 'screenshot-CONFLICT.png'): path = os.path.join(root, f) try: size_groups[os.path.getsize(path)].append(path) except OSError: pass dedup_removed = 0 for size, paths in size_groups.items(): if len(paths) < 2: continue hash_groups = defaultdict(list) for p in paths: try: h = hashlib.md5(open(p, 'rb').read()).hexdigest() hash_groups[h].append(p) except OSError: pass for h, same_paths in hash_groups.items(): if len(same_paths) < 2: continue # Keep the chronologically first one, delete the rest same_paths.sort(key=lambda x: os.path.getmtime(x)) keep = same_paths[0] for p in same_paths[1:]: try: os.remove(p) dedup_removed += 1 except OSError: pass print(f"[sync-webgoggles cleanup] Deduplication removed {dedup_removed} files") # --- Phase 2: Thin old session screenshots --- print("[sync-webgoggles cleanup] Thinning old session screenshots...") # Collect all page dirs that have both page-info.json and screenshot.png, # grouped by session (site/user/sessions/TS/) sessions = defaultdict(list) # session_dir -> [(page_dir, timestamp_epoch)] for root, dirs, files in os.walk(dest): if 'page-info.json' not in files or 'screenshot.png' not in files: continue # Only process pages inside sessions/ structure parent = os.path.dirname(root) grandparent = os.path.dirname(parent) if os.path.basename(grandparent) != 'sessions': continue session_dir = parent # .../sessions/2026-05-17T01-10-23-730Z info_path = os.path.join(root, 'page-info.json') try: with open(info_path) as f: info = json.load(f) ts_str = info.get('timestamp', '') if not ts_str: continue # Parse ISO-8601, handle Z suffix if ts_str.endswith('Z'): ts_str = ts_str[:-1] + '+00:00' ts = datetime.fromisoformat(ts_str).timestamp() sessions[session_dir].append((root, ts)) except (json.JSONDecodeError, KeyError, ValueError, OSError): pass thin_removed = 0 thin_kept = 0 for session_dir, pages in sessions.items(): pages.sort(key=lambda x: x[1]) # Only thin sessions where the first page is older than cutoff if pages[0][1] > cutoff: continue session_start = pages[0][1] # Group pages into 120s slots by proximity to ideal keep times # slot N covers [start+N*K - K/2, start+N*K + K/2) slots = defaultdict(list) for page_dir, ts in pages: slot = int((ts - session_start + keep_interval / 2) / keep_interval) slots[slot].append((page_dir, ts)) for slot, slot_pages in sorted(slots.items()): ideal = session_start + slot * keep_interval # Keep the screenshot closest to the ideal slot center best = min(slot_pages, key=lambda x: abs(x[1] - ideal)) thin_kept += 1 for page_dir, ts in slot_pages: if page_dir != best[0]: ss_path = os.path.join(page_dir, 'screenshot.png') try: os.remove(ss_path) thin_removed += 1 except OSError: pass print(f"[sync-webgoggles cleanup] Thinning kept {thin_kept}, removed {thin_removed} screenshots") print(f"[sync-webgoggles cleanup] Done") PYEOF } cleanup_destination "${MP}" 42 120 DURATION=$(( $(date +%s) - START )) echo "[sync-webgoggles] Done in ${DURATION}s at $(date)"