Files
schmeeve-toolz/sync-webgoggles
2026-05-16 19:47:22 -07:00

212 lines
6.7 KiB
Bash
Executable File

#!/bin/bash
#
# Sync ~/webGoggles to a NAS share.
#
# Uses mount -t cifs with sudo (prompts for password if no credentials file).
# Create ~/.smb/ts.nas to avoid the password prompt each time:
# echo -e "username=schmeeve\npassword=yourpass" > ~/.smb/ts.nas
# chmod 600 ~/.smb/ts.nas
#
# Post-sync cleanup: deduplicates identical screenshots by content hash,
# and thins screenshots in sessions older than 6 weeks to 1 per 2 minutes
# (based on page-info.json timestamps, not filenames).
#
# Adjust SHARE, SUBPATH, and SOURCE to match your environment.
for arg in "$@"; do
case "$arg" in
--help|-h)
cat <<'EOF'
Usage: sync-webgoggles [options]
Sync ~/webGoggles to a NAS share, with deduplication and old-session thinning.
Options:
-h, --help Show this help message and exit
EOF
exit 0
;;
esac
done
SHARE="//ts.nas/aura"
SUBPATH="webGoggles"
MOUNTPOINT="${HOME}/mnt/ts.nas/aura"
CREDENTIALS="${HOME}/.smb/ts.nas"
SOURCE="${HOME}/webGoggles"
START="$(date +%s)"
echo "[sync-webgoggles] Starting at $(date)"
# Verify source exists
if [ ! -d "${SOURCE}" ]; then
echo "[sync-webgoggles] ERROR: Source ${SOURCE} does not exist"
exit 1
fi
# 1. Create mountpoint and mount if not already mounted
if mount | grep -q "${MOUNTPOINT}"; then
echo "[sync-webgoggles] Already mounted at ${MOUNTPOINT}"
else
echo "[sync-webgoggles] Mounting ${SHARE}${MOUNTPOINT}"
mkdir -p "${MOUNTPOINT}"
OPTS="username=schmeeve,uid=$(id -u),gid=$(id -g),forceuid,forcegid,nounix,serverino"
if [ -f "${CREDENTIALS}" ]; then
OPTS="${OPTS},credentials=${CREDENTIALS}"
fi
sudo mount -t cifs "${SHARE}" "${MOUNTPOINT}" -o "${OPTS}"
if ! mount | grep -q "${MOUNTPOINT}"; then
echo "[sync-webgoggles] ERROR: Failed to mount ${SHARE}"
exit 1
fi
echo "[sync-webgoggles] Mounted at ${MOUNTPOINT}"
fi
# 2. Run pick-most-skin on local source to find best screenshot per user
PICK_SCRIPT="$(dirname "$0")/pick-most-skin"
if [ -x "${PICK_SCRIPT}" ]; then
echo "[sync-webgoggles] Running ${PICK_SCRIPT} ${SOURCE}"
"${PICK_SCRIPT}" "${SOURCE}"
else
echo "[sync-webgoggles] Warning: ${PICK_SCRIPT} not found, skipping"
fi
# 3. One-way rsync of SOURCE into the share subpath
MP="${MOUNTPOINT}/${SUBPATH}"
mkdir -p "${MP}"
echo "[sync-webgoggles] Syncing ${SOURCE}/ → ${MP}/"
rsync -vrau \
"${SOURCE}/" \
"${MP}/" \
--backup \
--suffix="-CONFLICT" \
--exclude=".DS_Store" \
--progress --stats
# 4. Cleanup: deduplicate and thin old screenshots on destination
cleanup_destination() {
local dest="$1"
local thin_days="${2:-42}" # default 6 weeks
local keep_interval="${3:-120}" # default 2 minutes
echo "[sync-webgoggles] Running cleanup on ${dest}"
python3 - "$1" "$2" "$3" <<'PYEOF'
import os, json, time, sys, hashlib
from collections import defaultdict
from datetime import datetime
dest = sys.argv[1]
thin_days = int(sys.argv[2])
keep_interval = int(sys.argv[3])
cutoff = time.time() - thin_days * 86400
# --- Phase 1: Deduplicate by content hash ---
print("[sync-webgoggles cleanup] Deduplicating screenshots...")
# Collect all screenshot variants (CONFLICT copies included)
size_groups = defaultdict(list)
for root, dirs, files in os.walk(dest):
for f in files:
if f in ('screenshot.png', 'screenshot-CONFLICT.png'):
path = os.path.join(root, f)
try:
size_groups[os.path.getsize(path)].append(path)
except OSError:
pass
dedup_removed = 0
for size, paths in size_groups.items():
if len(paths) < 2:
continue
hash_groups = defaultdict(list)
for p in paths:
try:
h = hashlib.md5(open(p, 'rb').read()).hexdigest()
hash_groups[h].append(p)
except OSError:
pass
for h, same_paths in hash_groups.items():
if len(same_paths) < 2:
continue
# Keep the chronologically first one, delete the rest
same_paths.sort(key=lambda x: os.path.getmtime(x))
keep = same_paths[0]
for p in same_paths[1:]:
try:
os.remove(p)
dedup_removed += 1
except OSError:
pass
print(f"[sync-webgoggles cleanup] Deduplication removed {dedup_removed} files")
# --- Phase 2: Thin old session screenshots ---
print("[sync-webgoggles cleanup] Thinning old session screenshots...")
# Collect all page dirs that have both page-info.json and screenshot.png,
# grouped by session (site/user/sessions/TS/)
sessions = defaultdict(list) # session_dir -> [(page_dir, timestamp_epoch)]
for root, dirs, files in os.walk(dest):
if 'page-info.json' not in files or 'screenshot.png' not in files:
continue
# Only process pages inside sessions/ structure
parent = os.path.dirname(root)
grandparent = os.path.dirname(parent)
if os.path.basename(grandparent) != 'sessions':
continue
session_dir = parent # .../sessions/2026-05-17T01-10-23-730Z
info_path = os.path.join(root, 'page-info.json')
try:
with open(info_path) as f:
info = json.load(f)
ts_str = info.get('timestamp', '')
if not ts_str:
continue
# Parse ISO-8601, handle Z suffix
if ts_str.endswith('Z'):
ts_str = ts_str[:-1] + '+00:00'
ts = datetime.fromisoformat(ts_str).timestamp()
sessions[session_dir].append((root, ts))
except (json.JSONDecodeError, KeyError, ValueError, OSError):
pass
thin_removed = 0
thin_kept = 0
for session_dir, pages in sessions.items():
pages.sort(key=lambda x: x[1])
# Only thin sessions where the first page is older than cutoff
if pages[0][1] > cutoff:
continue
session_start = pages[0][1]
# Group pages into 120s slots by proximity to ideal keep times
# slot N covers [start+N*K - K/2, start+N*K + K/2)
slots = defaultdict(list)
for page_dir, ts in pages:
slot = int((ts - session_start + keep_interval / 2) / keep_interval)
slots[slot].append((page_dir, ts))
for slot, slot_pages in sorted(slots.items()):
ideal = session_start + slot * keep_interval
# Keep the screenshot closest to the ideal slot center
best = min(slot_pages, key=lambda x: abs(x[1] - ideal))
thin_kept += 1
for page_dir, ts in slot_pages:
if page_dir != best[0]:
ss_path = os.path.join(page_dir, 'screenshot.png')
try:
os.remove(ss_path)
thin_removed += 1
except OSError:
pass
print(f"[sync-webgoggles cleanup] Thinning kept {thin_kept}, removed {thin_removed} screenshots")
print(f"[sync-webgoggles cleanup] Done")
PYEOF
}
cleanup_destination "${MP}" 42 120
DURATION=$(( $(date +%s) - START ))
echo "[sync-webgoggles] Done in ${DURATION}s at $(date)"