187 lines
6.2 KiB
Bash
Executable File
187 lines
6.2 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Sync ~/webGoggles to a NAS share.
|
|
#
|
|
# Uses mount -t cifs with sudo (prompts for password if no credentials file).
|
|
# Create ~/.smb/ts.nas to avoid the password prompt each time:
|
|
# echo -e "username=schmeeve\npassword=yourpass" > ~/.smb/ts.nas
|
|
# chmod 600 ~/.smb/ts.nas
|
|
#
|
|
# Post-sync cleanup: deduplicates identical screenshots by content hash,
|
|
# and thins screenshots in sessions older than 6 weeks to 1 per 2 minutes
|
|
# (based on page-info.json timestamps, not filenames).
|
|
#
|
|
# Adjust SHARE, SUBPATH, and SOURCE to match your environment.
|
|
|
|
SHARE="//ts.nas/aura"
|
|
SUBPATH="webGoggles"
|
|
MOUNTPOINT="${HOME}/mnt/ts.nas/aura"
|
|
CREDENTIALS="${HOME}/.smb/ts.nas"
|
|
SOURCE="${HOME}/webGoggles"
|
|
START="$(date +%s)"
|
|
|
|
echo "[sync-webgoggles] Starting at $(date)"
|
|
|
|
# Verify source exists
|
|
if [ ! -d "${SOURCE}" ]; then
|
|
echo "[sync-webgoggles] ERROR: Source ${SOURCE} does not exist"
|
|
exit 1
|
|
fi
|
|
|
|
# 1. Create mountpoint and mount if not already mounted
|
|
if mount | grep -q "${MOUNTPOINT}"; then
|
|
echo "[sync-webgoggles] Already mounted at ${MOUNTPOINT}"
|
|
else
|
|
echo "[sync-webgoggles] Mounting ${SHARE} → ${MOUNTPOINT}"
|
|
mkdir -p "${MOUNTPOINT}"
|
|
OPTS="username=schmeeve,uid=$(id -u),gid=$(id -g),forceuid,forcegid,nounix,serverino"
|
|
if [ -f "${CREDENTIALS}" ]; then
|
|
OPTS="${OPTS},credentials=${CREDENTIALS}"
|
|
fi
|
|
sudo mount -t cifs "${SHARE}" "${MOUNTPOINT}" -o "${OPTS}"
|
|
if ! mount | grep -q "${MOUNTPOINT}"; then
|
|
echo "[sync-webgoggles] ERROR: Failed to mount ${SHARE}"
|
|
exit 1
|
|
fi
|
|
echo "[sync-webgoggles] Mounted at ${MOUNTPOINT}"
|
|
fi
|
|
|
|
# 2. One-way rsync of SOURCE into the share subpath
|
|
MP="${MOUNTPOINT}/${SUBPATH}"
|
|
mkdir -p "${MP}"
|
|
echo "[sync-webgoggles] Syncing ${SOURCE}/ → ${MP}/"
|
|
rsync -vrau \
|
|
"${SOURCE}/" \
|
|
"${MP}/" \
|
|
--backup \
|
|
--suffix="-CONFLICT" \
|
|
--exclude=".DS_Store" \
|
|
--progress --stats
|
|
|
|
# 3. Cleanup: deduplicate and thin old screenshots on destination
|
|
cleanup_destination() {
|
|
local dest="$1"
|
|
local thin_days="${2:-42}" # default 6 weeks
|
|
local keep_interval="${3:-120}" # default 2 minutes
|
|
|
|
echo "[sync-webgoggles] Running cleanup on ${dest}"
|
|
|
|
python3 - "$1" "$2" "$3" <<'PYEOF'
|
|
import os, json, time, sys, hashlib
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
dest = sys.argv[1]
|
|
thin_days = int(sys.argv[2])
|
|
keep_interval = int(sys.argv[3])
|
|
cutoff = time.time() - thin_days * 86400
|
|
|
|
# --- Phase 1: Deduplicate by content hash ---
|
|
print("[sync-webgoggles cleanup] Deduplicating screenshots...")
|
|
|
|
# Collect all screenshot variants (CONFLICT copies included)
|
|
size_groups = defaultdict(list)
|
|
for root, dirs, files in os.walk(dest):
|
|
for f in files:
|
|
if f in ('screenshot.png', 'screenshot-CONFLICT.png'):
|
|
path = os.path.join(root, f)
|
|
try:
|
|
size_groups[os.path.getsize(path)].append(path)
|
|
except OSError:
|
|
pass
|
|
|
|
dedup_removed = 0
|
|
for size, paths in size_groups.items():
|
|
if len(paths) < 2:
|
|
continue
|
|
hash_groups = defaultdict(list)
|
|
for p in paths:
|
|
try:
|
|
h = hashlib.md5(open(p, 'rb').read()).hexdigest()
|
|
hash_groups[h].append(p)
|
|
except OSError:
|
|
pass
|
|
for h, same_paths in hash_groups.items():
|
|
if len(same_paths) < 2:
|
|
continue
|
|
# Keep the chronologically first one, delete the rest
|
|
same_paths.sort(key=lambda x: os.path.getmtime(x))
|
|
keep = same_paths[0]
|
|
for p in same_paths[1:]:
|
|
try:
|
|
os.remove(p)
|
|
dedup_removed += 1
|
|
except OSError:
|
|
pass
|
|
|
|
print(f"[sync-webgoggles cleanup] Deduplication removed {dedup_removed} files")
|
|
|
|
# --- Phase 2: Thin old session screenshots ---
|
|
print("[sync-webgoggles cleanup] Thinning old session screenshots...")
|
|
|
|
# Collect all page dirs that have both page-info.json and screenshot.png,
|
|
# grouped by session (site/user/sessions/TS/)
|
|
sessions = defaultdict(list) # session_dir -> [(page_dir, timestamp_epoch)]
|
|
|
|
for root, dirs, files in os.walk(dest):
|
|
if 'page-info.json' not in files or 'screenshot.png' not in files:
|
|
continue
|
|
# Only process pages inside sessions/ structure
|
|
parent = os.path.dirname(root)
|
|
grandparent = os.path.dirname(parent)
|
|
if os.path.basename(grandparent) != 'sessions':
|
|
continue
|
|
session_dir = parent # .../sessions/2026-05-17T01-10-23-730Z
|
|
info_path = os.path.join(root, 'page-info.json')
|
|
try:
|
|
with open(info_path) as f:
|
|
info = json.load(f)
|
|
ts_str = info.get('timestamp', '')
|
|
if not ts_str:
|
|
continue
|
|
# Parse ISO-8601, handle Z suffix
|
|
if ts_str.endswith('Z'):
|
|
ts_str = ts_str[:-1] + '+00:00'
|
|
ts = datetime.fromisoformat(ts_str).timestamp()
|
|
sessions[session_dir].append((root, ts))
|
|
except (json.JSONDecodeError, KeyError, ValueError, OSError):
|
|
pass
|
|
|
|
thin_removed = 0
|
|
thin_kept = 0
|
|
for session_dir, pages in sessions.items():
|
|
pages.sort(key=lambda x: x[1])
|
|
# Only thin sessions where the first page is older than cutoff
|
|
if pages[0][1] > cutoff:
|
|
continue
|
|
session_start = pages[0][1]
|
|
# Group pages into 120s slots by proximity to ideal keep times
|
|
# slot N covers [start+N*K - K/2, start+N*K + K/2)
|
|
slots = defaultdict(list)
|
|
for page_dir, ts in pages:
|
|
slot = int((ts - session_start + keep_interval / 2) / keep_interval)
|
|
slots[slot].append((page_dir, ts))
|
|
for slot, slot_pages in sorted(slots.items()):
|
|
ideal = session_start + slot * keep_interval
|
|
# Keep the screenshot closest to the ideal slot center
|
|
best = min(slot_pages, key=lambda x: abs(x[1] - ideal))
|
|
thin_kept += 1
|
|
for page_dir, ts in slot_pages:
|
|
if page_dir != best[0]:
|
|
ss_path = os.path.join(page_dir, 'screenshot.png')
|
|
try:
|
|
os.remove(ss_path)
|
|
thin_removed += 1
|
|
except OSError:
|
|
pass
|
|
|
|
print(f"[sync-webgoggles cleanup] Thinning kept {thin_kept}, removed {thin_removed} screenshots")
|
|
print(f"[sync-webgoggles cleanup] Done")
|
|
PYEOF
|
|
}
|
|
|
|
cleanup_destination "${MP}" 42 120
|
|
|
|
DURATION=$(( $(date +%s) - START ))
|
|
echo "[sync-webgoggles] Done in ${DURATION}s at $(date)"
|