msitarzewski--agency-agents/scripts/check-agent-originality.sh

#!/usr/bin/env bash
#
# check-agent-originality.sh — Flag agent files that substantially duplicate
# an existing agent (or another agent in the same change set).
#
# Why: a new agent should be genuinely new. Find-replace "re-skins" of an
# existing agent (e.g. swapping a country/platform name) are easy to miss in
# review because they're mergeable and well-formed — but they bloat the
# library with duplicates. This compares each candidate against the whole
# existing roster using entity-neutralized 8-word shingle overlap, so a
# swapped proper noun can't hide the copy.
#
# Usage:
#   ./scripts/check-agent-originality.sh [file ...]
#     With files: checks those agent .md files (used by CI on changed files).
#     With no args: checks every agent in the repo against every other (audit).
#
# Exit status:
#   0  all candidates below the FAIL threshold
#   1  at least one candidate at/above FAIL threshold (likely duplicate)
#
# Tunables (env):
#   ORIGINALITY_FAIL   default 40  — at/above this %, treated as a duplicate (exit 1)
#   ORIGINALITY_WARN   default 20  — at/above this %, surfaced as a warning (no fail)
#
# Calibration: across the existing 184-agent library the worst same-pair
# similarity is ~1.5% (median 0%). Anything in the double digits is a strong
# anomaly; the defaults leave a wide safety margin against false positives.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"

command -v python3 >/dev/null 2>&1 || {
  echo "ERROR: python3 is required for the originality check." >&2
  exit 2
}

ORIGINALITY_FAIL="${ORIGINALITY_FAIL:-40}" \
ORIGINALITY_WARN="${ORIGINALITY_WARN:-20}" \
REPO_ROOT="$REPO_ROOT" \
python3 - "$@" <<'PYEOF'
import os, re, sys, glob

REPO_ROOT = os.environ["REPO_ROOT"]
FAIL = float(os.environ["ORIGINALITY_FAIL"])
WARN = float(os.environ["ORIGINALITY_WARN"])

AGENT_DIRS = ("academic design engineering finance game-development marketing "
              "paid-media product project-management sales spatial-computing "
              "specialized strategy support testing").split()

# Proper nouns we neutralize so a find-replace re-skin (swap the country/platform
# and little else) still scores as a near-duplicate. Extend as new markets appear.
ENTITY = re.compile(
    r'\b(vietnam|vietnamese|china|chinese|douyin|tiktok|korea|korean|japan|japanese|'
    r'india|indian|indonesia|indonesian|thailand|thai|philippines|filipino|brazil|'
    r'brazilian|mexico|mexican|wechat|weixin|weibo|xiaohongshu|rednote|kuaishou|'
    r'bilibili|zhihu|baidu|shopee|lazada|zalo|tokopedia|taobao|tmall|pinduoduo|'
    r'instagram|facebook|youtube|reels|shorts|linkedin|twitter|threads|snapchat)\b')

def strip_frontmatter(t):
    if t.startswith('---'):
        parts = t.split('---', 2)
        if len(parts) >= 3:
            return parts[2]
    return t

def tokens(text):
    text = ENTITY.sub(' ', strip_frontmatter(text).lower())
    text = re.sub(r'[^a-z0-9 ]', ' ', text)
    return text.split()

def shingles(words, k=8):
    return set(' '.join(words[i:i+k]) for i in range(max(0, len(words) - k + 1)))

def jaccard(a, b):
    return len(a & b) / len(a | b) if a and b else 0.0

def is_agent(path):
    try:
        with open(path) as fh:
            return fh.readline().strip() == '---'
    except OSError:
        return False

def rel(p):
    try:
        return os.path.relpath(p, REPO_ROOT)
    except ValueError:
        return p

# --- Build the existing-library corpus -------------------------------------
corpus = {}
for d in AGENT_DIRS:
    for f in glob.glob(os.path.join(REPO_ROOT, d, '**', '*.md'), recursive=True):
        if is_agent(f):
            corpus[os.path.abspath(f)] = shingles(tokens(open(f).read()))

# --- Determine candidates ---------------------------------------------------
args = sys.argv[1:]
if args:
    candidates = []
    for a in args:
        p = a if os.path.isabs(a) else os.path.join(os.getcwd(), a)
        p = os.path.abspath(p)
        if not os.path.isfile(p):
            print(f"  skip (not found): {a}")
            continue
        if not is_agent(p):
            print(f"  skip (no frontmatter, not an agent): {rel(p)}")
            continue
        candidates.append(p)
else:
    candidates = list(corpus.keys())   # audit mode: everything vs everything

if not candidates:
    print("No agent files to check.")
    sys.exit(0)

cand_sh = {p: corpus.get(p) or shingles(tokens(open(p).read())) for p in candidates}
cand_set = set(candidates)

worst = 0.0
fails, warns = [], []

for p in candidates:
    sh = cand_sh[p]
    best_name, best_score = "", 0.0
    # vs existing library (exclude the candidate itself by path)
    for cf, csh in corpus.items():
        if cf == p:
            continue
        s = jaccard(sh, csh)
        if s > best_score:
            best_name, best_score = rel(cf), s
    # vs other candidates in this same change set
    for op in candidates:
        if op == p:
            continue
        s = jaccard(sh, cand_sh[op])
        if s > best_score:
            best_name, best_score = rel(op) + " (same change set)", s

    pct = best_score * 100
    worst = max(worst, pct)
    tag = "OK   "
    if pct >= FAIL:
        tag = "FAIL "; fails.append((rel(p), best_name, pct))
    elif pct >= WARN:
        tag = "WARN "; warns.append((rel(p), best_name, pct))
    print(f"  [{tag}] {pct:5.1f}%  {rel(p)}")
    if best_name:
        print(f"            closest: {best_name}")

print()
print(f"Thresholds: WARN >= {WARN:.0f}%, FAIL >= {FAIL:.0f}%  "
      f"(existing-library baseline max ~1.5%)")

if fails:
    print()
    print(f"FAILED: {len(fails)} agent(s) substantially duplicate existing content:")
    for name, match, pct in fails:
        print(f"  - {name}  ~{pct:.0f}% like  {match}")
    print()
    print("A new agent should be genuinely new. If this is intended market/platform")
    print("localization, make the body distinct (different platforms, tactics, examples)")
    print("rather than a find-replace of an existing agent.")
    sys.exit(1)

if warns:
    print(f"\n{len(warns)} warning(s) — review for overlap, but not blocking.")
print("\nPASSED")
sys.exit(0)
PYEOF