mirror of
https://github.com/msitarzewski/agency-agents/
synced 2026-06-09 10:13:17 +00:00
feat: add agent originality check (script + CI + docs) (#560)
Adds scripts/check-agent-originality.sh, which flags new agents that substantially duplicate an existing one. It compares each candidate against the whole roster (and other files in the same change set) using entity-neutralized 8-word shingle overlap, so a find-replace "re-skin" that only swaps a country/platform name can't slip past review. - CI: new "Check agent originality" step in lint-agents.yml runs it on changed agent files; a >=40% match fails the build. - Docs: CONTRIBUTING.md gains a self-run "before submitting" step, a checklist item, and a "things we'll always close" bullet for re-skins. Calibration: across the existing 184-agent library the worst same-pair similarity is ~1.5% (median 0%), so the WARN >=20% / FAIL >=40% defaults leave a wide margin against false positives. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
083ce47e13
commit
5032f7e75c
Executable
+176
@@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# check-agent-originality.sh — Flag agent files that substantially duplicate
|
||||
# an existing agent (or another agent in the same change set).
|
||||
#
|
||||
# Why: a new agent should be genuinely new. Find-replace "re-skins" of an
|
||||
# existing agent (e.g. swapping a country/platform name) are easy to miss in
|
||||
# review because they're mergeable and well-formed — but they bloat the
|
||||
# library with duplicates. This compares each candidate against the whole
|
||||
# existing roster using entity-neutralized 8-word shingle overlap, so a
|
||||
# swapped proper noun can't hide the copy.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/check-agent-originality.sh [file ...]
|
||||
# With files: checks those agent .md files (used by CI on changed files).
|
||||
# With no args: checks every agent in the repo against every other (audit).
|
||||
#
|
||||
# Exit status:
|
||||
# 0 all candidates below the FAIL threshold
|
||||
# 1 at least one candidate at/above FAIL threshold (likely duplicate)
|
||||
#
|
||||
# Tunables (env):
|
||||
# ORIGINALITY_FAIL default 40 — at/above this %, treated as a duplicate (exit 1)
|
||||
# ORIGINALITY_WARN default 20 — at/above this %, surfaced as a warning (no fail)
|
||||
#
|
||||
# Calibration: across the existing 184-agent library the worst same-pair
|
||||
# similarity is ~1.5% (median 0%). Anything in the double digits is a strong
|
||||
# anomaly; the defaults leave a wide safety margin against false positives.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
command -v python3 >/dev/null 2>&1 || {
|
||||
echo "ERROR: python3 is required for the originality check." >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
ORIGINALITY_FAIL="${ORIGINALITY_FAIL:-40}" \
|
||||
ORIGINALITY_WARN="${ORIGINALITY_WARN:-20}" \
|
||||
REPO_ROOT="$REPO_ROOT" \
|
||||
python3 - "$@" <<'PYEOF'
|
||||
import os, re, sys, glob
|
||||
|
||||
REPO_ROOT = os.environ["REPO_ROOT"]
|
||||
FAIL = float(os.environ["ORIGINALITY_FAIL"])
|
||||
WARN = float(os.environ["ORIGINALITY_WARN"])
|
||||
|
||||
AGENT_DIRS = ("academic design engineering finance game-development marketing "
|
||||
"paid-media product project-management sales spatial-computing "
|
||||
"specialized strategy support testing").split()
|
||||
|
||||
# Proper nouns we neutralize so a find-replace re-skin (swap the country/platform
|
||||
# and little else) still scores as a near-duplicate. Extend as new markets appear.
|
||||
ENTITY = re.compile(
|
||||
r'\b(vietnam|vietnamese|china|chinese|douyin|tiktok|korea|korean|japan|japanese|'
|
||||
r'india|indian|indonesia|indonesian|thailand|thai|philippines|filipino|brazil|'
|
||||
r'brazilian|mexico|mexican|wechat|weixin|weibo|xiaohongshu|rednote|kuaishou|'
|
||||
r'bilibili|zhihu|baidu|shopee|lazada|zalo|tokopedia|taobao|tmall|pinduoduo|'
|
||||
r'instagram|facebook|youtube|reels|shorts|linkedin|twitter|threads|snapchat)\b')
|
||||
|
||||
def strip_frontmatter(t):
|
||||
if t.startswith('---'):
|
||||
parts = t.split('---', 2)
|
||||
if len(parts) >= 3:
|
||||
return parts[2]
|
||||
return t
|
||||
|
||||
def tokens(text):
|
||||
text = ENTITY.sub(' ', strip_frontmatter(text).lower())
|
||||
text = re.sub(r'[^a-z0-9 ]', ' ', text)
|
||||
return text.split()
|
||||
|
||||
def shingles(words, k=8):
|
||||
return set(' '.join(words[i:i+k]) for i in range(max(0, len(words) - k + 1)))
|
||||
|
||||
def jaccard(a, b):
|
||||
return len(a & b) / len(a | b) if a and b else 0.0
|
||||
|
||||
def is_agent(path):
|
||||
try:
|
||||
with open(path) as fh:
|
||||
return fh.readline().strip() == '---'
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
def rel(p):
|
||||
try:
|
||||
return os.path.relpath(p, REPO_ROOT)
|
||||
except ValueError:
|
||||
return p
|
||||
|
||||
# --- Build the existing-library corpus -------------------------------------
|
||||
corpus = {}
|
||||
for d in AGENT_DIRS:
|
||||
for f in glob.glob(os.path.join(REPO_ROOT, d, '**', '*.md'), recursive=True):
|
||||
if is_agent(f):
|
||||
corpus[os.path.abspath(f)] = shingles(tokens(open(f).read()))
|
||||
|
||||
# --- Determine candidates ---------------------------------------------------
|
||||
args = sys.argv[1:]
|
||||
if args:
|
||||
candidates = []
|
||||
for a in args:
|
||||
p = a if os.path.isabs(a) else os.path.join(os.getcwd(), a)
|
||||
p = os.path.abspath(p)
|
||||
if not os.path.isfile(p):
|
||||
print(f" skip (not found): {a}")
|
||||
continue
|
||||
if not is_agent(p):
|
||||
print(f" skip (no frontmatter, not an agent): {rel(p)}")
|
||||
continue
|
||||
candidates.append(p)
|
||||
else:
|
||||
candidates = list(corpus.keys()) # audit mode: everything vs everything
|
||||
|
||||
if not candidates:
|
||||
print("No agent files to check.")
|
||||
sys.exit(0)
|
||||
|
||||
cand_sh = {p: corpus.get(p) or shingles(tokens(open(p).read())) for p in candidates}
|
||||
cand_set = set(candidates)
|
||||
|
||||
worst = 0.0
|
||||
fails, warns = [], []
|
||||
|
||||
for p in candidates:
|
||||
sh = cand_sh[p]
|
||||
best_name, best_score = "", 0.0
|
||||
# vs existing library (exclude the candidate itself by path)
|
||||
for cf, csh in corpus.items():
|
||||
if cf == p:
|
||||
continue
|
||||
s = jaccard(sh, csh)
|
||||
if s > best_score:
|
||||
best_name, best_score = rel(cf), s
|
||||
# vs other candidates in this same change set
|
||||
for op in candidates:
|
||||
if op == p:
|
||||
continue
|
||||
s = jaccard(sh, cand_sh[op])
|
||||
if s > best_score:
|
||||
best_name, best_score = rel(op) + " (same change set)", s
|
||||
|
||||
pct = best_score * 100
|
||||
worst = max(worst, pct)
|
||||
tag = "OK "
|
||||
if pct >= FAIL:
|
||||
tag = "FAIL "; fails.append((rel(p), best_name, pct))
|
||||
elif pct >= WARN:
|
||||
tag = "WARN "; warns.append((rel(p), best_name, pct))
|
||||
print(f" [{tag}] {pct:5.1f}% {rel(p)}")
|
||||
if best_name:
|
||||
print(f" closest: {best_name}")
|
||||
|
||||
print()
|
||||
print(f"Thresholds: WARN >= {WARN:.0f}%, FAIL >= {FAIL:.0f}% "
|
||||
f"(existing-library baseline max ~1.5%)")
|
||||
|
||||
if fails:
|
||||
print()
|
||||
print(f"FAILED: {len(fails)} agent(s) substantially duplicate existing content:")
|
||||
for name, match, pct in fails:
|
||||
print(f" - {name} ~{pct:.0f}% like {match}")
|
||||
print()
|
||||
print("A new agent should be genuinely new. If this is intended market/platform")
|
||||
print("localization, make the body distinct (different platforms, tactics, examples)")
|
||||
print("rather than a find-replace of an existing agent.")
|
||||
sys.exit(1)
|
||||
|
||||
if warns:
|
||||
print(f"\n{len(warns)} warning(s) — review for overlap, but not blocking.")
|
||||
print("\nPASSED")
|
||||
sys.exit(0)
|
||||
PYEOF
|
||||
Reference in New Issue
Block a user