Skip to content

Index Health Monitor #181

Index Health Monitor

Index Health Monitor #181

name: Index Health Monitor
on:
schedule:
- cron: "0 */4 * * *"
workflow_dispatch:
inputs:
force_notify:
description: "Send notifications regardless of state machine"
type: boolean
default: false
dry_run:
description: "Run checks but do not notify or update state"
type: boolean
default: false
permissions:
contents: read
jobs:
monitor:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Restore state from cache
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: /tmp/health-monitor-state.json
key: index-health-state-
restore-keys: |
index-health-state-
- name: Run health probes
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
FORCE_NOTIFY: ${{ inputs.force_notify || 'false' }}
DRY_RUN: ${{ inputs.dry_run || 'false' }}
run: |
set -euo pipefail
STATE_FILE="/tmp/health-monitor-state.json"
NOTIFY_FILE="/tmp/health-monitor-notifications.txt"
RUN_URL="$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID"
NOW=$(date -u +%s)
NOW_ISO=$(date -u +%Y-%m-%dT%H:%M:%SZ)
BULLET=$(printf '\xe2\x80\xa2')
# Initialize notification file
: > "$NOTIFY_FILE"
# Initialize state file if missing or invalid
if [ ! -f "$STATE_FILE" ] || ! jq empty "$STATE_FILE" 2>/dev/null; then
echo '{}' > "$STATE_FILE"
fi
# --- Instance configurations ---
declare -A INSTANCE_URLS
INSTANCE_URLS[copilotkit-docs]="https://mcp.copilotkit.ai"
INSTANCE_URLS[pathfinder-docs]="https://mcp.pathfinder.copilotkit.dev"
INSTANCE_URLS[aimock-docs]="https://mcp.aimock.copilotkit.dev" # TODO: verify domain
declare -A CHUNK_FLOORS
CHUNK_FLOORS[copilotkit-docs]=1000
CHUNK_FLOORS[pathfinder-docs]=50
CHUNK_FLOORS[aimock-docs]=50
# Source-to-repo mappings stored as "source:owner/repo:branch" entries
COPILOTKIT_DOCS_SOURCES=(
"docs:CopilotKit/CopilotKit:main"
"code:CopilotKit/CopilotKit:main"
"ag-ui-docs:ag-ui-protocol/ag-ui:main"
"ag-ui-code:ag-ui-protocol/ag-ui:main"
)
PATHFINDER_DOCS_SOURCES=(
"pathfinder-docs:CopilotKit/pathfinder:main"
)
AIMOCK_DOCS_SOURCES=(
"docs:CopilotKit/aimock:main"
"code:CopilotKit/aimock:main"
)
# --- GitHub API: prefetch HEAD commits ---
HEAD_COMMITS_FILE="/tmp/head-commits.json"
prefetch_head_commits() {
local entries=("$@")
local seen=()
echo '{}' > "$HEAD_COMMITS_FILE"
for entry in "${entries[@]}"; do
local remainder="${entry#*:}"
local repo="${remainder%%:*}"
local branch="${remainder##*:}"
local cache_key="${repo}:${branch}"
# Deduplicate
local already=false
for s in "${seen[@]+"${seen[@]}"}"; do
if [ "$s" = "$cache_key" ]; then already=true; break; fi
done
[ "$already" = true ] && continue
seen+=("$cache_key")
local sha=""
local api_response
if api_response=$(curl -sf --max-time 30 \
-H "Authorization: Bearer $GITHUB_TOKEN" \
"https://api.github.com/repos/${repo}/commits/${branch}" 2>/dev/null); then
sha=$(echo "$api_response" | jq -r '.sha // empty' | head -c 8)
fi
local tmp
tmp=$(jq --arg k "$cache_key" --arg v "$sha" '.[$k] = $v' "$HEAD_COMMITS_FILE")
echo "$tmp" > "$HEAD_COMMITS_FILE"
done
}
get_head_commit() {
local repo="$1"
local branch="$2"
local cache_key="${repo}:${branch}"
jq -r --arg k "$cache_key" '.[$k] // empty' "$HEAD_COMMITS_FILE"
}
# --- Check one instance ---
check_instance() {
local instance="$1"
local url="$2"
local chunk_floor="$3"
shift 3
local sources=("$@")
local issues=()
local total_chunks=0
local source_count=0
local health_json=""
# 1. Liveness check
if ! health_json=$(curl -sf --max-time 30 "${url}/health" 2>/dev/null); then
issues+=("liveness: unreachable (HTTP error or timeout)")
# Cannot proceed without a response
echo "ISSUES:$(printf '%s\n' "${issues[@]}" | jq -R . | jq -sc .)"
echo "TOTAL_CHUNKS:0"
echo "SOURCE_COUNT:0"
return
fi
# 2. Parse response
if ! echo "$health_json" | jq empty 2>/dev/null; then
issues+=("liveness: invalid JSON response")
echo "ISSUES:$(printf '%s\n' "${issues[@]}" | jq -R . | jq -sc .)"
echo "TOTAL_CHUNKS:0"
echo "SOURCE_COUNT:0"
return
fi
# 3. Service status
local status
status=$(echo "$health_json" | jq -r '.status // "unknown"')
if [ "$status" != "ok" ]; then
issues+=("service degraded: ${status}")
fi
# 4. Source errors -- check sources in the health response
local known_keys=()
for entry in "${sources[@]}"; do
known_keys+=("${entry%%:*}")
done
while IFS= read -r source_line; do
local src_key src_status src_error
src_key=$(echo "$source_line" | jq -r '.key')
src_status=$(echo "$source_line" | jq -r '.status')
src_error=$(echo "$source_line" | jq -r '.error // empty')
# Skip sources not in our mapping
local known=false
for k in "${known_keys[@]}"; do
if [ "$k" = "$src_key" ]; then
known=true
break
fi
done
[ "$known" = true ] || continue
if [ "$src_status" = "error" ]; then
issues+=("source error: ${src_key} -- ${src_error}")
fi
done < <(echo "$health_json" | jq -c '.index.sources[]? // empty')
# 5. Commit drift
for entry in "${sources[@]}"; do
local src_key src_repo src_branch
src_key="${entry%%:*}"
local remainder="${entry#*:}"
src_repo="${remainder%%:*}"
src_branch="${remainder##*:}"
# Find this source in the health response
local src_data
src_data=$(echo "$health_json" | jq -c --arg key "$src_key" '.index.sources[]? | select(.key == $key)' 2>/dev/null)
[ -n "$src_data" ] || continue
local src_status
src_status=$(echo "$src_data" | jq -r '.status // empty')
# Skip if mid-reindex
[ "$src_status" != "indexing" ] || continue
local indexed_commit
indexed_commit=$(echo "$src_data" | jq -r '.commit // empty')
[ -n "$indexed_commit" ] || continue
local head_commit
head_commit=$(get_head_commit "$src_repo" "$src_branch")
if [ -z "$head_commit" ]; then
issues+=("commit drift: ${src_key} -- could not fetch HEAD for ${src_repo}@${src_branch} (GitHub API error)")
continue
fi
if [ "$indexed_commit" != "$head_commit" ]; then
# Check last_indexed age
local last_indexed_ts
last_indexed_ts=$(echo "$src_data" | jq -r '.last_indexed // empty')
if [ -n "$last_indexed_ts" ]; then
local indexed_epoch
if ! indexed_epoch=$(date -d "$last_indexed_ts" +%s 2>/dev/null); then
issues+=("commit drift: ${src_key} -- indexed ${indexed_commit}, HEAD is ${head_commit} (could not parse last_indexed timestamp '${last_indexed_ts}')")
continue
fi
local age_hours=$(( (NOW - indexed_epoch) / 3600 ))
if [ "$age_hours" -ge 25 ]; then
issues+=("commit drift: ${src_key} -- indexed ${indexed_commit}, HEAD is ${head_commit} (last indexed ${age_hours}h ago)")
fi
else
issues+=("commit drift: ${src_key} -- indexed ${indexed_commit}, HEAD is ${head_commit} (last_indexed timestamp unavailable)")
fi
fi
done
# 6. Chunk floor
total_chunks=$(echo "$health_json" | jq -r '.index.total_chunks // 0')
if [ "$total_chunks" -lt "$chunk_floor" ]; then
issues+=("chunk count: ${total_chunks} below minimum ${chunk_floor}")
fi
# Count sources
source_count=$(echo "$health_json" | jq '[.index.sources[]?] | length')
# Output results
if [ ${#issues[@]} -eq 0 ]; then
echo "ISSUES:[]"
else
echo "ISSUES:$(printf '%s\n' "${issues[@]}" | jq -R . | jq -sc .)"
fi
echo "TOTAL_CHUNKS:${total_chunks}"
echo "SOURCE_COUNT:${source_count}"
}
# --- State machine ---
process_instance() {
local instance="$1"
local url="$2"
local chunk_floor="$3"
shift 3
local sources=("$@")
echo "=== Checking ${instance} ==="
# Run checks and capture output
local check_output
check_output=$(check_instance "$instance" "$url" "$chunk_floor" "${sources[@]}")
local issues_json total_chunks source_count
issues_json=$(echo "$check_output" | grep '^ISSUES:' | head -1 | sed 's/^ISSUES://')
total_chunks=$(echo "$check_output" | grep '^TOTAL_CHUNKS:' | head -1 | sed 's/^TOTAL_CHUNKS://')
source_count=$(echo "$check_output" | grep '^SOURCE_COUNT:' | head -1 | sed 's/^SOURCE_COUNT://')
local issue_count
issue_count=$(echo "$issues_json" | jq 'length')
echo " Issues found: ${issue_count}"
if [ "$issue_count" -gt 0 ]; then
echo "$issues_json" | jq -r '.[]' | while read -r iss; do
echo " - ${iss}"
done
fi
# Determine new status
local new_status="green"
if [ "$issue_count" -gt 0 ]; then
new_status="red"
fi
# Read existing state for this instance
local old_state
old_state=$(jq -c --arg inst "$instance" '.[$inst] // {"status":"unknown","since":"","last_notified":"","notifications_24h":[],"issues":[]}' "$STATE_FILE")
local old_status
old_status=$(echo "$old_state" | jq -r '.status')
local old_since
old_since=$(echo "$old_state" | jq -r '.since // empty')
echo " State transition: ${old_status} -> ${new_status}"
# Determine if we should notify
local should_notify=false
local notify_type=""
case "${old_status}:${new_status}" in
unknown:green)
should_notify=true
notify_type="baseline"
;;
unknown:red)
should_notify=true
notify_type="alert"
;;
green:green)
should_notify=false
;;
green:red)
should_notify=true
notify_type="alert"
;;
red:green)
should_notify=true
notify_type="recovery"
;;
red:red)
# Rate-limit: max 2 notifications per 24h
local recent_count
recent_count=$(echo "$old_state" | jq --argjson now "$NOW" '[.notifications_24h[]? | select((. | tonumber) > ($now - 86400))] | length')
if [ "$recent_count" -lt 2 ]; then
should_notify=true
notify_type="repeat"
else
echo " Notification suppressed (rate limit: ${recent_count} in last 24h)"
fi
;;
esac
# Force notify override
if [ "$FORCE_NOTIFY" = "true" ]; then
should_notify=true
if [ -z "$notify_type" ]; then
if [ "$new_status" = "green" ]; then
notify_type="baseline"
else
notify_type="alert"
fi
fi
fi
# Build notification message
if [ "$should_notify" = true ]; then
local message=""
local issues_bullets=""
# Pre-build bullet list of issues
if [ "$issue_count" -gt 0 ]; then
issues_bullets=$(echo "$issues_json" | jq -r '.[]' | while IFS= read -r iss; do
printf '%s %s\n' "$BULLET" "$iss"
done)
fi
case "$notify_type" in
alert)
if [ "$old_status" = "red" ]; then
message=$(printf 'Pathfinder index health: %s still RED (since %s UTC)\n%s\n%s' "$instance" "$old_since" "$issues_bullets" "$RUN_URL")
else
message=$(printf 'Pathfinder index health: %s is RED\n%s\n%s' "$instance" "$issues_bullets" "$RUN_URL")
fi
;;
repeat)
message=$(printf 'Pathfinder index health: %s still RED (since %s UTC)\n%s\n%s' "$instance" "$old_since" "$issues_bullets" "$RUN_URL")
;;
recovery)
message=$(printf 'Pathfinder index health: %s recovered (was RED since %s UTC)\nAll sources indexed and current.' "$instance" "$old_since")
;;
baseline)
message=$(printf 'Pathfinder index health: %s baseline established -- GREEN\n%s sources indexed, %s total chunks.' "$instance" "$source_count" "$total_chunks")
;;
esac
if [ -n "$message" ]; then
echo " Notification: ${notify_type}"
# Write notification as JSON line
jq -nc --arg inst "$instance" --arg text "$message" '{instance: $inst, text: $text}' >> "$NOTIFY_FILE"
fi
fi
# Update state
local new_since="$old_since"
if [ "$old_status" != "$new_status" ]; then
new_since="$NOW_ISO"
fi
local new_notifications_24h
new_notifications_24h=$(echo "$old_state" | jq -c --argjson now "$NOW" '[.notifications_24h[]? | select((. | tonumber) > ($now - 86400))]')
local new_last_notified
new_last_notified=$(echo "$old_state" | jq -r '.last_notified // empty')
# Write updated state back
local updated_state
updated_state=$(jq -nc \
--arg status "$new_status" \
--arg since "$new_since" \
--arg last_notified "$new_last_notified" \
--argjson notifications_24h "$new_notifications_24h" \
--argjson issues "$issues_json" \
'{status: $status, since: $since, last_notified: $last_notified, notifications_24h: $notifications_24h, issues: $issues}')
# Merge into state file (atomic write)
local tmp_state
tmp_state=$(jq --arg inst "$instance" --argjson state "$updated_state" '.[$inst] = $state' "$STATE_FILE")
echo "$tmp_state" > "${STATE_FILE}.tmp"
mv "${STATE_FILE}.tmp" "$STATE_FILE"
echo " State saved: status=${new_status}, since=${new_since}"
}
# --- Main ---
# Prefetch HEAD commits for all unique repos (avoids subshell cache loss)
prefetch_head_commits "${COPILOTKIT_DOCS_SOURCES[@]}" "${PATHFINDER_DOCS_SOURCES[@]}" "${AIMOCK_DOCS_SOURCES[@]}"
process_instance "copilotkit-docs" \
"${INSTANCE_URLS[copilotkit-docs]}" \
"${CHUNK_FLOORS[copilotkit-docs]}" \
"${COPILOTKIT_DOCS_SOURCES[@]}"
process_instance "pathfinder-docs" \
"${INSTANCE_URLS[pathfinder-docs]}" \
"${CHUNK_FLOORS[pathfinder-docs]}" \
"${PATHFINDER_DOCS_SOURCES[@]}"
process_instance "aimock-docs" \
"${INSTANCE_URLS[aimock-docs]}" \
"${CHUNK_FLOORS[aimock-docs]}" \
"${AIMOCK_DOCS_SOURCES[@]}"
echo ""
echo "=== Final state ==="
jq . "$STATE_FILE"
echo ""
echo "=== Notifications ==="
if [ -s "$NOTIFY_FILE" ]; then
cat "$NOTIFY_FILE"
else
echo "(none)"
fi
- name: Send Slack notifications
if: always() && inputs.dry_run != 'true'
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
run: |
set -euo pipefail
NOTIFY_FILE="/tmp/health-monitor-notifications.txt"
if [ ! -s "$NOTIFY_FILE" ]; then
echo "No notifications to send."
exit 0
fi
STATE_FILE="/tmp/health-monitor-state.json"
NOW_EPOCH=$(date -u +%s)
NOW_ISO=$(date -u +%Y-%m-%dT%H:%M:%SZ)
while IFS= read -r line; do
instance=$(echo "$line" | jq -r '.instance')
text=$(echo "$line" | jq -r '.text')
if [ -n "$SLACK_WEBHOOK" ]; then
payload=$(jq -nc --arg t "$text" '{text: $t}')
if curl -sf --max-time 30 -X POST "$SLACK_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "$payload" 2>/dev/null; then
tmp_state=$(jq --arg inst "$instance" --argjson ts "$NOW_EPOCH" --arg iso "$NOW_ISO" \
'.[$inst].notifications_24h += [$ts] | .[$inst].last_notified = $iso' \
"$STATE_FILE")
echo "$tmp_state" > "${STATE_FILE}.tmp"
mv "${STATE_FILE}.tmp" "$STATE_FILE"
else
echo "::warning::Slack notification failed for ${instance}"
fi
else
echo "::warning::SLACK_WEBHOOK not configured -- notification not sent: ${instance}"
fi
done < "$NOTIFY_FILE"
- name: Save state to cache
uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
if: always() && inputs.dry_run != 'true'
with:
path: /tmp/health-monitor-state.json
key: index-health-state-${{ github.run_id }}