#!/bin/sh # Munin multigraph plugin: shelley_ # # Monitors a Shelley instance via its SQLite database. # # Graphs: # shelley_conversations — total conversations (DERIVE, shows creation rate) # shelley_messages — messages by role: user, agent, tool, system (DERIVE) # shelley_requests — LLM requests by model (DERIVE, stacked) # shelley_latency — average request duration by model (GAUGE) # shelley_tokens — input/output tokens by model (DERIVE, stacked) # shelley_cache — cache read vs creation tokens by model (GAUGE) # # Config: # [shelley_] # env.SHELLEY_DB /home/exedev/.config/shelley/shelley.db # user root # # Note: munin-node on Debian runs with ProtectHome=true by default, # which hides /home/. Either set ProtectHome=read-only via a systemd # drop-in override, or place the DB outside /home/. # #%# family=manual #%# capabilities=autoconf SHELLEY_DB="${SHELLEY_DB:-/home/exedev/.config/shelley/shelley.db}" # --- helpers --------------------------------------------------------------- sqlite_query() { sqlite3 "$SHELLEY_DB" "$1" } # Sanitize a model name into a valid munin field name: # must match ^[A-Za-z_][A-Za-z0-9_]*$ sanitize_field() { echo "$1" | sed 's/[^A-Za-z0-9_]/_/g; s/^[0-9]/_/' } # --- autoconf -------------------------------------------------------------- if [ "${1:-}" = "autoconf" ]; then if [ ! -f "$SHELLEY_DB" ]; then echo "no (database not found at ${SHELLEY_DB})" exit 0 fi if ! sqlite3 "$SHELLEY_DB" "SELECT 1" >/dev/null 2>&1; then echo "no (cannot query database at ${SHELLEY_DB})" exit 0 fi echo "yes" exit 0 fi # --- discover models ------------------------------------------------------- # Get distinct models from llm_requests for request/latency graphs MODELS_REQUESTS=$(sqlite_query "SELECT DISTINCT model FROM llm_requests ORDER BY model;") # Get distinct models from messages.usage_data for token/cache graphs MODELS_TOKENS=$(sqlite_query "SELECT DISTINCT json_extract(usage_data, '$.model') FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.input_tokens') > 0 ORDER BY 1;") # --- config ---------------------------------------------------------------- if [ "${1:-}" = "config" ]; then # --- shelley_conversations --- cat <<'EOF' multigraph shelley_conversations graph_title Shelley conversations graph_vlabel conversations / ${graph_period} graph_category shelley graph_args -l 0 --base 1000 graph_info Total conversations tracked. Munin computes the creation rate per period. conversations.label Conversations conversations.type DERIVE conversations.min 0 EOF # --- shelley_messages --- cat <<'EOF' multigraph shelley_messages graph_title Shelley messages by role graph_vlabel messages / ${graph_period} graph_category shelley graph_args -l 0 --base 1000 graph_info Message count by role. Munin computes the rate per period. user.label User user.type DERIVE user.min 0 user.draw AREA agent.label Agent agent.type DERIVE agent.min 0 agent.draw STACK tool.label Tool tool.type DERIVE tool.min 0 tool.draw STACK system.label System system.type DERIVE system.min 0 system.draw STACK EOF # --- shelley_requests --- echo "multigraph shelley_requests" echo "graph_title Shelley LLM requests by model" echo "graph_vlabel requests / ${graph_period}" echo "graph_category shelley" echo "graph_args -l 0 --base 1000" echo "graph_info LLM request count by model. Munin computes the rate per period." FIRST=1 echo "$MODELS_REQUESTS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") echo "${field}.label ${model}" echo "${field}.type DERIVE" echo "${field}.min 0" if [ "$FIRST" = 1 ]; then echo "${field}.draw AREA" FIRST=0 else echo "${field}.draw STACK" fi done echo # --- shelley_latency --- echo "multigraph shelley_latency" echo "graph_title Shelley LLM request latency" echo "graph_vlabel ms" echo "graph_category shelley" echo "graph_args -l 0 --base 1000" echo "graph_info Average request duration in milliseconds by model." echo "$MODELS_REQUESTS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") echo "${field}.label ${model}" echo "${field}.type GAUGE" echo "${field}.min 0" done echo # --- shelley_tokens --- echo "multigraph shelley_tokens" echo "graph_title Shelley token throughput by model" echo "graph_vlabel tokens / ${graph_period}" echo "graph_category shelley" echo "graph_args -l 0 --base 1000" echo "graph_info Input and output token counts by model. Munin computes the rate per period." echo "$MODELS_TOKENS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") echo "${field}_in.label ${model} input ${field}_in.type DERIVE ${field}_in.min 0 ${field}_out.label ${model} output ${field}_out.type DERIVE ${field}_out.min 0 ${field}_out.draw STACK" done echo # --- shelley_cache --- echo "multigraph shelley_cache" echo "graph_title Shelley cache efficiency by model" echo "graph_vlabel tokens" echo "graph_category shelley" echo "graph_args -l 0 --base 1000" echo "graph_info Prompt cache read vs creation tokens by model. High cache_read means good cache hit rate." echo "$MODELS_TOKENS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") echo "${field}_read.label ${model} cache read ${field}_read.type GAUGE ${field}_read.min 0 ${field}_read.draw AREA ${field}_create.label ${model} cache create ${field}_create.type GAUGE ${field}_create.min 0 ${field}_create.draw STACK" done echo exit 0 fi # --- fetch (values) -------------------------------------------------------- # conversations CONV_COUNT=$(sqlite_query "SELECT COUNT(*) FROM conversations;") echo "multigraph shelley_conversations" echo "conversations.value ${CONV_COUNT:-U}" echo # messages by role echo "multigraph shelley_messages" for role in user agent tool system; do COUNT=$(sqlite_query "SELECT COUNT(*) FROM messages WHERE type='${role}';") echo "${role}.value ${COUNT:-U}" done echo # requests by model echo "multigraph shelley_requests" echo "$MODELS_REQUESTS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") COUNT=$(sqlite_query "SELECT COUNT(*) FROM llm_requests WHERE model='${model}';") echo "${field}.value ${COUNT:-U}" done echo # latency by model echo "multigraph shelley_latency" echo "$MODELS_REQUESTS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") AVG=$(sqlite_query "SELECT CAST(AVG(duration_ms) AS INTEGER) FROM llm_requests WHERE model='${model}' AND duration_ms IS NOT NULL;") echo "${field}.value ${AVG:-U}" done echo # tokens by model echo "multigraph shelley_tokens" echo "$MODELS_TOKENS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") # usage_data is a JSON array; extract per-message token counts for matching model IN_TOKENS=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.input_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;") OUT_TOKENS=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.output_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;") echo "${field}_in.value ${IN_TOKENS:-U}" echo "${field}_out.value ${OUT_TOKENS:-U}" done echo # cache by model echo "multigraph shelley_cache" echo "$MODELS_TOKENS" | while read -r model; do [ -z "$model" ] && continue field=$(sanitize_field "$model") CACHE_READ=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.cache_read_input_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;") CACHE_CREATE=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.cache_creation_input_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;") echo "${field}_read.value ${CACHE_READ:-U}" echo "${field}_create.value ${CACHE_CREATE:-U}" done