1#!/bin/sh
2# Munin multigraph plugin: shelley_
3#
4# Monitors a Shelley instance via its SQLite database.
5#
6# Graphs:
7# shelley_conversations — total conversations (DERIVE, shows creation rate)
8# shelley_messages — messages by role: user, agent, tool, system (DERIVE)
9# shelley_requests — LLM requests by model (DERIVE, stacked)
10# shelley_latency — average request duration by model (GAUGE)
11# shelley_tokens — input/output tokens by model (DERIVE, stacked)
12# shelley_cache — cache read vs creation tokens by model (GAUGE)
13#
14# Config:
15# [shelley_]
16# env.SHELLEY_DB /home/exedev/.config/shelley/shelley.db
17# user root
18#
19# Note: munin-node on Debian runs with ProtectHome=true by default,
20# which hides /home/. Either set ProtectHome=read-only via a systemd
21# drop-in override, or place the DB outside /home/.
22#
23#%# family=manual
24#%# capabilities=autoconf
25
26SHELLEY_DB="${SHELLEY_DB:-/home/exedev/.config/shelley/shelley.db}"
27
28# --- helpers ---------------------------------------------------------------
29
30sqlite_query() {
31 sqlite3 "$SHELLEY_DB" "$1"
32}
33
34# Sanitize a model name into a valid munin field name:
35# must match ^[A-Za-z_][A-Za-z0-9_]*$
36sanitize_field() {
37 echo "$1" | sed 's/[^A-Za-z0-9_]/_/g; s/^[0-9]/_/'
38}
39
40# --- autoconf --------------------------------------------------------------
41
42if [ "${1:-}" = "autoconf" ]; then
43 if [ ! -f "$SHELLEY_DB" ]; then
44 echo "no (database not found at ${SHELLEY_DB})"
45 exit 0
46 fi
47 if ! sqlite3 "$SHELLEY_DB" "SELECT 1" >/dev/null 2>&1; then
48 echo "no (cannot query database at ${SHELLEY_DB})"
49 exit 0
50 fi
51 echo "yes"
52 exit 0
53fi
54
55# --- discover models -------------------------------------------------------
56
57# Get distinct models from llm_requests for request/latency graphs
58MODELS_REQUESTS=$(sqlite_query "SELECT DISTINCT model FROM llm_requests ORDER BY model;")
59# Get distinct models from messages.usage_data for token/cache graphs
60MODELS_TOKENS=$(sqlite_query "SELECT DISTINCT json_extract(usage_data, '$.model') FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.input_tokens') > 0 ORDER BY 1;")
61
62# --- config ----------------------------------------------------------------
63
64if [ "${1:-}" = "config" ]; then
65
66 # --- shelley_conversations ---
67 cat <<'EOF'
68multigraph shelley_conversations
69graph_title Shelley conversations
70graph_vlabel conversations / ${graph_period}
71graph_category shelley
72graph_args -l 0 --base 1000
73graph_info Total conversations tracked. Munin computes the creation rate per period.
74conversations.label Conversations
75conversations.type DERIVE
76conversations.min 0
77
78EOF
79
80 # --- shelley_messages ---
81 cat <<'EOF'
82multigraph shelley_messages
83graph_title Shelley messages by role
84graph_vlabel messages / ${graph_period}
85graph_category shelley
86graph_args -l 0 --base 1000
87graph_info Message count by role. Munin computes the rate per period.
88user.label User
89user.type DERIVE
90user.min 0
91user.draw AREA
92agent.label Agent
93agent.type DERIVE
94agent.min 0
95agent.draw STACK
96tool.label Tool
97tool.type DERIVE
98tool.min 0
99tool.draw STACK
100system.label System
101system.type DERIVE
102system.min 0
103system.draw STACK
104
105EOF
106
107 # --- shelley_requests ---
108 echo "multigraph shelley_requests"
109 echo "graph_title Shelley LLM requests by model"
110 echo "graph_vlabel requests / ${graph_period}"
111 echo "graph_category shelley"
112 echo "graph_args -l 0 --base 1000"
113 echo "graph_info LLM request count by model. Munin computes the rate per period."
114 FIRST=1
115 echo "$MODELS_REQUESTS" | while read -r model; do
116 [ -z "$model" ] && continue
117 field=$(sanitize_field "$model")
118 echo "${field}.label ${model}"
119 echo "${field}.type DERIVE"
120 echo "${field}.min 0"
121 if [ "$FIRST" = 1 ]; then
122 echo "${field}.draw AREA"
123 FIRST=0
124 else
125 echo "${field}.draw STACK"
126 fi
127 done
128 echo
129
130 # --- shelley_latency ---
131 echo "multigraph shelley_latency"
132 echo "graph_title Shelley LLM request latency"
133 echo "graph_vlabel ms"
134 echo "graph_category shelley"
135 echo "graph_args -l 0 --base 1000"
136 echo "graph_info Average request duration in milliseconds by model."
137 echo "$MODELS_REQUESTS" | while read -r model; do
138 [ -z "$model" ] && continue
139 field=$(sanitize_field "$model")
140 echo "${field}.label ${model}"
141 echo "${field}.type GAUGE"
142 echo "${field}.min 0"
143 done
144 echo
145
146 # --- shelley_tokens ---
147 echo "multigraph shelley_tokens"
148 echo "graph_title Shelley token throughput by model"
149 echo "graph_vlabel tokens / ${graph_period}"
150 echo "graph_category shelley"
151 echo "graph_args -l 0 --base 1000"
152 echo "graph_info Input and output token counts by model. Munin computes the rate per period."
153 echo "$MODELS_TOKENS" | while read -r model; do
154 [ -z "$model" ] && continue
155 field=$(sanitize_field "$model")
156 echo "${field}_in.label ${model} input
157${field}_in.type DERIVE
158${field}_in.min 0
159${field}_out.label ${model} output
160${field}_out.type DERIVE
161${field}_out.min 0
162${field}_out.draw STACK"
163 done
164 echo
165
166 # --- shelley_cache ---
167 echo "multigraph shelley_cache"
168 echo "graph_title Shelley cache efficiency by model"
169 echo "graph_vlabel tokens"
170 echo "graph_category shelley"
171 echo "graph_args -l 0 --base 1000"
172 echo "graph_info Prompt cache read vs creation tokens by model. High cache_read means good cache hit rate."
173 echo "$MODELS_TOKENS" | while read -r model; do
174 [ -z "$model" ] && continue
175 field=$(sanitize_field "$model")
176 echo "${field}_read.label ${model} cache read
177${field}_read.type GAUGE
178${field}_read.min 0
179${field}_read.draw AREA
180${field}_create.label ${model} cache create
181${field}_create.type GAUGE
182${field}_create.min 0
183${field}_create.draw STACK"
184 done
185 echo
186
187 exit 0
188fi
189
190# --- fetch (values) --------------------------------------------------------
191
192# conversations
193CONV_COUNT=$(sqlite_query "SELECT COUNT(*) FROM conversations;")
194echo "multigraph shelley_conversations"
195echo "conversations.value ${CONV_COUNT:-U}"
196echo
197
198# messages by role
199echo "multigraph shelley_messages"
200for role in user agent tool system; do
201 COUNT=$(sqlite_query "SELECT COUNT(*) FROM messages WHERE type='${role}';")
202 echo "${role}.value ${COUNT:-U}"
203done
204echo
205
206# requests by model
207echo "multigraph shelley_requests"
208echo "$MODELS_REQUESTS" | while read -r model; do
209 [ -z "$model" ] && continue
210 field=$(sanitize_field "$model")
211 COUNT=$(sqlite_query "SELECT COUNT(*) FROM llm_requests WHERE model='${model}';")
212 echo "${field}.value ${COUNT:-U}"
213done
214echo
215
216# latency by model
217echo "multigraph shelley_latency"
218echo "$MODELS_REQUESTS" | while read -r model; do
219 [ -z "$model" ] && continue
220 field=$(sanitize_field "$model")
221 AVG=$(sqlite_query "SELECT CAST(AVG(duration_ms) AS INTEGER) FROM llm_requests WHERE model='${model}' AND duration_ms IS NOT NULL;")
222 echo "${field}.value ${AVG:-U}"
223done
224echo
225
226# tokens by model
227echo "multigraph shelley_tokens"
228echo "$MODELS_TOKENS" | while read -r model; do
229 [ -z "$model" ] && continue
230 field=$(sanitize_field "$model")
231 # usage_data is a JSON array; extract per-message token counts for matching model
232 IN_TOKENS=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.input_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;")
233 OUT_TOKENS=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.output_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;")
234 echo "${field}_in.value ${IN_TOKENS:-U}"
235 echo "${field}_out.value ${OUT_TOKENS:-U}"
236done
237echo
238
239# cache by model
240echo "multigraph shelley_cache"
241echo "$MODELS_TOKENS" | while read -r model; do
242 [ -z "$model" ] && continue
243 field=$(sanitize_field "$model")
244 CACHE_READ=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.cache_read_input_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;")
245 CACHE_CREATE=$(sqlite_query "SELECT COALESCE(SUM(json_extract(usage_data, '$.cache_creation_input_tokens')),0) FROM messages WHERE usage_data IS NOT NULL AND json_extract(usage_data, '$.model')='${model}' AND json_extract(usage_data, '$.input_tokens') > 0;")
246 echo "${field}_read.value ${CACHE_READ:-U}"
247 echo "${field}_create.value ${CACHE_CREATE:-U}"
248done