// Package node — minimal Prometheus-format metrics. // // We expose counters, gauges, and histograms in the Prometheus text // exposition format on GET /metrics. A full-blown prometheus/client_golang // dependency would pull 10+ transitive modules; for our needs (a handful of // metrics + stable output format) a ~200 LOC in-tree implementation is // enough, with zero extra build surface. // // Concurrency: all metric types are safe for concurrent Inc/Add/Observe. // Registration happens at init time and is not reentrant. package node import ( "fmt" "net/http" "sort" "strconv" "strings" "sync" "sync/atomic" ) // ─── registry ──────────────────────────────────────────────────────────────── // metric is the internal interface every exposed metric implements. type metric interface { // write emits the metric's lines to w in Prometheus text format. write(w *strings.Builder) } type metricRegistry struct { mu sync.RWMutex entries []metric } var defaultRegistry = &metricRegistry{} func (r *metricRegistry) register(m metric) { r.mu.Lock() r.entries = append(r.entries, m) r.mu.Unlock() } // metricsHandler writes Prometheus exposition format for all registered metrics. func metricsHandler(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } var sb strings.Builder defaultRegistry.mu.RLock() for _, m := range defaultRegistry.entries { m.write(&sb) } defaultRegistry.mu.RUnlock() w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") _, _ = w.Write([]byte(sb.String())) } // ─── counter ───────────────────────────────────────────────────────────────── // MetricCounter is a monotonically increasing value. Typical use: number of // blocks committed, number of rejected txs. type MetricCounter struct { name, help string v atomic.Uint64 } // NewCounter registers and returns a new counter. func NewCounter(name, help string) *MetricCounter { c := &MetricCounter{name: name, help: help} defaultRegistry.register(c) return c } // Inc adds 1 to the counter. func (c *MetricCounter) Inc() { c.v.Add(1) } // Add adds n to the counter (must be ≥ 0 — counters are monotonic). func (c *MetricCounter) Add(n uint64) { c.v.Add(n) } func (c *MetricCounter) write(sb *strings.Builder) { fmt.Fprintf(sb, "# HELP %s %s\n# TYPE %s counter\n%s %d\n", c.name, c.help, c.name, c.name, c.v.Load()) } // ─── gauge ─────────────────────────────────────────────────────────────────── // MetricGauge is a value that can go up or down. Typical use: current // mempool size, active websocket connections, tip height. type MetricGauge struct { name, help string v atomic.Int64 fn func() int64 // optional live provider; if set, v is unused } // NewGauge registers a gauge backed by an atomic Int64. func NewGauge(name, help string) *MetricGauge { g := &MetricGauge{name: name, help: help} defaultRegistry.register(g) return g } // NewGaugeFunc registers a gauge whose value is fetched on scrape. Useful // when the source of truth is some other subsystem (chain.TipIndex, peer // count, etc.) and we don't want a separate mirror variable. func NewGaugeFunc(name, help string, fn func() int64) *MetricGauge { g := &MetricGauge{name: name, help: help, fn: fn} defaultRegistry.register(g) return g } // Set overrides the stored value. Only meaningful for non-fn gauges. func (g *MetricGauge) Set(v int64) { g.v.Store(v) } // Inc / Dec convenience for bookkeeping gauges. func (g *MetricGauge) Inc() { g.v.Add(1) } func (g *MetricGauge) Dec() { g.v.Add(-1) } func (g *MetricGauge) write(sb *strings.Builder) { v := g.v.Load() if g.fn != nil { v = g.fn() } fmt.Fprintf(sb, "# HELP %s %s\n# TYPE %s gauge\n%s %d\n", g.name, g.help, g.name, g.name, v) } // ─── histogram ─────────────────────────────────────────────────────────────── // MetricHistogram is a fixed-bucket latency histogram. Typical use: time to // commit a block, time to apply a tx. We use user-supplied buckets (upper // bounds in seconds) and track sum + count alongside for Prometheus-standard // output. type MetricHistogram struct { name, help string buckets []float64 counts []atomic.Uint64 inf atomic.Uint64 sum atomic.Uint64 // sum in microseconds to avoid floats count atomic.Uint64 } // NewHistogram registers a histogram with explicit bucket upper bounds (s). // Buckets must be strictly increasing. The implicit +Inf bucket is added // automatically per Prometheus spec. func NewHistogram(name, help string, buckets []float64) *MetricHistogram { sorted := make([]float64, len(buckets)) copy(sorted, buckets) sort.Float64s(sorted) h := &MetricHistogram{ name: name, help: help, buckets: sorted, counts: make([]atomic.Uint64, len(sorted)), } defaultRegistry.register(h) return h } // Observe records a single sample (duration in seconds). func (h *MetricHistogram) Observe(seconds float64) { // Record in every bucket whose upper bound ≥ sample. for i, b := range h.buckets { if seconds <= b { h.counts[i].Add(1) } } h.inf.Add(1) h.sum.Add(uint64(seconds * 1_000_000)) // µs resolution h.count.Add(1) } func (h *MetricHistogram) write(sb *strings.Builder) { fmt.Fprintf(sb, "# HELP %s %s\n# TYPE %s histogram\n", h.name, h.help, h.name) for i, b := range h.buckets { fmt.Fprintf(sb, `%s_bucket{le="%s"} %d`+"\n", h.name, strconv.FormatFloat(b, 'g', -1, 64), h.counts[i].Load()) } fmt.Fprintf(sb, `%s_bucket{le="+Inf"} %d`+"\n", h.name, h.inf.Load()) fmt.Fprintf(sb, "%s_sum %f\n", h.name, float64(h.sum.Load())/1_000_000) fmt.Fprintf(sb, "%s_count %d\n", h.name, h.count.Load()) } // ─── registered metrics (called from main.go) ──────────────────────────────── // // Keeping these as package-level vars lets callers just do // `MetricBlocksTotal.Inc()` instead of threading a registry through every // component. Names follow Prometheus naming conventions: // ___ var ( MetricBlocksTotal = NewCounter( "dchain_blocks_total", "Total number of blocks committed by this node", ) MetricTxsTotal = NewCounter( "dchain_txs_total", "Total number of transactions included in committed blocks", ) MetricTxSubmitAccepted = NewCounter( "dchain_tx_submit_accepted_total", "Transactions accepted into the mempool via /api/tx or WS submit_tx", ) MetricTxSubmitRejected = NewCounter( "dchain_tx_submit_rejected_total", "Transactions rejected at API validation (bad sig, timestamp, etc.)", ) MetricWSConnections = NewGauge( "dchain_ws_connections", "Currently open websocket connections on this node", ) MetricPeers = NewGauge( "dchain_peer_count", "Currently connected libp2p peers", ) // Block-commit latency — how long AddBlock takes end-to-end. Catches // slow contract calls before they reach the freeze threshold. MetricBlockCommitSeconds = NewHistogram( "dchain_block_commit_seconds", "Wall-clock seconds spent inside chain.AddBlock", []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30}, ) // Worst validator liveness in the current set — how many seqNums have // passed without a commit vote from the most-delinquent validator. // Alert on this staying > 20 for more than a few minutes; that // validator is either down, partitioned, or wedged. MetricMaxMissedBlocks = NewGauge( "dchain_max_missed_blocks", "Highest missed-block count among current validators (0 if all healthy)", ) )