dchain/node/metrics.go

// Package node — minimal Prometheus-format metrics.
//
// We expose counters, gauges, and histograms in the Prometheus text
// exposition format on GET /metrics. A full-blown prometheus/client_golang
// dependency would pull 10+ transitive modules; for our needs (a handful of
// metrics + stable output format) a ~200 LOC in-tree implementation is
// enough, with zero extra build surface.
//
// Concurrency: all metric types are safe for concurrent Inc/Add/Observe.
// Registration happens at init time and is not reentrant.
package node

import (
	"fmt"
	"net/http"
	"sort"
	"strconv"
	"strings"
	"sync"
	"sync/atomic"
)

// ─── registry ────────────────────────────────────────────────────────────────

// metric is the internal interface every exposed metric implements.
type metric interface {
	// write emits the metric's lines to w in Prometheus text format.
	write(w *strings.Builder)
}

type metricRegistry struct {
	mu      sync.RWMutex
	entries []metric
}

var defaultRegistry = &metricRegistry{}

func (r *metricRegistry) register(m metric) {
	r.mu.Lock()
	r.entries = append(r.entries, m)
	r.mu.Unlock()
}

// metricsHandler writes Prometheus exposition format for all registered metrics.
func metricsHandler(w http.ResponseWriter, r *http.Request) {
	if r.Method != http.MethodGet {
		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
		return
	}
	var sb strings.Builder
	defaultRegistry.mu.RLock()
	for _, m := range defaultRegistry.entries {
		m.write(&sb)
	}
	defaultRegistry.mu.RUnlock()
	w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
	_, _ = w.Write([]byte(sb.String()))
}

// ─── counter ─────────────────────────────────────────────────────────────────

// MetricCounter is a monotonically increasing value. Typical use: number of
// blocks committed, number of rejected txs.
type MetricCounter struct {
	name, help string
	v          atomic.Uint64
}

// NewCounter registers and returns a new counter.
func NewCounter(name, help string) *MetricCounter {
	c := &MetricCounter{name: name, help: help}
	defaultRegistry.register(c)
	return c
}

// Inc adds 1 to the counter.
func (c *MetricCounter) Inc() { c.v.Add(1) }

// Add adds n to the counter (must be ≥ 0 — counters are monotonic).
func (c *MetricCounter) Add(n uint64) { c.v.Add(n) }

func (c *MetricCounter) write(sb *strings.Builder) {
	fmt.Fprintf(sb, "# HELP %s %s\n# TYPE %s counter\n%s %d\n",
		c.name, c.help, c.name, c.name, c.v.Load())
}

// ─── gauge ───────────────────────────────────────────────────────────────────

// MetricGauge is a value that can go up or down. Typical use: current
// mempool size, active websocket connections, tip height.
type MetricGauge struct {
	name, help string
	v          atomic.Int64
	fn         func() int64 // optional live provider; if set, v is unused
}

// NewGauge registers a gauge backed by an atomic Int64.
func NewGauge(name, help string) *MetricGauge {
	g := &MetricGauge{name: name, help: help}
	defaultRegistry.register(g)
	return g
}

// NewGaugeFunc registers a gauge whose value is fetched on scrape. Useful
// when the source of truth is some other subsystem (chain.TipIndex, peer
// count, etc.) and we don't want a separate mirror variable.
func NewGaugeFunc(name, help string, fn func() int64) *MetricGauge {
	g := &MetricGauge{name: name, help: help, fn: fn}
	defaultRegistry.register(g)
	return g
}

// Set overrides the stored value. Only meaningful for non-fn gauges.
func (g *MetricGauge) Set(v int64) { g.v.Store(v) }

// Inc / Dec convenience for bookkeeping gauges.
func (g *MetricGauge) Inc() { g.v.Add(1) }
func (g *MetricGauge) Dec() { g.v.Add(-1) }

func (g *MetricGauge) write(sb *strings.Builder) {
	v := g.v.Load()
	if g.fn != nil {
		v = g.fn()
	}
	fmt.Fprintf(sb, "# HELP %s %s\n# TYPE %s gauge\n%s %d\n",
		g.name, g.help, g.name, g.name, v)
}

// ─── histogram ───────────────────────────────────────────────────────────────

// MetricHistogram is a fixed-bucket latency histogram. Typical use: time to
// commit a block, time to apply a tx. We use user-supplied buckets (upper
// bounds in seconds) and track sum + count alongside for Prometheus-standard
// output.
type MetricHistogram struct {
	name, help string
	buckets    []float64
	counts     []atomic.Uint64
	inf        atomic.Uint64
	sum        atomic.Uint64 // sum in microseconds to avoid floats
	count      atomic.Uint64
}

// NewHistogram registers a histogram with explicit bucket upper bounds (s).
// Buckets must be strictly increasing. The implicit +Inf bucket is added
// automatically per Prometheus spec.
func NewHistogram(name, help string, buckets []float64) *MetricHistogram {
	sorted := make([]float64, len(buckets))
	copy(sorted, buckets)
	sort.Float64s(sorted)
	h := &MetricHistogram{
		name: name, help: help,
		buckets: sorted,
		counts:  make([]atomic.Uint64, len(sorted)),
	}
	defaultRegistry.register(h)
	return h
}

// Observe records a single sample (duration in seconds).
func (h *MetricHistogram) Observe(seconds float64) {
	// Record in every bucket whose upper bound ≥ sample.
	for i, b := range h.buckets {
		if seconds <= b {
			h.counts[i].Add(1)
		}
	}
	h.inf.Add(1)
	h.sum.Add(uint64(seconds * 1_000_000)) // µs resolution
	h.count.Add(1)
}

func (h *MetricHistogram) write(sb *strings.Builder) {
	fmt.Fprintf(sb, "# HELP %s %s\n# TYPE %s histogram\n", h.name, h.help, h.name)
	for i, b := range h.buckets {
		fmt.Fprintf(sb, `%s_bucket{le="%s"} %d`+"\n",
			h.name, strconv.FormatFloat(b, 'g', -1, 64), h.counts[i].Load())
	}
	fmt.Fprintf(sb, `%s_bucket{le="+Inf"} %d`+"\n", h.name, h.inf.Load())
	fmt.Fprintf(sb, "%s_sum %f\n", h.name, float64(h.sum.Load())/1_000_000)
	fmt.Fprintf(sb, "%s_count %d\n", h.name, h.count.Load())
}

// ─── registered metrics (called from main.go) ────────────────────────────────
//
// Keeping these as package-level vars lets callers just do
// `MetricBlocksTotal.Inc()` instead of threading a registry through every
// component. Names follow Prometheus naming conventions:
//   <namespace>_<subsystem>_<metric>_<unit>

var (
	MetricBlocksTotal = NewCounter(
		"dchain_blocks_total",
		"Total number of blocks committed by this node",
	)
	MetricTxsTotal = NewCounter(
		"dchain_txs_total",
		"Total number of transactions included in committed blocks",
	)
	MetricTxSubmitAccepted = NewCounter(
		"dchain_tx_submit_accepted_total",
		"Transactions accepted into the mempool via /api/tx or WS submit_tx",
	)
	MetricTxSubmitRejected = NewCounter(
		"dchain_tx_submit_rejected_total",
		"Transactions rejected at API validation (bad sig, timestamp, etc.)",
	)
	MetricWSConnections = NewGauge(
		"dchain_ws_connections",
		"Currently open websocket connections on this node",
	)
	MetricPeers = NewGauge(
		"dchain_peer_count",
		"Currently connected libp2p peers",
	)
	// Block-commit latency — how long AddBlock takes end-to-end. Catches
	// slow contract calls before they reach the freeze threshold.
	MetricBlockCommitSeconds = NewHistogram(
		"dchain_block_commit_seconds",
		"Wall-clock seconds spent inside chain.AddBlock",
		[]float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30},
	)

	// Worst validator liveness in the current set — how many seqNums have
	// passed without a commit vote from the most-delinquent validator.
	// Alert on this staying > 20 for more than a few minutes; that
	// validator is either down, partitioned, or wedged.
	MetricMaxMissedBlocks = NewGauge(
		"dchain_max_missed_blocks",
		"Highest missed-block count among current validators (0 if all healthy)",
	)
)