From 65b19dbd962b8431fe3c7bf41b5ecbc7671a7d18 Mon Sep 17 00:00:00 2001 From: osmarks Date: Sun, 25 Apr 2021 22:10:52 +0100 Subject: [PATCH] Big refactors, performance, status images --- nim.cfg | 2 +- onstat.nimble | 3 +- src/db.nim | 37 +++++--- src/onstat.nim | 248 ++++++++++++++++++++++++++++++++++--------------- src/style.css | 21 +++-- 5 files changed, 218 insertions(+), 93 deletions(-) diff --git a/nim.cfg b/nim.cfg index ace4e9c..f804694 100644 --- a/nim.cfg +++ b/nim.cfg @@ -1 +1 @@ --d:ssl \ No newline at end of file +-d:ssl --threads:on \ No newline at end of file diff --git a/onstat.nimble b/onstat.nimble index cd36092..cc159bc 100644 --- a/onstat.nimble +++ b/onstat.nimble @@ -12,4 +12,5 @@ bin = @["onstat"] requires "nim >= 1.4.2" requires "https://github.com/GULPF/tiny_sqlite#8fe760d9" requires "karax >= 1.2.1" -requires "cligen >= 1" \ No newline at end of file +requires "cligen >= 1" +requires "imageman >= 0.8" \ No newline at end of file diff --git a/src/db.nim b/src/db.nim index 2bdb2e1..168d32f 100644 --- a/src/db.nim +++ b/src/db.nim @@ -3,21 +3,34 @@ import options let migrations: seq[string] = @[ """ - CREATE TABLE sites ( - sid INTEGER PRIMARY KEY, - url TEXT NOT NULL - ); +CREATE TABLE sites ( + sid INTEGER PRIMARY KEY, + url TEXT NOT NULL +); - CREATE TABLE reqs ( - rid INTEGER PRIMARY KEY, - site INTEGER NOT NULL REFERENCES sites(sid), - timestamp INTEGER NOT NULL, - status INTEGER NOT NULL, - latency INTEGER NOT NULL - ); +CREATE TABLE reqs ( + rid INTEGER PRIMARY KEY, + site INTEGER NOT NULL REFERENCES sites(sid), + timestamp INTEGER NOT NULL, + status INTEGER NOT NULL, + latency INTEGER NOT NULL +); """, """ - CREATE INDEX req_ts_idx ON reqs (timestamp); +CREATE INDEX req_ts_idx ON reqs (timestamp); + """, + # rolling total/successful ping and latency count + # rc_data_since holds the older end of the interval the counters are from + # this slightly horribly migrates the existing data using a hardcoded 1 week window + """ +ALTER TABLE sites ADD COLUMN rc_total INTEGER NOT NULL DEFAULT 0; +ALTER TABLE sites ADD COLUMN rc_success INTEGER NOT NULL DEFAULT 0; +ALTER TABLE sites ADD COLUMN rc_latency INTEGER NOT NULL DEFAULT 0; +ALTER TABLE sites ADD COLUMN rc_data_since INTEGER; +UPDATE sites SET rc_total = (SELECT COUNT(*) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000); +UPDATE sites SET rc_success = (SELECT SUM(status <= 0) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000); +UPDATE sites SET rc_latency = (SELECT SUM(latency) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000); +UPDATE sites SET rc_data_since = (strftime('%s') - (86400*7)) * 1000000; """ ] diff --git a/src/onstat.nim b/src/onstat.nim index ee4705b..1dc9e42 100644 --- a/src/onstat.nim +++ b/src/onstat.nim @@ -10,8 +10,12 @@ import sugar import net import sequtils import strformat -import std/exitprocs +import strutils import cligen +import imageman +import math +import hashes +import tables import ./db @@ -19,19 +23,6 @@ macro includeFile(x: string): string = newStrLitNode(readFile(x.strVal)) const css = includeFile("./src/style.css") -var threadDB {.threadvar.}: Option[DbConn] -proc getDB(): DbConn {.gcsafe.} = - if isNone threadDB: - let x = openDatabase("./monitoring.sqlite3") - x.exec("PRAGMA journal_mode=WAL") - proc closeDB() = - try: close(x) - except: discard - addExitProc(closeDB) - when declared(onThreadDestroy): onThreadDestroy(closeDB) - threadDB = some x - get threadDB - func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000) func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000) @@ -39,37 +30,42 @@ proc toDbValue(t: Time): DbValue = DbValue(kind: sqliteInteger, intVal: timeToTi proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal) type - ResponseType = enum - rtHttpTeapot = -1 - rtOk = 0 - rtHttpError = 1 - rtTimeout = 2 - rtFetchError = 3 + ResponseType {.pure.} = enum + HttpTeapot = -1 + Ok = 0 + HttpError = 1 + Timeout = 2 + FetchError = 3 Response = object rtype: ResponseType latency: int64 # microseconds SiteStatus = object + id: int url: string lastPing: Time lastResponse: ResponseType lastLatency: float uptimePercent: float + averageLatency: float + Ctx = object + db: DbConn + dbPath: string + images: TableRef[int, (seq[byte], int)] + interval: int -proc uptimeSince(sid: int, time: Time): float = - let okPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND status <= 0 AND timestamp >= ?", sid, time), int) - let totalPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND timestamp >= ?", sid, time), int) - okPings / totalPings - -proc fetchLatest(row: ResultRow): Option[SiteStatus] = +proc fetchLatest(ctx: Ctx, row: ResultRow): Option[SiteStatus] = let weekAgo = getTime() + initTimeInterval(weeks= -1) - let (site, url) = row.unpack((int, string)) - let row = getDB().one("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT 1", site) + let (site, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int, string, int64, int64, int64, int64)) + # work around bizarre SQLite query planner issue - it appears that if it has a literal value to compare site against it generates very fast VM code + # but if it has a prepared state parameter it somehow refuses to use the index + let row = ctx.db.one("SELECT timestamp, status, latency FROM reqs WHERE site = -1 OR site = ? ORDER BY timestamp DESC LIMIT 1", site) if isNone row: return none(SiteStatus) let (ts, status, latency) = (get row).unpack((Time, int, int)) - some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float64(latency) / 1e3, uptimePercent: uptimeSince(site, weekAgo)) + some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float(latency) / 1e3, id: site, + uptimePercent: float(rollingSuccessfulPings) / float(rollingTotalPings), averageLatency: float(rollingLatency) / float(rollingTotalPings) / 1e3) -proc mainPage(): string = - let sites = getDB().all("SELECT * FROM sites ORDER BY sid").map(fetchLatest).filter(x => isSome x).map(x => get x) +proc mainPage(ctx: Ctx): string = + let sites = ctx.db.all("SELECT * FROM sites ORDER BY sid").map(x => ctx.fetchLatest(x)).filter(x => x.isSome).map(x => x.get) let up = sites.filter(x => int(x.lastResponse) <= 0).len() let vnode = buildHtml(html()): head: @@ -83,23 +79,25 @@ proc mainPage(): string = h2(class="title"): text &"{up}/{sites.len} up" for site in sites: tdiv(class="card " & $site.lastResponse): - h2: - case site.lastResponse - of rtOk: text "✓ " - of rtHttpError: text "⚠ " - of rtTimeout: text "✕ " - of rtFetchError: text "✕ " - of rtHttpTeapot: text "🫖 " - text site.url - tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy")) - tdiv: - case site.lastResponse - of rtOk: text &"Latency {site.lastLatency}ms" - of rtHttpError: text "HTTP error" - of rtHttpTeapot: text &"Teapot, latency {site.lastLatency}ms" - of rtTimeout: text "Timed out" - of rtFetchError: text "Fetch failed" - tdiv: text &"{site.uptimePercent * 100}% up in last week" + tdiv(class="left"): + h2: + case site.lastResponse + of ResponseType.Ok: text "✓ " + of ResponseType.HttpError: text "⚠ " + of ResponseType.Timeout: text "✕ " + of ResponseType.FetchError: text "✕ " + of ResponseType.HttpTeapot: text "🫖 " + text site.url + tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy")) + tdiv: + case site.lastResponse + of ResponseType.Ok: text &"Latency {site.lastLatency}ms" + of ResponseType.HttpError: text "HTTP error" + of ResponseType.HttpTeapot: text &"Teapot, latency {site.lastLatency:.5f}ms" + of ResponseType.Timeout: text "Timed out" + of ResponseType.FetchError: text "Fetch failed" + tdiv: text &"{site.uptimePercent * 100:.5f}% up, {site.averageLatency:.5f}ms latency in last week" + if site.id in ctx.images: img(src= &"/vis/{site.id}", class="right", title= "&{site.url} 12-week status visualization") hr() small: text "made by " @@ -109,54 +107,158 @@ proc mainPage(): string = text "." $vnode -proc onRequest(req: Request) {.async.} = - if req.reqMethod == HttpGet: - case req.url.path - of "/": await req.respond(Http200, mainPage(), headers=newHttpHeaders([("Content-Type", "text/html")])) - else: await req.respond(Http404, "not found") - else: - await req.respond(Http404, "not found") +var imageReturnChannel: Channel[(int, seq[byte])] -proc pollTarget(s: string): Future[Response] {.async.} = +proc readIntoContext(ctx: Ctx) = + # this is a horrible workaround to avoid having to something something shared hash table + var available = true + while available: + let (av, data) = imageReturnChannel.tryRecv() + available = av + if available: + let (id, image) = data + ctx.images[id] = (image, image.hash) + +proc onRequest(ctx: Ctx): (proc(req: Request): Future[void] {.gcsafe.}) = + result = proc(req: Request) {.async.} = + readIntoContext(ctx) + if req.reqMethod == HttpGet: + var path = req.url.path + if path == "/": + await req.respond(Http200, mainPage(ctx), headers=newHttpHeaders([("Content-Type", "text/html")])) + elif path.startsWith("/vis/"): + path.removePrefix("/vis/") + var id = 0 + try: + id = parseInt path + except: + await req.respond(Http404, "not found") + return + if id in ctx.images: + let (image, hash) = ctx.images[id] + let etag = &"\"{hash}\"" + if etag == req.headers.getOrDefault("if-none-match"): + await req.respond(Http304, "") + else: + await req.respond(Http200, cast[string](image), headers=newHttpHeaders([ + ("Content-Type", "image/png"), ("ETag", etag)])) + else: await req.respond(Http404, "not found") + else: await req.respond(Http404, "not found") + else: + await req.respond(Http405, "GET only") + +proc pollTarget(ctx: Ctx, s: string): Future[Response] {.async.} = var client = newAsyncHttpClient() - var x = Response(rtype: rtTimeout, latency: 0) + var x = Response(rtype: ResponseType.Timeout, latency: 0) proc doFetch() {.async.} = let ts = now().utc let res = await client.get(s) let latency = (now().utc - ts).inMicroseconds - if res.code.int == 418: x = Response(rtype: rtHttpTeapot, latency: latency) - elif res.code.is4xx or res.code.is5xx: x = Response(rtype: rtHttpError, latency: latency) - else: x = Response(rtype: rtOk, latency: latency) + if res.code.int == 418: x = Response(rtype: ResponseType.HttpTeapot, latency: latency) + elif res.code.is4xx or res.code.is5xx: x = Response(rtype: ResponseType.HttpError, latency: latency) + else: x = Response(rtype: ResponseType.Ok, latency: latency) try: discard await withTimeout(doFetch(), 10000) except: - x = Response(rtype: rtFetchError, latency: 0) + x = Response(rtype: ResponseType.FetchError, latency: 0) client.close() return x -proc pollTargets() {.async.} = - for row in getDB().all("SELECT * FROM sites"): - let (id, url) = row.unpack((int64, string)) - let res = await pollTarget(url) - getDB().exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency) +proc pollTargets(ctx: Ctx) {.async.} = + for row in ctx.db.all("SELECT * FROM sites"): + var (id, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int64, string, int64, int64, int64, Option[Time])) + let res = await ctx.pollTarget(url) + let threshold = getTime() + initTimeInterval(weeks= -1) -proc timerCallback(fd: AsyncFD): bool = - asyncCheck pollTargets() - false + # drop old data from rolling counters + if rollingDataSince.isSome: + for row in ctx.db.iterate("SELECT status, latency FROM reqs WHERE timestamp >= ? AND timestamp <= ? AND site = ?", rollingDataSince.get, threshold, id): + let (statusRaw, latency) = row.unpack((int, int)) + rollingTotalPings -= 1 + rollingLatency -= latency + if statusRaw <= 0: + rollingSuccessfulPings -= 1 + + # add new data + rollingTotalPings += 1 + rollingLatency += res.latency + if int(res.rtype) <= 0: + rollingSuccessfulPings += 1 + + ctx.db.transaction: + ctx.db.exec("UPDATE sites SET rc_total = ?, rc_success = ?, rc_latency = ?, rc_data_since = ? WHERE sid = ?", rollingTotalPings, rollingSuccessfulPings, rollingLatency, threshold, id) + ctx.db.exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency) + +proc drawLatencyImage(db: DbConn, site: int, interval: int): seq[byte] = + const width = 120 * 6 + const height = 168 * 2 + var image = initImage[ColorRGBU](width, height) + var count = 0 + var lastTs = getTime() + for row in db.iterate("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT ?", site, width * height): + let (ts, statusRaw, latency) = row.unpack((Time, int, int)) + let timeGap = lastTs - ts + if timeGap > initDuration(milliseconds = interval + 10000): + let pixels = timeGap.inMilliseconds div interval + for _ in 1..pixels: + image.data[count] = ColorRGBU([0x7Eu8, 0x1E, 0x9C]) + count += 1 + if count >= image.data.len: break + else: + let status = ResponseType(statusRaw) + case status + of ResponseType.HttpError: + image.data[count] = ColorRGBU([255u8, 127, 0]) + of ResponseType.Timeout: + image.data[count] = ColorRGBU([0u8, 0, 0]) + of ResponseType.FetchError: + image.data[count] = ColorRGBU([255u8, 0, 0]) + else: + let latencyMultiplier = max(min(pow(10.0, 1.1) / pow(float(latency), 0.25), 1.0), 0.2) + image.data[count] = ColorRGBU([0u8, uint8(latencyMultiplier * 255.0), 0]) + + count += 1 + if count >= image.data.len: break + lastTs = ts + writePNG(image, compression=6) + +proc generateImages(args: (string, int)) = + let (dbPath, interval) = args + let db = openDatabase(dbPath) + db.exec("PRAGMA journal_mode = WAL") + for row in db.all("SELECT sid FROM sites"): + let id = row[0].fromDbValue(int) + imageReturnChannel.send((id, drawLatencyImage(db, id, interval))) + close(db) proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) = ## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this. let database = openDatabase(dbPath) + database.exec("PRAGMA journal_mode = WAL") migrate(database) for url in urls: echo &"Adding {url}" database.exec("INSERT INTO sites (url) VALUES (?)", url) - close(database) + + var ctx = Ctx(db: database, dbPath: dbPath, images: newTable[int, (seq[byte], int)](), interval: interval) + echo "Starting up" - asyncCheck pollTargets() - addTimer(interval, false, timerCallback) + asyncCheck pollTargets(ctx) + imageReturnChannel.open() + var thread: Thread[(string, int)] + createThread(thread, generateImages, (dbPath, interval)) + echo "Ready" + addTimer(interval, false, proc(fd: AsyncFD): bool = + asyncCheck pollTargets(ctx) + false) + addTimer(interval * 60, false, proc(fd: AsyncFD): bool = + createThread(thread, generateImages, (dbPath, interval)) + let fut = sleepAsync(10000) + fut.addCallback(() => readIntoContext(ctx)) + asyncCheck fut + false) var server = newAsyncHttpServer() - waitFor server.serve(Port(port), onRequest) + waitFor server.serve(Port(port), onRequest(ctx)) dispatch(run, help={ "dbPath": "path to SQLite3 database for historical data logging", "port": "port to serve HTTP on", diff --git a/src/style.css b/src/style.css index 99f5ce8..8e6cbf2 100644 --- a/src/style.css +++ b/src/style.css @@ -1,5 +1,4 @@ body { - max-width: 40em; font-family: sans-serif; } @@ -9,6 +8,7 @@ body { h1, h2 { font-weight: normal; + max-width: 100%; margin: 0; } @@ -18,20 +18,29 @@ h1 { .card { margin-bottom: 1em; + display: flex; + justify-content: space-between; + flex-wrap: wrap; } -.card.rtOk h2 { +.card.Ok h2 { color: green; } -.card.rtHttpError h2 { +.card.HttpError h2 { color: orange; } -.card.rtHttpTeapot h2 { +.card.HttpTeapot h2 { color: blue; } -.card.rtFetchError h2 { +.card.FetchError h2 { color: red; } -.card.rtTimeout h2 { +.card.Timeout h2 { color: red; } + +img { + image-rendering: pixelated; + -ms-interpolation-mode: nearest-neighbor; + image-rendering: crisp-edges; +} \ No newline at end of file