Big refactors, performance, status images

This commit is contained in:
osmarks 2021-04-25 22:10:52 +01:00
parent dfbda82731
commit 65b19dbd96
5 changed files with 218 additions and 93 deletions

View File

@ -1 +1 @@
-d:ssl -d:ssl --threads:on

View File

@ -13,3 +13,4 @@ requires "nim >= 1.4.2"
requires "https://github.com/GULPF/tiny_sqlite#8fe760d9" requires "https://github.com/GULPF/tiny_sqlite#8fe760d9"
requires "karax >= 1.2.1" requires "karax >= 1.2.1"
requires "cligen >= 1" requires "cligen >= 1"
requires "imageman >= 0.8"

View File

@ -18,6 +18,19 @@ let migrations: seq[string] = @[
""", """,
""" """
CREATE INDEX req_ts_idx ON reqs (timestamp); CREATE INDEX req_ts_idx ON reqs (timestamp);
""",
# rolling total/successful ping and latency count
# rc_data_since holds the older end of the interval the counters are from
# this slightly horribly migrates the existing data using a hardcoded 1 week window
"""
ALTER TABLE sites ADD COLUMN rc_total INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_success INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_latency INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_data_since INTEGER;
UPDATE sites SET rc_total = (SELECT COUNT(*) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_success = (SELECT SUM(status <= 0) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_latency = (SELECT SUM(latency) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_data_since = (strftime('%s') - (86400*7)) * 1000000;
""" """
] ]

View File

@ -10,8 +10,12 @@ import sugar
import net import net
import sequtils import sequtils
import strformat import strformat
import std/exitprocs import strutils
import cligen import cligen
import imageman
import math
import hashes
import tables
import ./db import ./db
@ -19,19 +23,6 @@ macro includeFile(x: string): string = newStrLitNode(readFile(x.strVal))
const css = includeFile("./src/style.css") const css = includeFile("./src/style.css")
var threadDB {.threadvar.}: Option[DbConn]
proc getDB(): DbConn {.gcsafe.} =
if isNone threadDB:
let x = openDatabase("./monitoring.sqlite3")
x.exec("PRAGMA journal_mode=WAL")
proc closeDB() =
try: close(x)
except: discard
addExitProc(closeDB)
when declared(onThreadDestroy): onThreadDestroy(closeDB)
threadDB = some x
get threadDB
func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000) func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000)
func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000) func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000)
@ -39,37 +30,42 @@ proc toDbValue(t: Time): DbValue = DbValue(kind: sqliteInteger, intVal: timeToTi
proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal) proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal)
type type
ResponseType = enum ResponseType {.pure.} = enum
rtHttpTeapot = -1 HttpTeapot = -1
rtOk = 0 Ok = 0
rtHttpError = 1 HttpError = 1
rtTimeout = 2 Timeout = 2
rtFetchError = 3 FetchError = 3
Response = object Response = object
rtype: ResponseType rtype: ResponseType
latency: int64 # microseconds latency: int64 # microseconds
SiteStatus = object SiteStatus = object
id: int
url: string url: string
lastPing: Time lastPing: Time
lastResponse: ResponseType lastResponse: ResponseType
lastLatency: float lastLatency: float
uptimePercent: float uptimePercent: float
averageLatency: float
Ctx = object
db: DbConn
dbPath: string
images: TableRef[int, (seq[byte], int)]
interval: int
proc uptimeSince(sid: int, time: Time): float = proc fetchLatest(ctx: Ctx, row: ResultRow): Option[SiteStatus] =
let okPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND status <= 0 AND timestamp >= ?", sid, time), int)
let totalPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND timestamp >= ?", sid, time), int)
okPings / totalPings
proc fetchLatest(row: ResultRow): Option[SiteStatus] =
let weekAgo = getTime() + initTimeInterval(weeks= -1) let weekAgo = getTime() + initTimeInterval(weeks= -1)
let (site, url) = row.unpack((int, string)) let (site, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int, string, int64, int64, int64, int64))
let row = getDB().one("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT 1", site) # work around bizarre SQLite query planner issue - it appears that if it has a literal value to compare site against it generates very fast VM code
# but if it has a prepared state parameter it somehow refuses to use the index
let row = ctx.db.one("SELECT timestamp, status, latency FROM reqs WHERE site = -1 OR site = ? ORDER BY timestamp DESC LIMIT 1", site)
if isNone row: return none(SiteStatus) if isNone row: return none(SiteStatus)
let (ts, status, latency) = (get row).unpack((Time, int, int)) let (ts, status, latency) = (get row).unpack((Time, int, int))
some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float64(latency) / 1e3, uptimePercent: uptimeSince(site, weekAgo)) some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float(latency) / 1e3, id: site,
uptimePercent: float(rollingSuccessfulPings) / float(rollingTotalPings), averageLatency: float(rollingLatency) / float(rollingTotalPings) / 1e3)
proc mainPage(): string = proc mainPage(ctx: Ctx): string =
let sites = getDB().all("SELECT * FROM sites ORDER BY sid").map(fetchLatest).filter(x => isSome x).map(x => get x) let sites = ctx.db.all("SELECT * FROM sites ORDER BY sid").map(x => ctx.fetchLatest(x)).filter(x => x.isSome).map(x => x.get)
let up = sites.filter(x => int(x.lastResponse) <= 0).len() let up = sites.filter(x => int(x.lastResponse) <= 0).len()
let vnode = buildHtml(html()): let vnode = buildHtml(html()):
head: head:
@ -83,23 +79,25 @@ proc mainPage(): string =
h2(class="title"): text &"{up}/{sites.len} up" h2(class="title"): text &"{up}/{sites.len} up"
for site in sites: for site in sites:
tdiv(class="card " & $site.lastResponse): tdiv(class="card " & $site.lastResponse):
tdiv(class="left"):
h2: h2:
case site.lastResponse case site.lastResponse
of rtOk: text "" of ResponseType.Ok: text ""
of rtHttpError: text "" of ResponseType.HttpError: text ""
of rtTimeout: text "" of ResponseType.Timeout: text ""
of rtFetchError: text "" of ResponseType.FetchError: text ""
of rtHttpTeapot: text "🫖 " of ResponseType.HttpTeapot: text "🫖 "
text site.url text site.url
tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy")) tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy"))
tdiv: tdiv:
case site.lastResponse case site.lastResponse
of rtOk: text &"Latency {site.lastLatency}ms" of ResponseType.Ok: text &"Latency {site.lastLatency}ms"
of rtHttpError: text "HTTP error" of ResponseType.HttpError: text "HTTP error"
of rtHttpTeapot: text &"Teapot, latency {site.lastLatency}ms" of ResponseType.HttpTeapot: text &"Teapot, latency {site.lastLatency:.5f}ms"
of rtTimeout: text "Timed out" of ResponseType.Timeout: text "Timed out"
of rtFetchError: text "Fetch failed" of ResponseType.FetchError: text "Fetch failed"
tdiv: text &"{site.uptimePercent * 100}% up in last week" tdiv: text &"{site.uptimePercent * 100:.5f}% up, {site.averageLatency:.5f}ms latency in last week"
if site.id in ctx.images: img(src= &"/vis/{site.id}", class="right", title= "&{site.url} 12-week status visualization")
hr() hr()
small: small:
text "made by " text "made by "
@ -109,54 +107,158 @@ proc mainPage(): string =
text "." text "."
$vnode $vnode
proc onRequest(req: Request) {.async.} = var imageReturnChannel: Channel[(int, seq[byte])]
proc readIntoContext(ctx: Ctx) =
# this is a horrible workaround to avoid having to something something shared hash table
var available = true
while available:
let (av, data) = imageReturnChannel.tryRecv()
available = av
if available:
let (id, image) = data
ctx.images[id] = (image, image.hash)
proc onRequest(ctx: Ctx): (proc(req: Request): Future[void] {.gcsafe.}) =
result = proc(req: Request) {.async.} =
readIntoContext(ctx)
if req.reqMethod == HttpGet: if req.reqMethod == HttpGet:
case req.url.path var path = req.url.path
of "/": await req.respond(Http200, mainPage(), headers=newHttpHeaders([("Content-Type", "text/html")])) if path == "/":
await req.respond(Http200, mainPage(ctx), headers=newHttpHeaders([("Content-Type", "text/html")]))
elif path.startsWith("/vis/"):
path.removePrefix("/vis/")
var id = 0
try:
id = parseInt path
except:
await req.respond(Http404, "not found")
return
if id in ctx.images:
let (image, hash) = ctx.images[id]
let etag = &"\"{hash}\""
if etag == req.headers.getOrDefault("if-none-match"):
await req.respond(Http304, "")
else:
await req.respond(Http200, cast[string](image), headers=newHttpHeaders([
("Content-Type", "image/png"), ("ETag", etag)]))
else: await req.respond(Http404, "not found")
else: await req.respond(Http404, "not found") else: await req.respond(Http404, "not found")
else: else:
await req.respond(Http404, "not found") await req.respond(Http405, "GET only")
proc pollTarget(s: string): Future[Response] {.async.} = proc pollTarget(ctx: Ctx, s: string): Future[Response] {.async.} =
var client = newAsyncHttpClient() var client = newAsyncHttpClient()
var x = Response(rtype: rtTimeout, latency: 0) var x = Response(rtype: ResponseType.Timeout, latency: 0)
proc doFetch() {.async.} = proc doFetch() {.async.} =
let ts = now().utc let ts = now().utc
let res = await client.get(s) let res = await client.get(s)
let latency = (now().utc - ts).inMicroseconds let latency = (now().utc - ts).inMicroseconds
if res.code.int == 418: x = Response(rtype: rtHttpTeapot, latency: latency) if res.code.int == 418: x = Response(rtype: ResponseType.HttpTeapot, latency: latency)
elif res.code.is4xx or res.code.is5xx: x = Response(rtype: rtHttpError, latency: latency) elif res.code.is4xx or res.code.is5xx: x = Response(rtype: ResponseType.HttpError, latency: latency)
else: x = Response(rtype: rtOk, latency: latency) else: x = Response(rtype: ResponseType.Ok, latency: latency)
try: try:
discard await withTimeout(doFetch(), 10000) discard await withTimeout(doFetch(), 10000)
except: except:
x = Response(rtype: rtFetchError, latency: 0) x = Response(rtype: ResponseType.FetchError, latency: 0)
client.close() client.close()
return x return x
proc pollTargets() {.async.} = proc pollTargets(ctx: Ctx) {.async.} =
for row in getDB().all("SELECT * FROM sites"): for row in ctx.db.all("SELECT * FROM sites"):
let (id, url) = row.unpack((int64, string)) var (id, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int64, string, int64, int64, int64, Option[Time]))
let res = await pollTarget(url) let res = await ctx.pollTarget(url)
getDB().exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency) let threshold = getTime() + initTimeInterval(weeks= -1)
proc timerCallback(fd: AsyncFD): bool = # drop old data from rolling counters
asyncCheck pollTargets() if rollingDataSince.isSome:
false for row in ctx.db.iterate("SELECT status, latency FROM reqs WHERE timestamp >= ? AND timestamp <= ? AND site = ?", rollingDataSince.get, threshold, id):
let (statusRaw, latency) = row.unpack((int, int))
rollingTotalPings -= 1
rollingLatency -= latency
if statusRaw <= 0:
rollingSuccessfulPings -= 1
# add new data
rollingTotalPings += 1
rollingLatency += res.latency
if int(res.rtype) <= 0:
rollingSuccessfulPings += 1
ctx.db.transaction:
ctx.db.exec("UPDATE sites SET rc_total = ?, rc_success = ?, rc_latency = ?, rc_data_since = ? WHERE sid = ?", rollingTotalPings, rollingSuccessfulPings, rollingLatency, threshold, id)
ctx.db.exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency)
proc drawLatencyImage(db: DbConn, site: int, interval: int): seq[byte] =
const width = 120 * 6
const height = 168 * 2
var image = initImage[ColorRGBU](width, height)
var count = 0
var lastTs = getTime()
for row in db.iterate("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT ?", site, width * height):
let (ts, statusRaw, latency) = row.unpack((Time, int, int))
let timeGap = lastTs - ts
if timeGap > initDuration(milliseconds = interval + 10000):
let pixels = timeGap.inMilliseconds div interval
for _ in 1..pixels:
image.data[count] = ColorRGBU([0x7Eu8, 0x1E, 0x9C])
count += 1
if count >= image.data.len: break
else:
let status = ResponseType(statusRaw)
case status
of ResponseType.HttpError:
image.data[count] = ColorRGBU([255u8, 127, 0])
of ResponseType.Timeout:
image.data[count] = ColorRGBU([0u8, 0, 0])
of ResponseType.FetchError:
image.data[count] = ColorRGBU([255u8, 0, 0])
else:
let latencyMultiplier = max(min(pow(10.0, 1.1) / pow(float(latency), 0.25), 1.0), 0.2)
image.data[count] = ColorRGBU([0u8, uint8(latencyMultiplier * 255.0), 0])
count += 1
if count >= image.data.len: break
lastTs = ts
writePNG(image, compression=6)
proc generateImages(args: (string, int)) =
let (dbPath, interval) = args
let db = openDatabase(dbPath)
db.exec("PRAGMA journal_mode = WAL")
for row in db.all("SELECT sid FROM sites"):
let id = row[0].fromDbValue(int)
imageReturnChannel.send((id, drawLatencyImage(db, id, interval)))
close(db)
proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) = proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) =
## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this. ## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this.
let database = openDatabase(dbPath) let database = openDatabase(dbPath)
database.exec("PRAGMA journal_mode = WAL")
migrate(database) migrate(database)
for url in urls: for url in urls:
echo &"Adding {url}" echo &"Adding {url}"
database.exec("INSERT INTO sites (url) VALUES (?)", url) database.exec("INSERT INTO sites (url) VALUES (?)", url)
close(database)
var ctx = Ctx(db: database, dbPath: dbPath, images: newTable[int, (seq[byte], int)](), interval: interval)
echo "Starting up" echo "Starting up"
asyncCheck pollTargets() asyncCheck pollTargets(ctx)
addTimer(interval, false, timerCallback) imageReturnChannel.open()
var thread: Thread[(string, int)]
createThread(thread, generateImages, (dbPath, interval))
echo "Ready"
addTimer(interval, false, proc(fd: AsyncFD): bool =
asyncCheck pollTargets(ctx)
false)
addTimer(interval * 60, false, proc(fd: AsyncFD): bool =
createThread(thread, generateImages, (dbPath, interval))
let fut = sleepAsync(10000)
fut.addCallback(() => readIntoContext(ctx))
asyncCheck fut
false)
var server = newAsyncHttpServer() var server = newAsyncHttpServer()
waitFor server.serve(Port(port), onRequest) waitFor server.serve(Port(port), onRequest(ctx))
dispatch(run, help={ dispatch(run, help={
"dbPath": "path to SQLite3 database for historical data logging", "dbPath": "path to SQLite3 database for historical data logging",
"port": "port to serve HTTP on", "port": "port to serve HTTP on",

View File

@ -1,5 +1,4 @@
body { body {
max-width: 40em;
font-family: sans-serif; font-family: sans-serif;
} }
@ -9,6 +8,7 @@ body {
h1, h2 { h1, h2 {
font-weight: normal; font-weight: normal;
max-width: 100%;
margin: 0; margin: 0;
} }
@ -18,20 +18,29 @@ h1 {
.card { .card {
margin-bottom: 1em; margin-bottom: 1em;
display: flex;
justify-content: space-between;
flex-wrap: wrap;
} }
.card.rtOk h2 { .card.Ok h2 {
color: green; color: green;
} }
.card.rtHttpError h2 { .card.HttpError h2 {
color: orange; color: orange;
} }
.card.rtHttpTeapot h2 { .card.HttpTeapot h2 {
color: blue; color: blue;
} }
.card.rtFetchError h2 { .card.FetchError h2 {
color: red; color: red;
} }
.card.rtTimeout h2 { .card.Timeout h2 {
color: red; color: red;
} }
img {
image-rendering: pixelated;
-ms-interpolation-mode: nearest-neighbor;
image-rendering: crisp-edges;
}