Big refactors, performance, status images

This commit is contained in:
osmarks 2021-04-25 22:10:52 +01:00
parent dfbda82731
commit 65b19dbd96
5 changed files with 218 additions and 93 deletions

View File

@ -1 +1 @@
-d:ssl -d:ssl --threads:on

View File

@ -13,3 +13,4 @@ requires "nim >= 1.4.2"
requires "https://github.com/GULPF/tiny_sqlite#8fe760d9" requires "https://github.com/GULPF/tiny_sqlite#8fe760d9"
requires "karax >= 1.2.1" requires "karax >= 1.2.1"
requires "cligen >= 1" requires "cligen >= 1"
requires "imageman >= 0.8"

View File

@ -3,21 +3,34 @@ import options
let migrations: seq[string] = @[ let migrations: seq[string] = @[
""" """
CREATE TABLE sites ( CREATE TABLE sites (
sid INTEGER PRIMARY KEY, sid INTEGER PRIMARY KEY,
url TEXT NOT NULL url TEXT NOT NULL
); );
CREATE TABLE reqs ( CREATE TABLE reqs (
rid INTEGER PRIMARY KEY, rid INTEGER PRIMARY KEY,
site INTEGER NOT NULL REFERENCES sites(sid), site INTEGER NOT NULL REFERENCES sites(sid),
timestamp INTEGER NOT NULL, timestamp INTEGER NOT NULL,
status INTEGER NOT NULL, status INTEGER NOT NULL,
latency INTEGER NOT NULL latency INTEGER NOT NULL
); );
""", """,
""" """
CREATE INDEX req_ts_idx ON reqs (timestamp); CREATE INDEX req_ts_idx ON reqs (timestamp);
""",
# rolling total/successful ping and latency count
# rc_data_since holds the older end of the interval the counters are from
# this slightly horribly migrates the existing data using a hardcoded 1 week window
"""
ALTER TABLE sites ADD COLUMN rc_total INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_success INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_latency INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_data_since INTEGER;
UPDATE sites SET rc_total = (SELECT COUNT(*) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_success = (SELECT SUM(status <= 0) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_latency = (SELECT SUM(latency) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_data_since = (strftime('%s') - (86400*7)) * 1000000;
""" """
] ]

View File

@ -10,8 +10,12 @@ import sugar
import net import net
import sequtils import sequtils
import strformat import strformat
import std/exitprocs import strutils
import cligen import cligen
import imageman
import math
import hashes
import tables
import ./db import ./db
@ -19,19 +23,6 @@ macro includeFile(x: string): string = newStrLitNode(readFile(x.strVal))
const css = includeFile("./src/style.css") const css = includeFile("./src/style.css")
var threadDB {.threadvar.}: Option[DbConn]
proc getDB(): DbConn {.gcsafe.} =
if isNone threadDB:
let x = openDatabase("./monitoring.sqlite3")
x.exec("PRAGMA journal_mode=WAL")
proc closeDB() =
try: close(x)
except: discard
addExitProc(closeDB)
when declared(onThreadDestroy): onThreadDestroy(closeDB)
threadDB = some x
get threadDB
func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000) func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000)
func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000) func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000)
@ -39,37 +30,42 @@ proc toDbValue(t: Time): DbValue = DbValue(kind: sqliteInteger, intVal: timeToTi
proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal) proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal)
type type
ResponseType = enum ResponseType {.pure.} = enum
rtHttpTeapot = -1 HttpTeapot = -1
rtOk = 0 Ok = 0
rtHttpError = 1 HttpError = 1
rtTimeout = 2 Timeout = 2
rtFetchError = 3 FetchError = 3
Response = object Response = object
rtype: ResponseType rtype: ResponseType
latency: int64 # microseconds latency: int64 # microseconds
SiteStatus = object SiteStatus = object
id: int
url: string url: string
lastPing: Time lastPing: Time
lastResponse: ResponseType lastResponse: ResponseType
lastLatency: float lastLatency: float
uptimePercent: float uptimePercent: float
averageLatency: float
Ctx = object
db: DbConn
dbPath: string
images: TableRef[int, (seq[byte], int)]
interval: int
proc uptimeSince(sid: int, time: Time): float = proc fetchLatest(ctx: Ctx, row: ResultRow): Option[SiteStatus] =
let okPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND status <= 0 AND timestamp >= ?", sid, time), int)
let totalPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND timestamp >= ?", sid, time), int)
okPings / totalPings
proc fetchLatest(row: ResultRow): Option[SiteStatus] =
let weekAgo = getTime() + initTimeInterval(weeks= -1) let weekAgo = getTime() + initTimeInterval(weeks= -1)
let (site, url) = row.unpack((int, string)) let (site, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int, string, int64, int64, int64, int64))
let row = getDB().one("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT 1", site) # work around bizarre SQLite query planner issue - it appears that if it has a literal value to compare site against it generates very fast VM code
# but if it has a prepared state parameter it somehow refuses to use the index
let row = ctx.db.one("SELECT timestamp, status, latency FROM reqs WHERE site = -1 OR site = ? ORDER BY timestamp DESC LIMIT 1", site)
if isNone row: return none(SiteStatus) if isNone row: return none(SiteStatus)
let (ts, status, latency) = (get row).unpack((Time, int, int)) let (ts, status, latency) = (get row).unpack((Time, int, int))
some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float64(latency) / 1e3, uptimePercent: uptimeSince(site, weekAgo)) some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float(latency) / 1e3, id: site,
uptimePercent: float(rollingSuccessfulPings) / float(rollingTotalPings), averageLatency: float(rollingLatency) / float(rollingTotalPings) / 1e3)
proc mainPage(): string = proc mainPage(ctx: Ctx): string =
let sites = getDB().all("SELECT * FROM sites ORDER BY sid").map(fetchLatest).filter(x => isSome x).map(x => get x) let sites = ctx.db.all("SELECT * FROM sites ORDER BY sid").map(x => ctx.fetchLatest(x)).filter(x => x.isSome).map(x => x.get)
let up = sites.filter(x => int(x.lastResponse) <= 0).len() let up = sites.filter(x => int(x.lastResponse) <= 0).len()
let vnode = buildHtml(html()): let vnode = buildHtml(html()):
head: head:
@ -83,23 +79,25 @@ proc mainPage(): string =
h2(class="title"): text &"{up}/{sites.len} up" h2(class="title"): text &"{up}/{sites.len} up"
for site in sites: for site in sites:
tdiv(class="card " & $site.lastResponse): tdiv(class="card " & $site.lastResponse):
tdiv(class="left"):
h2: h2:
case site.lastResponse case site.lastResponse
of rtOk: text "" of ResponseType.Ok: text ""
of rtHttpError: text "" of ResponseType.HttpError: text ""
of rtTimeout: text "" of ResponseType.Timeout: text ""
of rtFetchError: text "" of ResponseType.FetchError: text ""
of rtHttpTeapot: text "🫖 " of ResponseType.HttpTeapot: text "🫖 "
text site.url text site.url
tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy")) tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy"))
tdiv: tdiv:
case site.lastResponse case site.lastResponse
of rtOk: text &"Latency {site.lastLatency}ms" of ResponseType.Ok: text &"Latency {site.lastLatency}ms"
of rtHttpError: text "HTTP error" of ResponseType.HttpError: text "HTTP error"
of rtHttpTeapot: text &"Teapot, latency {site.lastLatency}ms" of ResponseType.HttpTeapot: text &"Teapot, latency {site.lastLatency:.5f}ms"
of rtTimeout: text "Timed out" of ResponseType.Timeout: text "Timed out"
of rtFetchError: text "Fetch failed" of ResponseType.FetchError: text "Fetch failed"
tdiv: text &"{site.uptimePercent * 100}% up in last week" tdiv: text &"{site.uptimePercent * 100:.5f}% up, {site.averageLatency:.5f}ms latency in last week"
if site.id in ctx.images: img(src= &"/vis/{site.id}", class="right", title= "&{site.url} 12-week status visualization")
hr() hr()
small: small:
text "made by " text "made by "
@ -109,54 +107,158 @@ proc mainPage(): string =
text "." text "."
$vnode $vnode
proc onRequest(req: Request) {.async.} = var imageReturnChannel: Channel[(int, seq[byte])]
proc readIntoContext(ctx: Ctx) =
# this is a horrible workaround to avoid having to something something shared hash table
var available = true
while available:
let (av, data) = imageReturnChannel.tryRecv()
available = av
if available:
let (id, image) = data
ctx.images[id] = (image, image.hash)
proc onRequest(ctx: Ctx): (proc(req: Request): Future[void] {.gcsafe.}) =
result = proc(req: Request) {.async.} =
readIntoContext(ctx)
if req.reqMethod == HttpGet: if req.reqMethod == HttpGet:
case req.url.path var path = req.url.path
of "/": await req.respond(Http200, mainPage(), headers=newHttpHeaders([("Content-Type", "text/html")])) if path == "/":
await req.respond(Http200, mainPage(ctx), headers=newHttpHeaders([("Content-Type", "text/html")]))
elif path.startsWith("/vis/"):
path.removePrefix("/vis/")
var id = 0
try:
id = parseInt path
except:
await req.respond(Http404, "not found")
return
if id in ctx.images:
let (image, hash) = ctx.images[id]
let etag = &"\"{hash}\""
if etag == req.headers.getOrDefault("if-none-match"):
await req.respond(Http304, "")
else:
await req.respond(Http200, cast[string](image), headers=newHttpHeaders([
("Content-Type", "image/png"), ("ETag", etag)]))
else: await req.respond(Http404, "not found")
else: await req.respond(Http404, "not found") else: await req.respond(Http404, "not found")
else: else:
await req.respond(Http404, "not found") await req.respond(Http405, "GET only")
proc pollTarget(s: string): Future[Response] {.async.} = proc pollTarget(ctx: Ctx, s: string): Future[Response] {.async.} =
var client = newAsyncHttpClient() var client = newAsyncHttpClient()
var x = Response(rtype: rtTimeout, latency: 0) var x = Response(rtype: ResponseType.Timeout, latency: 0)
proc doFetch() {.async.} = proc doFetch() {.async.} =
let ts = now().utc let ts = now().utc
let res = await client.get(s) let res = await client.get(s)
let latency = (now().utc - ts).inMicroseconds let latency = (now().utc - ts).inMicroseconds
if res.code.int == 418: x = Response(rtype: rtHttpTeapot, latency: latency) if res.code.int == 418: x = Response(rtype: ResponseType.HttpTeapot, latency: latency)
elif res.code.is4xx or res.code.is5xx: x = Response(rtype: rtHttpError, latency: latency) elif res.code.is4xx or res.code.is5xx: x = Response(rtype: ResponseType.HttpError, latency: latency)
else: x = Response(rtype: rtOk, latency: latency) else: x = Response(rtype: ResponseType.Ok, latency: latency)
try: try:
discard await withTimeout(doFetch(), 10000) discard await withTimeout(doFetch(), 10000)
except: except:
x = Response(rtype: rtFetchError, latency: 0) x = Response(rtype: ResponseType.FetchError, latency: 0)
client.close() client.close()
return x return x
proc pollTargets() {.async.} = proc pollTargets(ctx: Ctx) {.async.} =
for row in getDB().all("SELECT * FROM sites"): for row in ctx.db.all("SELECT * FROM sites"):
let (id, url) = row.unpack((int64, string)) var (id, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int64, string, int64, int64, int64, Option[Time]))
let res = await pollTarget(url) let res = await ctx.pollTarget(url)
getDB().exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency) let threshold = getTime() + initTimeInterval(weeks= -1)
proc timerCallback(fd: AsyncFD): bool = # drop old data from rolling counters
asyncCheck pollTargets() if rollingDataSince.isSome:
false for row in ctx.db.iterate("SELECT status, latency FROM reqs WHERE timestamp >= ? AND timestamp <= ? AND site = ?", rollingDataSince.get, threshold, id):
let (statusRaw, latency) = row.unpack((int, int))
rollingTotalPings -= 1
rollingLatency -= latency
if statusRaw <= 0:
rollingSuccessfulPings -= 1
# add new data
rollingTotalPings += 1
rollingLatency += res.latency
if int(res.rtype) <= 0:
rollingSuccessfulPings += 1
ctx.db.transaction:
ctx.db.exec("UPDATE sites SET rc_total = ?, rc_success = ?, rc_latency = ?, rc_data_since = ? WHERE sid = ?", rollingTotalPings, rollingSuccessfulPings, rollingLatency, threshold, id)
ctx.db.exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency)
proc drawLatencyImage(db: DbConn, site: int, interval: int): seq[byte] =
const width = 120 * 6
const height = 168 * 2
var image = initImage[ColorRGBU](width, height)
var count = 0
var lastTs = getTime()
for row in db.iterate("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT ?", site, width * height):
let (ts, statusRaw, latency) = row.unpack((Time, int, int))
let timeGap = lastTs - ts
if timeGap > initDuration(milliseconds = interval + 10000):
let pixels = timeGap.inMilliseconds div interval
for _ in 1..pixels:
image.data[count] = ColorRGBU([0x7Eu8, 0x1E, 0x9C])
count += 1
if count >= image.data.len: break
else:
let status = ResponseType(statusRaw)
case status
of ResponseType.HttpError:
image.data[count] = ColorRGBU([255u8, 127, 0])
of ResponseType.Timeout:
image.data[count] = ColorRGBU([0u8, 0, 0])
of ResponseType.FetchError:
image.data[count] = ColorRGBU([255u8, 0, 0])
else:
let latencyMultiplier = max(min(pow(10.0, 1.1) / pow(float(latency), 0.25), 1.0), 0.2)
image.data[count] = ColorRGBU([0u8, uint8(latencyMultiplier * 255.0), 0])
count += 1
if count >= image.data.len: break
lastTs = ts
writePNG(image, compression=6)
proc generateImages(args: (string, int)) =
let (dbPath, interval) = args
let db = openDatabase(dbPath)
db.exec("PRAGMA journal_mode = WAL")
for row in db.all("SELECT sid FROM sites"):
let id = row[0].fromDbValue(int)
imageReturnChannel.send((id, drawLatencyImage(db, id, interval)))
close(db)
proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) = proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) =
## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this. ## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this.
let database = openDatabase(dbPath) let database = openDatabase(dbPath)
database.exec("PRAGMA journal_mode = WAL")
migrate(database) migrate(database)
for url in urls: for url in urls:
echo &"Adding {url}" echo &"Adding {url}"
database.exec("INSERT INTO sites (url) VALUES (?)", url) database.exec("INSERT INTO sites (url) VALUES (?)", url)
close(database)
var ctx = Ctx(db: database, dbPath: dbPath, images: newTable[int, (seq[byte], int)](), interval: interval)
echo "Starting up" echo "Starting up"
asyncCheck pollTargets() asyncCheck pollTargets(ctx)
addTimer(interval, false, timerCallback) imageReturnChannel.open()
var thread: Thread[(string, int)]
createThread(thread, generateImages, (dbPath, interval))
echo "Ready"
addTimer(interval, false, proc(fd: AsyncFD): bool =
asyncCheck pollTargets(ctx)
false)
addTimer(interval * 60, false, proc(fd: AsyncFD): bool =
createThread(thread, generateImages, (dbPath, interval))
let fut = sleepAsync(10000)
fut.addCallback(() => readIntoContext(ctx))
asyncCheck fut
false)
var server = newAsyncHttpServer() var server = newAsyncHttpServer()
waitFor server.serve(Port(port), onRequest) waitFor server.serve(Port(port), onRequest(ctx))
dispatch(run, help={ dispatch(run, help={
"dbPath": "path to SQLite3 database for historical data logging", "dbPath": "path to SQLite3 database for historical data logging",
"port": "port to serve HTTP on", "port": "port to serve HTTP on",

View File

@ -1,5 +1,4 @@
body { body {
max-width: 40em;
font-family: sans-serif; font-family: sans-serif;
} }
@ -9,6 +8,7 @@ body {
h1, h2 { h1, h2 {
font-weight: normal; font-weight: normal;
max-width: 100%;
margin: 0; margin: 0;
} }
@ -18,20 +18,29 @@ h1 {
.card { .card {
margin-bottom: 1em; margin-bottom: 1em;
display: flex;
justify-content: space-between;
flex-wrap: wrap;
} }
.card.rtOk h2 { .card.Ok h2 {
color: green; color: green;
} }
.card.rtHttpError h2 { .card.HttpError h2 {
color: orange; color: orange;
} }
.card.rtHttpTeapot h2 { .card.HttpTeapot h2 {
color: blue; color: blue;
} }
.card.rtFetchError h2 { .card.FetchError h2 {
color: red; color: red;
} }
.card.rtTimeout h2 { .card.Timeout h2 {
color: red; color: red;
} }
img {
image-rendering: pixelated;
-ms-interpolation-mode: nearest-neighbor;
image-rendering: crisp-edges;
}