Big refactors, performance, status images

This commit is contained in:
osmarks 2021-04-25 22:10:52 +01:00
parent dfbda82731
commit 65b19dbd96
5 changed files with 218 additions and 93 deletions

View File

@ -1 +1 @@
-d:ssl
-d:ssl --threads:on

View File

@ -12,4 +12,5 @@ bin = @["onstat"]
requires "nim >= 1.4.2"
requires "https://github.com/GULPF/tiny_sqlite#8fe760d9"
requires "karax >= 1.2.1"
requires "cligen >= 1"
requires "cligen >= 1"
requires "imageman >= 0.8"

View File

@ -3,21 +3,34 @@ import options
let migrations: seq[string] = @[
"""
CREATE TABLE sites (
sid INTEGER PRIMARY KEY,
url TEXT NOT NULL
);
CREATE TABLE sites (
sid INTEGER PRIMARY KEY,
url TEXT NOT NULL
);
CREATE TABLE reqs (
rid INTEGER PRIMARY KEY,
site INTEGER NOT NULL REFERENCES sites(sid),
timestamp INTEGER NOT NULL,
status INTEGER NOT NULL,
latency INTEGER NOT NULL
);
CREATE TABLE reqs (
rid INTEGER PRIMARY KEY,
site INTEGER NOT NULL REFERENCES sites(sid),
timestamp INTEGER NOT NULL,
status INTEGER NOT NULL,
latency INTEGER NOT NULL
);
""",
"""
CREATE INDEX req_ts_idx ON reqs (timestamp);
CREATE INDEX req_ts_idx ON reqs (timestamp);
""",
# rolling total/successful ping and latency count
# rc_data_since holds the older end of the interval the counters are from
# this slightly horribly migrates the existing data using a hardcoded 1 week window
"""
ALTER TABLE sites ADD COLUMN rc_total INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_success INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_latency INTEGER NOT NULL DEFAULT 0;
ALTER TABLE sites ADD COLUMN rc_data_since INTEGER;
UPDATE sites SET rc_total = (SELECT COUNT(*) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_success = (SELECT SUM(status <= 0) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_latency = (SELECT SUM(latency) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
UPDATE sites SET rc_data_since = (strftime('%s') - (86400*7)) * 1000000;
"""
]

View File

@ -10,8 +10,12 @@ import sugar
import net
import sequtils
import strformat
import std/exitprocs
import strutils
import cligen
import imageman
import math
import hashes
import tables
import ./db
@ -19,19 +23,6 @@ macro includeFile(x: string): string = newStrLitNode(readFile(x.strVal))
const css = includeFile("./src/style.css")
var threadDB {.threadvar.}: Option[DbConn]
proc getDB(): DbConn {.gcsafe.} =
if isNone threadDB:
let x = openDatabase("./monitoring.sqlite3")
x.exec("PRAGMA journal_mode=WAL")
proc closeDB() =
try: close(x)
except: discard
addExitProc(closeDB)
when declared(onThreadDestroy): onThreadDestroy(closeDB)
threadDB = some x
get threadDB
func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000)
func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000)
@ -39,37 +30,42 @@ proc toDbValue(t: Time): DbValue = DbValue(kind: sqliteInteger, intVal: timeToTi
proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal)
type
ResponseType = enum
rtHttpTeapot = -1
rtOk = 0
rtHttpError = 1
rtTimeout = 2
rtFetchError = 3
ResponseType {.pure.} = enum
HttpTeapot = -1
Ok = 0
HttpError = 1
Timeout = 2
FetchError = 3
Response = object
rtype: ResponseType
latency: int64 # microseconds
SiteStatus = object
id: int
url: string
lastPing: Time
lastResponse: ResponseType
lastLatency: float
uptimePercent: float
averageLatency: float
Ctx = object
db: DbConn
dbPath: string
images: TableRef[int, (seq[byte], int)]
interval: int
proc uptimeSince(sid: int, time: Time): float =
let okPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND status <= 0 AND timestamp >= ?", sid, time), int)
let totalPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND timestamp >= ?", sid, time), int)
okPings / totalPings
proc fetchLatest(row: ResultRow): Option[SiteStatus] =
proc fetchLatest(ctx: Ctx, row: ResultRow): Option[SiteStatus] =
let weekAgo = getTime() + initTimeInterval(weeks= -1)
let (site, url) = row.unpack((int, string))
let row = getDB().one("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT 1", site)
let (site, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int, string, int64, int64, int64, int64))
# work around bizarre SQLite query planner issue - it appears that if it has a literal value to compare site against it generates very fast VM code
# but if it has a prepared state parameter it somehow refuses to use the index
let row = ctx.db.one("SELECT timestamp, status, latency FROM reqs WHERE site = -1 OR site = ? ORDER BY timestamp DESC LIMIT 1", site)
if isNone row: return none(SiteStatus)
let (ts, status, latency) = (get row).unpack((Time, int, int))
some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float64(latency) / 1e3, uptimePercent: uptimeSince(site, weekAgo))
some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float(latency) / 1e3, id: site,
uptimePercent: float(rollingSuccessfulPings) / float(rollingTotalPings), averageLatency: float(rollingLatency) / float(rollingTotalPings) / 1e3)
proc mainPage(): string =
let sites = getDB().all("SELECT * FROM sites ORDER BY sid").map(fetchLatest).filter(x => isSome x).map(x => get x)
proc mainPage(ctx: Ctx): string =
let sites = ctx.db.all("SELECT * FROM sites ORDER BY sid").map(x => ctx.fetchLatest(x)).filter(x => x.isSome).map(x => x.get)
let up = sites.filter(x => int(x.lastResponse) <= 0).len()
let vnode = buildHtml(html()):
head:
@ -83,23 +79,25 @@ proc mainPage(): string =
h2(class="title"): text &"{up}/{sites.len} up"
for site in sites:
tdiv(class="card " & $site.lastResponse):
h2:
case site.lastResponse
of rtOk: text ""
of rtHttpError: text ""
of rtTimeout: text ""
of rtFetchError: text ""
of rtHttpTeapot: text "🫖 "
text site.url
tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy"))
tdiv:
case site.lastResponse
of rtOk: text &"Latency {site.lastLatency}ms"
of rtHttpError: text "HTTP error"
of rtHttpTeapot: text &"Teapot, latency {site.lastLatency}ms"
of rtTimeout: text "Timed out"
of rtFetchError: text "Fetch failed"
tdiv: text &"{site.uptimePercent * 100}% up in last week"
tdiv(class="left"):
h2:
case site.lastResponse
of ResponseType.Ok: text ""
of ResponseType.HttpError: text ""
of ResponseType.Timeout: text ""
of ResponseType.FetchError: text ""
of ResponseType.HttpTeapot: text "🫖 "
text site.url
tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy"))
tdiv:
case site.lastResponse
of ResponseType.Ok: text &"Latency {site.lastLatency}ms"
of ResponseType.HttpError: text "HTTP error"
of ResponseType.HttpTeapot: text &"Teapot, latency {site.lastLatency:.5f}ms"
of ResponseType.Timeout: text "Timed out"
of ResponseType.FetchError: text "Fetch failed"
tdiv: text &"{site.uptimePercent * 100:.5f}% up, {site.averageLatency:.5f}ms latency in last week"
if site.id in ctx.images: img(src= &"/vis/{site.id}", class="right", title= "&{site.url} 12-week status visualization")
hr()
small:
text "made by "
@ -109,54 +107,158 @@ proc mainPage(): string =
text "."
$vnode
proc onRequest(req: Request) {.async.} =
if req.reqMethod == HttpGet:
case req.url.path
of "/": await req.respond(Http200, mainPage(), headers=newHttpHeaders([("Content-Type", "text/html")]))
else: await req.respond(Http404, "not found")
else:
await req.respond(Http404, "not found")
var imageReturnChannel: Channel[(int, seq[byte])]
proc pollTarget(s: string): Future[Response] {.async.} =
proc readIntoContext(ctx: Ctx) =
# this is a horrible workaround to avoid having to something something shared hash table
var available = true
while available:
let (av, data) = imageReturnChannel.tryRecv()
available = av
if available:
let (id, image) = data
ctx.images[id] = (image, image.hash)
proc onRequest(ctx: Ctx): (proc(req: Request): Future[void] {.gcsafe.}) =
result = proc(req: Request) {.async.} =
readIntoContext(ctx)
if req.reqMethod == HttpGet:
var path = req.url.path
if path == "/":
await req.respond(Http200, mainPage(ctx), headers=newHttpHeaders([("Content-Type", "text/html")]))
elif path.startsWith("/vis/"):
path.removePrefix("/vis/")
var id = 0
try:
id = parseInt path
except:
await req.respond(Http404, "not found")
return
if id in ctx.images:
let (image, hash) = ctx.images[id]
let etag = &"\"{hash}\""
if etag == req.headers.getOrDefault("if-none-match"):
await req.respond(Http304, "")
else:
await req.respond(Http200, cast[string](image), headers=newHttpHeaders([
("Content-Type", "image/png"), ("ETag", etag)]))
else: await req.respond(Http404, "not found")
else: await req.respond(Http404, "not found")
else:
await req.respond(Http405, "GET only")
proc pollTarget(ctx: Ctx, s: string): Future[Response] {.async.} =
var client = newAsyncHttpClient()
var x = Response(rtype: rtTimeout, latency: 0)
var x = Response(rtype: ResponseType.Timeout, latency: 0)
proc doFetch() {.async.} =
let ts = now().utc
let res = await client.get(s)
let latency = (now().utc - ts).inMicroseconds
if res.code.int == 418: x = Response(rtype: rtHttpTeapot, latency: latency)
elif res.code.is4xx or res.code.is5xx: x = Response(rtype: rtHttpError, latency: latency)
else: x = Response(rtype: rtOk, latency: latency)
if res.code.int == 418: x = Response(rtype: ResponseType.HttpTeapot, latency: latency)
elif res.code.is4xx or res.code.is5xx: x = Response(rtype: ResponseType.HttpError, latency: latency)
else: x = Response(rtype: ResponseType.Ok, latency: latency)
try:
discard await withTimeout(doFetch(), 10000)
except:
x = Response(rtype: rtFetchError, latency: 0)
x = Response(rtype: ResponseType.FetchError, latency: 0)
client.close()
return x
proc pollTargets() {.async.} =
for row in getDB().all("SELECT * FROM sites"):
let (id, url) = row.unpack((int64, string))
let res = await pollTarget(url)
getDB().exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency)
proc pollTargets(ctx: Ctx) {.async.} =
for row in ctx.db.all("SELECT * FROM sites"):
var (id, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int64, string, int64, int64, int64, Option[Time]))
let res = await ctx.pollTarget(url)
let threshold = getTime() + initTimeInterval(weeks= -1)
proc timerCallback(fd: AsyncFD): bool =
asyncCheck pollTargets()
false
# drop old data from rolling counters
if rollingDataSince.isSome:
for row in ctx.db.iterate("SELECT status, latency FROM reqs WHERE timestamp >= ? AND timestamp <= ? AND site = ?", rollingDataSince.get, threshold, id):
let (statusRaw, latency) = row.unpack((int, int))
rollingTotalPings -= 1
rollingLatency -= latency
if statusRaw <= 0:
rollingSuccessfulPings -= 1
# add new data
rollingTotalPings += 1
rollingLatency += res.latency
if int(res.rtype) <= 0:
rollingSuccessfulPings += 1
ctx.db.transaction:
ctx.db.exec("UPDATE sites SET rc_total = ?, rc_success = ?, rc_latency = ?, rc_data_since = ? WHERE sid = ?", rollingTotalPings, rollingSuccessfulPings, rollingLatency, threshold, id)
ctx.db.exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency)
proc drawLatencyImage(db: DbConn, site: int, interval: int): seq[byte] =
const width = 120 * 6
const height = 168 * 2
var image = initImage[ColorRGBU](width, height)
var count = 0
var lastTs = getTime()
for row in db.iterate("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT ?", site, width * height):
let (ts, statusRaw, latency) = row.unpack((Time, int, int))
let timeGap = lastTs - ts
if timeGap > initDuration(milliseconds = interval + 10000):
let pixels = timeGap.inMilliseconds div interval
for _ in 1..pixels:
image.data[count] = ColorRGBU([0x7Eu8, 0x1E, 0x9C])
count += 1
if count >= image.data.len: break
else:
let status = ResponseType(statusRaw)
case status
of ResponseType.HttpError:
image.data[count] = ColorRGBU([255u8, 127, 0])
of ResponseType.Timeout:
image.data[count] = ColorRGBU([0u8, 0, 0])
of ResponseType.FetchError:
image.data[count] = ColorRGBU([255u8, 0, 0])
else:
let latencyMultiplier = max(min(pow(10.0, 1.1) / pow(float(latency), 0.25), 1.0), 0.2)
image.data[count] = ColorRGBU([0u8, uint8(latencyMultiplier * 255.0), 0])
count += 1
if count >= image.data.len: break
lastTs = ts
writePNG(image, compression=6)
proc generateImages(args: (string, int)) =
let (dbPath, interval) = args
let db = openDatabase(dbPath)
db.exec("PRAGMA journal_mode = WAL")
for row in db.all("SELECT sid FROM sites"):
let id = row[0].fromDbValue(int)
imageReturnChannel.send((id, drawLatencyImage(db, id, interval)))
close(db)
proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) =
## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this.
let database = openDatabase(dbPath)
database.exec("PRAGMA journal_mode = WAL")
migrate(database)
for url in urls:
echo &"Adding {url}"
database.exec("INSERT INTO sites (url) VALUES (?)", url)
close(database)
var ctx = Ctx(db: database, dbPath: dbPath, images: newTable[int, (seq[byte], int)](), interval: interval)
echo "Starting up"
asyncCheck pollTargets()
addTimer(interval, false, timerCallback)
asyncCheck pollTargets(ctx)
imageReturnChannel.open()
var thread: Thread[(string, int)]
createThread(thread, generateImages, (dbPath, interval))
echo "Ready"
addTimer(interval, false, proc(fd: AsyncFD): bool =
asyncCheck pollTargets(ctx)
false)
addTimer(interval * 60, false, proc(fd: AsyncFD): bool =
createThread(thread, generateImages, (dbPath, interval))
let fut = sleepAsync(10000)
fut.addCallback(() => readIntoContext(ctx))
asyncCheck fut
false)
var server = newAsyncHttpServer()
waitFor server.serve(Port(port), onRequest)
waitFor server.serve(Port(port), onRequest(ctx))
dispatch(run, help={
"dbPath": "path to SQLite3 database for historical data logging",
"port": "port to serve HTTP on",

View File

@ -1,5 +1,4 @@
body {
max-width: 40em;
font-family: sans-serif;
}
@ -9,6 +8,7 @@ body {
h1, h2 {
font-weight: normal;
max-width: 100%;
margin: 0;
}
@ -18,20 +18,29 @@ h1 {
.card {
margin-bottom: 1em;
display: flex;
justify-content: space-between;
flex-wrap: wrap;
}
.card.rtOk h2 {
.card.Ok h2 {
color: green;
}
.card.rtHttpError h2 {
.card.HttpError h2 {
color: orange;
}
.card.rtHttpTeapot h2 {
.card.HttpTeapot h2 {
color: blue;
}
.card.rtFetchError h2 {
.card.FetchError h2 {
color: red;
}
.card.rtTimeout h2 {
.card.Timeout h2 {
color: red;
}
img {
image-rendering: pixelated;
-ms-interpolation-mode: nearest-neighbor;
image-rendering: crisp-edges;
}