forked from osmarks/onstat
Big refactors, performance, status images
This commit is contained in:
parent
dfbda82731
commit
65b19dbd96
@ -12,4 +12,5 @@ bin = @["onstat"]
|
||||
requires "nim >= 1.4.2"
|
||||
requires "https://github.com/GULPF/tiny_sqlite#8fe760d9"
|
||||
requires "karax >= 1.2.1"
|
||||
requires "cligen >= 1"
|
||||
requires "cligen >= 1"
|
||||
requires "imageman >= 0.8"
|
37
src/db.nim
37
src/db.nim
@ -3,21 +3,34 @@ import options
|
||||
|
||||
let migrations: seq[string] = @[
|
||||
"""
|
||||
CREATE TABLE sites (
|
||||
sid INTEGER PRIMARY KEY,
|
||||
url TEXT NOT NULL
|
||||
);
|
||||
CREATE TABLE sites (
|
||||
sid INTEGER PRIMARY KEY,
|
||||
url TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE reqs (
|
||||
rid INTEGER PRIMARY KEY,
|
||||
site INTEGER NOT NULL REFERENCES sites(sid),
|
||||
timestamp INTEGER NOT NULL,
|
||||
status INTEGER NOT NULL,
|
||||
latency INTEGER NOT NULL
|
||||
);
|
||||
CREATE TABLE reqs (
|
||||
rid INTEGER PRIMARY KEY,
|
||||
site INTEGER NOT NULL REFERENCES sites(sid),
|
||||
timestamp INTEGER NOT NULL,
|
||||
status INTEGER NOT NULL,
|
||||
latency INTEGER NOT NULL
|
||||
);
|
||||
""",
|
||||
"""
|
||||
CREATE INDEX req_ts_idx ON reqs (timestamp);
|
||||
CREATE INDEX req_ts_idx ON reqs (timestamp);
|
||||
""",
|
||||
# rolling total/successful ping and latency count
|
||||
# rc_data_since holds the older end of the interval the counters are from
|
||||
# this slightly horribly migrates the existing data using a hardcoded 1 week window
|
||||
"""
|
||||
ALTER TABLE sites ADD COLUMN rc_total INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE sites ADD COLUMN rc_success INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE sites ADD COLUMN rc_latency INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE sites ADD COLUMN rc_data_since INTEGER;
|
||||
UPDATE sites SET rc_total = (SELECT COUNT(*) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
|
||||
UPDATE sites SET rc_success = (SELECT SUM(status <= 0) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
|
||||
UPDATE sites SET rc_latency = (SELECT SUM(latency) FROM reqs WHERE site = sid AND timestamp >= (strftime('%s') - (86400*7)) * 1000000);
|
||||
UPDATE sites SET rc_data_since = (strftime('%s') - (86400*7)) * 1000000;
|
||||
"""
|
||||
]
|
||||
|
||||
|
248
src/onstat.nim
248
src/onstat.nim
@ -10,8 +10,12 @@ import sugar
|
||||
import net
|
||||
import sequtils
|
||||
import strformat
|
||||
import std/exitprocs
|
||||
import strutils
|
||||
import cligen
|
||||
import imageman
|
||||
import math
|
||||
import hashes
|
||||
import tables
|
||||
|
||||
import ./db
|
||||
|
||||
@ -19,19 +23,6 @@ macro includeFile(x: string): string = newStrLitNode(readFile(x.strVal))
|
||||
|
||||
const css = includeFile("./src/style.css")
|
||||
|
||||
var threadDB {.threadvar.}: Option[DbConn]
|
||||
proc getDB(): DbConn {.gcsafe.} =
|
||||
if isNone threadDB:
|
||||
let x = openDatabase("./monitoring.sqlite3")
|
||||
x.exec("PRAGMA journal_mode=WAL")
|
||||
proc closeDB() =
|
||||
try: close(x)
|
||||
except: discard
|
||||
addExitProc(closeDB)
|
||||
when declared(onThreadDestroy): onThreadDestroy(closeDB)
|
||||
threadDB = some x
|
||||
get threadDB
|
||||
|
||||
func timeToTimestamp*(t: Time): int64 = toUnix(t) * 1000000 + (nanosecond(t) div 1000)
|
||||
func timestampToTime*(ts: int64): Time = initTime(ts div 1000000, (ts mod 1000000) * 1000)
|
||||
|
||||
@ -39,37 +30,42 @@ proc toDbValue(t: Time): DbValue = DbValue(kind: sqliteInteger, intVal: timeToTi
|
||||
proc fromDbValue(value: DbValue, T: typedesc[Time]): Time = timestampToTime(value.intVal)
|
||||
|
||||
type
|
||||
ResponseType = enum
|
||||
rtHttpTeapot = -1
|
||||
rtOk = 0
|
||||
rtHttpError = 1
|
||||
rtTimeout = 2
|
||||
rtFetchError = 3
|
||||
ResponseType {.pure.} = enum
|
||||
HttpTeapot = -1
|
||||
Ok = 0
|
||||
HttpError = 1
|
||||
Timeout = 2
|
||||
FetchError = 3
|
||||
Response = object
|
||||
rtype: ResponseType
|
||||
latency: int64 # microseconds
|
||||
SiteStatus = object
|
||||
id: int
|
||||
url: string
|
||||
lastPing: Time
|
||||
lastResponse: ResponseType
|
||||
lastLatency: float
|
||||
uptimePercent: float
|
||||
averageLatency: float
|
||||
Ctx = object
|
||||
db: DbConn
|
||||
dbPath: string
|
||||
images: TableRef[int, (seq[byte], int)]
|
||||
interval: int
|
||||
|
||||
proc uptimeSince(sid: int, time: Time): float =
|
||||
let okPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND status <= 0 AND timestamp >= ?", sid, time), int)
|
||||
let totalPings = fromDbValue(get getDB().value("SELECT COUNT(*) FROM reqs WHERE site = ? AND timestamp >= ?", sid, time), int)
|
||||
okPings / totalPings
|
||||
|
||||
proc fetchLatest(row: ResultRow): Option[SiteStatus] =
|
||||
proc fetchLatest(ctx: Ctx, row: ResultRow): Option[SiteStatus] =
|
||||
let weekAgo = getTime() + initTimeInterval(weeks= -1)
|
||||
let (site, url) = row.unpack((int, string))
|
||||
let row = getDB().one("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT 1", site)
|
||||
let (site, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int, string, int64, int64, int64, int64))
|
||||
# work around bizarre SQLite query planner issue - it appears that if it has a literal value to compare site against it generates very fast VM code
|
||||
# but if it has a prepared state parameter it somehow refuses to use the index
|
||||
let row = ctx.db.one("SELECT timestamp, status, latency FROM reqs WHERE site = -1 OR site = ? ORDER BY timestamp DESC LIMIT 1", site)
|
||||
if isNone row: return none(SiteStatus)
|
||||
let (ts, status, latency) = (get row).unpack((Time, int, int))
|
||||
some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float64(latency) / 1e3, uptimePercent: uptimeSince(site, weekAgo))
|
||||
some SiteStatus(url: url, lastPing: ts, lastResponse: ResponseType(status), lastLatency: float(latency) / 1e3, id: site,
|
||||
uptimePercent: float(rollingSuccessfulPings) / float(rollingTotalPings), averageLatency: float(rollingLatency) / float(rollingTotalPings) / 1e3)
|
||||
|
||||
proc mainPage(): string =
|
||||
let sites = getDB().all("SELECT * FROM sites ORDER BY sid").map(fetchLatest).filter(x => isSome x).map(x => get x)
|
||||
proc mainPage(ctx: Ctx): string =
|
||||
let sites = ctx.db.all("SELECT * FROM sites ORDER BY sid").map(x => ctx.fetchLatest(x)).filter(x => x.isSome).map(x => x.get)
|
||||
let up = sites.filter(x => int(x.lastResponse) <= 0).len()
|
||||
let vnode = buildHtml(html()):
|
||||
head:
|
||||
@ -83,23 +79,25 @@ proc mainPage(): string =
|
||||
h2(class="title"): text &"{up}/{sites.len} up"
|
||||
for site in sites:
|
||||
tdiv(class="card " & $site.lastResponse):
|
||||
h2:
|
||||
case site.lastResponse
|
||||
of rtOk: text "✓ "
|
||||
of rtHttpError: text "⚠ "
|
||||
of rtTimeout: text "✕ "
|
||||
of rtFetchError: text "✕ "
|
||||
of rtHttpTeapot: text "🫖 "
|
||||
text site.url
|
||||
tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy"))
|
||||
tdiv:
|
||||
case site.lastResponse
|
||||
of rtOk: text &"Latency {site.lastLatency}ms"
|
||||
of rtHttpError: text "HTTP error"
|
||||
of rtHttpTeapot: text &"Teapot, latency {site.lastLatency}ms"
|
||||
of rtTimeout: text "Timed out"
|
||||
of rtFetchError: text "Fetch failed"
|
||||
tdiv: text &"{site.uptimePercent * 100}% up in last week"
|
||||
tdiv(class="left"):
|
||||
h2:
|
||||
case site.lastResponse
|
||||
of ResponseType.Ok: text "✓ "
|
||||
of ResponseType.HttpError: text "⚠ "
|
||||
of ResponseType.Timeout: text "✕ "
|
||||
of ResponseType.FetchError: text "✕ "
|
||||
of ResponseType.HttpTeapot: text "🫖 "
|
||||
text site.url
|
||||
tdiv: text("Last pinged " & format(site.lastPing, "HH:mm:ss dd-MM-yyyy"))
|
||||
tdiv:
|
||||
case site.lastResponse
|
||||
of ResponseType.Ok: text &"Latency {site.lastLatency}ms"
|
||||
of ResponseType.HttpError: text "HTTP error"
|
||||
of ResponseType.HttpTeapot: text &"Teapot, latency {site.lastLatency:.5f}ms"
|
||||
of ResponseType.Timeout: text "Timed out"
|
||||
of ResponseType.FetchError: text "Fetch failed"
|
||||
tdiv: text &"{site.uptimePercent * 100:.5f}% up, {site.averageLatency:.5f}ms latency in last week"
|
||||
if site.id in ctx.images: img(src= &"/vis/{site.id}", class="right", title= "&{site.url} 12-week status visualization")
|
||||
hr()
|
||||
small:
|
||||
text "made by "
|
||||
@ -109,54 +107,158 @@ proc mainPage(): string =
|
||||
text "."
|
||||
$vnode
|
||||
|
||||
proc onRequest(req: Request) {.async.} =
|
||||
if req.reqMethod == HttpGet:
|
||||
case req.url.path
|
||||
of "/": await req.respond(Http200, mainPage(), headers=newHttpHeaders([("Content-Type", "text/html")]))
|
||||
else: await req.respond(Http404, "not found")
|
||||
else:
|
||||
await req.respond(Http404, "not found")
|
||||
var imageReturnChannel: Channel[(int, seq[byte])]
|
||||
|
||||
proc pollTarget(s: string): Future[Response] {.async.} =
|
||||
proc readIntoContext(ctx: Ctx) =
|
||||
# this is a horrible workaround to avoid having to something something shared hash table
|
||||
var available = true
|
||||
while available:
|
||||
let (av, data) = imageReturnChannel.tryRecv()
|
||||
available = av
|
||||
if available:
|
||||
let (id, image) = data
|
||||
ctx.images[id] = (image, image.hash)
|
||||
|
||||
proc onRequest(ctx: Ctx): (proc(req: Request): Future[void] {.gcsafe.}) =
|
||||
result = proc(req: Request) {.async.} =
|
||||
readIntoContext(ctx)
|
||||
if req.reqMethod == HttpGet:
|
||||
var path = req.url.path
|
||||
if path == "/":
|
||||
await req.respond(Http200, mainPage(ctx), headers=newHttpHeaders([("Content-Type", "text/html")]))
|
||||
elif path.startsWith("/vis/"):
|
||||
path.removePrefix("/vis/")
|
||||
var id = 0
|
||||
try:
|
||||
id = parseInt path
|
||||
except:
|
||||
await req.respond(Http404, "not found")
|
||||
return
|
||||
if id in ctx.images:
|
||||
let (image, hash) = ctx.images[id]
|
||||
let etag = &"\"{hash}\""
|
||||
if etag == req.headers.getOrDefault("if-none-match"):
|
||||
await req.respond(Http304, "")
|
||||
else:
|
||||
await req.respond(Http200, cast[string](image), headers=newHttpHeaders([
|
||||
("Content-Type", "image/png"), ("ETag", etag)]))
|
||||
else: await req.respond(Http404, "not found")
|
||||
else: await req.respond(Http404, "not found")
|
||||
else:
|
||||
await req.respond(Http405, "GET only")
|
||||
|
||||
proc pollTarget(ctx: Ctx, s: string): Future[Response] {.async.} =
|
||||
var client = newAsyncHttpClient()
|
||||
var x = Response(rtype: rtTimeout, latency: 0)
|
||||
var x = Response(rtype: ResponseType.Timeout, latency: 0)
|
||||
proc doFetch() {.async.} =
|
||||
let ts = now().utc
|
||||
let res = await client.get(s)
|
||||
let latency = (now().utc - ts).inMicroseconds
|
||||
if res.code.int == 418: x = Response(rtype: rtHttpTeapot, latency: latency)
|
||||
elif res.code.is4xx or res.code.is5xx: x = Response(rtype: rtHttpError, latency: latency)
|
||||
else: x = Response(rtype: rtOk, latency: latency)
|
||||
if res.code.int == 418: x = Response(rtype: ResponseType.HttpTeapot, latency: latency)
|
||||
elif res.code.is4xx or res.code.is5xx: x = Response(rtype: ResponseType.HttpError, latency: latency)
|
||||
else: x = Response(rtype: ResponseType.Ok, latency: latency)
|
||||
try:
|
||||
discard await withTimeout(doFetch(), 10000)
|
||||
except:
|
||||
x = Response(rtype: rtFetchError, latency: 0)
|
||||
x = Response(rtype: ResponseType.FetchError, latency: 0)
|
||||
client.close()
|
||||
return x
|
||||
|
||||
proc pollTargets() {.async.} =
|
||||
for row in getDB().all("SELECT * FROM sites"):
|
||||
let (id, url) = row.unpack((int64, string))
|
||||
let res = await pollTarget(url)
|
||||
getDB().exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency)
|
||||
proc pollTargets(ctx: Ctx) {.async.} =
|
||||
for row in ctx.db.all("SELECT * FROM sites"):
|
||||
var (id, url, rollingTotalPings, rollingSuccessfulPings, rollingLatency, rollingDataSince) = row.unpack((int64, string, int64, int64, int64, Option[Time]))
|
||||
let res = await ctx.pollTarget(url)
|
||||
let threshold = getTime() + initTimeInterval(weeks= -1)
|
||||
|
||||
proc timerCallback(fd: AsyncFD): bool =
|
||||
asyncCheck pollTargets()
|
||||
false
|
||||
# drop old data from rolling counters
|
||||
if rollingDataSince.isSome:
|
||||
for row in ctx.db.iterate("SELECT status, latency FROM reqs WHERE timestamp >= ? AND timestamp <= ? AND site = ?", rollingDataSince.get, threshold, id):
|
||||
let (statusRaw, latency) = row.unpack((int, int))
|
||||
rollingTotalPings -= 1
|
||||
rollingLatency -= latency
|
||||
if statusRaw <= 0:
|
||||
rollingSuccessfulPings -= 1
|
||||
|
||||
# add new data
|
||||
rollingTotalPings += 1
|
||||
rollingLatency += res.latency
|
||||
if int(res.rtype) <= 0:
|
||||
rollingSuccessfulPings += 1
|
||||
|
||||
ctx.db.transaction:
|
||||
ctx.db.exec("UPDATE sites SET rc_total = ?, rc_success = ?, rc_latency = ?, rc_data_since = ? WHERE sid = ?", rollingTotalPings, rollingSuccessfulPings, rollingLatency, threshold, id)
|
||||
ctx.db.exec("INSERT INTO reqs (site, timestamp, status, latency) VALUES (?, ?, ?, ?)", id, getTime(), int(res.rtype), res.latency)
|
||||
|
||||
proc drawLatencyImage(db: DbConn, site: int, interval: int): seq[byte] =
|
||||
const width = 120 * 6
|
||||
const height = 168 * 2
|
||||
var image = initImage[ColorRGBU](width, height)
|
||||
var count = 0
|
||||
var lastTs = getTime()
|
||||
for row in db.iterate("SELECT timestamp, status, latency FROM reqs WHERE site = ? ORDER BY timestamp DESC LIMIT ?", site, width * height):
|
||||
let (ts, statusRaw, latency) = row.unpack((Time, int, int))
|
||||
let timeGap = lastTs - ts
|
||||
if timeGap > initDuration(milliseconds = interval + 10000):
|
||||
let pixels = timeGap.inMilliseconds div interval
|
||||
for _ in 1..pixels:
|
||||
image.data[count] = ColorRGBU([0x7Eu8, 0x1E, 0x9C])
|
||||
count += 1
|
||||
if count >= image.data.len: break
|
||||
else:
|
||||
let status = ResponseType(statusRaw)
|
||||
case status
|
||||
of ResponseType.HttpError:
|
||||
image.data[count] = ColorRGBU([255u8, 127, 0])
|
||||
of ResponseType.Timeout:
|
||||
image.data[count] = ColorRGBU([0u8, 0, 0])
|
||||
of ResponseType.FetchError:
|
||||
image.data[count] = ColorRGBU([255u8, 0, 0])
|
||||
else:
|
||||
let latencyMultiplier = max(min(pow(10.0, 1.1) / pow(float(latency), 0.25), 1.0), 0.2)
|
||||
image.data[count] = ColorRGBU([0u8, uint8(latencyMultiplier * 255.0), 0])
|
||||
|
||||
count += 1
|
||||
if count >= image.data.len: break
|
||||
lastTs = ts
|
||||
writePNG(image, compression=6)
|
||||
|
||||
proc generateImages(args: (string, int)) =
|
||||
let (dbPath, interval) = args
|
||||
let db = openDatabase(dbPath)
|
||||
db.exec("PRAGMA journal_mode = WAL")
|
||||
for row in db.all("SELECT sid FROM sites"):
|
||||
let id = row[0].fromDbValue(int)
|
||||
imageReturnChannel.send((id, drawLatencyImage(db, id, interval)))
|
||||
close(db)
|
||||
|
||||
proc run(dbPath="./monitoring.sqlite3", port=7800, interval=30000, urls: seq[string]) =
|
||||
## Run onstat. Note that the URLs you configure will be persisted in the monitoring database. To remove them, you must manually update this.
|
||||
let database = openDatabase(dbPath)
|
||||
database.exec("PRAGMA journal_mode = WAL")
|
||||
migrate(database)
|
||||
for url in urls:
|
||||
echo &"Adding {url}"
|
||||
database.exec("INSERT INTO sites (url) VALUES (?)", url)
|
||||
close(database)
|
||||
|
||||
var ctx = Ctx(db: database, dbPath: dbPath, images: newTable[int, (seq[byte], int)](), interval: interval)
|
||||
|
||||
echo "Starting up"
|
||||
asyncCheck pollTargets()
|
||||
addTimer(interval, false, timerCallback)
|
||||
asyncCheck pollTargets(ctx)
|
||||
imageReturnChannel.open()
|
||||
var thread: Thread[(string, int)]
|
||||
createThread(thread, generateImages, (dbPath, interval))
|
||||
echo "Ready"
|
||||
addTimer(interval, false, proc(fd: AsyncFD): bool =
|
||||
asyncCheck pollTargets(ctx)
|
||||
false)
|
||||
addTimer(interval * 60, false, proc(fd: AsyncFD): bool =
|
||||
createThread(thread, generateImages, (dbPath, interval))
|
||||
let fut = sleepAsync(10000)
|
||||
fut.addCallback(() => readIntoContext(ctx))
|
||||
asyncCheck fut
|
||||
false)
|
||||
var server = newAsyncHttpServer()
|
||||
waitFor server.serve(Port(port), onRequest)
|
||||
waitFor server.serve(Port(port), onRequest(ctx))
|
||||
dispatch(run, help={
|
||||
"dbPath": "path to SQLite3 database for historical data logging",
|
||||
"port": "port to serve HTTP on",
|
||||
|
@ -1,5 +1,4 @@
|
||||
body {
|
||||
max-width: 40em;
|
||||
font-family: sans-serif;
|
||||
}
|
||||
|
||||
@ -9,6 +8,7 @@ body {
|
||||
|
||||
h1, h2 {
|
||||
font-weight: normal;
|
||||
max-width: 100%;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
@ -18,20 +18,29 @@ h1 {
|
||||
|
||||
.card {
|
||||
margin-bottom: 1em;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.card.rtOk h2 {
|
||||
.card.Ok h2 {
|
||||
color: green;
|
||||
}
|
||||
.card.rtHttpError h2 {
|
||||
.card.HttpError h2 {
|
||||
color: orange;
|
||||
}
|
||||
.card.rtHttpTeapot h2 {
|
||||
.card.HttpTeapot h2 {
|
||||
color: blue;
|
||||
}
|
||||
.card.rtFetchError h2 {
|
||||
.card.FetchError h2 {
|
||||
color: red;
|
||||
}
|
||||
.card.rtTimeout h2 {
|
||||
.card.Timeout h2 {
|
||||
color: red;
|
||||
}
|
||||
|
||||
img {
|
||||
image-rendering: pixelated;
|
||||
-ms-interpolation-mode: nearest-neighbor;
|
||||
image-rendering: crisp-edges;
|
||||
}
|
Loading…
Reference in New Issue
Block a user