base benchmark

This commit is contained in:
2025-09-04 23:00:01 -06:00
parent 1114c02b7c
commit bcb31d84e6
10 changed files with 375 additions and 14 deletions

View File

@@ -1 +1,59 @@
local fs = require("@lune/fs")
local serde = require("@lune/serde")
local config = require("@config")
local ask = require("./src/benchmark/aggregate")
local evaluator = require("./src/benchmark/evaluate")
local visualizer = require("./src/visualizer")
local function countQuestions(dataset): number
local n = 0
for _, items in dataset do n += #items end
return n
end
local function run()
local models: { string } = config.modelsToEvaluate
local totalQuestions = countQuestions(config.dataset)
local asked = 0
local lastAskPct = -1
local function onAskProgress(current: number, total: number, ctx)
-- Avoid noisy logs; print only on percentage change
local pct = math.floor((current / total) * 100)
if pct ~= lastAskPct then
print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
lastAskPct = pct
end
asked = current
end
local answers = ask.askAll(config, onAskProgress)
local evaluated = 0
local lastEvalPct = -1
local function onEvalProgress(current: number, total: number, ctx)
local pct = math.floor((current / total) * 100)
if pct ~= lastEvalPct then
print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
lastEvalPct = pct
end
evaluated = current
end
local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
pcall(fs.writeDir, "out")
local lines = {}
for _, r in scored do
table.insert(lines, serde.encode("json", r))
end
pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
local md = visualizer.toMarkdown(scored)
pcall(fs.writeFile, "out/summary.md", md)
return scored
end
return run()