60 lines
1.8 KiB
Lua
60 lines
1.8 KiB
Lua
local fs = require("@lune/fs")
|
|
local serde = require("@lune/serde")
|
|
|
|
local config = require("@config")
|
|
local ask = require("./src/benchmark/aggregate")
|
|
local evaluator = require("./src/benchmark/evaluate")
|
|
local visualizer = require("./src/visualizer")
|
|
|
|
local function countQuestions(dataset): number
|
|
local n = 0
|
|
for _, items in dataset do n += #items end
|
|
return n
|
|
end
|
|
|
|
local function run()
|
|
local models: { string } = config.modelsToEvaluate
|
|
local totalQuestions = countQuestions(config.dataset)
|
|
|
|
local asked = 0
|
|
local lastAskPct = -1
|
|
local function onAskProgress(current: number, total: number, ctx)
|
|
-- Avoid noisy logs; print only on percentage change
|
|
local pct = math.floor((current / total) * 100)
|
|
if pct ~= lastAskPct then
|
|
print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
|
|
lastAskPct = pct
|
|
end
|
|
asked = current
|
|
end
|
|
|
|
local answers = ask.askAll(config, onAskProgress)
|
|
|
|
local evaluated = 0
|
|
local lastEvalPct = -1
|
|
local function onEvalProgress(current: number, total: number, ctx)
|
|
local pct = math.floor((current / total) * 100)
|
|
if pct ~= lastEvalPct then
|
|
print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
|
|
lastEvalPct = pct
|
|
end
|
|
evaluated = current
|
|
end
|
|
|
|
local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
|
|
|
|
pcall(fs.writeDir, "out")
|
|
local lines = {}
|
|
for _, r in scored do
|
|
table.insert(lines, serde.encode("json", r))
|
|
end
|
|
pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
|
|
|
|
local md = visualizer.toMarkdown(scored)
|
|
pcall(fs.writeFile, "out/summary.md", md)
|
|
|
|
return scored
|
|
end
|
|
|
|
return run()
|