base benchmark
This commit is contained in:
@@ -1 +1,59 @@
|
||||
local fs = require("@lune/fs")
|
||||
local serde = require("@lune/serde")
|
||||
|
||||
local config = require("@config")
|
||||
local ask = require("./src/benchmark/aggregate")
|
||||
local evaluator = require("./src/benchmark/evaluate")
|
||||
local visualizer = require("./src/visualizer")
|
||||
|
||||
local function countQuestions(dataset): number
|
||||
local n = 0
|
||||
for _, items in dataset do n += #items end
|
||||
return n
|
||||
end
|
||||
|
||||
local function run()
|
||||
local models: { string } = config.modelsToEvaluate
|
||||
local totalQuestions = countQuestions(config.dataset)
|
||||
|
||||
local asked = 0
|
||||
local lastAskPct = -1
|
||||
local function onAskProgress(current: number, total: number, ctx)
|
||||
-- Avoid noisy logs; print only on percentage change
|
||||
local pct = math.floor((current / total) * 100)
|
||||
if pct ~= lastAskPct then
|
||||
print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
|
||||
lastAskPct = pct
|
||||
end
|
||||
asked = current
|
||||
end
|
||||
|
||||
local answers = ask.askAll(config, onAskProgress)
|
||||
|
||||
local evaluated = 0
|
||||
local lastEvalPct = -1
|
||||
local function onEvalProgress(current: number, total: number, ctx)
|
||||
local pct = math.floor((current / total) * 100)
|
||||
if pct ~= lastEvalPct then
|
||||
print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
|
||||
lastEvalPct = pct
|
||||
end
|
||||
evaluated = current
|
||||
end
|
||||
|
||||
local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
|
||||
|
||||
pcall(fs.writeDir, "out")
|
||||
local lines = {}
|
||||
for _, r in scored do
|
||||
table.insert(lines, serde.encode("json", r))
|
||||
end
|
||||
pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
|
||||
|
||||
local md = visualizer.toMarkdown(scored)
|
||||
pcall(fs.writeFile, "out/summary.md", md)
|
||||
|
||||
return scored
|
||||
end
|
||||
|
||||
return run()
|
||||
|
Reference in New Issue
Block a user