local fs = require("@lune/fs") local serde = require("@lune/serde") local config = require("@config") local ask = require("./src/benchmark/aggregate") local evaluator = require("./src/benchmark/evaluate") local visualizer = require("./src/visualizer") local function countQuestions(dataset): number local n = 0 for _, items in dataset do n += #items end return n end local function run() local models: { string } = config.modelsToEvaluate local totalQuestions = countQuestions(config.dataset) local asked = 0 local lastAskPct = -1 local function onAskProgress(current: number, total: number, ctx) -- Avoid noisy logs; print only on percentage change local pct = math.floor((current / total) * 100) if pct ~= lastAskPct then print(string.format("Asking models: %d/%d (%d%%)", current, total, pct)) lastAskPct = pct end asked = current end local answers = ask.askAll(config, onAskProgress) local evaluated = 0 local lastEvalPct = -1 local function onEvalProgress(current: number, total: number, ctx) local pct = math.floor((current / total) * 100) if pct ~= lastEvalPct then print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct)) lastEvalPct = pct end evaluated = current end local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress) pcall(fs.writeDir, "out") local lines = {} for _, r in scored do table.insert(lines, serde.encode("json", r)) end pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n")) local md = visualizer.toMarkdown(scored) pcall(fs.writeFile, "out/summary.md", md) return scored end return run()