lsfbench/src/init.luau

local fs = require("@lune/fs")
local serde = require("@lune/serde")

local config = require("@config")
local ask = require("./src/benchmark/aggregate")
local evaluator = require("./src/benchmark/evaluate")
local visualizer = require("./src/visualizer")

local function countQuestions(dataset): number
    local n = 0
    for _, items in dataset do n += #items end
    return n
end

local function run()
    local models: { string } = config.modelsToEvaluate
    local totalQuestions = countQuestions(config.dataset)

    local asked = 0
    local lastAskPct = -1
    local function onAskProgress(current: number, total: number, ctx)
        -- Avoid noisy logs; print only on percentage change
        local pct = math.floor((current / total) * 100)
        if pct ~= lastAskPct then
            print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
            lastAskPct = pct
        end
        asked = current
    end

    local answers = ask.askAll(config, onAskProgress)

    local evaluated = 0
    local lastEvalPct = -1
    local function onEvalProgress(current: number, total: number, ctx)
        local pct = math.floor((current / total) * 100)
        if pct ~= lastEvalPct then
            print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
            lastEvalPct = pct
        end
        evaluated = current
    end

    local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)

    pcall(fs.writeDir, "out")
    local lines = {}
    for _, r in scored do
        table.insert(lines, serde.encode("json", r))
    end
    pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))

    local md = visualizer.toMarkdown(scored)
    pcall(fs.writeFile, "out/summary.md", md)

    return scored
end

return run()