base benchmark

2025-09-04 23:00:01 -06:00
parent 1114c02b7c
commit bcb31d84e6
10 changed files with 375 additions and 14 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-AGENTS.md
+AGENTS.md
 out/
--- a/.luaurc
+++ b/.luaurc
@@ -1,6 +1,7 @@
 {
  "aliases": {
    "lune": "~/.lune/.typedefs/0.10.2/",
-    "ollama": "./src/ollama"
+    "ollama": "./src/ollama",
    "config": "./config"
  }
-}
+}
--- a/README.md
+++ b/README.md
@@ -0,0 +1,10 @@
 # LSFBench
 Minimal Luau/Lune benchmark to evaluate LLMs: one model answers questions, another model scores the answers against the reference key.
 ## Quick Start
 Prereqs
 - Install Lune (0.10.x)
 - Start Ollama at `http://localhost:11434` and pull the models referenced in `config.luau` (e.g. `qwen3:4b`)
 ## Notice
 The evaluator model must support structured JSON outputs.
--- a/config.luau
+++ b/config.luau
@@ -22,8 +22,8 @@ export type config = {
 }
 local config: config = {
-    modelsToEvaluate = { 'qwen3:4b' },
+    modelsToEvaluate = { 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0' },
-    evaluatingModel = 'qwen3:4b',
+    evaluatingModel = 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0',
    dataset = {
        syntax = {
            {
--- a/config.yaml
+++ b/config.yaml
--- a/src/benchmark/aggregate.luau
+++ b/src/benchmark/aggregate.luau
@@ -0,0 +1,86 @@
 local ollama = require("@ollama")
 export type QuestionItem = {
    question: string,
    answer: string,
    explanation: string?,
    pointReward: number,
 }
 export type AnswerRecord = {
    model: string,
    category: string,
    index: number,
    question: string,
    candidate: string,
    reference: string,
    pointReward: number,
 }
 local function askModel(modelName: string, question: string): string
    local client = ollama.serve()
    local system = "You are a concise Luau expert. Answer precisely."
    local prompt = (
        "Question:\n" .. question .. "\n\n" ..
        "Respond with the answer."
    )
    local r = client:generateCompletion({
        model = modelName,
        prompt = prompt,
        system = system,
        keep_alive = "5m",
        options = {
            num_ctx = 32000
        }
    })
    if r.statusCode then
        return ""
    end
    return r.response
 end
 local function askAll(config, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { AnswerRecord }
    local results: { AnswerRecord } = {}
    local models: { string } = config.modelsToEvaluate
    -- Precompute total questions
    local total = 0
    for _, items in config.dataset do
        total += #items
    end
    total *= #models
    local current = 0
    for _, modelName in models do
        for categoryName, items in config.dataset do
            local list = items :: { QuestionItem }
            for i, q in list do
                local okAns, candidate = pcall(askModel, modelName, q.question)
                if not okAns then candidate = "" end
                table.insert(results, {
                    model = modelName,
                    category = categoryName,
                    index = i,
                    question = q.question,
                    candidate = candidate,
                    reference = q.answer,
                    pointReward = q.pointReward,
                })
                current += 1
                if onProgress then
                    onProgress(current, total, { model = modelName, category = categoryName, index = i })
                end
            end
        end
    end
    return results
 end
 return {
    askAll = askAll,
 }
--- a/src/benchmark/evaluate.luau
+++ b/src/benchmark/evaluate.luau
@@ -0,0 +1,126 @@
 local serde = require("@lune/serde")
 local ollama = require("@ollama")
 export type AnswerRecord = {
    model: string,
    category: string,
    index: number,
    question: string,
    candidate: string,
    reference: string,
    pointReward: number,
 }
 export type EvalJSON = {
    score: number,
    rationale: string,
    correct: boolean?,
 }
 export type ScoredRecord = AnswerRecord & {
    score: number,
    rationale: string,
 }
 local function clamp(n: number, lo: number, hi: number): number
    if n < lo then return lo end
    if n > hi then return hi end
    return n
 end
 local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
    local client = ollama.serve()
    local system = table.concat({
        "You are a strict grader.",
        "Return ONLY valid JSON complying with the given schema.",
        "No prose, no markdown, no code fences.",
    }, " ")
    local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
    local instructions = string.format([[You will grade a candidate answer.
 Constraints:
 - Award an integer score from 0 to %d.
 - Keep rationale 1-2 short sentences.
 - Set correct=true if the candidate meaningfully matches the reference answer, else false.
 Output:
 - Return ONLY a single JSON object matching this schema: %s
 Question:
 """
 %s
 """
 Reference Answer:
 """
 %s
 """
 Candidate Answer:
 """
 %s
 """
 ]], maxPoints, schema, question, reference, candidate)
    local r = client:generateCompletion({
        model = evaluatorModel,
        prompt = instructions,
        system = system,
        format = "json",
        keep_alive = "5m",
        options = {
            num_ctx = 32000
        }
    })
    if r.statusCode then
        return { score = 0, rationale = "evaluator request failed", correct = false }
    end
    local ok, obj = pcall(function()
        return serde.decode("json", r.response)
    end)
    if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
        return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
    end
    local bounded = clamp(math.floor(obj.score), 0, maxPoints)
    local rationale = tostring(obj.rationale or "")
    return { score = bounded, rationale = rationale, correct = not not obj.correct }
 end
 local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
    local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
    return {
        model = ar.model,
        category = ar.category,
        index = ar.index,
        question = ar.question,
        candidate = ar.candidate,
        reference = ar.reference,
        pointReward = ar.pointReward,
        score = res.score,
        rationale = res.rationale,
    }
 end
 local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
    local out: { ScoredRecord } = {}
    local total = #answers
    local current = 0
    for _, ar in answers do
        table.insert(out, evaluateOne(evaluatorModel, ar))
        current += 1
        if onProgress then
            onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
        end
    end
    return out
 end
 return {
    evaluateOne = evaluateOne,
    evaluateAll = evaluateAll,
 }
--- a/src/benchmark/types.luau
+++ b/src/benchmark/types.luau
@@ -0,0 +1,31 @@
 export type QuestionItem = {
    question: string,
    answer: string,
    explanation: string?,
    pointReward: number,
 }
 export type CategorySet = { QuestionItem }
 export type EvalJSON = {
    score: number,
    rationale: string,
    correct: boolean?,
 }
 export type AnswerRecord = {
    model: string,
    category: string,
    index: number,
    question: string,
    candidate: string,
    reference: string,
    pointReward: number,
 }
 export type ScoredRecord = AnswerRecord & {
    score: number,
    rationale: string,
 }
 return {}
--- a/src/init.luau
+++ b/src/init.luau
@@ -1 +1,59 @@
 local fs = require("@lune/fs")
 local serde = require("@lune/serde")
 local config = require("@config")
 local ask = require("./src/benchmark/aggregate")
 local evaluator = require("./src/benchmark/evaluate")
 local visualizer = require("./src/visualizer")
 local function countQuestions(dataset): number
    local n = 0
    for _, items in dataset do n += #items end
    return n
 end
 local function run()
    local models: { string } = config.modelsToEvaluate
    local totalQuestions = countQuestions(config.dataset)
    local asked = 0
    local lastAskPct = -1
    local function onAskProgress(current: number, total: number, ctx)
        -- Avoid noisy logs; print only on percentage change
        local pct = math.floor((current / total) * 100)
        if pct ~= lastAskPct then
            print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
            lastAskPct = pct
        end
        asked = current
    end
    local answers = ask.askAll(config, onAskProgress)
    local evaluated = 0
    local lastEvalPct = -1
    local function onEvalProgress(current: number, total: number, ctx)
        local pct = math.floor((current / total) * 100)
        if pct ~= lastEvalPct then
            print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
            lastEvalPct = pct
        end
        evaluated = current
    end
    local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
    pcall(fs.writeDir, "out")
    local lines = {}
    for _, r in scored do
        table.insert(lines, serde.encode("json", r))
    end
    pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
    local md = visualizer.toMarkdown(scored)
    pcall(fs.writeFile, "out/summary.md", md)
    return scored
 end
 return run()
--- a/src/visualizer.luau
+++ b/src/visualizer.luau
@@ -1,9 +1,57 @@
--[[
+local function summarizeOverall(results)
-returns text outputs like so:
+    local totalsByModel: { [string]: { total: number, max: number, count: number } } = {}
-|-----------------------------------------|
+    for _, r in results do
-| 1. model one                        90% |
+        totalsByModel[r.model] = totalsByModel[r.model] or { total = 0, max = 0, count = 0 }
-| 1. model two                        85% |
+        local t = totalsByModel[r.model]
-| 1. model three                      80% |
+        t.total += r.score
-| 1. model four                       60% |
+        t.max += r.pointReward
-|-----------------------------------------|
+        t.count += 1
-]]
+    end
    return totalsByModel
 end
 local function summarizeByCategory(results)
    local byModel: { [string]: { [string]: { total: number, max: number, count: number } } } = {}
    for _, r in results do
        byModel[r.model] = byModel[r.model] or {}
        byModel[r.model][r.category] = byModel[r.model][r.category] or { total = 0, max = 0, count = 0 }
        local t = byModel[r.model][r.category]
        t.total += r.score
        t.max += r.pointReward
        t.count += 1
    end
    return byModel
 end
 local function toMarkdown(results): string
    local overall = summarizeOverall(results)
    local byCat = summarizeByCategory(results)
    local lines = {}
    table.insert(lines, "# LSFBench Summary")
    table.insert(lines, "")
    table.insert(lines, "| Model | Total | Max | Percent | Count |")
    table.insert(lines, "|-------|-------|-----|---------|-------|")
    for model, t in overall do
        local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
        table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", model, t.total, t.max, pct, t.count))
    end
    table.insert(lines, "")
    table.insert(lines, "## Per-Category")
    for model, cats in byCat do
        table.insert(lines, string.format("### %s", model))
        table.insert(lines, "| Category | Total | Max | Percent | Count |")
        table.insert(lines, "|----------|-------|-----|---------|-------|")
        for category, t in cats do
            local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
            table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", category, t.total, t.max, pct, t.count))
        end
        table.insert(lines, "")
    end
    return table.concat(lines, "\n")
 end
 return { toMarkdown = toMarkdown }