From bcb31d84e6ae35e2b8a7d713d5cec15357ee0a56 Mon Sep 17 00:00:00 2001
From: cyclic <cyclic@luau.software>
Date: Thu, 4 Sep 2025 23:00:01 -0600
Subject: [PATCH] base benchmark

---
 .gitignore                   |   3 +-
 .luaurc                      |   5 +-
 README.md                    |  10 +++
 config.luau                  |   4 +-
 config.yaml                  |   0
 src/benchmark/aggregate.luau |  86 ++++++++++++++++++++++++
 src/benchmark/evaluate.luau  | 126 +++++++++++++++++++++++++++++++++++
 src/benchmark/types.luau     |  31 +++++++++
 src/init.luau                |  58 ++++++++++++++++
 src/visualizer.luau          |  66 +++++++++++++++---
 10 files changed, 375 insertions(+), 14 deletions(-)
 create mode 100644 README.md
 delete mode 100644 config.yaml

diff --git a/.gitignore b/.gitignore
index 47dc3e3..2868ba2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-AGENTS.md
\ No newline at end of file
+AGENTS.md
+out/
\ No newline at end of file
diff --git a/.luaurc b/.luaurc
index 99bf373..8a5b629 100644
--- a/.luaurc
+++ b/.luaurc
@@ -1,6 +1,7 @@
 {
   "aliases": {
     "lune": "~/.lune/.typedefs/0.10.2/",
-    "ollama": "./src/ollama"
+    "ollama": "./src/ollama",
+    "config": "./config"
   }
-}
+}
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..49908f7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+# LSFBench
+Minimal Luau/Lune benchmark to evaluate LLMs: one model answers questions, another model scores the answers against the reference key.
+
+## Quick Start
+Prereqs
+- Install Lune (0.10.x)
+- Start Ollama at `http://localhost:11434` and pull the models referenced in `config.luau` (e.g. `qwen3:4b`)
+
+## Notice
+The evaluator model must support structured JSON outputs.
\ No newline at end of file
diff --git a/config.luau b/config.luau
index 5a4c4a8..97ea69b 100644
--- a/config.luau
+++ b/config.luau
@@ -22,8 +22,8 @@ export type config = {
 }
 
 local config: config = {
-    modelsToEvaluate = { 'qwen3:4b' },
-    evaluatingModel = 'qwen3:4b',
+    modelsToEvaluate = { 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0' },
+    evaluatingModel = 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0',
     dataset = {
         syntax = {
             {
diff --git a/config.yaml b/config.yaml
deleted file mode 100644
index e69de29..0000000
diff --git a/src/benchmark/aggregate.luau b/src/benchmark/aggregate.luau
index e69de29..c4a1b7b 100644
--- a/src/benchmark/aggregate.luau
+++ b/src/benchmark/aggregate.luau
@@ -0,0 +1,86 @@
+local ollama = require("@ollama")
+
+export type QuestionItem = {
+    question: string,
+    answer: string,
+    explanation: string?,
+    pointReward: number,
+}
+
+export type AnswerRecord = {
+    model: string,
+    category: string,
+    index: number,
+    question: string,
+    candidate: string,
+    reference: string,
+    pointReward: number,
+}
+
+local function askModel(modelName: string, question: string): string
+    local client = ollama.serve()
+    local system = "You are a concise Luau expert. Answer precisely."
+    local prompt = (
+        "Question:\n" .. question .. "\n\n" ..
+        "Respond with the answer."
+    )
+
+    local r = client:generateCompletion({
+        model = modelName,
+        prompt = prompt,
+        system = system,
+        keep_alive = "5m",
+        options = {
+            num_ctx = 32000
+        }
+    })
+
+    if r.statusCode then
+        return ""
+    end
+    return r.response
+end
+
+local function askAll(config, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { AnswerRecord }
+    local results: { AnswerRecord } = {}
+    local models: { string } = config.modelsToEvaluate
+    
+    -- Precompute total questions
+    local total = 0
+    for _, items in config.dataset do
+        total += #items
+    end
+    total *= #models
+    local current = 0
+
+    for _, modelName in models do
+        for categoryName, items in config.dataset do
+            local list = items :: { QuestionItem }
+            for i, q in list do
+                local okAns, candidate = pcall(askModel, modelName, q.question)
+                if not okAns then candidate = "" end
+
+                table.insert(results, {
+                    model = modelName,
+                    category = categoryName,
+                    index = i,
+                    question = q.question,
+                    candidate = candidate,
+                    reference = q.answer,
+                    pointReward = q.pointReward,
+                })
+
+                current += 1
+                if onProgress then
+                    onProgress(current, total, { model = modelName, category = categoryName, index = i })
+                end
+            end
+        end
+    end
+
+    return results
+end
+
+return {
+    askAll = askAll,
+}
diff --git a/src/benchmark/evaluate.luau b/src/benchmark/evaluate.luau
index e69de29..1312f32 100644
--- a/src/benchmark/evaluate.luau
+++ b/src/benchmark/evaluate.luau
@@ -0,0 +1,126 @@
+local serde = require("@lune/serde")
+local ollama = require("@ollama")
+
+export type AnswerRecord = {
+    model: string,
+    category: string,
+    index: number,
+    question: string,
+    candidate: string,
+    reference: string,
+    pointReward: number,
+}
+
+export type EvalJSON = {
+    score: number,
+    rationale: string,
+    correct: boolean?,
+}
+
+export type ScoredRecord = AnswerRecord & {
+    score: number,
+    rationale: string,
+}
+
+local function clamp(n: number, lo: number, hi: number): number
+    if n < lo then return lo end
+    if n > hi then return hi end
+    return n
+end
+
+local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
+    local client = ollama.serve()
+
+    local system = table.concat({
+        "You are a strict grader.",
+        "Return ONLY valid JSON complying with the given schema.",
+        "No prose, no markdown, no code fences.",
+    }, " ")
+
+    local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
+
+    local instructions = string.format([[You will grade a candidate answer.
+Constraints:
+- Award an integer score from 0 to %d.
+- Keep rationale 1-2 short sentences.
+- Set correct=true if the candidate meaningfully matches the reference answer, else false.
+Output:
+- Return ONLY a single JSON object matching this schema: %s
+
+Question:
+"""
+%s
+"""
+
+Reference Answer:
+"""
+%s
+"""
+
+Candidate Answer:
+"""
+%s
+"""
+]], maxPoints, schema, question, reference, candidate)
+
+    local r = client:generateCompletion({
+        model = evaluatorModel,
+        prompt = instructions,
+        system = system,
+        format = "json",
+        keep_alive = "5m",
+        options = {
+            num_ctx = 32000
+        }
+    })
+
+    if r.statusCode then
+        return { score = 0, rationale = "evaluator request failed", correct = false }
+    end
+
+    local ok, obj = pcall(function()
+        return serde.decode("json", r.response)
+    end)
+
+    if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
+        return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
+    end
+
+    local bounded = clamp(math.floor(obj.score), 0, maxPoints)
+    local rationale = tostring(obj.rationale or "")
+    return { score = bounded, rationale = rationale, correct = not not obj.correct }
+end
+
+local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
+    local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
+    return {
+        model = ar.model,
+        category = ar.category,
+        index = ar.index,
+        question = ar.question,
+        candidate = ar.candidate,
+        reference = ar.reference,
+        pointReward = ar.pointReward,
+        score = res.score,
+        rationale = res.rationale,
+    }
+end
+
+local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
+    local out: { ScoredRecord } = {}
+    local total = #answers
+    local current = 0
+    for _, ar in answers do
+        table.insert(out, evaluateOne(evaluatorModel, ar))
+        current += 1
+        if onProgress then
+            onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
+        end
+    end
+    return out
+end
+
+return {
+    evaluateOne = evaluateOne,
+    evaluateAll = evaluateAll,
+}
diff --git a/src/benchmark/types.luau b/src/benchmark/types.luau
index e69de29..a1f0219 100644
--- a/src/benchmark/types.luau
+++ b/src/benchmark/types.luau
@@ -0,0 +1,31 @@
+export type QuestionItem = {
+    question: string,
+    answer: string,
+    explanation: string?,
+    pointReward: number,
+}
+
+export type CategorySet = { QuestionItem }
+
+export type EvalJSON = {
+    score: number,
+    rationale: string,
+    correct: boolean?,
+}
+
+export type AnswerRecord = {
+    model: string,
+    category: string,
+    index: number,
+    question: string,
+    candidate: string,
+    reference: string,
+    pointReward: number,
+}
+
+export type ScoredRecord = AnswerRecord & {
+    score: number,
+    rationale: string,
+}
+
+return {}
diff --git a/src/init.luau b/src/init.luau
index 2a26029..922199a 100644
--- a/src/init.luau
+++ b/src/init.luau
@@ -1 +1,59 @@
 local fs = require("@lune/fs")
+local serde = require("@lune/serde")
+
+local config = require("@config")
+local ask = require("./src/benchmark/aggregate")
+local evaluator = require("./src/benchmark/evaluate")
+local visualizer = require("./src/visualizer")
+
+local function countQuestions(dataset): number
+    local n = 0
+    for _, items in dataset do n += #items end
+    return n
+end
+
+local function run()
+    local models: { string } = config.modelsToEvaluate
+    local totalQuestions = countQuestions(config.dataset)
+
+    local asked = 0
+    local lastAskPct = -1
+    local function onAskProgress(current: number, total: number, ctx)
+        -- Avoid noisy logs; print only on percentage change
+        local pct = math.floor((current / total) * 100)
+        if pct ~= lastAskPct then
+            print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
+            lastAskPct = pct
+        end
+        asked = current
+    end
+
+    local answers = ask.askAll(config, onAskProgress)
+
+    local evaluated = 0
+    local lastEvalPct = -1
+    local function onEvalProgress(current: number, total: number, ctx)
+        local pct = math.floor((current / total) * 100)
+        if pct ~= lastEvalPct then
+            print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
+            lastEvalPct = pct
+        end
+        evaluated = current
+    end
+
+    local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
+
+    pcall(fs.writeDir, "out")
+    local lines = {}
+    for _, r in scored do
+        table.insert(lines, serde.encode("json", r))
+    end
+    pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
+
+    local md = visualizer.toMarkdown(scored)
+    pcall(fs.writeFile, "out/summary.md", md)
+
+    return scored
+end
+
+return run()
diff --git a/src/visualizer.luau b/src/visualizer.luau
index c4b5558..e986396 100644
--- a/src/visualizer.luau
+++ b/src/visualizer.luau
@@ -1,9 +1,57 @@
---[[
-returns text outputs like so:
-|-----------------------------------------|
-| 1. model one                        90% |
-| 1. model two                        85% |
-| 1. model three                      80% |
-| 1. model four                       60% |
-|-----------------------------------------|
-]]
\ No newline at end of file
+local function summarizeOverall(results)
+    local totalsByModel: { [string]: { total: number, max: number, count: number } } = {}
+    for _, r in results do
+        totalsByModel[r.model] = totalsByModel[r.model] or { total = 0, max = 0, count = 0 }
+        local t = totalsByModel[r.model]
+        t.total += r.score
+        t.max += r.pointReward
+        t.count += 1
+    end
+    return totalsByModel
+end
+
+local function summarizeByCategory(results)
+    local byModel: { [string]: { [string]: { total: number, max: number, count: number } } } = {}
+    for _, r in results do
+        byModel[r.model] = byModel[r.model] or {}
+        byModel[r.model][r.category] = byModel[r.model][r.category] or { total = 0, max = 0, count = 0 }
+        local t = byModel[r.model][r.category]
+        t.total += r.score
+        t.max += r.pointReward
+        t.count += 1
+    end
+    return byModel
+end
+
+local function toMarkdown(results): string
+    local overall = summarizeOverall(results)
+    local byCat = summarizeByCategory(results)
+    local lines = {}
+
+    table.insert(lines, "# LSFBench Summary")
+    table.insert(lines, "")
+    table.insert(lines, "| Model | Total | Max | Percent | Count |")
+    table.insert(lines, "|-------|-------|-----|---------|-------|")
+    for model, t in overall do
+        local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
+        table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", model, t.total, t.max, pct, t.count))
+    end
+
+    table.insert(lines, "")
+    table.insert(lines, "## Per-Category")
+
+    for model, cats in byCat do
+        table.insert(lines, string.format("### %s", model))
+        table.insert(lines, "| Category | Total | Max | Percent | Count |")
+        table.insert(lines, "|----------|-------|-----|---------|-------|")
+        for category, t in cats do
+            local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
+            table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", category, t.total, t.max, pct, t.count))
+        end
+        table.insert(lines, "")
+    end
+
+    return table.concat(lines, "\n")
+end
+
+return { toMarkdown = toMarkdown }