From bcb31d84e6ae35e2b8a7d713d5cec15357ee0a56 Mon Sep 17 00:00:00 2001 From: cyclic Date: Thu, 4 Sep 2025 23:00:01 -0600 Subject: [PATCH] base benchmark --- .gitignore | 3 +- .luaurc | 5 +- README.md | 10 +++ config.luau | 4 +- config.yaml | 0 src/benchmark/aggregate.luau | 86 ++++++++++++++++++++++++ src/benchmark/evaluate.luau | 126 +++++++++++++++++++++++++++++++++++ src/benchmark/types.luau | 31 +++++++++ src/init.luau | 58 ++++++++++++++++ src/visualizer.luau | 66 +++++++++++++++--- 10 files changed, 375 insertions(+), 14 deletions(-) create mode 100644 README.md delete mode 100644 config.yaml diff --git a/.gitignore b/.gitignore index 47dc3e3..2868ba2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -AGENTS.md \ No newline at end of file +AGENTS.md +out/ \ No newline at end of file diff --git a/.luaurc b/.luaurc index 99bf373..8a5b629 100644 --- a/.luaurc +++ b/.luaurc @@ -1,6 +1,7 @@ { "aliases": { "lune": "~/.lune/.typedefs/0.10.2/", - "ollama": "./src/ollama" + "ollama": "./src/ollama", + "config": "./config" } -} +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..49908f7 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# LSFBench +Minimal Luau/Lune benchmark to evaluate LLMs: one model answers questions, another model scores the answers against the reference key. + +## Quick Start +Prereqs +- Install Lune (0.10.x) +- Start Ollama at `http://localhost:11434` and pull the models referenced in `config.luau` (e.g. `qwen3:4b`) + +## Notice +The evaluator model must support structured JSON outputs. \ No newline at end of file diff --git a/config.luau b/config.luau index 5a4c4a8..97ea69b 100644 --- a/config.luau +++ b/config.luau @@ -22,8 +22,8 @@ export type config = { } local config: config = { - modelsToEvaluate = { 'qwen3:4b' }, - evaluatingModel = 'qwen3:4b', + modelsToEvaluate = { 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0' }, + evaluatingModel = 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0', dataset = { syntax = { { diff --git a/config.yaml b/config.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/src/benchmark/aggregate.luau b/src/benchmark/aggregate.luau index e69de29..c4a1b7b 100644 --- a/src/benchmark/aggregate.luau +++ b/src/benchmark/aggregate.luau @@ -0,0 +1,86 @@ +local ollama = require("@ollama") + +export type QuestionItem = { + question: string, + answer: string, + explanation: string?, + pointReward: number, +} + +export type AnswerRecord = { + model: string, + category: string, + index: number, + question: string, + candidate: string, + reference: string, + pointReward: number, +} + +local function askModel(modelName: string, question: string): string + local client = ollama.serve() + local system = "You are a concise Luau expert. Answer precisely." + local prompt = ( + "Question:\n" .. question .. "\n\n" .. + "Respond with the answer." + ) + + local r = client:generateCompletion({ + model = modelName, + prompt = prompt, + system = system, + keep_alive = "5m", + options = { + num_ctx = 32000 + } + }) + + if r.statusCode then + return "" + end + return r.response +end + +local function askAll(config, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { AnswerRecord } + local results: { AnswerRecord } = {} + local models: { string } = config.modelsToEvaluate + + -- Precompute total questions + local total = 0 + for _, items in config.dataset do + total += #items + end + total *= #models + local current = 0 + + for _, modelName in models do + for categoryName, items in config.dataset do + local list = items :: { QuestionItem } + for i, q in list do + local okAns, candidate = pcall(askModel, modelName, q.question) + if not okAns then candidate = "" end + + table.insert(results, { + model = modelName, + category = categoryName, + index = i, + question = q.question, + candidate = candidate, + reference = q.answer, + pointReward = q.pointReward, + }) + + current += 1 + if onProgress then + onProgress(current, total, { model = modelName, category = categoryName, index = i }) + end + end + end + end + + return results +end + +return { + askAll = askAll, +} diff --git a/src/benchmark/evaluate.luau b/src/benchmark/evaluate.luau index e69de29..1312f32 100644 --- a/src/benchmark/evaluate.luau +++ b/src/benchmark/evaluate.luau @@ -0,0 +1,126 @@ +local serde = require("@lune/serde") +local ollama = require("@ollama") + +export type AnswerRecord = { + model: string, + category: string, + index: number, + question: string, + candidate: string, + reference: string, + pointReward: number, +} + +export type EvalJSON = { + score: number, + rationale: string, + correct: boolean?, +} + +export type ScoredRecord = AnswerRecord & { + score: number, + rationale: string, +} + +local function clamp(n: number, lo: number, hi: number): number + if n < lo then return lo end + if n > hi then return hi end + return n +end + +local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON + local client = ollama.serve() + + local system = table.concat({ + "You are a strict grader.", + "Return ONLY valid JSON complying with the given schema.", + "No prose, no markdown, no code fences.", + }, " ") + + local schema = [[{"score": number, "rationale": string, "correct": boolean}]] + + local instructions = string.format([[You will grade a candidate answer. +Constraints: +- Award an integer score from 0 to %d. +- Keep rationale 1-2 short sentences. +- Set correct=true if the candidate meaningfully matches the reference answer, else false. +Output: +- Return ONLY a single JSON object matching this schema: %s + +Question: +""" +%s +""" + +Reference Answer: +""" +%s +""" + +Candidate Answer: +""" +%s +""" +]], maxPoints, schema, question, reference, candidate) + + local r = client:generateCompletion({ + model = evaluatorModel, + prompt = instructions, + system = system, + format = "json", + keep_alive = "5m", + options = { + num_ctx = 32000 + } + }) + + if r.statusCode then + return { score = 0, rationale = "evaluator request failed", correct = false } + end + + local ok, obj = pcall(function() + return serde.decode("json", r.response) + end) + + if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then + return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false } + end + + local bounded = clamp(math.floor(obj.score), 0, maxPoints) + local rationale = tostring(obj.rationale or "") + return { score = bounded, rationale = rationale, correct = not not obj.correct } +end + +local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord + local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward) + return { + model = ar.model, + category = ar.category, + index = ar.index, + question = ar.question, + candidate = ar.candidate, + reference = ar.reference, + pointReward = ar.pointReward, + score = res.score, + rationale = res.rationale, + } +end + +local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord } + local out: { ScoredRecord } = {} + local total = #answers + local current = 0 + for _, ar in answers do + table.insert(out, evaluateOne(evaluatorModel, ar)) + current += 1 + if onProgress then + onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index }) + end + end + return out +end + +return { + evaluateOne = evaluateOne, + evaluateAll = evaluateAll, +} diff --git a/src/benchmark/types.luau b/src/benchmark/types.luau index e69de29..a1f0219 100644 --- a/src/benchmark/types.luau +++ b/src/benchmark/types.luau @@ -0,0 +1,31 @@ +export type QuestionItem = { + question: string, + answer: string, + explanation: string?, + pointReward: number, +} + +export type CategorySet = { QuestionItem } + +export type EvalJSON = { + score: number, + rationale: string, + correct: boolean?, +} + +export type AnswerRecord = { + model: string, + category: string, + index: number, + question: string, + candidate: string, + reference: string, + pointReward: number, +} + +export type ScoredRecord = AnswerRecord & { + score: number, + rationale: string, +} + +return {} diff --git a/src/init.luau b/src/init.luau index 2a26029..922199a 100644 --- a/src/init.luau +++ b/src/init.luau @@ -1 +1,59 @@ local fs = require("@lune/fs") +local serde = require("@lune/serde") + +local config = require("@config") +local ask = require("./src/benchmark/aggregate") +local evaluator = require("./src/benchmark/evaluate") +local visualizer = require("./src/visualizer") + +local function countQuestions(dataset): number + local n = 0 + for _, items in dataset do n += #items end + return n +end + +local function run() + local models: { string } = config.modelsToEvaluate + local totalQuestions = countQuestions(config.dataset) + + local asked = 0 + local lastAskPct = -1 + local function onAskProgress(current: number, total: number, ctx) + -- Avoid noisy logs; print only on percentage change + local pct = math.floor((current / total) * 100) + if pct ~= lastAskPct then + print(string.format("Asking models: %d/%d (%d%%)", current, total, pct)) + lastAskPct = pct + end + asked = current + end + + local answers = ask.askAll(config, onAskProgress) + + local evaluated = 0 + local lastEvalPct = -1 + local function onEvalProgress(current: number, total: number, ctx) + local pct = math.floor((current / total) * 100) + if pct ~= lastEvalPct then + print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct)) + lastEvalPct = pct + end + evaluated = current + end + + local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress) + + pcall(fs.writeDir, "out") + local lines = {} + for _, r in scored do + table.insert(lines, serde.encode("json", r)) + end + pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n")) + + local md = visualizer.toMarkdown(scored) + pcall(fs.writeFile, "out/summary.md", md) + + return scored +end + +return run() diff --git a/src/visualizer.luau b/src/visualizer.luau index c4b5558..e986396 100644 --- a/src/visualizer.luau +++ b/src/visualizer.luau @@ -1,9 +1,57 @@ ---[[ -returns text outputs like so: -|-----------------------------------------| -| 1. model one 90% | -| 1. model two 85% | -| 1. model three 80% | -| 1. model four 60% | -|-----------------------------------------| -]] \ No newline at end of file +local function summarizeOverall(results) + local totalsByModel: { [string]: { total: number, max: number, count: number } } = {} + for _, r in results do + totalsByModel[r.model] = totalsByModel[r.model] or { total = 0, max = 0, count = 0 } + local t = totalsByModel[r.model] + t.total += r.score + t.max += r.pointReward + t.count += 1 + end + return totalsByModel +end + +local function summarizeByCategory(results) + local byModel: { [string]: { [string]: { total: number, max: number, count: number } } } = {} + for _, r in results do + byModel[r.model] = byModel[r.model] or {} + byModel[r.model][r.category] = byModel[r.model][r.category] or { total = 0, max = 0, count = 0 } + local t = byModel[r.model][r.category] + t.total += r.score + t.max += r.pointReward + t.count += 1 + end + return byModel +end + +local function toMarkdown(results): string + local overall = summarizeOverall(results) + local byCat = summarizeByCategory(results) + local lines = {} + + table.insert(lines, "# LSFBench Summary") + table.insert(lines, "") + table.insert(lines, "| Model | Total | Max | Percent | Count |") + table.insert(lines, "|-------|-------|-----|---------|-------|") + for model, t in overall do + local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0 + table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", model, t.total, t.max, pct, t.count)) + end + + table.insert(lines, "") + table.insert(lines, "## Per-Category") + + for model, cats in byCat do + table.insert(lines, string.format("### %s", model)) + table.insert(lines, "| Category | Total | Max | Percent | Count |") + table.insert(lines, "|----------|-------|-----|---------|-------|") + for category, t in cats do + local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0 + table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", category, t.total, t.max, pct, t.count)) + end + table.insert(lines, "") + end + + return table.concat(lines, "\n") +end + +return { toMarkdown = toMarkdown }