base benchmark

This commit is contained in:
2025-09-04 23:00:01 -06:00
parent 1114c02b7c
commit bcb31d84e6
10 changed files with 375 additions and 14 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
AGENTS.md
out/

View File

@@ -1,6 +1,7 @@
{
"aliases": {
"lune": "~/.lune/.typedefs/0.10.2/",
"ollama": "./src/ollama"
"ollama": "./src/ollama",
"config": "./config"
}
}

10
README.md Normal file
View File

@@ -0,0 +1,10 @@
# LSFBench
Minimal Luau/Lune benchmark to evaluate LLMs: one model answers questions, another model scores the answers against the reference key.
## Quick Start
Prereqs
- Install Lune (0.10.x)
- Start Ollama at `http://localhost:11434` and pull the models referenced in `config.luau` (e.g. `qwen3:4b`)
## Notice
The evaluator model must support structured JSON outputs.

View File

@@ -22,8 +22,8 @@ export type config = {
}
local config: config = {
modelsToEvaluate = { 'qwen3:4b' },
evaluatingModel = 'qwen3:4b',
modelsToEvaluate = { 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0' },
evaluatingModel = 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0',
dataset = {
syntax = {
{

View File

View File

@@ -0,0 +1,86 @@
local ollama = require("@ollama")
export type QuestionItem = {
question: string,
answer: string,
explanation: string?,
pointReward: number,
}
export type AnswerRecord = {
model: string,
category: string,
index: number,
question: string,
candidate: string,
reference: string,
pointReward: number,
}
local function askModel(modelName: string, question: string): string
local client = ollama.serve()
local system = "You are a concise Luau expert. Answer precisely."
local prompt = (
"Question:\n" .. question .. "\n\n" ..
"Respond with the answer."
)
local r = client:generateCompletion({
model = modelName,
prompt = prompt,
system = system,
keep_alive = "5m",
options = {
num_ctx = 32000
}
})
if r.statusCode then
return ""
end
return r.response
end
local function askAll(config, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { AnswerRecord }
local results: { AnswerRecord } = {}
local models: { string } = config.modelsToEvaluate
-- Precompute total questions
local total = 0
for _, items in config.dataset do
total += #items
end
total *= #models
local current = 0
for _, modelName in models do
for categoryName, items in config.dataset do
local list = items :: { QuestionItem }
for i, q in list do
local okAns, candidate = pcall(askModel, modelName, q.question)
if not okAns then candidate = "" end
table.insert(results, {
model = modelName,
category = categoryName,
index = i,
question = q.question,
candidate = candidate,
reference = q.answer,
pointReward = q.pointReward,
})
current += 1
if onProgress then
onProgress(current, total, { model = modelName, category = categoryName, index = i })
end
end
end
end
return results
end
return {
askAll = askAll,
}

View File

@@ -0,0 +1,126 @@
local serde = require("@lune/serde")
local ollama = require("@ollama")
export type AnswerRecord = {
model: string,
category: string,
index: number,
question: string,
candidate: string,
reference: string,
pointReward: number,
}
export type EvalJSON = {
score: number,
rationale: string,
correct: boolean?,
}
export type ScoredRecord = AnswerRecord & {
score: number,
rationale: string,
}
local function clamp(n: number, lo: number, hi: number): number
if n < lo then return lo end
if n > hi then return hi end
return n
end
local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
local client = ollama.serve()
local system = table.concat({
"You are a strict grader.",
"Return ONLY valid JSON complying with the given schema.",
"No prose, no markdown, no code fences.",
}, " ")
local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
local instructions = string.format([[You will grade a candidate answer.
Constraints:
- Award an integer score from 0 to %d.
- Keep rationale 1-2 short sentences.
- Set correct=true if the candidate meaningfully matches the reference answer, else false.
Output:
- Return ONLY a single JSON object matching this schema: %s
Question:
"""
%s
"""
Reference Answer:
"""
%s
"""
Candidate Answer:
"""
%s
"""
]], maxPoints, schema, question, reference, candidate)
local r = client:generateCompletion({
model = evaluatorModel,
prompt = instructions,
system = system,
format = "json",
keep_alive = "5m",
options = {
num_ctx = 32000
}
})
if r.statusCode then
return { score = 0, rationale = "evaluator request failed", correct = false }
end
local ok, obj = pcall(function()
return serde.decode("json", r.response)
end)
if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
end
local bounded = clamp(math.floor(obj.score), 0, maxPoints)
local rationale = tostring(obj.rationale or "")
return { score = bounded, rationale = rationale, correct = not not obj.correct }
end
local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
return {
model = ar.model,
category = ar.category,
index = ar.index,
question = ar.question,
candidate = ar.candidate,
reference = ar.reference,
pointReward = ar.pointReward,
score = res.score,
rationale = res.rationale,
}
end
local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
local out: { ScoredRecord } = {}
local total = #answers
local current = 0
for _, ar in answers do
table.insert(out, evaluateOne(evaluatorModel, ar))
current += 1
if onProgress then
onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
end
end
return out
end
return {
evaluateOne = evaluateOne,
evaluateAll = evaluateAll,
}

View File

@@ -0,0 +1,31 @@
export type QuestionItem = {
question: string,
answer: string,
explanation: string?,
pointReward: number,
}
export type CategorySet = { QuestionItem }
export type EvalJSON = {
score: number,
rationale: string,
correct: boolean?,
}
export type AnswerRecord = {
model: string,
category: string,
index: number,
question: string,
candidate: string,
reference: string,
pointReward: number,
}
export type ScoredRecord = AnswerRecord & {
score: number,
rationale: string,
}
return {}

View File

@@ -1 +1,59 @@
local fs = require("@lune/fs")
local serde = require("@lune/serde")
local config = require("@config")
local ask = require("./src/benchmark/aggregate")
local evaluator = require("./src/benchmark/evaluate")
local visualizer = require("./src/visualizer")
local function countQuestions(dataset): number
local n = 0
for _, items in dataset do n += #items end
return n
end
local function run()
local models: { string } = config.modelsToEvaluate
local totalQuestions = countQuestions(config.dataset)
local asked = 0
local lastAskPct = -1
local function onAskProgress(current: number, total: number, ctx)
-- Avoid noisy logs; print only on percentage change
local pct = math.floor((current / total) * 100)
if pct ~= lastAskPct then
print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
lastAskPct = pct
end
asked = current
end
local answers = ask.askAll(config, onAskProgress)
local evaluated = 0
local lastEvalPct = -1
local function onEvalProgress(current: number, total: number, ctx)
local pct = math.floor((current / total) * 100)
if pct ~= lastEvalPct then
print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
lastEvalPct = pct
end
evaluated = current
end
local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
pcall(fs.writeDir, "out")
local lines = {}
for _, r in scored do
table.insert(lines, serde.encode("json", r))
end
pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
local md = visualizer.toMarkdown(scored)
pcall(fs.writeFile, "out/summary.md", md)
return scored
end
return run()

View File

@@ -1,9 +1,57 @@
--[[
returns text outputs like so:
|-----------------------------------------|
| 1. model one 90% |
| 1. model two 85% |
| 1. model three 80% |
| 1. model four 60% |
|-----------------------------------------|
]]
local function summarizeOverall(results)
local totalsByModel: { [string]: { total: number, max: number, count: number } } = {}
for _, r in results do
totalsByModel[r.model] = totalsByModel[r.model] or { total = 0, max = 0, count = 0 }
local t = totalsByModel[r.model]
t.total += r.score
t.max += r.pointReward
t.count += 1
end
return totalsByModel
end
local function summarizeByCategory(results)
local byModel: { [string]: { [string]: { total: number, max: number, count: number } } } = {}
for _, r in results do
byModel[r.model] = byModel[r.model] or {}
byModel[r.model][r.category] = byModel[r.model][r.category] or { total = 0, max = 0, count = 0 }
local t = byModel[r.model][r.category]
t.total += r.score
t.max += r.pointReward
t.count += 1
end
return byModel
end
local function toMarkdown(results): string
local overall = summarizeOverall(results)
local byCat = summarizeByCategory(results)
local lines = {}
table.insert(lines, "# LSFBench Summary")
table.insert(lines, "")
table.insert(lines, "| Model | Total | Max | Percent | Count |")
table.insert(lines, "|-------|-------|-----|---------|-------|")
for model, t in overall do
local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", model, t.total, t.max, pct, t.count))
end
table.insert(lines, "")
table.insert(lines, "## Per-Category")
for model, cats in byCat do
table.insert(lines, string.format("### %s", model))
table.insert(lines, "| Category | Total | Max | Percent | Count |")
table.insert(lines, "|----------|-------|-----|---------|-------|")
for category, t in cats do
local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", category, t.total, t.max, pct, t.count))
end
table.insert(lines, "")
end
return table.concat(lines, "\n")
end
return { toMarkdown = toMarkdown }