base benchmark
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1 +1,2 @@
|
||||
AGENTS.md
|
||||
AGENTS.md
|
||||
out/
|
5
.luaurc
5
.luaurc
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"aliases": {
|
||||
"lune": "~/.lune/.typedefs/0.10.2/",
|
||||
"ollama": "./src/ollama"
|
||||
"ollama": "./src/ollama",
|
||||
"config": "./config"
|
||||
}
|
||||
}
|
||||
}
|
10
README.md
Normal file
10
README.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# LSFBench
|
||||
Minimal Luau/Lune benchmark to evaluate LLMs: one model answers questions, another model scores the answers against the reference key.
|
||||
|
||||
## Quick Start
|
||||
Prereqs
|
||||
- Install Lune (0.10.x)
|
||||
- Start Ollama at `http://localhost:11434` and pull the models referenced in `config.luau` (e.g. `qwen3:4b`)
|
||||
|
||||
## Notice
|
||||
The evaluator model must support structured JSON outputs.
|
@@ -22,8 +22,8 @@ export type config = {
|
||||
}
|
||||
|
||||
local config: config = {
|
||||
modelsToEvaluate = { 'qwen3:4b' },
|
||||
evaluatingModel = 'qwen3:4b',
|
||||
modelsToEvaluate = { 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0' },
|
||||
evaluatingModel = 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0',
|
||||
dataset = {
|
||||
syntax = {
|
||||
{
|
||||
|
@@ -0,0 +1,86 @@
|
||||
local ollama = require("@ollama")
|
||||
|
||||
export type QuestionItem = {
|
||||
question: string,
|
||||
answer: string,
|
||||
explanation: string?,
|
||||
pointReward: number,
|
||||
}
|
||||
|
||||
export type AnswerRecord = {
|
||||
model: string,
|
||||
category: string,
|
||||
index: number,
|
||||
question: string,
|
||||
candidate: string,
|
||||
reference: string,
|
||||
pointReward: number,
|
||||
}
|
||||
|
||||
local function askModel(modelName: string, question: string): string
|
||||
local client = ollama.serve()
|
||||
local system = "You are a concise Luau expert. Answer precisely."
|
||||
local prompt = (
|
||||
"Question:\n" .. question .. "\n\n" ..
|
||||
"Respond with the answer."
|
||||
)
|
||||
|
||||
local r = client:generateCompletion({
|
||||
model = modelName,
|
||||
prompt = prompt,
|
||||
system = system,
|
||||
keep_alive = "5m",
|
||||
options = {
|
||||
num_ctx = 32000
|
||||
}
|
||||
})
|
||||
|
||||
if r.statusCode then
|
||||
return ""
|
||||
end
|
||||
return r.response
|
||||
end
|
||||
|
||||
local function askAll(config, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { AnswerRecord }
|
||||
local results: { AnswerRecord } = {}
|
||||
local models: { string } = config.modelsToEvaluate
|
||||
|
||||
-- Precompute total questions
|
||||
local total = 0
|
||||
for _, items in config.dataset do
|
||||
total += #items
|
||||
end
|
||||
total *= #models
|
||||
local current = 0
|
||||
|
||||
for _, modelName in models do
|
||||
for categoryName, items in config.dataset do
|
||||
local list = items :: { QuestionItem }
|
||||
for i, q in list do
|
||||
local okAns, candidate = pcall(askModel, modelName, q.question)
|
||||
if not okAns then candidate = "" end
|
||||
|
||||
table.insert(results, {
|
||||
model = modelName,
|
||||
category = categoryName,
|
||||
index = i,
|
||||
question = q.question,
|
||||
candidate = candidate,
|
||||
reference = q.answer,
|
||||
pointReward = q.pointReward,
|
||||
})
|
||||
|
||||
current += 1
|
||||
if onProgress then
|
||||
onProgress(current, total, { model = modelName, category = categoryName, index = i })
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
return results
|
||||
end
|
||||
|
||||
return {
|
||||
askAll = askAll,
|
||||
}
|
||||
|
@@ -0,0 +1,126 @@
|
||||
local serde = require("@lune/serde")
|
||||
local ollama = require("@ollama")
|
||||
|
||||
export type AnswerRecord = {
|
||||
model: string,
|
||||
category: string,
|
||||
index: number,
|
||||
question: string,
|
||||
candidate: string,
|
||||
reference: string,
|
||||
pointReward: number,
|
||||
}
|
||||
|
||||
export type EvalJSON = {
|
||||
score: number,
|
||||
rationale: string,
|
||||
correct: boolean?,
|
||||
}
|
||||
|
||||
export type ScoredRecord = AnswerRecord & {
|
||||
score: number,
|
||||
rationale: string,
|
||||
}
|
||||
|
||||
local function clamp(n: number, lo: number, hi: number): number
|
||||
if n < lo then return lo end
|
||||
if n > hi then return hi end
|
||||
return n
|
||||
end
|
||||
|
||||
local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
|
||||
local client = ollama.serve()
|
||||
|
||||
local system = table.concat({
|
||||
"You are a strict grader.",
|
||||
"Return ONLY valid JSON complying with the given schema.",
|
||||
"No prose, no markdown, no code fences.",
|
||||
}, " ")
|
||||
|
||||
local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
|
||||
|
||||
local instructions = string.format([[You will grade a candidate answer.
|
||||
Constraints:
|
||||
- Award an integer score from 0 to %d.
|
||||
- Keep rationale 1-2 short sentences.
|
||||
- Set correct=true if the candidate meaningfully matches the reference answer, else false.
|
||||
Output:
|
||||
- Return ONLY a single JSON object matching this schema: %s
|
||||
|
||||
Question:
|
||||
"""
|
||||
%s
|
||||
"""
|
||||
|
||||
Reference Answer:
|
||||
"""
|
||||
%s
|
||||
"""
|
||||
|
||||
Candidate Answer:
|
||||
"""
|
||||
%s
|
||||
"""
|
||||
]], maxPoints, schema, question, reference, candidate)
|
||||
|
||||
local r = client:generateCompletion({
|
||||
model = evaluatorModel,
|
||||
prompt = instructions,
|
||||
system = system,
|
||||
format = "json",
|
||||
keep_alive = "5m",
|
||||
options = {
|
||||
num_ctx = 32000
|
||||
}
|
||||
})
|
||||
|
||||
if r.statusCode then
|
||||
return { score = 0, rationale = "evaluator request failed", correct = false }
|
||||
end
|
||||
|
||||
local ok, obj = pcall(function()
|
||||
return serde.decode("json", r.response)
|
||||
end)
|
||||
|
||||
if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
|
||||
return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
|
||||
end
|
||||
|
||||
local bounded = clamp(math.floor(obj.score), 0, maxPoints)
|
||||
local rationale = tostring(obj.rationale or "")
|
||||
return { score = bounded, rationale = rationale, correct = not not obj.correct }
|
||||
end
|
||||
|
||||
local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
|
||||
local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
|
||||
return {
|
||||
model = ar.model,
|
||||
category = ar.category,
|
||||
index = ar.index,
|
||||
question = ar.question,
|
||||
candidate = ar.candidate,
|
||||
reference = ar.reference,
|
||||
pointReward = ar.pointReward,
|
||||
score = res.score,
|
||||
rationale = res.rationale,
|
||||
}
|
||||
end
|
||||
|
||||
local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
|
||||
local out: { ScoredRecord } = {}
|
||||
local total = #answers
|
||||
local current = 0
|
||||
for _, ar in answers do
|
||||
table.insert(out, evaluateOne(evaluatorModel, ar))
|
||||
current += 1
|
||||
if onProgress then
|
||||
onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
return {
|
||||
evaluateOne = evaluateOne,
|
||||
evaluateAll = evaluateAll,
|
||||
}
|
||||
|
@@ -0,0 +1,31 @@
|
||||
export type QuestionItem = {
|
||||
question: string,
|
||||
answer: string,
|
||||
explanation: string?,
|
||||
pointReward: number,
|
||||
}
|
||||
|
||||
export type CategorySet = { QuestionItem }
|
||||
|
||||
export type EvalJSON = {
|
||||
score: number,
|
||||
rationale: string,
|
||||
correct: boolean?,
|
||||
}
|
||||
|
||||
export type AnswerRecord = {
|
||||
model: string,
|
||||
category: string,
|
||||
index: number,
|
||||
question: string,
|
||||
candidate: string,
|
||||
reference: string,
|
||||
pointReward: number,
|
||||
}
|
||||
|
||||
export type ScoredRecord = AnswerRecord & {
|
||||
score: number,
|
||||
rationale: string,
|
||||
}
|
||||
|
||||
return {}
|
||||
|
@@ -1 +1,59 @@
|
||||
local fs = require("@lune/fs")
|
||||
local serde = require("@lune/serde")
|
||||
|
||||
local config = require("@config")
|
||||
local ask = require("./src/benchmark/aggregate")
|
||||
local evaluator = require("./src/benchmark/evaluate")
|
||||
local visualizer = require("./src/visualizer")
|
||||
|
||||
local function countQuestions(dataset): number
|
||||
local n = 0
|
||||
for _, items in dataset do n += #items end
|
||||
return n
|
||||
end
|
||||
|
||||
local function run()
|
||||
local models: { string } = config.modelsToEvaluate
|
||||
local totalQuestions = countQuestions(config.dataset)
|
||||
|
||||
local asked = 0
|
||||
local lastAskPct = -1
|
||||
local function onAskProgress(current: number, total: number, ctx)
|
||||
-- Avoid noisy logs; print only on percentage change
|
||||
local pct = math.floor((current / total) * 100)
|
||||
if pct ~= lastAskPct then
|
||||
print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
|
||||
lastAskPct = pct
|
||||
end
|
||||
asked = current
|
||||
end
|
||||
|
||||
local answers = ask.askAll(config, onAskProgress)
|
||||
|
||||
local evaluated = 0
|
||||
local lastEvalPct = -1
|
||||
local function onEvalProgress(current: number, total: number, ctx)
|
||||
local pct = math.floor((current / total) * 100)
|
||||
if pct ~= lastEvalPct then
|
||||
print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
|
||||
lastEvalPct = pct
|
||||
end
|
||||
evaluated = current
|
||||
end
|
||||
|
||||
local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
|
||||
|
||||
pcall(fs.writeDir, "out")
|
||||
local lines = {}
|
||||
for _, r in scored do
|
||||
table.insert(lines, serde.encode("json", r))
|
||||
end
|
||||
pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
|
||||
|
||||
local md = visualizer.toMarkdown(scored)
|
||||
pcall(fs.writeFile, "out/summary.md", md)
|
||||
|
||||
return scored
|
||||
end
|
||||
|
||||
return run()
|
||||
|
@@ -1,9 +1,57 @@
|
||||
--[[
|
||||
returns text outputs like so:
|
||||
|-----------------------------------------|
|
||||
| 1. model one 90% |
|
||||
| 1. model two 85% |
|
||||
| 1. model three 80% |
|
||||
| 1. model four 60% |
|
||||
|-----------------------------------------|
|
||||
]]
|
||||
local function summarizeOverall(results)
|
||||
local totalsByModel: { [string]: { total: number, max: number, count: number } } = {}
|
||||
for _, r in results do
|
||||
totalsByModel[r.model] = totalsByModel[r.model] or { total = 0, max = 0, count = 0 }
|
||||
local t = totalsByModel[r.model]
|
||||
t.total += r.score
|
||||
t.max += r.pointReward
|
||||
t.count += 1
|
||||
end
|
||||
return totalsByModel
|
||||
end
|
||||
|
||||
local function summarizeByCategory(results)
|
||||
local byModel: { [string]: { [string]: { total: number, max: number, count: number } } } = {}
|
||||
for _, r in results do
|
||||
byModel[r.model] = byModel[r.model] or {}
|
||||
byModel[r.model][r.category] = byModel[r.model][r.category] or { total = 0, max = 0, count = 0 }
|
||||
local t = byModel[r.model][r.category]
|
||||
t.total += r.score
|
||||
t.max += r.pointReward
|
||||
t.count += 1
|
||||
end
|
||||
return byModel
|
||||
end
|
||||
|
||||
local function toMarkdown(results): string
|
||||
local overall = summarizeOverall(results)
|
||||
local byCat = summarizeByCategory(results)
|
||||
local lines = {}
|
||||
|
||||
table.insert(lines, "# LSFBench Summary")
|
||||
table.insert(lines, "")
|
||||
table.insert(lines, "| Model | Total | Max | Percent | Count |")
|
||||
table.insert(lines, "|-------|-------|-----|---------|-------|")
|
||||
for model, t in overall do
|
||||
local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
|
||||
table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", model, t.total, t.max, pct, t.count))
|
||||
end
|
||||
|
||||
table.insert(lines, "")
|
||||
table.insert(lines, "## Per-Category")
|
||||
|
||||
for model, cats in byCat do
|
||||
table.insert(lines, string.format("### %s", model))
|
||||
table.insert(lines, "| Category | Total | Max | Percent | Count |")
|
||||
table.insert(lines, "|----------|-------|-----|---------|-------|")
|
||||
for category, t in cats do
|
||||
local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
|
||||
table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", category, t.total, t.max, pct, t.count))
|
||||
end
|
||||
table.insert(lines, "")
|
||||
end
|
||||
|
||||
return table.concat(lines, "\n")
|
||||
end
|
||||
|
||||
return { toMarkdown = toMarkdown }
|
||||
|
Reference in New Issue
Block a user