base benchmark
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1 +1,2 @@
|
|||||||
AGENTS.md
|
AGENTS.md
|
||||||
|
out/
|
5
.luaurc
5
.luaurc
@@ -1,6 +1,7 @@
|
|||||||
{
|
{
|
||||||
"aliases": {
|
"aliases": {
|
||||||
"lune": "~/.lune/.typedefs/0.10.2/",
|
"lune": "~/.lune/.typedefs/0.10.2/",
|
||||||
"ollama": "./src/ollama"
|
"ollama": "./src/ollama",
|
||||||
|
"config": "./config"
|
||||||
}
|
}
|
||||||
}
|
}
|
10
README.md
Normal file
10
README.md
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# LSFBench
|
||||||
|
Minimal Luau/Lune benchmark to evaluate LLMs: one model answers questions, another model scores the answers against the reference key.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
Prereqs
|
||||||
|
- Install Lune (0.10.x)
|
||||||
|
- Start Ollama at `http://localhost:11434` and pull the models referenced in `config.luau` (e.g. `qwen3:4b`)
|
||||||
|
|
||||||
|
## Notice
|
||||||
|
The evaluator model must support structured JSON outputs.
|
@@ -22,8 +22,8 @@ export type config = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
local config: config = {
|
local config: config = {
|
||||||
modelsToEvaluate = { 'qwen3:4b' },
|
modelsToEvaluate = { 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0' },
|
||||||
evaluatingModel = 'qwen3:4b',
|
evaluatingModel = 'hf.co/unsloth/Qwen3-4B-Instruct-2507-GGUF:Q8_0',
|
||||||
dataset = {
|
dataset = {
|
||||||
syntax = {
|
syntax = {
|
||||||
{
|
{
|
||||||
|
@@ -0,0 +1,86 @@
|
|||||||
|
local ollama = require("@ollama")
|
||||||
|
|
||||||
|
export type QuestionItem = {
|
||||||
|
question: string,
|
||||||
|
answer: string,
|
||||||
|
explanation: string?,
|
||||||
|
pointReward: number,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type AnswerRecord = {
|
||||||
|
model: string,
|
||||||
|
category: string,
|
||||||
|
index: number,
|
||||||
|
question: string,
|
||||||
|
candidate: string,
|
||||||
|
reference: string,
|
||||||
|
pointReward: number,
|
||||||
|
}
|
||||||
|
|
||||||
|
local function askModel(modelName: string, question: string): string
|
||||||
|
local client = ollama.serve()
|
||||||
|
local system = "You are a concise Luau expert. Answer precisely."
|
||||||
|
local prompt = (
|
||||||
|
"Question:\n" .. question .. "\n\n" ..
|
||||||
|
"Respond with the answer."
|
||||||
|
)
|
||||||
|
|
||||||
|
local r = client:generateCompletion({
|
||||||
|
model = modelName,
|
||||||
|
prompt = prompt,
|
||||||
|
system = system,
|
||||||
|
keep_alive = "5m",
|
||||||
|
options = {
|
||||||
|
num_ctx = 32000
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if r.statusCode then
|
||||||
|
return ""
|
||||||
|
end
|
||||||
|
return r.response
|
||||||
|
end
|
||||||
|
|
||||||
|
local function askAll(config, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { AnswerRecord }
|
||||||
|
local results: { AnswerRecord } = {}
|
||||||
|
local models: { string } = config.modelsToEvaluate
|
||||||
|
|
||||||
|
-- Precompute total questions
|
||||||
|
local total = 0
|
||||||
|
for _, items in config.dataset do
|
||||||
|
total += #items
|
||||||
|
end
|
||||||
|
total *= #models
|
||||||
|
local current = 0
|
||||||
|
|
||||||
|
for _, modelName in models do
|
||||||
|
for categoryName, items in config.dataset do
|
||||||
|
local list = items :: { QuestionItem }
|
||||||
|
for i, q in list do
|
||||||
|
local okAns, candidate = pcall(askModel, modelName, q.question)
|
||||||
|
if not okAns then candidate = "" end
|
||||||
|
|
||||||
|
table.insert(results, {
|
||||||
|
model = modelName,
|
||||||
|
category = categoryName,
|
||||||
|
index = i,
|
||||||
|
question = q.question,
|
||||||
|
candidate = candidate,
|
||||||
|
reference = q.answer,
|
||||||
|
pointReward = q.pointReward,
|
||||||
|
})
|
||||||
|
|
||||||
|
current += 1
|
||||||
|
if onProgress then
|
||||||
|
onProgress(current, total, { model = modelName, category = categoryName, index = i })
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return results
|
||||||
|
end
|
||||||
|
|
||||||
|
return {
|
||||||
|
askAll = askAll,
|
||||||
|
}
|
||||||
|
@@ -0,0 +1,126 @@
|
|||||||
|
local serde = require("@lune/serde")
|
||||||
|
local ollama = require("@ollama")
|
||||||
|
|
||||||
|
export type AnswerRecord = {
|
||||||
|
model: string,
|
||||||
|
category: string,
|
||||||
|
index: number,
|
||||||
|
question: string,
|
||||||
|
candidate: string,
|
||||||
|
reference: string,
|
||||||
|
pointReward: number,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type EvalJSON = {
|
||||||
|
score: number,
|
||||||
|
rationale: string,
|
||||||
|
correct: boolean?,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScoredRecord = AnswerRecord & {
|
||||||
|
score: number,
|
||||||
|
rationale: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
local function clamp(n: number, lo: number, hi: number): number
|
||||||
|
if n < lo then return lo end
|
||||||
|
if n > hi then return hi end
|
||||||
|
return n
|
||||||
|
end
|
||||||
|
|
||||||
|
local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
|
||||||
|
local client = ollama.serve()
|
||||||
|
|
||||||
|
local system = table.concat({
|
||||||
|
"You are a strict grader.",
|
||||||
|
"Return ONLY valid JSON complying with the given schema.",
|
||||||
|
"No prose, no markdown, no code fences.",
|
||||||
|
}, " ")
|
||||||
|
|
||||||
|
local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
|
||||||
|
|
||||||
|
local instructions = string.format([[You will grade a candidate answer.
|
||||||
|
Constraints:
|
||||||
|
- Award an integer score from 0 to %d.
|
||||||
|
- Keep rationale 1-2 short sentences.
|
||||||
|
- Set correct=true if the candidate meaningfully matches the reference answer, else false.
|
||||||
|
Output:
|
||||||
|
- Return ONLY a single JSON object matching this schema: %s
|
||||||
|
|
||||||
|
Question:
|
||||||
|
"""
|
||||||
|
%s
|
||||||
|
"""
|
||||||
|
|
||||||
|
Reference Answer:
|
||||||
|
"""
|
||||||
|
%s
|
||||||
|
"""
|
||||||
|
|
||||||
|
Candidate Answer:
|
||||||
|
"""
|
||||||
|
%s
|
||||||
|
"""
|
||||||
|
]], maxPoints, schema, question, reference, candidate)
|
||||||
|
|
||||||
|
local r = client:generateCompletion({
|
||||||
|
model = evaluatorModel,
|
||||||
|
prompt = instructions,
|
||||||
|
system = system,
|
||||||
|
format = "json",
|
||||||
|
keep_alive = "5m",
|
||||||
|
options = {
|
||||||
|
num_ctx = 32000
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if r.statusCode then
|
||||||
|
return { score = 0, rationale = "evaluator request failed", correct = false }
|
||||||
|
end
|
||||||
|
|
||||||
|
local ok, obj = pcall(function()
|
||||||
|
return serde.decode("json", r.response)
|
||||||
|
end)
|
||||||
|
|
||||||
|
if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
|
||||||
|
return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
|
||||||
|
end
|
||||||
|
|
||||||
|
local bounded = clamp(math.floor(obj.score), 0, maxPoints)
|
||||||
|
local rationale = tostring(obj.rationale or "")
|
||||||
|
return { score = bounded, rationale = rationale, correct = not not obj.correct }
|
||||||
|
end
|
||||||
|
|
||||||
|
local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
|
||||||
|
local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
|
||||||
|
return {
|
||||||
|
model = ar.model,
|
||||||
|
category = ar.category,
|
||||||
|
index = ar.index,
|
||||||
|
question = ar.question,
|
||||||
|
candidate = ar.candidate,
|
||||||
|
reference = ar.reference,
|
||||||
|
pointReward = ar.pointReward,
|
||||||
|
score = res.score,
|
||||||
|
rationale = res.rationale,
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
|
||||||
|
local out: { ScoredRecord } = {}
|
||||||
|
local total = #answers
|
||||||
|
local current = 0
|
||||||
|
for _, ar in answers do
|
||||||
|
table.insert(out, evaluateOne(evaluatorModel, ar))
|
||||||
|
current += 1
|
||||||
|
if onProgress then
|
||||||
|
onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return out
|
||||||
|
end
|
||||||
|
|
||||||
|
return {
|
||||||
|
evaluateOne = evaluateOne,
|
||||||
|
evaluateAll = evaluateAll,
|
||||||
|
}
|
||||||
|
@@ -0,0 +1,31 @@
|
|||||||
|
export type QuestionItem = {
|
||||||
|
question: string,
|
||||||
|
answer: string,
|
||||||
|
explanation: string?,
|
||||||
|
pointReward: number,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type CategorySet = { QuestionItem }
|
||||||
|
|
||||||
|
export type EvalJSON = {
|
||||||
|
score: number,
|
||||||
|
rationale: string,
|
||||||
|
correct: boolean?,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type AnswerRecord = {
|
||||||
|
model: string,
|
||||||
|
category: string,
|
||||||
|
index: number,
|
||||||
|
question: string,
|
||||||
|
candidate: string,
|
||||||
|
reference: string,
|
||||||
|
pointReward: number,
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ScoredRecord = AnswerRecord & {
|
||||||
|
score: number,
|
||||||
|
rationale: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {}
|
||||||
|
@@ -1 +1,59 @@
|
|||||||
local fs = require("@lune/fs")
|
local fs = require("@lune/fs")
|
||||||
|
local serde = require("@lune/serde")
|
||||||
|
|
||||||
|
local config = require("@config")
|
||||||
|
local ask = require("./src/benchmark/aggregate")
|
||||||
|
local evaluator = require("./src/benchmark/evaluate")
|
||||||
|
local visualizer = require("./src/visualizer")
|
||||||
|
|
||||||
|
local function countQuestions(dataset): number
|
||||||
|
local n = 0
|
||||||
|
for _, items in dataset do n += #items end
|
||||||
|
return n
|
||||||
|
end
|
||||||
|
|
||||||
|
local function run()
|
||||||
|
local models: { string } = config.modelsToEvaluate
|
||||||
|
local totalQuestions = countQuestions(config.dataset)
|
||||||
|
|
||||||
|
local asked = 0
|
||||||
|
local lastAskPct = -1
|
||||||
|
local function onAskProgress(current: number, total: number, ctx)
|
||||||
|
-- Avoid noisy logs; print only on percentage change
|
||||||
|
local pct = math.floor((current / total) * 100)
|
||||||
|
if pct ~= lastAskPct then
|
||||||
|
print(string.format("Asking models: %d/%d (%d%%)", current, total, pct))
|
||||||
|
lastAskPct = pct
|
||||||
|
end
|
||||||
|
asked = current
|
||||||
|
end
|
||||||
|
|
||||||
|
local answers = ask.askAll(config, onAskProgress)
|
||||||
|
|
||||||
|
local evaluated = 0
|
||||||
|
local lastEvalPct = -1
|
||||||
|
local function onEvalProgress(current: number, total: number, ctx)
|
||||||
|
local pct = math.floor((current / total) * 100)
|
||||||
|
if pct ~= lastEvalPct then
|
||||||
|
print(string.format("Evaluating answers: %d/%d (%d%%)", current, total, pct))
|
||||||
|
lastEvalPct = pct
|
||||||
|
end
|
||||||
|
evaluated = current
|
||||||
|
end
|
||||||
|
|
||||||
|
local scored = evaluator.evaluateAll(config.evaluatingModel, answers, onEvalProgress)
|
||||||
|
|
||||||
|
pcall(fs.writeDir, "out")
|
||||||
|
local lines = {}
|
||||||
|
for _, r in scored do
|
||||||
|
table.insert(lines, serde.encode("json", r))
|
||||||
|
end
|
||||||
|
pcall(fs.writeFile, "out/results.jsonl", table.concat(lines, "\n"))
|
||||||
|
|
||||||
|
local md = visualizer.toMarkdown(scored)
|
||||||
|
pcall(fs.writeFile, "out/summary.md", md)
|
||||||
|
|
||||||
|
return scored
|
||||||
|
end
|
||||||
|
|
||||||
|
return run()
|
||||||
|
@@ -1,9 +1,57 @@
|
|||||||
--[[
|
local function summarizeOverall(results)
|
||||||
returns text outputs like so:
|
local totalsByModel: { [string]: { total: number, max: number, count: number } } = {}
|
||||||
|-----------------------------------------|
|
for _, r in results do
|
||||||
| 1. model one 90% |
|
totalsByModel[r.model] = totalsByModel[r.model] or { total = 0, max = 0, count = 0 }
|
||||||
| 1. model two 85% |
|
local t = totalsByModel[r.model]
|
||||||
| 1. model three 80% |
|
t.total += r.score
|
||||||
| 1. model four 60% |
|
t.max += r.pointReward
|
||||||
|-----------------------------------------|
|
t.count += 1
|
||||||
]]
|
end
|
||||||
|
return totalsByModel
|
||||||
|
end
|
||||||
|
|
||||||
|
local function summarizeByCategory(results)
|
||||||
|
local byModel: { [string]: { [string]: { total: number, max: number, count: number } } } = {}
|
||||||
|
for _, r in results do
|
||||||
|
byModel[r.model] = byModel[r.model] or {}
|
||||||
|
byModel[r.model][r.category] = byModel[r.model][r.category] or { total = 0, max = 0, count = 0 }
|
||||||
|
local t = byModel[r.model][r.category]
|
||||||
|
t.total += r.score
|
||||||
|
t.max += r.pointReward
|
||||||
|
t.count += 1
|
||||||
|
end
|
||||||
|
return byModel
|
||||||
|
end
|
||||||
|
|
||||||
|
local function toMarkdown(results): string
|
||||||
|
local overall = summarizeOverall(results)
|
||||||
|
local byCat = summarizeByCategory(results)
|
||||||
|
local lines = {}
|
||||||
|
|
||||||
|
table.insert(lines, "# LSFBench Summary")
|
||||||
|
table.insert(lines, "")
|
||||||
|
table.insert(lines, "| Model | Total | Max | Percent | Count |")
|
||||||
|
table.insert(lines, "|-------|-------|-----|---------|-------|")
|
||||||
|
for model, t in overall do
|
||||||
|
local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
|
||||||
|
table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", model, t.total, t.max, pct, t.count))
|
||||||
|
end
|
||||||
|
|
||||||
|
table.insert(lines, "")
|
||||||
|
table.insert(lines, "## Per-Category")
|
||||||
|
|
||||||
|
for model, cats in byCat do
|
||||||
|
table.insert(lines, string.format("### %s", model))
|
||||||
|
table.insert(lines, "| Category | Total | Max | Percent | Count |")
|
||||||
|
table.insert(lines, "|----------|-------|-----|---------|-------|")
|
||||||
|
for category, t in cats do
|
||||||
|
local pct = (t.max > 0) and (t.total / t.max * 100.0) or 0
|
||||||
|
table.insert(lines, string.format("| %s | %d | %d | %.1f%% | %d |", category, t.total, t.max, pct, t.count))
|
||||||
|
end
|
||||||
|
table.insert(lines, "")
|
||||||
|
end
|
||||||
|
|
||||||
|
return table.concat(lines, "\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
return { toMarkdown = toMarkdown }
|
||||||
|
Reference in New Issue
Block a user