base benchmark

This commit is contained in:
2025-09-04 23:00:01 -06:00
parent 1114c02b7c
commit bcb31d84e6
10 changed files with 375 additions and 14 deletions

View File

@@ -0,0 +1,126 @@
local serde = require("@lune/serde")
local ollama = require("@ollama")
export type AnswerRecord = {
model: string,
category: string,
index: number,
question: string,
candidate: string,
reference: string,
pointReward: number,
}
export type EvalJSON = {
score: number,
rationale: string,
correct: boolean?,
}
export type ScoredRecord = AnswerRecord & {
score: number,
rationale: string,
}
local function clamp(n: number, lo: number, hi: number): number
if n < lo then return lo end
if n > hi then return hi end
return n
end
local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
local client = ollama.serve()
local system = table.concat({
"You are a strict grader.",
"Return ONLY valid JSON complying with the given schema.",
"No prose, no markdown, no code fences.",
}, " ")
local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
local instructions = string.format([[You will grade a candidate answer.
Constraints:
- Award an integer score from 0 to %d.
- Keep rationale 1-2 short sentences.
- Set correct=true if the candidate meaningfully matches the reference answer, else false.
Output:
- Return ONLY a single JSON object matching this schema: %s
Question:
"""
%s
"""
Reference Answer:
"""
%s
"""
Candidate Answer:
"""
%s
"""
]], maxPoints, schema, question, reference, candidate)
local r = client:generateCompletion({
model = evaluatorModel,
prompt = instructions,
system = system,
format = "json",
keep_alive = "5m",
options = {
num_ctx = 32000
}
})
if r.statusCode then
return { score = 0, rationale = "evaluator request failed", correct = false }
end
local ok, obj = pcall(function()
return serde.decode("json", r.response)
end)
if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
end
local bounded = clamp(math.floor(obj.score), 0, maxPoints)
local rationale = tostring(obj.rationale or "")
return { score = bounded, rationale = rationale, correct = not not obj.correct }
end
local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
return {
model = ar.model,
category = ar.category,
index = ar.index,
question = ar.question,
candidate = ar.candidate,
reference = ar.reference,
pointReward = ar.pointReward,
score = res.score,
rationale = res.rationale,
}
end
local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
local out: { ScoredRecord } = {}
local total = #answers
local current = 0
for _, ar in answers do
table.insert(out, evaluateOne(evaluatorModel, ar))
current += 1
if onProgress then
onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
end
end
return out
end
return {
evaluateOne = evaluateOne,
evaluateAll = evaluateAll,
}