base benchmark
This commit is contained in:
@@ -0,0 +1,126 @@
|
||||
local serde = require("@lune/serde")
|
||||
local ollama = require("@ollama")
|
||||
|
||||
export type AnswerRecord = {
|
||||
model: string,
|
||||
category: string,
|
||||
index: number,
|
||||
question: string,
|
||||
candidate: string,
|
||||
reference: string,
|
||||
pointReward: number,
|
||||
}
|
||||
|
||||
export type EvalJSON = {
|
||||
score: number,
|
||||
rationale: string,
|
||||
correct: boolean?,
|
||||
}
|
||||
|
||||
export type ScoredRecord = AnswerRecord & {
|
||||
score: number,
|
||||
rationale: string,
|
||||
}
|
||||
|
||||
local function clamp(n: number, lo: number, hi: number): number
|
||||
if n < lo then return lo end
|
||||
if n > hi then return hi end
|
||||
return n
|
||||
end
|
||||
|
||||
local function evalAnswer(evaluatorModel: string, question: string, reference: string, candidate: string, maxPoints: number): EvalJSON
|
||||
local client = ollama.serve()
|
||||
|
||||
local system = table.concat({
|
||||
"You are a strict grader.",
|
||||
"Return ONLY valid JSON complying with the given schema.",
|
||||
"No prose, no markdown, no code fences.",
|
||||
}, " ")
|
||||
|
||||
local schema = [[{"score": number, "rationale": string, "correct": boolean}]]
|
||||
|
||||
local instructions = string.format([[You will grade a candidate answer.
|
||||
Constraints:
|
||||
- Award an integer score from 0 to %d.
|
||||
- Keep rationale 1-2 short sentences.
|
||||
- Set correct=true if the candidate meaningfully matches the reference answer, else false.
|
||||
Output:
|
||||
- Return ONLY a single JSON object matching this schema: %s
|
||||
|
||||
Question:
|
||||
"""
|
||||
%s
|
||||
"""
|
||||
|
||||
Reference Answer:
|
||||
"""
|
||||
%s
|
||||
"""
|
||||
|
||||
Candidate Answer:
|
||||
"""
|
||||
%s
|
||||
"""
|
||||
]], maxPoints, schema, question, reference, candidate)
|
||||
|
||||
local r = client:generateCompletion({
|
||||
model = evaluatorModel,
|
||||
prompt = instructions,
|
||||
system = system,
|
||||
format = "json",
|
||||
keep_alive = "5m",
|
||||
options = {
|
||||
num_ctx = 32000
|
||||
}
|
||||
})
|
||||
|
||||
if r.statusCode then
|
||||
return { score = 0, rationale = "evaluator request failed", correct = false }
|
||||
end
|
||||
|
||||
local ok, obj = pcall(function()
|
||||
return serde.decode("json", r.response)
|
||||
end)
|
||||
|
||||
if not ok or type(obj) ~= "table" or type(obj.score) ~= "number" then
|
||||
return { score = 0, rationale = "invalid or non-JSON evaluator output", correct = false }
|
||||
end
|
||||
|
||||
local bounded = clamp(math.floor(obj.score), 0, maxPoints)
|
||||
local rationale = tostring(obj.rationale or "")
|
||||
return { score = bounded, rationale = rationale, correct = not not obj.correct }
|
||||
end
|
||||
|
||||
local function evaluateOne(evaluatorModel: string, ar: AnswerRecord): ScoredRecord
|
||||
local res = evalAnswer(evaluatorModel, ar.question, ar.reference, ar.candidate, ar.pointReward)
|
||||
return {
|
||||
model = ar.model,
|
||||
category = ar.category,
|
||||
index = ar.index,
|
||||
question = ar.question,
|
||||
candidate = ar.candidate,
|
||||
reference = ar.reference,
|
||||
pointReward = ar.pointReward,
|
||||
score = res.score,
|
||||
rationale = res.rationale,
|
||||
}
|
||||
end
|
||||
|
||||
local function evaluateAll(evaluatorModel: string, answers: { AnswerRecord }, onProgress: ((current: number, total: number, ctx: { model: string, category: string, index: number }) -> ())?): { ScoredRecord }
|
||||
local out: { ScoredRecord } = {}
|
||||
local total = #answers
|
||||
local current = 0
|
||||
for _, ar in answers do
|
||||
table.insert(out, evaluateOne(evaluatorModel, ar))
|
||||
current += 1
|
||||
if onProgress then
|
||||
onProgress(current, total, { model = ar.model, category = ar.category, index = ar.index })
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
return {
|
||||
evaluateOne = evaluateOne,
|
||||
evaluateAll = evaluateAll,
|
||||
}
|
||||
|
Reference in New Issue
Block a user