forked from openai/evals
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test-modelgraded.yaml
90 lines (84 loc) · 2.91 KB
/
test-modelgraded.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# a simple modelgraded eval checking if a completion is funny or not
joke-animals:
id: joke-animals.dev.v0
metrics: [accuracy]
joke-animals.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
samples_renamings:
input1: "input"
completion1: "completion"
eval_type: cot_classify
modelgraded_spec_file: humor
# (same eval as above, but with likert scale of 1-5)
joke-animals-likert:
id: joke-animals-likert.dev.v0
metrics: [accuracy]
joke-animals-likert.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
samples_renamings:
input1: "input"
completion1: "completion"
eval_type: cot_classify
modelgraded_spec_file: humor_likert
# a simple modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset, but ignores "completion" and "choice"
joke-fruits:
id: joke-fruits.dev.v0
metrics: [accuracy]
joke-fruits.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: humor
# a meta-evaluation of a modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset with "completion" and "choice"
joke-fruits-meta:
id: joke-fruits-meta.dev.v0
metrics: [accuracy]
joke-fruits-meta.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: humor
metaeval: true
# (above, but with "answer then explain", rather than "reason then answer")
joke-fruits-expl-meta:
id: joke-fruits-expl-meta.dev.v0
metrics: [accuracy]
joke-fruits-expl-meta.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: classify_cot
modelgraded_spec_file: humor
metaeval: true
# (above, but with "answer" only)
joke-fruits-ans-meta:
id: joke-fruits-ans-meta.dev.v0
metrics: [accuracy]
joke-fruits-ans-meta.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: classify
modelgraded_spec_file: humor
metaeval: true
# a simple modelgraded eval checking if 4 completions to the sample prompt is diverse
# this example uses a labeled dataset, but ignore "completion" and "choice"
diversity:
id: diversity.dev.v0
metrics: [accuracy]
diversity.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: diversity
multicomp_n: 4
multicomp_temperature: 0.4