-
Notifications
You must be signed in to change notification settings - Fork 0
/
nugget_scorer.py
722 lines (580 loc) · 24.6 KB
/
nugget_scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
#!/usr/bin/env python3
"""
nugget_scorer.py
webannoparser
4/5/19
Copyright (c) Gilles Jacobs. All rights reserved.
This code is an implementation of Liu et al. (2015) unit nugget scorer.
It currently scores at the document-level and scores are macro-avg there.
N.B: selecting match with maximum dice score is already done by Matcher:
i.e. the argmax steps are precomputed before the Scorer steps in Algorithm 3 and 4 in Liu et al. 2015).
[1] Liu, Z., Mitamura, T., & Hovy, E. (2015). Evaluation Algorithms for Event Nugget Detection : A Pilot Study. Proceedings of the 3rd Workshop on EVENTS at the NAACL-HLT, 53–57.
Implementation approach:
- Score a nugget in 2 steps: 1. matching and 2. scoring: different settings can be set for both phases.
- 1. Matching: two step process consists of matching token spans by:
- 1.1 Candidate search: match based on token span dice-score > 0.0 for ERE, other search criteria can be entered too for custom matching.
- 1.2. Selection of candidates: Using heuristics select final candidates: important for obtaining good matches.
-2. Scoring:
TODO:
- Check accuracy counts as in [1] Algorithm 3
- Change the matching (but not the scoring) to be less strict because of differences with more lose matching.
- Handle preprocessing of coordinated and "multiple option" gold annotations in Matching
-
-
-
"""
from parser import *
from parse_project import parse_project, parse_corpus
from functools import partial
import re
import util
from pathlib import Path
from itertools import groupby, product
from collections import Counter
import settings
import pandas as pd
class MentionMapper:
"""
MentionMapper for matching annotations using several strategies.
Generates candidate matches for an annotation unit in a system document using a matching strategy in the 'match' method.
The select method selects the final match from all candidates for an annotation based on several criteria.
"""
def match_candidates(self, gold, system, strategy):
"""
:param system: systemerence document
:param gold: Gold standard document
:param strategy:
:return:
"""
matcher = get_matcher(strategy)
return matcher(system, gold)
def select(self, candidates, criteria):
selector = get_selector(criteria)
return selector(candidates)
def match(self, gold, system, match_strategy="ere", select_criteria="ere"):
"""
Make
:param system: System document.
:param gold: Gold standard document.
:param match_strategy:
:param select_criteria:
:return:
"""
matcher = get_matcher(match_strategy)
candidates = matcher(gold, system) # gold annotations with mapped attribute
collect_match_metadata(candidates, len(gold.events), len(system.events))
selector = get_selector(select_criteria)
matches = selector(candidates)
# set matched system unit as attrib on gold unit, simplifies calls
for gold_ann in gold.events:
gold_ann.ere_mapping = []
for (gold_m, system, dice_score) in matches:
if gold_ann == gold_m:
gold_ann.ere_mapping.append((system, dice_score))
return gold
def collect_match_metadata(mappings, n_gold, n_system):
# system annotation cannot be mapped to multiple gold, but 1 gold can be to multiple system (1_g-n_s)
sys_l = [x[1] for x in mappings]
gold_l = [x[0] for x in mappings]
not_mapped_sys = n_system - len(sys_l)
not_mapped_gold = n_gold - len(gold_l)
print(
f"{len(mappings)} mappings made between {n_gold} system and {n_system} gold annotations. "
f"{not_mapped_sys} system and {not_mapped_gold} gold annotations not mapped."
)
def get_matcher(strategy):
"""
Factory method for fetching the matcher func.
:param strategy:
:return:
"""
if strategy == "ere":
return _match_ere
elif strategy == "ere_sentivent":
return _match_ere_sentivent
else:
raise ValueError(strategy)
def get_selector(criteria):
if criteria == "ere":
return _select_ere
else:
raise ValueError(criteria)
def _match_ere_sentivent(gold, system):
"""
Variant of ERE nugget scoring matching strategy [1] accounting for exact overlap token spans.
Exactly overlapping span annotations are allowed in SENTiVENT but not in TAC KBP.
1. "Liu, Z., Mitamura, T., & Hovy, E. (2015).
Evaluation Algorithms for Event Nugget Detection : A Pilot Study.
Proceedings of the 3rd Workshop on EVENTS at the NAACL-HLT, 53–57."
:param gold:
:param system:
:return:
"""
mappings = []
# score all events with dice coefficient
for gold_event in gold.events:
gold_token_ids = gold_event.get_extent_token_ids(
extent=["discontiguous_triggers"]
)
for system_event in system.events:
system_token_ids = system_event.get_extent_token_ids(
extent=["discontiguous_triggers"]
)
dice_score = dice_coefficient(gold_token_ids, system_token_ids)
if dice_score > 0:
mappings.append((gold_event, system_event, dice_score))
return mappings
def _match_ere(gold, system):
"""
ERE nugget scoring matching strategy as explained in "Liu, Z., Mitamura, T., & Hovy, E. (2015).
Evaluation Algorithms for Event Nugget Detection : A Pilot Study.
Proceedings of the 3rd Workshop on EVENTS at the NAACL-HLT, 53–57."
Note: Non-general search approach for map candidates
This approach differs from our approaches because it considers all mention combinations at once in the full document.
On the other hand, our own custom mapper uses constraint search for each gold unit.
:param gold:
:param system:
:return:
"""
mappings = []
seen_system = set()
# score all events with dice coefficient
for gold_event in gold.events:
gold_token_ids = gold_event.get_extent_token_ids(
extent=["discontiguous_triggers"]
)
for system_event in system.events:
system_token_ids = system_event.get_extent_token_ids(
extent=["discontiguous_triggers"]
)
dice_score = dice_coefficient(gold_token_ids, system_token_ids)
# # for TESTING
# print(" ".join(str(x) for x in gold_token_ids), "-",
# " ".join(str(x) for x in system_token_ids), "-",
# " ".join(str(t) for t in gold_token_span), "-",
# " ".join(str(t) for t in system_token_span), "|",
# dice_score)
if not system_event in seen_system and dice_score > 0.0: # disable
# mapping = { # for TESTING inspection
# "gold": gold_event,
# "system": system_event,
# "span_dice_score": dice_score,
# "gold_text": " ".join(str(t) for t in gold_token_span),
# "system_text": " ".join(str(t) for t in system_token_span),
# "gold_token_ids": " ".join(str(x) for x in gold_token_ids),
# "system_token_ids": " ".join(str(x) for x in system_token_ids),
# }
#
# gold_event.ere_mapping.append(mapping)
# system_event.ere_mapped_to_gold.append(mapping)
mappings.append((gold_event, system_event, dice_score))
seen_system.add(system_event)
# make sure each system is only mapped once, multiple
sys_l = [x[1] for x in mappings]
assert len(set(sys_l)) == len(sys_l)
return mappings
def dice_coefficient(a, b):
"""
:param a: list of items
:param b: list of items
:return: dice coefficient score
"""
a.sort()
b.sort()
if not len(a) or not len(b):
return 0.0
""" quick case for true duplicates """
if a == b:
return 1.0
# assignments to save function calls
lena = len(a)
lenb = len(b)
# count match
matches = len(set(a) & set(b))
score = 2 * float(matches) / float(lena + lenb)
return score
def _argmax_dicescore(ann_score_list):
"""
Selects all candidate matches with the highest dice span sim score.
This is needed because same span can have multiple annotations in our implementation.
This is only the case for Liu et al. 2015 when gold unit nuggets are mistakenly split up into multiple other nuggets of same length.
Default python max selects only the first item with max value, so cannot be used.
:param ann_score_list: list of tuples (AnnotationObject, dice_similarity_score)
:return: list of matches with the max sim score value
"""
max_sim = max(m[2] for m in ann_score_list)
matches = [x for x in ann_score_list if x[2] == max_sim]
print(
f"Selected {len(matches)} from {len(ann_score_list)} based on dice scor {max_sim}"
)
return matches
def _select_ere(candidates):
def select_max(candidates):
"""
Select candidates with max dice score. Can return multiple with same dice score.
:param candidates: candidate matches
:return:
"""
selected_matches = []
# if multiple system match on one gold: take the highest dice span overlap, but
# if they overlap with by same amount keep them both
for gold_event, matches in groupby(candidates, key=lambda x: x[0]):
matches = list(matches)
if len(matches) > 1:
old_matches = matches
matches = _argmax_dicescore(matches)
selected_matches.extend(matches)
return selected_matches
selection = select_max(candidates)
try:
# check that each system unit is only matched once
if selection:
c = Counter(m[1].element_id for m in selection)
# print(c)
assert c.most_common(1)[0][1] == 1
except:
print(c)
raise ValueError("Something went wrong in selection")
return selection
class NuggetScorer:
def score(self, gold_doc, system_doc, score_criteria="ere_all_attributes"):
scorer, attributes = get_scorer(score_criteria)
return scorer(gold_doc, system_doc, attributes=attributes)
def get_scorer(criteria):
if criteria == "all_attributes":
return (
_score_ere_nugget,
["event_type", "event_subtype", "modality", "polarity_negation", "realis"],
)
elif criteria == "ere_liu":
return _score_ere_nugget, ["event_type", "realis"]
else:
raise "ValueError"
def compute_prf(metrics):
metrics["r"] = metrics["tp"] / metrics["n_gold"]
metrics["p"] = metrics["tp"] / (metrics["tp"] + metrics["fp"])
try:
metrics["f1"] = (2 * metrics["p"] * metrics["r"]) / (
metrics["p"] + metrics["r"]
)
except ZeroDivisionError:
metrics["f1"] = 0.0
# group keys by type of metric denoted by _suffix (relaxed, attrib, etc)
metric_suffixes = [
re.split(r"(tp|fp)_", n)[-1] for n in metrics.keys() if "tp_" in n or "fp_" in n
]
# compute alternative span precision P as proposed in Liu et al. 2015
metrics["p_alt"] = (
metrics["tp"] / metrics["n_system"] if metrics["n_system"] else 0.0
)
try:
metrics["f1_alt"] = (2 * metrics["p_alt"] * metrics["r"]) / (
metrics["p_alt"] + metrics["r"]
)
except ZeroDivisionError:
metrics["f1_alt"] = 0.0
# make alternative precision as in paper: tp / n_sys
for m in metric_suffixes:
tp = metrics[f"tp_{m}"]
fp = metrics["fp"]
# compute precision, recall and alternative precision
metrics[f"p_{m}"] = tp / (tp + fp) if tp or fp else 0.0
metrics[f"r_{m}"] = tp / metrics["n_gold"] if metrics["n_gold"] else 0.0
metrics[f"p_{m}_alt"] = tp / metrics["n_system"] if metrics["n_system"] else 0.0
try:
metrics[f"f1_{m}"] = (2 * metrics[f"p_{m}"] * metrics[f"r_{m}"]) / (
metrics[f"p_{m}"] + metrics[f"r_{m}"]
)
except ZeroDivisionError:
metrics[f"f1_{m}"] = 0.0
try:
metrics[f"f1_{m}_alt"] = (2 * metrics[f"p_{m}_alt"] * metrics[f"r_{m}"]) / (
metrics[f"p_{m}_alt"] + metrics[f"r_{m}"]
)
except ZeroDivisionError:
metrics[f"f1_{m}_alt"] = 0.0
return metrics
def compute_acc(scores):
"""
Compute attribute accuracy based on a metrics dict with "_acc" keys and the total amount of system docs.
In accordance
:param scores:
:return:
"""
for k, v in scores.items():
if k.startswith("acc_"):
acc = v / scores["n_system"] if scores["n_system"] else 0.0
scores[k] = acc
return scores
def check_matched(ann):
"""
Check if annotation is matched.
:param ann:
Returns True if ok else raises ValueError
"""
if any("mapping" in attr_n for attr_n in dir(ann)) or any(
"match" in attr_n for attr_n in dir(ann)
):
return True
else:
raise ValueError("Gold-system matches need to be made before scoring.")
def is_attribute_match(gold, match, attribute_name):
if attribute_name == "event_subtype":
# need to specify maintype because some same subtype stem on different maintypes
gold_val = (getattr(gold, "event_type"), getattr(gold, "event_subtype"))
match_val = (getattr(match, "event_type"), getattr(match, "event_subtype"))
else:
gold_val = getattr(gold, attribute_name)
match_val = getattr(match, attribute_name)
if gold_val == match_val:
return True
else:
return False
def _score_ere_nugget(
gold_doc, system_doc, attributes=["event_type", "polarity_negation", "modality"]
):
"""
ERE nugget scoring strategy as explained in [1].
N.B: selecting match with maximum dice score is already done by Matcher
(i.e. argmax in Algorithm 3 and 4 in Liu et al. 2015).
Multiple matches are off the case described in [1] pp. where one gold annotation is erroneously split
into multiple annotations.
Computes all variants mentioned in the above paper:
- f1: span (dice score) f1 of unit triggers
- f1_alt: Uses alternative Precision ("p_alt") as [1] pp.55 sec.2.4
- f1_attrib_alt:
:param gold_doc:
:param system_doc:
:param attributes:
:return: dict of all metrics and scores
"""
check_matched(
gold_doc.events[-1]
) # sanity check if gold and system matching step has completed
metrics = {
"tp": 0.0, # default span tp where of one match with argmax dice score is selected
# when there are multiple system candidate matches for a gold
"fp": 0.0, # default span fp
# alternative scoring that differs from "Evaluation algo. for Event Nugget Detection, Liu et al. 2015"
"tp_relaxed": 0.0, # if token span overlaps count full score, instead of dice coeff score
# attribute matched metrics which are count when all attributes match
"fp_attrib": 0.0, # fp when all attributes do not match (very strict)
"tp_attrib": 0.0, # only count tp if all attributes are accurate and add dice coeff score (very strict)
"tp_attrib_relaxed": 0.0, # as above, but +1, instead of dice coeff score
# counts
"n_gold": len(gold_doc.events), # total number of gold annotations
"n_system": len(system_doc.events), # total number of reference annotations
"n_match": 0.0, # total number of gold to potentially multiple system (1-n) mappings for attribute accuracy
"acc_allattrib": 0.0,
}
# add attribute accuracy, fp, tp keys
metrics.update({"acc_" + attrib_n: 0.0 for attrib_n in attributes})
metrics.update( # tp for individual attributes
{"tp_" + attrib_n: 0.0 for attrib_n in attributes}
)
metrics.update( # fp for individual attributes
{"fp_" + attrib_n: 0.0 for attrib_n in attributes}
)
# check if matches on gold
for gold_ev in gold_doc.events:
if gold_ev.ere_mapping:
# [1] Algorithm 2 pp. 55: Compute span F1
sys_ev_max, max_dice_score = max(gold_ev.ere_mapping, key=lambda x: x[1])
metrics["tp"] += max_dice_score
metrics[
"tp_relaxed"
] += 1.0 # different from [1]: count every span overlap as tp: no penalty for long annotations.
# [1] Algorithm 4 pp 55: Compute True Positive with all attributes
if all(
is_attribute_match(gold_ev, sys_ev_max, attrib_n)
for attrib_n in attributes
): # [1] Algorithm 4 pp 55 line 3: Check if all attributes match for the max hit.
metrics["tp_attrib"] += max_dice_score # increment dice score
metrics["tp_attrib_relaxed"] += 1.0 # deviation from [1] own relaxation
else:
metrics["fp_attrib"] += 1.0
for (sys_ev, dice_score) in gold_ev.ere_mapping:
# [1] Algorithm 3 line 3: extract attributes and check accuracy individually
attrib_match = []
for attrib_n in attributes: # check if each attrib matches
attribute_is_accurate = is_attribute_match(
gold_ev, sys_ev, attrib_n
)
attrib_match.append(attribute_is_accurate)
if attribute_is_accurate: # add attributes to score dict
metrics["acc_" + attrib_n] += 1.0 / len(
gold_ev.ere_mapping
) # [1] Algorithm 3 line 3: increment acc score for INDIVIDUAL attributes. Different from Algorithm 3 which considers all at once.
metrics["tp_" + attrib_n] += dice_score # algorithm 4
else:
metrics["fp_" + attrib_n] += 1.0
# Attribute TP: all attributes match, add tp_attrib, tp_attrib_relaxed
if all(
attrib_match
): # [1] Algorithm 3 line 3: check if all attributes match
metrics["acc_allattrib"] += 1.0 / len(
gold_ev.ere_mapping
) # [1] Algorithm 3 line 3: increment acc score
else:
metrics["fp"] += 1.0
metrics["fp_attrib"] += 1.0
for attrib_n in attributes:
metrics["fp_" + attrib_n] += 1.0
# compute P, R and F1 and accs
metrics = compute_prf(metrics)
metrics = compute_acc(metrics)
return metrics
def find_exact_overlap(events):
"""
In a list of annotations return the ones that exactly overlap and have the exact same boundaries e.g.:
"The [[report]] about oil prices comes in at."
:param events: list of unit annotation objects.
:return: list of lists of exactly overlapping events.
"""
token_span_ids = [
tuple(ev.get_extent_token_ids(extent=["discontiguous_triggers"]))
for ev in events
]
# use counter to find overlapping spans: if token position ids are same, there is exact unit trigger overlap.
counter = Counter(token_span_ids)
overlap_token_span = [k for k, v in counter.items() if v > 1]
if overlap_token_span:
overlap_events = [
[
ev
for ev in events
if tuple(ev.get_extent_token_ids(extent=["discontiguous_triggers"]))
== d
]
for d in overlap_token_span
]
return overlap_events
def compute_overlap_stats(proj):
exact_overlaps = [find_exact_overlap(d.events) for d in proj.annotation_documents]
exact_overlaps = [x for x in exact_overlaps if x]
exact_overlaps = util.flatten(exact_overlaps)
overlaps_cnt = len(exact_overlaps)
overlap_events_cnt = sum(len(x) for x in exact_overlaps)
all_ev = []
for d in proj.annotation_documents:
for ev in d.events:
all_ev.append(ev)
all_ev_cnt = len(all_ev)
overlap_pct = round(100 * overlap_events_cnt / all_ev_cnt, 1)
return {
"overlap_pct": overlap_pct,
"overlaps_cnt": overlaps_cnt,
"overlapping_event_cnt": overlap_events_cnt,
}
if __name__ == "__main__":
# load the final IAA study project and set gold standard
iaa_proj = parse_project(settings.IAA_XMI_DIRP)
# load the final clean project
gold_proj = parse_project(settings.CLEAN_XMI_DIRP)
gold_docs = [
doc
for doc in gold_proj.annotation_documents
if doc.title in settings.IA_IDS and doc.annotator_id == settings.MOD_ID
]
# get some stats on overlap annotations
overlap_stats_iaa = compute_overlap_stats(iaa_proj)
overlap_stats_gold = compute_overlap_stats(gold_proj)
# set the matching, selection, and scoring criteria
scorer_settings = {
"match_strategy": "ere",
"select_criteria": "ere",
"score_criteria": "all_attributes",
}
# use moderator_id to determine gold file and system files
system_docs = list(
filter(
lambda d: d.annotator_id != settings.MOD_ID, iaa_proj.annotation_documents
)
)
system_docs.sort(key=lambda x: x.annotator_id)
system_docs_dict = {
anno_id: list(sys_docs)
for anno_id, sys_docs in groupby(system_docs, key=lambda x: x.annotator_id)
}
mapper = MentionMapper()
scorer = NuggetScorer()
records = []
for gold_doc in gold_docs:
for system_anno_id, system_docs in system_docs_dict.items():
system_doc = next(
sys_doc for sys_doc in system_docs if sys_doc.title == gold_doc.title
)
print(f"Starting mention mapping for {system_anno_id} {system_doc.title}.")
gold_matched = mapper.match(
gold_doc,
system_doc,
match_strategy=scorer_settings["match_strategy"],
select_criteria=scorer_settings["select_criteria"],
)
# preprocess for multiple gold annotations on same string
## find gold_anns on same exact token span (for gold and system)
## join them into one annotation with attribs
## TODO check how this can be compared
doc_scores = scorer.score(
gold_matched,
system_doc,
score_criteria=scorer_settings["score_criteria"],
)
record = {"doc_id": system_doc.title, "anno_id": system_doc.annotator_id}
record.update(doc_scores)
records.append(record)
# remove non-annotated doc of anno_03, bac_05
remove = [
{
"anno_id": "anno_03",
"doc_id": "bac05_bofa-includes-bitcoin-trust-in-broader-ban-on-investments.txt",
},
{
"anno_id": "anno_01",
"doc_id": "duk06_like-many-of-its-peers-duk-is-trading-in-the-oversold-zone.txt",
},
]
records_filt = [
r for r in records if not any(rm.items() <= r.items() for rm in remove)
]
# MAKE PANDAS DATAFRAME AND EXPORT TO EXCEL
all_score_df = pd.DataFrame.from_records(records_filt)
# round to 4 decimals
all_score_df = all_score_df.round(decimals=4)
# mean over column to get final scores and sort
all_mean_df = all_score_df.mean(axis=0)
anno_mean_df = all_score_df.groupby(
["anno_id"]
).mean() # scores by annotator, transpose for readab.
docs_mean_df = all_score_df.groupby(
["doc_id"]
).mean() # scores by doc, transpose for readab.
# Write scores to Excel file
xl_fn = (
"nuggetscores-"
+ "-".join(f"{k}={v}" for k, v in scorer_settings.items())
+ "_test3.xlsx"
)
xl_fp = Path(settings.OPT_DIRP, xl_fn)
xl_writer = pd.ExcelWriter(xl_fp, engine="xlsxwriter")
all_mean_df.to_excel(
xl_writer, sheet_name="All mean scores"
) # Write each dataframe to a different worksheet.
anno_mean_df.to_excel(xl_writer, sheet_name="Annotator mean scorers")
docs_mean_df.to_excel(xl_writer, sheet_name="Document mean scores")
# column names to sort columns in Excel
important_cols = [
"doc_id",
"anno_id",
"f1_alt",
"f1_relaxed_alt",
"f1_attrib_alt",
"f1_attrib_relaxed_alt",
]
cols_sorted = important_cols + [
cn for cn in all_score_df.columns.to_list() if cn not in important_cols
]
# write the excel
all_score_df = all_score_df.reindex(columns=cols_sorted)
all_score_df.to_excel(xl_writer, index=False, sheet_name="All scores")
xl_writer.save() # Close the Pandas Excel writer and output the Excel file