Skip to content

Commit

Permalink
fix inconsistency when having notes in the same page
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed May 17, 2023
1 parent d57c82c commit 9adb8d8
Showing 1 changed file with 10 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1398,7 +1398,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
curDiv.appendChild(note);
} else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) {
List<LayoutToken> clusterTokens = cluster.concatTokens();
int clusterPage = Iterables.getLast(clusterTokens).getPage();
List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens);
int clusterPage = Iterables.getLast(dehyphenized).getPage();

List<Note> notesSamePage = null;
if (notes != null && notes.size() > 0) {
Expand All @@ -1408,7 +1409,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
}

if (notesSamePage == null) {
List<LayoutToken> dehyphenized = LayoutTokensUtil.dehyphenize(clusterTokens);

String text = LayoutTokensUtil.toText(dehyphenized).replace("\n", " ");

if (isNewParagraph(lastClusterLabel, curParagraph)) {
Expand Down Expand Up @@ -1460,13 +1461,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
List<Pair<String,OffsetPosition>> matchedLabelPosition = new ArrayList<>();

for (Note note : notesSamePage) {
Optional<LayoutToken> matching = clusterTokens
Optional<LayoutToken> matching = dehyphenized
.stream()
.filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
.findFirst();

if (matching.isPresent()) {
int idx = clusterTokens.indexOf(matching.get());
int idx = dehyphenized.indexOf(matching.get());
note.setIgnored(true);
OffsetPosition matchingPosition = new OffsetPosition();
matchingPosition.start = idx;
Expand All @@ -1490,8 +1491,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
Note note = labels2Notes.get(matching.getLeft());
OffsetPosition matchingPosition = matching.getRight();

List<LayoutToken> before = clusterTokens.subList(pos, matchingPosition.start);
String clusterContentBefore = LayoutTokensUtil.normalizeDehyphenizeText(before);
List<LayoutToken> before = dehyphenized.subList(pos, matchingPosition.start);
String clusterContentBefore = LayoutTokensUtil.toText(before);

if (CollectionUtils.isNotEmpty(before) && before.get(0).getText().equals(" ")) {
curParagraph.appendChild(new Text(" "));
Expand All @@ -1506,7 +1507,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
}
curParagraphTokens.addAll(cluster.concatTokens());

List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
List<LayoutToken> calloutTokens = dehyphenized.subList(matchingPosition.start, matchingPosition.end);

Element ref = teiElement("ref");
ref.addAttribute(new Attribute("type", "foot"));
Expand All @@ -1526,8 +1527,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
}

// add last chunk of paragraph stuff (or whole paragraph if no note callout matching)
List<LayoutToken> remaining = clusterTokens.subList(pos, clusterTokens.size());
String remainingClusterContent = LayoutTokensUtil.normalizeDehyphenizeText(remaining);
List<LayoutToken> remaining = dehyphenized.subList(pos, dehyphenized.size());
String remainingClusterContent = LayoutTokensUtil.toText(remaining);

if (CollectionUtils.isNotEmpty(remaining) && remaining.get(0).getText().equals(" ")) {
curParagraph.appendChild(new Text(" "));
Expand Down

0 comments on commit 9adb8d8

Please sign in to comment.