Skip to content

Commit

Permalink
Fix corner case
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed May 27, 2024
1 parent 6370de2 commit 4d4c1e3
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
15 changes: 15 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,13 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
int startTokenIndex = tokenPositions.start;
int endTokensIndex = tokenPositions.end;

// There are no token that matches the character offsets, this may happen rarely when
// the character offset falls in the middle of a token, this is likely due to a badly
// constructed PDF document
if (startTokenIndex < 0 || endTokensIndex < 0) {
continue;
}

List<LayoutToken> urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex+1));

String urlString = LayoutTokensUtil.toText(urlTokens);
Expand Down Expand Up @@ -1360,6 +1367,14 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
String difference = urlString.substring(startCharDifference);
OffsetPosition newTokenPositions = getTokenPositions(startCharDifference, urlString.length(), urlTokens);

if (newTokenPositions.end < 0) {
// The difference is within the last token, even if we split the layout tokens, here,
// it won't solve the problem so we limit collateral damage.
// At some point we could return the destination containing the clean URL to fill up the
// "target" attribute in the TEI
newTokenPositions.end = urlTokens.size() - 1;
}

urlTokens = urlTokens.subList(0, newTokenPositions.end);
endPos = startPos + LayoutTokensUtil.toText(urlTokens).length();
} else {
Expand Down
17 changes: 17 additions & 0 deletions grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -435,5 +435,22 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
assertThat(input.substring(url0.start, url0.end), is("https://uhslc.soest.hawaii.edu/stations/?stn=057#levels"));
}

@Test
public void testGetTokenPosition() throws Exception {

//NOTE LF: The current behaviour will return -1 if the tokens are not matching with the positions
// of the characters
//Here the url is https://paperpile.com/c/QlNkzH/Hj7c+4D5e but because `Lameness` is attached the last token
// is `Hj7c+4D5eLameness` which will cause troubles.

String input = "https://paperpile.com/c/QlNkzH/Hj7c+4D5eLameness";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

OffsetPosition tokenPositions = Lexicon.getTokenPositions(40, 48, tokens);

assertThat(tokenPositions.start, is(-1));
assertThat(tokenPositions.end, is(-1));

}

}

0 comments on commit 4d4c1e3

Please sign in to comment.