From 4d4c1e388479985e927d2c838020fda46558bd90 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Mon, 27 May 2024 10:22:59 +0900
Subject: [PATCH] Fix corner case

---
 .../java/org/grobid/core/lexicon/Lexicon.java   | 15 +++++++++++++++
 .../org/grobid/core/lexicon/LexiconTest.java    | 17 +++++++++++++++++
 2 files changed, 32 insertions(+)
diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
index ee63b539cc..0bd8af981e 100755
--- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
+++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
@@ -1265,6 +1265,13 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
             int startTokenIndex = tokenPositions.start;
             int endTokensIndex = tokenPositions.end;
 
+            // There are no token that matches the character offsets, this may happen rarely when
+            // the character offset falls in the middle of a token, this is likely due to a badly
+            // constructed PDF document
+            if (startTokenIndex < 0 || endTokensIndex < 0) {
+                continue;
+            }
+
             List<LayoutToken> urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex+1));
 
             String urlString = LayoutTokensUtil.toText(urlTokens);
@@ -1360,6 +1367,14 @@ public static List<OffsetPosition> characterPositionsUrlPatternWithPdfAnnotation
                     String difference = urlString.substring(startCharDifference);
                     OffsetPosition newTokenPositions = getTokenPositions(startCharDifference, urlString.length(), urlTokens);
 
+                    if (newTokenPositions.end < 0) {
+                        // The difference is within the last token, even if we split the layout tokens, here,
+                        // it won't solve the problem so we limit collateral damage.
+                        // At some point we could return the destination containing the clean URL to fill up the
+                        // "target" attribute in the TEI
+                        newTokenPositions.end = urlTokens.size() - 1;
+                    }
+
                     urlTokens = urlTokens.subList(0, newTokenPositions.end);
                     endPos = startPos + LayoutTokensUtil.toText(urlTokens).length();
                 } else {
diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
index b957d12b90..11d11cfcda 100644
--- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
+++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java
@@ -435,5 +435,22 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC
         assertThat(input.substring(url0.start, url0.end), is("https://uhslc.soest.hawaii.edu/stations/?stn=057#levels"));
     }
 
+    @Test
+    public void testGetTokenPosition() throws Exception {
+
+        //NOTE LF: The current behaviour will return -1 if the tokens are not matching with the positions
+        // of the characters
+        //Here the url is https://paperpile.com/c/QlNkzH/Hj7c+4D5e but because `Lameness` is attached the last token
+        // is `Hj7c+4D5eLameness` which will cause troubles.
+
+        String input = "https://paperpile.com/c/QlNkzH/Hj7c+4D5eLameness";
+        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
+
+        OffsetPosition tokenPositions = Lexicon.getTokenPositions(40, 48, tokens);
+
+        assertThat(tokenPositions.start, is(-1));
+        assertThat(tokenPositions.end, is(-1));
+
+    }
 
 }