Merge pull request #1099 from kermitt2/feature/identify-urls

Identify URLs and output them in TEI
kermitt2 · Jun 9, 2024 · cb7118d · cb7118d
2 parents 76fd16f + 4d4c1e3
commit cb7118d
Show file tree

Hide file tree

Showing 7 changed files with 520 additions and 133 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Note.java b/grobid-core/src/main/java/org/grobid/core/data/Note.java
@@ -1,8 +1,8 @@
 package org.grobid.core.data;
 
 import org.apache.commons.collections4.CollectionUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.grobid.core.layout.LayoutToken;
-import org.grobid.core.layout.Page;
 import org.grobid.core.utilities.*;
 
 import java.util.List;
@@ -127,10 +127,6 @@ public void setNoteType(NoteType noteType) {
     }
 
     public String getNoteTypeName() {
-        if (this.noteType == NoteType.FOOT) {
-            return "foot";
-        } else {
-            return "margin";
-        }
+        return StringUtils.lowerCase(noteType.name());
     }
 }
diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -5,19 +5,19 @@
 import com.google.common.collect.Sets;
 
 import org.apache.commons.collections4.CollectionUtils;
-import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.lang3.StringUtils;
 
 import nu.xom.Attribute;
 import nu.xom.Element;
 import nu.xom.Node;
 import nu.xom.Text;
 
+import org.apache.commons.lang3.tuple.Triple;
 import org.grobid.core.GrobidModels;
+import org.grobid.core.data.*;
 import org.grobid.core.data.CopyrightsLicense.License;
 import org.grobid.core.data.CopyrightsLicense.CopyrightsOwner;
 import org.grobid.core.data.Date;
-import org.grobid.core.data.*;
 import org.grobid.core.document.xml.XmlBuilderUtils;
 import org.grobid.core.engines.Engine;
 import org.grobid.core.engines.FullTextParser;
@@ -29,10 +29,9 @@
 import org.grobid.core.lang.Language;
 import org.grobid.core.layout.*;
 import org.grobid.core.lexicon.Lexicon;
-import org.grobid.core.utilities.SentenceUtilities;
+import org.grobid.core.utilities.*;
 import org.grobid.core.tokenization.TaggingTokenCluster;
 import org.grobid.core.tokenization.TaggingTokenClusteror;
-import org.grobid.core.utilities.*;
 import org.grobid.core.utilities.matching.EntityMatcherException;
 import org.grobid.core.utilities.matching.ReferenceMarkerMatcher;
 import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType;
@@ -1513,29 +1512,78 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 int clusterPage = Iterables.getLast(clusterTokens).getPage();
 
                 List<Note> notesSamePage = null;
+                List<Triple<String,String, OffsetPosition>> matchedLabelPosition = new ArrayList<>();
+
+                // map the matched note labels to their corresponding note objects
+                Map<String, Note> labels2Notes = new TreeMap<>();
                 if (CollectionUtils.isNotEmpty(notes)) {
                     notesSamePage = notes.stream()
                                 .filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage)
                                 .collect(Collectors.toList());
+
+                    // we need to cover several footnote callouts in the same paragraph segment
+
+                    // we also can't assume notes are sorted and will appear first in the text as the same order
+                    // they are defined in the note areas - this might not always be the case in
+                    // ill-formed documents
+
+                    // map a note label (string) to a valid matching position in the sequence of Layout Tokens
+                    // of the paragraph segment
+
+                    for (Note note : notesSamePage) {
+                        Optional<LayoutToken> matching = clusterTokens
+                            .stream()
+                            .filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
+                            .findFirst();
+
+                        if (matching.isPresent()) {
+                            int idx = clusterTokens.indexOf(matching.get());
+                            note.setIgnored(true);
+                            OffsetPosition matchingPosition = new OffsetPosition();
+                            matchingPosition.start = idx;
+                            matchingPosition.end = idx+1; // to be review, might be more than one layout token
+                            matchedLabelPosition.add(Triple.of(note.getLabel(), "note", matchingPosition));
+                            labels2Notes.put(note.getLabel(), note);
+                        }
+                    }
+
                 }
 
-                if (notesSamePage == null) {
+                //Identify URLs and attach reference in the text
+                List<OffsetPosition> offsetPositionsUrls = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(clusterTokens, doc.getPDFAnnotations());
+                offsetPositionsUrls.stream()
+                    .forEach(opu -> {
+                            // We correct the latest token here, since later we will do a substring in the shared code,
+                            // and we cannot add a +1 there.
+                        matchedLabelPosition.add(
+                            Triple.of(LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens.subList(opu.start, opu.end)),
+                                "url",
+                                new OffsetPosition(opu.start, opu.end + 1)
+                            )
+                        );
+                    }
+                    );
+
+                // We can add more elements to be extracted from the paragraphs, here. Each labelPosition it's a
+                // Triple with three main elements: the text of the item, the type, and the offsetPositions.
+
+                if (CollectionUtils.isEmpty(matchedLabelPosition)){
                     String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
                     if (isNewParagraph(lastClusterLabel, curParagraph)) {
                         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
+                            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
                         }
                         curParagraph = teiElement("p");
                         if (config.isGenerateTeiIds()) {
                             String divID = KeyGen.getKey().substring(0, 7);
                             addXmlId(curParagraph, "_" + divID);
                         }
-                        
+
                         if (config.isGenerateTeiCoordinates("p")) {
                             String coords = LayoutTokensUtil.getCoordsString(clusterTokens);
                             curParagraph.addAttribute(new Attribute("coords", coords));
                         }
-                        
+
                         curDiv.appendChild(curParagraph);
                         curParagraphTokens = new ArrayList<>();
                     } else {
@@ -1563,41 +1611,11 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                             String coords = LayoutTokensUtil.getCoordsString(clusterTokens);
                             curParagraph.addAttribute(new Attribute("coords", coords));
                         }
-                        
+
                         curDiv.appendChild(curParagraph);
                         curParagraphTokens = new ArrayList<>();
                     }
 
-                    // we need to cover several footnote callouts in the same paragraph segment
-
-                    // we also can't assume notes are sorted and will appear first in the text as the same order 
-                    // they are defined in the note areas - this might not always be the case in 
-                    // ill-formed documents
-
-                    // map the matched note labels to their corresponding note objects
-                    Map<String, Note> labels2Notes = new TreeMap<>();
-
-                    // map a note label (string) to a valid matching position in the sequence of Layout Tokens
-                    // of the paragraph segment
-                    List<Pair<String,OffsetPosition>> matchedLabelPosition = new ArrayList<>();
-
-                    for (Note note : notesSamePage) {
-                        Optional<LayoutToken> matching = clusterTokens
-                            .stream()
-                            .filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
-                            .findFirst();
-
-                        if (matching.isPresent()) {
-                            int idx = clusterTokens.indexOf(matching.get());
-                            note.setIgnored(true);
-                            OffsetPosition matchingPosition = new OffsetPosition();
-                            matchingPosition.start = idx;
-                            matchingPosition.end = idx+1; // to be review, might be more than one layout token
-                            matchedLabelPosition.add(Pair.of(note.getLabel(), matchingPosition));
-                            labels2Notes.put(note.getLabel(), note);
-                        }
-                    }
-
                     // sort the matches by position
                     Collections.sort(matchedLabelPosition, (m1, m2) -> {
                             return m1.getRight().start - m2.getRight().start;
@@ -1608,9 +1626,9 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     int pos = 0;
 
                     // build the paragraph segment, match by match
-                    for(Pair<String,OffsetPosition> matching : matchedLabelPosition) {
-                        Note note = labels2Notes.get(matching.getLeft());
-                        OffsetPosition matchingPosition = matching.getRight();
+                    for (Triple<String, String, OffsetPosition> referenceInformation : matchedLabelPosition) {
+                        String type = referenceInformation.getMiddle();
+                        OffsetPosition matchingPosition = referenceInformation.getRight();
 
                         if (pos >= matchingPosition.start)
                             break;
@@ -1629,26 +1647,27 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                                 curParagraph.addAttribute(new Attribute("coords", curParagraph.getAttributeValue("coords") + ";" + coords));
                             }
                         }
-
-                        curParagraphTokens.addAll(before);
 
-                        List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
+                        curParagraphTokens.addAll(before);
 
-                        Element ref = teiElement("ref");
-                        ref.addAttribute(new Attribute("type", "foot"));
 
-                        if (config.isGenerateTeiCoordinates("ref") ) {
-                            String coords =  LayoutTokensUtil.getCoordsString(calloutTokens);
-                            if (coords != null) {
-                                ref.addAttribute(new Attribute("coords", coords));
+                        Element ref = null;
+                        List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
+                        if (type.equals("note")) {
+                            Note note = labels2Notes.get(referenceInformation.getLeft());
+                            ref = generateNoteRef(calloutTokens, referenceInformation.getLeft(), note, config);
+                        } else if (type.equals("url")) {
+                            String normalizeDehyphenizeText = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens.subList(matchingPosition.start, matchingPosition.end));
+                            ref = generateURLRef(normalizeDehyphenizeText, calloutTokens, config.isGenerateTeiCoordinates("ref"));
+
+                            //We might need to add a space if it's in the layout tokens
+                            if (CollectionUtils.isNotEmpty(before) && StringUtils.equalsAnyIgnoreCase(Iterables.getLast(before).getText(), " ", "\n")) {
+                                curParagraph.appendChild(new Text(" "));
                             }
                         }
 
-                        ref.appendChild(matching.getLeft());
-                        ref.addAttribute(new Attribute("target", "#" + note.getIdentifier()));
+                        pos = matchingPosition.end;
                         curParagraph.appendChild(ref);
-
-                        pos = matchingPosition.end; 
                     }
 
                     // add last chunk of paragraph stuff (or whole paragraph if no note callout matching)
@@ -1725,18 +1744,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                                         if (chunkRefString.trim().equals(note.getLabel())) {
                                             footNoteCallout = true;
                                             note.setIgnored(true);
-
-                                            Element ref = teiElement("ref");
-                                            ref.addAttribute(new Attribute("type", "foot"));
-
-                                            if (config.isGenerateTeiCoordinates("ref") ) {
-                                                String coords =  LayoutTokensUtil.getCoordsString(refTokens);
-                                                if (coords != null) {
-                                                    ref.addAttribute(new Attribute("coords", coords));
-                                                }
-                                            }
-                                            ref.appendChild(chunkRefString.trim());
-                                            ref.addAttribute(new Attribute("target", "#" + note.getIdentifier()));
+
+                                            Element ref = generateNoteRef(refTokens, chunkRefString.trim(), note, config);
 
                                             parent.appendChild(ref);
 
@@ -1831,6 +1840,23 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
         return buffer;
     }
 
+    private static Element generateNoteRef(List<LayoutToken> noteTokens, String noteLabel,  Note note, GrobidAnalysisConfig config) {
+        Element ref = teiElement("ref");
+        //TODO: is this normal that it's hardcoded "foot"?
+        ref.addAttribute(new Attribute("type", "foot"));
+
+        if (config.isGenerateTeiCoordinates("ref")) {
+            String coords = LayoutTokensUtil.getCoordsString(noteTokens);
+            if (coords != null) {
+                ref.addAttribute(new Attribute("coords", coords));
+            }
+        }
+
+        ref.appendChild(noteLabel);
+        ref.addAttribute(new Attribute("target", "#" + note.getIdentifier()));
+        return ref;
+    }
+
     public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagraph) {
         return (!MARKER_LABELS.contains(lastClusterLabel) && lastClusterLabel != TaggingLabels.FIGURE
                 && lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null;
@@ -1876,6 +1902,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
             }
         }
 
+        // We add URL that are identified using the PDF features for annotations, in this way we avoid mangling URLs
+        // in different sentences.
         List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
         forbiddenPositions.addAll(offsetPositionsUrls);
 
@@ -2509,6 +2537,33 @@ public List<Node> markReferencesEquationTEI(String text,
         return nodes;
     }
 
+    public Element generateURLRef(String text,
+                                  List<LayoutToken> refTokens,
+                                  boolean generateCoordinates) {
+        if (StringUtils.isEmpty(text)) {
+            return null;
+        }
+
+        // For URLs, we remove spaces
+        String cleanText = StringUtils.trim(text.replace("\n", " ").replace(" ", ""));
+
+        String coords = null;
+        if (generateCoordinates && refTokens != null) {
+            coords = LayoutTokensUtil.getCoordsString(refTokens);
+        }
+
+        Element ref = teiElement("ref");
+        ref.addAttribute(new Attribute("type", "url"));
+
+        if (coords != null) {
+            ref.addAttribute(new Attribute("coords", coords));
+        }
+        ref.appendChild(text);
+        ref.addAttribute(new Attribute("target", cleanText));
+
+        return ref;
+    }
+
     private String normalizeText(String localText) {
         localText = localText.trim();
         localText = TextUtilities.dehyphenize(localText);

diff --git a/grobid-core/src/main/java/org/grobid/core/layout/PDFAnnotation.java b/grobid-core/src/main/java/org/grobid/core/layout/PDFAnnotation.java
@@ -124,7 +124,7 @@ public boolean cover(LayoutToken token) {
 						break;
 					}
 					double areaToken = tokenBox.area();
-					// the bounding box of the insection 
+					// the bounding box of the intersection
 					BoundingBox intersectionBox = box.boundingBoxIntersection(tokenBox);
 					if (intersectionBox != null) {
 						double intersectionArea = intersectionBox.area();