Skip to content

Commit

Permalink
Merge pull request #1099 from kermitt2/feature/identify-urls
Browse files Browse the repository at this point in the history
Identify URLs and output them in TEI
  • Loading branch information
lfoppiano authored Jun 9, 2024
2 parents 76fd16f + 4d4c1e3 commit cb7118d
Show file tree
Hide file tree
Showing 7 changed files with 520 additions and 133 deletions.
8 changes: 2 additions & 6 deletions grobid-core/src/main/java/org/grobid/core/data/Note.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.Page;
import org.grobid.core.utilities.*;

import java.util.List;
Expand Down Expand Up @@ -127,10 +127,6 @@ public void setNoteType(NoteType noteType) {
}

public String getNoteTypeName() {
if (this.noteType == NoteType.FOOT) {
return "foot";
} else {
return "margin";
}
return StringUtils.lowerCase(noteType.name());
}
}
189 changes: 122 additions & 67 deletions grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
import com.google.common.collect.Sets;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.StringUtils;

import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Text;

import org.apache.commons.lang3.tuple.Triple;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.*;
import org.grobid.core.data.CopyrightsLicense.License;
import org.grobid.core.data.CopyrightsLicense.CopyrightsOwner;
import org.grobid.core.data.Date;
import org.grobid.core.data.*;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.engines.Engine;
import org.grobid.core.engines.FullTextParser;
Expand All @@ -29,10 +29,9 @@
import org.grobid.core.lang.Language;
import org.grobid.core.layout.*;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.utilities.SentenceUtilities;
import org.grobid.core.utilities.*;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.*;
import org.grobid.core.utilities.matching.EntityMatcherException;
import org.grobid.core.utilities.matching.ReferenceMarkerMatcher;
import org.grobid.core.engines.citations.CalloutAnalyzer.MarkerType;
Expand Down Expand Up @@ -1513,29 +1512,78 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
int clusterPage = Iterables.getLast(clusterTokens).getPage();

List<Note> notesSamePage = null;
List<Triple<String,String, OffsetPosition>> matchedLabelPosition = new ArrayList<>();

// map the matched note labels to their corresponding note objects
Map<String, Note> labels2Notes = new TreeMap<>();
if (CollectionUtils.isNotEmpty(notes)) {
notesSamePage = notes.stream()
.filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage)
.collect(Collectors.toList());

// we need to cover several footnote callouts in the same paragraph segment

// we also can't assume notes are sorted and will appear first in the text as the same order
// they are defined in the note areas - this might not always be the case in
// ill-formed documents

// map a note label (string) to a valid matching position in the sequence of Layout Tokens
// of the paragraph segment

for (Note note : notesSamePage) {
Optional<LayoutToken> matching = clusterTokens
.stream()
.filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
.findFirst();

if (matching.isPresent()) {
int idx = clusterTokens.indexOf(matching.get());
note.setIgnored(true);
OffsetPosition matchingPosition = new OffsetPosition();
matchingPosition.start = idx;
matchingPosition.end = idx+1; // to be review, might be more than one layout token
matchedLabelPosition.add(Triple.of(note.getLabel(), "note", matchingPosition));
labels2Notes.put(note.getLabel(), note);
}
}

}

if (notesSamePage == null) {
//Identify URLs and attach reference in the text
List<OffsetPosition> offsetPositionsUrls = Lexicon.tokenPositionUrlPatternWithPdfAnnotations(clusterTokens, doc.getPDFAnnotations());
offsetPositionsUrls.stream()
.forEach(opu -> {
// We correct the latest token here, since later we will do a substring in the shared code,
// and we cannot add a +1 there.
matchedLabelPosition.add(
Triple.of(LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens.subList(opu.start, opu.end)),
"url",
new OffsetPosition(opu.start, opu.end + 1)
)
);
}
);

// We can add more elements to be extracted from the paragraphs, here. Each labelPosition it's a
// Triple with three main elements: the text of the item, the type, and the offsetPositions.

if (CollectionUtils.isEmpty(matchedLabelPosition)){
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage(), doc.getPDFAnnotations());
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
}
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(curParagraph, "_" + divID);
}

if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(clusterTokens);
curParagraph.addAttribute(new Attribute("coords", coords));
}

curDiv.appendChild(curParagraph);
curParagraphTokens = new ArrayList<>();
} else {
Expand Down Expand Up @@ -1563,41 +1611,11 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
String coords = LayoutTokensUtil.getCoordsString(clusterTokens);
curParagraph.addAttribute(new Attribute("coords", coords));
}

curDiv.appendChild(curParagraph);
curParagraphTokens = new ArrayList<>();
}

// we need to cover several footnote callouts in the same paragraph segment

// we also can't assume notes are sorted and will appear first in the text as the same order
// they are defined in the note areas - this might not always be the case in
// ill-formed documents

// map the matched note labels to their corresponding note objects
Map<String, Note> labels2Notes = new TreeMap<>();

// map a note label (string) to a valid matching position in the sequence of Layout Tokens
// of the paragraph segment
List<Pair<String,OffsetPosition>> matchedLabelPosition = new ArrayList<>();

for (Note note : notesSamePage) {
Optional<LayoutToken> matching = clusterTokens
.stream()
.filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
.findFirst();

if (matching.isPresent()) {
int idx = clusterTokens.indexOf(matching.get());
note.setIgnored(true);
OffsetPosition matchingPosition = new OffsetPosition();
matchingPosition.start = idx;
matchingPosition.end = idx+1; // to be review, might be more than one layout token
matchedLabelPosition.add(Pair.of(note.getLabel(), matchingPosition));
labels2Notes.put(note.getLabel(), note);
}
}

// sort the matches by position
Collections.sort(matchedLabelPosition, (m1, m2) -> {
return m1.getRight().start - m2.getRight().start;
Expand All @@ -1608,9 +1626,9 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
int pos = 0;

// build the paragraph segment, match by match
for(Pair<String,OffsetPosition> matching : matchedLabelPosition) {
Note note = labels2Notes.get(matching.getLeft());
OffsetPosition matchingPosition = matching.getRight();
for (Triple<String, String, OffsetPosition> referenceInformation : matchedLabelPosition) {
String type = referenceInformation.getMiddle();
OffsetPosition matchingPosition = referenceInformation.getRight();

if (pos >= matchingPosition.start)
break;
Expand All @@ -1629,26 +1647,27 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
curParagraph.addAttribute(new Attribute("coords", curParagraph.getAttributeValue("coords") + ";" + coords));
}
}

curParagraphTokens.addAll(before);

List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
curParagraphTokens.addAll(before);

Element ref = teiElement("ref");
ref.addAttribute(new Attribute("type", "foot"));

if (config.isGenerateTeiCoordinates("ref") ) {
String coords = LayoutTokensUtil.getCoordsString(calloutTokens);
if (coords != null) {
ref.addAttribute(new Attribute("coords", coords));
Element ref = null;
List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
if (type.equals("note")) {
Note note = labels2Notes.get(referenceInformation.getLeft());
ref = generateNoteRef(calloutTokens, referenceInformation.getLeft(), note, config);
} else if (type.equals("url")) {
String normalizeDehyphenizeText = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens.subList(matchingPosition.start, matchingPosition.end));
ref = generateURLRef(normalizeDehyphenizeText, calloutTokens, config.isGenerateTeiCoordinates("ref"));

//We might need to add a space if it's in the layout tokens
if (CollectionUtils.isNotEmpty(before) && StringUtils.equalsAnyIgnoreCase(Iterables.getLast(before).getText(), " ", "\n")) {
curParagraph.appendChild(new Text(" "));
}
}

ref.appendChild(matching.getLeft());
ref.addAttribute(new Attribute("target", "#" + note.getIdentifier()));
pos = matchingPosition.end;
curParagraph.appendChild(ref);

pos = matchingPosition.end;
}

// add last chunk of paragraph stuff (or whole paragraph if no note callout matching)
Expand Down Expand Up @@ -1725,18 +1744,8 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
if (chunkRefString.trim().equals(note.getLabel())) {
footNoteCallout = true;
note.setIgnored(true);

Element ref = teiElement("ref");
ref.addAttribute(new Attribute("type", "foot"));

if (config.isGenerateTeiCoordinates("ref") ) {
String coords = LayoutTokensUtil.getCoordsString(refTokens);
if (coords != null) {
ref.addAttribute(new Attribute("coords", coords));
}
}
ref.appendChild(chunkRefString.trim());
ref.addAttribute(new Attribute("target", "#" + note.getIdentifier()));

Element ref = generateNoteRef(refTokens, chunkRefString.trim(), note, config);

parent.appendChild(ref);

Expand Down Expand Up @@ -1831,6 +1840,23 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
return buffer;
}

private static Element generateNoteRef(List<LayoutToken> noteTokens, String noteLabel, Note note, GrobidAnalysisConfig config) {
Element ref = teiElement("ref");
//TODO: is this normal that it's hardcoded "foot"?
ref.addAttribute(new Attribute("type", "foot"));

if (config.isGenerateTeiCoordinates("ref")) {
String coords = LayoutTokensUtil.getCoordsString(noteTokens);
if (coords != null) {
ref.addAttribute(new Attribute("coords", coords));
}
}

ref.appendChild(noteLabel);
ref.addAttribute(new Attribute("target", "#" + note.getIdentifier()));
return ref;
}

public static boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagraph) {
return (!MARKER_LABELS.contains(lastClusterLabel) && lastClusterLabel != TaggingLabels.FIGURE
&& lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null;
Expand Down Expand Up @@ -1876,6 +1902,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}
}

// We add URL that are identified using the PDF features for annotations, in this way we avoid mangling URLs
// in different sentences.
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

Expand Down Expand Up @@ -2509,6 +2537,33 @@ public List<Node> markReferencesEquationTEI(String text,
return nodes;
}

public Element generateURLRef(String text,
List<LayoutToken> refTokens,
boolean generateCoordinates) {
if (StringUtils.isEmpty(text)) {
return null;
}

// For URLs, we remove spaces
String cleanText = StringUtils.trim(text.replace("\n", " ").replace(" ", ""));

String coords = null;
if (generateCoordinates && refTokens != null) {
coords = LayoutTokensUtil.getCoordsString(refTokens);
}

Element ref = teiElement("ref");
ref.addAttribute(new Attribute("type", "url"));

if (coords != null) {
ref.addAttribute(new Attribute("coords", coords));
}
ref.appendChild(text);
ref.addAttribute(new Attribute("target", cleanText));

return ref;
}

private String normalizeText(String localText) {
localText = localText.trim();
localText = TextUtilities.dehyphenize(localText);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ public boolean cover(LayoutToken token) {
break;
}
double areaToken = tokenBox.area();
// the bounding box of the insection
// the bounding box of the intersection
BoundingBox intersectionBox = box.boundingBoxIntersection(tokenBox);
if (intersectionBox != null) {
double intersectionArea = intersectionBox.area();
Expand Down
Loading

0 comments on commit cb7118d

Please sign in to comment.