Skip to content

Commit

Permalink
Merge pull request #1068 from kermitt2/feature/paragraphs-coordinates
Browse files Browse the repository at this point in the history
Add paragraphs coordinates
  • Loading branch information
kermitt2 authored Dec 29, 2023
2 parents 2ca3f35 + a506eb1 commit b50c9aa
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 3 deletions.
3 changes: 2 additions & 1 deletion doc/Coordinates-in-PDF.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Since April 2017, GROBID version 0.4.2 and higher, coordinate areas can be obtai
* ```formula``` for mathematical equations,
* ```head``` for section titles,
* ```s``` for optional sentence structure (the GROBID fulltext service must be called with the `segmentSentences` parameter to provide the optional sentence-level elements),
* ```p``` for paragraph structure,
* ```note``` for foot note elements,
* ```title``` for the title elements (main article title and cited reference titles),
* ```affiliation``` for the affiliation and address part.
Expand Down Expand Up @@ -117,7 +118,7 @@ The GROBID console offers a reference implementation with PDF.js for dynamically

### Coordinates in TEI/XML results

Coordinates for a given structure appear via an extra attribute ```@coord```. This is part of the [customization to the TEI](TEI-encoding-of-results.md) used by GROBID.
Coordinates for a given structure appear via an extra attribute ```@coords```. This is part of the [customization to the TEI](TEI-encoding-of-results.md) used by GROBID.

* the list of page size is encoded under the TEI element `<facsimile>`. The dimension of each page is given successively by the TEI attributes `@lrx` and `@lry` of the element `<surface>` to be conformant with the TEI (`@ulx` and `@uly` are used to set the orgine coordinates, which is always `(0,0)` for us).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1202,7 +1202,12 @@ private StringBuilder toTEINote(StringBuilder tei,
String pID = KeyGen.getKey().substring(0, 7);
addXmlId(pNote, "_" + pID);
}


if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(note.getTokens());
desc.addAttribute(new Attribute("coords", coords));
}

// for labelling bibliographical references in notes
List<LayoutToken> noteTokens = note.getTokens();

Expand Down Expand Up @@ -1472,6 +1477,12 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(curParagraph, "_" + divID);
}

if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(clusterTokens);
curParagraph.addAttribute(new Attribute("coords", coords));
}

curDiv.appendChild(curParagraph);
curParagraphTokens = new ArrayList<>();
}
Expand All @@ -1487,6 +1498,12 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(curParagraph, "_" + divID);
}

if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(clusterTokens);
curParagraph.addAttribute(new Attribute("coords", coords));
}

curDiv.appendChild(curParagraph);
curParagraphTokens = new ArrayList<>();
}
Expand Down Expand Up @@ -1543,6 +1560,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
}

curParagraph.appendChild(clusterContentBefore);
if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(before);
if (curParagraph.getAttribute("coords") != null && !curParagraph.getAttributeValue("coords").contains(coords)) {
curParagraph.addAttribute(new Attribute("coords", curParagraph.getAttributeValue("coords") + ";" + coords));
}
}

curParagraphTokens.addAll(before);

List<LayoutToken> calloutTokens = clusterTokens.subList(matchingPosition.start, matchingPosition.end);
Expand Down Expand Up @@ -1572,6 +1596,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
curParagraph.appendChild(new Text(" "));
}

if (config.isGenerateTeiCoordinates("p")) {
String coords = LayoutTokensUtil.getCoordsString(remaining);
if (curParagraph.getAttribute("coords") != null && !curParagraph.getAttributeValue("coords").contains(coords)) {
curParagraph.addAttribute(new Attribute("coords", curParagraph.getAttributeValue("coords") + ";" + coords));
}
}

curParagraph.appendChild(remainingClusterContent);
curParagraphTokens.addAll(remaining);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public void processFullText(final GrobidMainArgs pGbdArgs) throws Exception {
} else {
List<String> elementCoordinates = null;
if (pGbdArgs.getTeiCoordinates()) {
elementCoordinates = Arrays.asList("figure", "persName", "ref", "biblStruct", "formula", "s", "note", "title", "head", "affiliation");
elementCoordinates = Arrays.asList("figure", "persName", "ref", "biblStruct", "formula", "s", "note", "title", "head", "affiliation", "p");
}
processFullTextDirectory(files, pGbdArgs, pGbdArgs.getPath2Output(), pGbdArgs.getSaveAssets(),
elementCoordinates, pGbdArgs.getSegmentSentences(), pGbdArgs.getAddElementId());
Expand Down

0 comments on commit b50c9aa

Please sign in to comment.