Skip to content

Commit

Permalink
Merge branch 'master' into feature/paragraphs-coordinates
Browse files Browse the repository at this point in the history
# Conflicts:
#	grobid-service/src/main/resources/web/grobid/grobid.js
  • Loading branch information
lfoppiano committed Dec 26, 2023
2 parents 7f0e0ca + 2ca3f35 commit 3ef0915
Show file tree
Hide file tree
Showing 94 changed files with 165,276 additions and 161,069 deletions.
3 changes: 2 additions & 1 deletion doc/Coordinates-in-PDF.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ Since April 2017, GROBID version 0.4.2 and higher, coordinate areas can be obtai
* ```head``` for section titles,
* ```s``` for optional sentence structure (the GROBID fulltext service must be called with the `segmentSentences` parameter to provide the optional sentence-level elements),
* ```note``` for foot note elements,
* ```title``` for the title elements (main article title and cited reference titles).
* ```title``` for the title elements (main article title and cited reference titles),
* ```affiliation``` for the affiliation and address part.

However, there is normally no particular limitation to the type of structures which can have their coordinates in the results, the implementation is on-going, see [issue #69](https://github.com/kermitt2/grobid/issues/69), and it is expected that more or less any structures could be associated with their coordinates in the orginal PDF.

Expand Down
202 changes: 92 additions & 110 deletions grobid-core/src/main/java/org/grobid/core/data/Affiliation.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.utilities.LayoutTokensUtil;

import java.util.*;

Expand Down Expand Up @@ -154,6 +156,7 @@ public void setAddressString(String s) {
}

public void setCountry(String s) {
s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n");
country = s;
}

Expand All @@ -170,10 +173,12 @@ public void setPostBox(String s) {
}

public void setRegion(String s) {
s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n");
region = s;
}

public void setSettlement(String s) {
s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n");
settlement = s;
}

Expand All @@ -187,6 +192,7 @@ public void setAffiliationString(String s) {

public void setRawAffiliationString(String s) {
rawAffiliationString = s;
rawAffiliationString = rawAffiliationString.replaceAll("( )+", " ");
}

public void setInstitutions(List<String> affs) {
Expand Down Expand Up @@ -219,6 +225,9 @@ public void addLaboratory(String aff) {
laboratories.add(TextUtilities.cleanField(aff, true));
}

/**
* DEPRECATED
**/
public void extendFirstInstitution(String theExtend) {
if (institutions == null) {
institutions = new ArrayList<String>();
Expand All @@ -230,6 +239,9 @@ public void extendFirstInstitution(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendLastInstitution(String theExtend) {
if (institutions == null) {
institutions = new ArrayList<String>();
Expand All @@ -241,6 +253,9 @@ public void extendLastInstitution(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendFirstDepartment(String theExtend) {
if (departments == null) {
departments = new ArrayList<String>();
Expand All @@ -252,6 +267,9 @@ public void extendFirstDepartment(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendLastDepartment(String theExtend) {
if (departments == null) {
departments = new ArrayList<String>();
Expand All @@ -263,6 +281,9 @@ public void extendLastDepartment(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendFirstLaboratory(String theExtend) {
if (laboratories == null) {
laboratories = new ArrayList<String>();
Expand All @@ -274,6 +295,9 @@ public void extendFirstLaboratory(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendLastLaboratory(String theExtend) {
if (laboratories == null) {
laboratories = new ArrayList<String>();
Expand All @@ -285,20 +309,33 @@ public void extendLastLaboratory(String theExtend) {
}
}

public boolean notNull() {
return !((departments == null) &
(institutions == null) &
(laboratories == null) &
(country == null) &
(postCode == null) &
(postBox == null) &
(region == null) &
(settlement == null) &
(addrLine == null) &
(affiliationString == null) &
public boolean isNotNull() {
return !((departments == null) &&
(institutions == null) &&
(laboratories == null) &&
(country == null) &&
(postCode == null) &&
(postBox == null) &&
(region == null) &&
(settlement == null) &&
(addrLine == null) &&
(affiliationString == null) &&
(addressString == null));
}

public boolean hasAddress() {
if (country != null ||
postCode != null ||
postBox != null ||
settlement != null ||
addrLine != null ||
region != null ||
addressString != null) {
return true;
} else
return false;
}

public void setFailAffiliation(boolean b) {
failAffiliation = b;
}
Expand All @@ -321,7 +358,7 @@ public void setLayoutTokens(List<LayoutToken> tokens) {

public void appendLayoutTokens(List<LayoutToken> tokens) {
if (this.layoutTokens == null)
layoutTokens = new ArrayList<>();
this.layoutTokens = new ArrayList<>();
this.layoutTokens.addAll(tokens);
}

Expand Down Expand Up @@ -448,101 +485,30 @@ public int nbStructures() {
return nbStruct;
}

@Deprecated
public String toTEI() {
StringBuilder tei = new StringBuilder();
if (!notNull()) {
return null;
} else {
tei.append("<affiliation");
if (key != null)
tei.append(" key=\"").append(key).append("\"");
tei.append(">");

if (departments != null) {
if (departments.size() == 1) {
tei.append("<orgName type=\"department\">").append(TextUtilities.HTMLEncode(departments.get(0))).append("</orgName>");
} else {
int q = 1;
for (String depa : departments) {
tei.append("<orgName type=\"department\" key=\"dep").append(q).append("\">").append(TextUtilities.HTMLEncode(depa)).append("</orgName>");
q++;
}
}
}

if (laboratories != null) {
if (laboratories.size() == 1) {
tei.append("<orgName type=\"laboratory\">").append(TextUtilities.HTMLEncode(laboratories.get(0))).append("</orgName>");
} else {
int q = 1;
for (String labo : laboratories) {
tei.append("<orgName type=\"laboratory\" key=\"lab").append(q).append("\">").append(TextUtilities.HTMLEncode(labo)).append("</orgName>");
q++;
}
}
}

if (institutions != null) {
if (institutions.size() == 1) {
tei.append("<orgName type=\"institution\">").append(TextUtilities.HTMLEncode(institutions.get(0))).append("</orgName>");
} else {
int q = 1;
for (String inst : institutions) {
tei.append("<orgName type=\"institution\" key=\"instit").append(q).append("\">").append(TextUtilities.HTMLEncode(inst)).append("</orgName>");
q++;
}
}
}

if ((getAddressString() != null) |
(getAddrLine() != null) |
(getPostBox() != null) |
(getPostCode() != null) |
(getSettlement() != null) |
(getRegion() != null) |
(getCountry() != null)) {
tei.append("<address>");
if (getAddressString() != null) {
tei.append("<addrLine>").append(TextUtilities.HTMLEncode(getAddressString())).append("</addrLine>");
}
if (getAddrLine() != null) {
tei.append("<addrLine>").append(TextUtilities.HTMLEncode(getAddrLine())).append("</addrLine>");
}
if (getPostBox() != null) {
tei.append("<postBox>").append(TextUtilities.HTMLEncode(getPostBox())).append("</postBox>");
}
if (getPostCode() != null) {
tei.append("<postCode>").append(TextUtilities.HTMLEncode(getPostCode())).append("</postCode>");
}
if (getSettlement() != null) {
tei.append("<settlement>").append(TextUtilities.HTMLEncode(getSettlement())).append("</settlement>");
}
if (getRegion() != null) {
tei.append("<region>").append(TextUtilities.HTMLEncode(getRegion())).append("</region>");
}
if (getCountry() != null) {
Lexicon lexicon = Lexicon.getInstance();
String code = lexicon.getCountryCode(getCountry());
tei.append("<country");
if (code != null)
tei.append(" key=\"").append(code).append("\"");
tei.append(">").append(TextUtilities.HTMLEncode(getCountry())).append("</country>");
}
tei.append("</address>");
}
tei.append("</affiliation>");
}

return tei.toString();
public static String toTEI(Affiliation aff, int nbTag) {
return toTEI(aff, nbTag, null);
}

public static String toTEI(Affiliation aff, int nbTag) {
public static String toTEI(Affiliation aff, int nbTag, GrobidAnalysisConfig config) {
StringBuffer tei = new StringBuffer();
TextUtilities.appendN(tei, '\t', nbTag + 1);

boolean withAffCoords = (config != null) &&
(config.getGenerateTeiCoordinates() != null) &&
(config.getGenerateTeiCoordinates().contains("affiliation"));
boolean orgNameCoords = (config != null) &&
(config.getGenerateTeiCoordinates() != null) &&
(config.getGenerateTeiCoordinates().contains("orgName"));

tei.append("<affiliation");
if (aff.getKey() != null)
tei.append(" key=\"").append(aff.getKey()).append("\"");
if (withAffCoords) {
String coords = LayoutTokensUtil.getCoordsString(aff.getLayoutTokens());
if (coords != null && coords.length()>0) {
tei.append(" coords=\"" + coords + "\"");
}
}
tei.append(">\n");

if (aff.getDepartments() != null) {
Expand Down Expand Up @@ -593,21 +559,22 @@ public static String toTEI(Affiliation aff, int nbTag) {
}
}

if ((aff.getAddressString() != null) ||
(aff.getAddrLine() != null) ||
(aff.getPostBox() != null) ||
(aff.getPostCode() != null) ||
(aff.getSettlement() != null) ||
(aff.getRegion() != null) ||
(aff.getCountry() != null)) {
if (
aff.getAddrLine() != null ||
aff.getPostBox() != null ||
aff.getPostCode() != null ||
aff.getSettlement() != null ||
aff.getRegion() != null ||
aff.getCountry() != null
) {
TextUtilities.appendN(tei, '\t', nbTag + 2);

tei.append("<address>\n");
if (aff.getAddressString() != null) {
/*if (aff.getAddressString() != null) {
TextUtilities.appendN(tei, '\t', nbTag + 3);
tei.append("<addrLine>" + TextUtilities.HTMLEncode(aff.getAddressString()) +
"</addrLine>\n");
}
}*/
if (aff.getAddrLine() != null) {
TextUtilities.appendN(tei, '\t', nbTag + 3);
tei.append("<addrLine>" + TextUtilities.HTMLEncode(aff.getAddrLine()) +
Expand Down Expand Up @@ -680,8 +647,23 @@ public void addLabeledResult(TaggingLabel label, List<LayoutToken> tokenizations
if (labeledTokens == null)
labeledTokens = new TreeMap<>();

List<LayoutToken> theTokenList = tokenizations == null ? new ArrayList<>() : tokenizations;
List<LayoutToken> theTokenList = null;
if (tokenizations == null)
theTokenList = new ArrayList<>();
else
theTokenList = tokenizations;

List<LayoutToken> theExistingTokenList = labeledTokens.get(label.getLabel());
if (theExistingTokenList != null) {
theExistingTokenList.addAll(theTokenList);
theTokenList = theExistingTokenList;
}

labeledTokens.put(label.getLabel(), theTokenList);
}

public List<LayoutToken> getLabeledResult(TaggingLabel label) {
return labeledTokens.get(label.getLabel());
}

}
Loading

0 comments on commit 3ef0915

Please sign in to comment.