From 268186dcb4a20cc5007749576866be9b1f97a503 Mon Sep 17 00:00:00 2001 From: lopez Date: Mon, 29 Jan 2024 18:31:02 +0100 Subject: [PATCH] copyrights+licenses models integrated; TEI serialization --- .../java/org/grobid/core/data/BiblioItem.java | 12 ++++++ .../grobid/core/data/CopyrightsLicense.java | 2 +- .../grobid/core/document/TEIFormatter.java | 42 +++++++++++++++---- .../org/grobid/core/engines/HeaderParser.java | 10 +++++ .../core/engines/LicenseClassifier.java | 6 ++- .../engines/ReferenceSegmenterParser.java | 2 +- .../core/engines/tagging/TaggerFactory.java | 2 +- .../core/utilities/GrobidProperties.java | 12 +++--- grobid-home/config/grobid.yaml | 6 ++- .../org/grobid/trainer/AbstractTrainer.java | 2 +- .../org/grobid/trainer/TrainerFactory.java | 4 +- 11 files changed, 78 insertions(+), 22 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index bf065d1f4a..7bd030f923 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -6,6 +6,7 @@ import org.grobid.core.data.util.AuthorEmailAssigner; import org.grobid.core.data.util.ClassicAuthorEmailAssigner; import org.grobid.core.data.util.EmailSanitizer; +import org.grobid.core.data.CopyrightsLicense; import org.grobid.core.document.*; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.exceptions.GrobidException; @@ -376,6 +377,9 @@ public String toString() { // Availability statement private String availabilityStmt = null; + // Copyrights/license information object + CopyrightsLicense copyrightsLicense = null; + public static final List confPrefixes = Arrays.asList("Proceedings of", "proceedings of", "In Proceedings of the", "In: Proceeding of", "In Proceedings, ", "In Proceedings of", "In Proceeding of", "in Proceeding of", "in Proceeding", "In Proceeding", "Proceedings", @@ -4477,4 +4481,12 @@ public void setAvailabilityStmt(String availabilityStmt) { public List> getAffiliationAddresslabeledTokens() { return affiliationAddresslabeledTokens; } + + public void setCopyrightsLicense(CopyrightsLicense copyrightsLicense) { + this.copyrightsLicense = copyrightsLicense; + } + + public CopyrightsLicense getCopyrightsLicense() { + return this.copyrightsLicense; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java index 0a21fdeaeb..56f3f36a92 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java +++ b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java @@ -39,7 +39,7 @@ public enum License { CCBYSA ("CC-BY-SA"), CCBYNCSA ("CC-BY-NC-SA"), CCBYND ("CC-BY-ND"), - COPYRIGHTS_STRICT ("copyrights"), + COPYRIGHTS ("strict-copyrights"), OTHER ("other"), UNDECIDED ("undecided"); diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 22390fe1f9..e2424bf3b8 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -14,6 +14,7 @@ import nu.xom.Text; import org.grobid.core.GrobidModels; +import org.grobid.core.data.CopyrightsLicense.License; import org.grobid.core.data.Date; import org.grobid.core.data.*; import org.grobid.core.document.xml.XmlBuilderUtils; @@ -250,23 +251,50 @@ public StringBuilder toTEIHeader(BiblioItem biblio, (biblio.getPublicationDate() != null) || (biblio.getNormalizedPublicationDate() != null)) { tei.append("\t\t\t\n"); + + CopyrightsLicense copyrightsLicense = biblio.getCopyrightsLicense(); + if (biblio.getPublisher() != null) { // publisher and date under for better TEI conformance tei.append("\t\t\t\t" + TextUtilities.HTMLEncode(biblio.getPublisher()) + "\n"); + } else { + // a dummy publicationStmt is still necessary according to TEI + tei.append("\t\t\t\t\n"); + } + - tei.append("\t\t\t\t"); - tei.append("

Copyright "); + // We introduce something more meaningful with TEI customization to encode copyrights information: + // - @copyrightsOwner with value "publisher", "authors", "unknown" + // - license related to copyrights exception is encoded via + // (note: I have no clue what can mean "free" as status for a document - there are always some sort of + // restrictions like moral rights even for public domain documents) + if (copyrightsLicense != null) { + tei.append("\t\t\t\t"); + tei.append(""+copyrightsLicense.getLicense().getName()+""); + } else { + tei.append(" status=\"unknown\">"); + } + + /*tei.append("

Copyright "); //if (biblio.getPublicationDate() != null) - tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "

\n"); + tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "

\n");*/ + tei.append("\t\t\t\t
\n"); } else { - // a dummy publicationStmt is still necessary according to TEI - tei.append("\t\t\t\t\n"); + tei.append("\t\t\t\t"); + tei.append(" status=\"unknown\">"); } else { - tei.append("\t\t\t\t

" + + tei.append(" status=\"unknown\">

" + TextUtilities.HTMLEncode(defaultPublicationStatement) + "

"); } tei.append("\n"); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 2db68597cc..185f3714d5 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -8,6 +8,7 @@ import org.grobid.core.data.Date; import org.grobid.core.data.Keyword; import org.grobid.core.data.Person; +import org.grobid.core.data.CopyrightsLicense; import org.grobid.core.document.*; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.engines.label.SegmentationLabels; @@ -309,6 +310,15 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, } } + // copyrights/license identification + if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) { + if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) { + CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright()); + if (copyrightsLicense != null) + resHeader.setCopyrightsLicense(copyrightsLicense); + } + } + resHeader = consolidateHeader(resHeader, config.getConsolidateHeader()); // we don't need to serialize if we process the full text (it would be done 2 times) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java index 78c2c8d39d..2b8868fefa 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java @@ -128,7 +128,7 @@ public List classify(List texts) throws Exception { int rank = 0; for (Double scoreField : scoreFields) { if (scoreField>0.5 && scoreField >= bestProb) { - owner = CopyrightsOwner.valueOf(owners.get(rank)); + owner = CopyrightsOwner.valueOf(owners.get(rank).toUpperCase()); bestProb = scoreField; } scoreUndecided = scoreField; @@ -206,7 +206,9 @@ public List classify(List texts) throws Exception { rank = 0; for (Double scoreField : scoreFields) { if (scoreField>0.5 && scoreField >= bestProb) { - license = License.valueOf(licenses.get(rank)); + String valueLicense = licenses.get(rank); + valueLicense = valueLicense.replace("-", ""); + license = License.valueOf(valueLicense.toUpperCase()); bestProb = scoreField; } scoreUndecided = scoreField; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java b/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java index 147c5cf2bb..4323618f82 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java @@ -91,7 +91,7 @@ public List extract(Document doc, SortedSet