copyrights+licenses models integrated; TEI serialization

kermitt2 · Jan 29, 2024 · 268186d · 268186d
1 parent 75ec437
commit 268186d
Show file tree

Hide file tree

Showing 11 changed files with 78 additions and 22 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
@@ -6,6 +6,7 @@
 import org.grobid.core.data.util.AuthorEmailAssigner;
 import org.grobid.core.data.util.ClassicAuthorEmailAssigner;
 import org.grobid.core.data.util.EmailSanitizer;
+import org.grobid.core.data.CopyrightsLicense;
 import org.grobid.core.document.*;
 import org.grobid.core.engines.config.GrobidAnalysisConfig;
 import org.grobid.core.exceptions.GrobidException;
@@ -376,6 +377,9 @@ public String toString() {
     // Availability statement
     private String availabilityStmt = null;
 
+    // Copyrights/license information object
+    CopyrightsLicense copyrightsLicense = null;
+
     public static final List<String> confPrefixes = Arrays.asList("Proceedings of", "proceedings of",
             "In Proceedings of the", "In: Proceeding of", "In Proceedings, ", "In Proceedings of",
             "In Proceeding of", "in Proceeding of", "in Proceeding", "In Proceeding", "Proceedings",
@@ -4477,4 +4481,12 @@ public void setAvailabilityStmt(String availabilityStmt) {
     public List<List<LayoutToken>> getAffiliationAddresslabeledTokens() {
         return affiliationAddresslabeledTokens;
     }
+
+    public void setCopyrightsLicense(CopyrightsLicense copyrightsLicense) {
+        this.copyrightsLicense = copyrightsLicense;
+    }
+
+    public CopyrightsLicense getCopyrightsLicense() {
+        return this.copyrightsLicense;
+    }
 }
diff --git a/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java
@@ -39,7 +39,7 @@ public enum License {
         CCBYSA  ("CC-BY-SA"),
         CCBYNCSA  ("CC-BY-NC-SA"),
         CCBYND  ("CC-BY-ND"),
-        COPYRIGHTS_STRICT ("copyrights"),
+        COPYRIGHTS ("strict-copyrights"),
         OTHER   ("other"),
         UNDECIDED   ("undecided");
 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -14,6 +14,7 @@
 import nu.xom.Text;
 
 import org.grobid.core.GrobidModels;
+import org.grobid.core.data.CopyrightsLicense.License;
 import org.grobid.core.data.Date;
 import org.grobid.core.data.*;
 import org.grobid.core.document.xml.XmlBuilderUtils;
@@ -250,23 +251,50 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
                 (biblio.getPublicationDate() != null) ||
                 (biblio.getNormalizedPublicationDate() != null)) {
             tei.append("\t\t\t<publicationStmt>\n");
+
+            CopyrightsLicense copyrightsLicense = biblio.getCopyrightsLicense();
+
             if (biblio.getPublisher() != null) {
                 // publisher and date under <publicationStmt> for better TEI conformance
                 tei.append("\t\t\t\t<publisher>" + TextUtilities.HTMLEncode(biblio.getPublisher()) +
                         "</publisher>\n");
+            } else {
+                // a dummy publicationStmt is still necessary according to TEI
+                tei.append("\t\t\t\t<publisher/>\n");
+            }
+
 
-                tei.append("\t\t\t\t<availability status=\"unknown\">");
-                tei.append("<p>Copyright ");
+            // We introduce something more meaningful with TEI customization to encode copyrights information:
+            // - @copyrightsOwner with value "publisher", "authors", "unknown"
+            // - license related to copyrights exception is encoded via <licence>  
+            // (note: I have no clue what can mean "free" as status for a document - there are always some sort of 
+            // restrictions like moral rights even for public domain documents)
+            if (copyrightsLicense != null) {
+                tei.append("\t\t\t\t<availability ");
+
+                if (copyrightsLicense.getCopyrightsOwner() != null) {
+                    tei.append("copyrightsOwner=\""+ copyrightsLicense.getCopyrightsOwner().getName() +"\" ");
+                }
+
+                if (copyrightsLicense.getLicense() != null && copyrightsLicense.getLicense() != License.UNDECIDED) {
+                    tei.append("status=\"restricted\">");
+                    tei.append("<licence>"+copyrightsLicense.getLicense().getName()+"</licence>");
+                } else {
+                    tei.append(" status=\"unknown\"><licence/>");
+                }
+
+                /*tei.append("<p>Copyright ");
                 //if (biblio.getPublicationDate() != null)
-                tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "</p>\n");
+                tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "</p>\n");*/
+
                 tei.append("\t\t\t\t</availability>\n");
             } else {
-                // a dummy publicationStmt is still necessary according to TEI
-                tei.append("\t\t\t\t<publisher/>\n");
+                tei.append("\t\t\t\t<availability ");
+
                 if (defaultPublicationStatement == null) {
-                    tei.append("\t\t\t\t<availability status=\"unknown\"><licence/></availability>");
+                    tei.append(" status=\"unknown\"><licence/></availability>");
                 } else {
-                    tei.append("\t\t\t\t<availability status=\"unknown\"><p>" +
+                    tei.append(" status=\"unknown\"><p>" +
                             TextUtilities.HTMLEncode(defaultPublicationStatement) + "</p></availability>");
                 }
                 tei.append("\n");

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java
@@ -8,6 +8,7 @@
 import org.grobid.core.data.Date;
 import org.grobid.core.data.Keyword;
 import org.grobid.core.data.Person;
+import org.grobid.core.data.CopyrightsLicense;
 import org.grobid.core.document.*;
 import org.grobid.core.engines.config.GrobidAnalysisConfig;
 import org.grobid.core.engines.label.SegmentationLabels;
@@ -309,6 +310,15 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
                     }
                 }
 
+                // copyrights/license identification
+                if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) {
+                    if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) {
+                        CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright());
+                        if (copyrightsLicense != null) 
+                            resHeader.setCopyrightsLicense(copyrightsLicense);
+                    }
+                }
+
                 resHeader = consolidateHeader(resHeader, config.getConsolidateHeader());
 
                 // we don't need to serialize if we process the full text (it would be done 2 times)

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java
@@ -128,7 +128,7 @@ public List<CopyrightsLicense> classify(List<String> texts) throws Exception {
                     int rank = 0;
                     for (Double scoreField : scoreFields) {
                         if (scoreField>0.5 && scoreField >= bestProb) {
-                            owner = CopyrightsOwner.valueOf(owners.get(rank));
+                            owner = CopyrightsOwner.valueOf(owners.get(rank).toUpperCase());
                             bestProb = scoreField;
                         }
                         scoreUndecided = scoreField;
@@ -206,7 +206,9 @@ public List<CopyrightsLicense> classify(List<String> texts) throws Exception {
                     rank = 0;
                     for (Double scoreField : scoreFields) {
                         if (scoreField>0.5 && scoreField >= bestProb) {
-                            license = License.valueOf(licenses.get(rank));
+                            String valueLicense = licenses.get(rank);
+                            valueLicense = valueLicense.replace("-", "");
+                            license = License.valueOf(valueLicense.toUpperCase());
                             bestProb = scoreField;
                         }
                         scoreUndecided = scoreField;

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java b/grobid-core/src/main/java/org/grobid/core/engines/ReferenceSegmenterParser.java
@@ -91,7 +91,7 @@ public List<LabeledReferenceResult> extract(Document doc, SortedSet<DocumentPiec
 			// this does not apply to CRF which can process "infinite" input sequence
 			// this is relevant to the reference segmenter RNN model, which is position-free in its 
 			// application, but could not be generalized to other RNN or transformer model long inputs
-			if (GrobidProperties.getGrobidCRFEngine(GrobidModels.REFERENCE_SEGMENTER) == GrobidCRFEngine.DELFT) {
+			if (GrobidProperties.getGrobidEngine(GrobidModels.REFERENCE_SEGMENTER) == GrobidCRFEngine.DELFT) {
 				String[] featureVectorLines = featureVector.split("\n");
 
 /*for(LayoutToken token : tokenizationsReferences) {

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java b/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java
@@ -25,7 +25,7 @@ public class TaggerFactory {
     private TaggerFactory() {}
 
     public static synchronized GenericTagger getTagger(GrobidModel model) {
-        return getTagger(model, GrobidProperties.getGrobidCRFEngine(model), GrobidProperties.getDelftArchitecture(model));
+        return getTagger(model, GrobidProperties.getGrobidEngine(model), GrobidProperties.getDelftArchitecture(model));
     }
 
     public static synchronized GenericTagger getTagger(GrobidModel model, GrobidCRFEngine engine) {

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java
@@ -588,7 +588,7 @@ public static File getPdfaltoPath() {
         return pathToPdfalto;
     }
 
-    private static String getGrobidCRFEngineName(final String modelName) {
+    public static String getGrobidEngineName(final String modelName) {
         ModelParameters param = modelMap.get(modelName);
         if (param == null) {
             LOGGER.debug("No configuration parameter defined for model " + modelName);
@@ -597,24 +597,24 @@ private static String getGrobidCRFEngineName(final String modelName) {
         return param.engine;
     }
 
-    public static GrobidCRFEngine getGrobidCRFEngine(final String modelName) {
-        String engineName = getGrobidCRFEngineName(modelName);
+    public static GrobidCRFEngine getGrobidEngine(final String modelName) {
+        String engineName = getGrobidEngineName(modelName);
         if (engineName == null)
             return null;
         else
             return GrobidCRFEngine.get(engineName);
     }
 
-    public static GrobidCRFEngine getGrobidCRFEngine(final GrobidModel model) {
-        return getGrobidCRFEngine(model.getModelName());
+    public static GrobidCRFEngine getGrobidEngine(final GrobidModel model) {
+        return getGrobidEngine(model.getModelName());
     }
 
     public static File getModelPath(final GrobidModel model) {
         if (modelMap.get(model.getModelName()) == null) {
             // model is not specified in the config, ignoring
             return null;
         }
-        String extension = getGrobidCRFEngine(model).getExt();
+        String extension = getGrobidEngine(model).getExt();
         return new File(getGrobidHome(), FOLDER_NAME_MODELS + File.separator
             + model.getFolderName() + File.separator
             + FILE_NAME_MODEL + "." + extension);

diff --git a/grobid-home/config/grobid.yaml b/grobid-home/config/grobid.yaml
@@ -261,8 +261,10 @@ grobid:
           batch_size: 40
 
     - name: "copyright"
-      # at this time, must always be DeLFT, not other implementation
+      # at this time, we only have a DeLFT implementation, 
+      # use crf if the deep learning library JNI is not available and model will then be ignored
       engine: "delft"
+      #engine: "crf"
       delft:
         # deep learning parameters
         architecture: "gru"
@@ -271,7 +273,9 @@ grobid:
 
     - name: "license"
       # at this time, must always be DeLFT, not other implementation
+      # use crf if the deep learning library JNI is not available and model will then be ignored
       engine: "delft"
+      #engine: "crf"
       delft:
         # deep learning parameters
         architecture: "gru"

diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java b/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java
@@ -104,7 +104,7 @@ public void train(boolean incremental) {
         trainer.train(getTemplatePath(), dataPath, tempModelPath, GrobidProperties.getWapitiNbThreads(), model, incremental);
         // if we are here, that means that training succeeded
         // rename model for CRF sequence labellers (not with DeLFT deep learning models)
-        if (GrobidProperties.getGrobidCRFEngine(this.model) != GrobidCRFEngine.DELFT)
+        if (GrobidProperties.getGrobidEngine(this.model) != GrobidCRFEngine.DELFT)
             renameModels(oldModelPath, tempModelPath);
     }
 

diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/TrainerFactory.java b/grobid-trainer/src/main/java/org/grobid/trainer/TrainerFactory.java
@@ -6,7 +6,7 @@
 
 public class TrainerFactory {
     public static GenericTrainer getTrainer(GrobidModel model) {
-        switch (GrobidProperties.getGrobidCRFEngine(model)) {
+        switch (GrobidProperties.getGrobidEngine(model)) {
             case CRFPP:
                 return new CRFPPGenericTrainer();
             case WAPITI:
@@ -16,7 +16,7 @@ public static GenericTrainer getTrainer(GrobidModel model) {
             case DUMMY:
                 return new DummyTrainer();
             default:
-                throw new IllegalStateException("Unsupported GROBID sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine(model));
+                throw new IllegalStateException("Unsupported GROBID sequence labelling engine: " + GrobidProperties.getGrobidEngine(model));
         }
     }
 }