Skip to content

Commit

Permalink
copyrights+licenses models integrated; TEI serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Jan 29, 2024
1 parent 75ec437 commit 268186d
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 22 deletions.
12 changes: 12 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.grobid.core.data.util.AuthorEmailAssigner;
import org.grobid.core.data.util.ClassicAuthorEmailAssigner;
import org.grobid.core.data.util.EmailSanitizer;
import org.grobid.core.data.CopyrightsLicense;
import org.grobid.core.document.*;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.exceptions.GrobidException;
Expand Down Expand Up @@ -376,6 +377,9 @@ public String toString() {
// Availability statement
private String availabilityStmt = null;

// Copyrights/license information object
CopyrightsLicense copyrightsLicense = null;

public static final List<String> confPrefixes = Arrays.asList("Proceedings of", "proceedings of",
"In Proceedings of the", "In: Proceeding of", "In Proceedings, ", "In Proceedings of",
"In Proceeding of", "in Proceeding of", "in Proceeding", "In Proceeding", "Proceedings",
Expand Down Expand Up @@ -4477,4 +4481,12 @@ public void setAvailabilityStmt(String availabilityStmt) {
public List<List<LayoutToken>> getAffiliationAddresslabeledTokens() {
return affiliationAddresslabeledTokens;
}

public void setCopyrightsLicense(CopyrightsLicense copyrightsLicense) {
this.copyrightsLicense = copyrightsLicense;
}

public CopyrightsLicense getCopyrightsLicense() {
return this.copyrightsLicense;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public enum License {
CCBYSA ("CC-BY-SA"),
CCBYNCSA ("CC-BY-NC-SA"),
CCBYND ("CC-BY-ND"),
COPYRIGHTS_STRICT ("copyrights"),
COPYRIGHTS ("strict-copyrights"),
OTHER ("other"),
UNDECIDED ("undecided");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import nu.xom.Text;

import org.grobid.core.GrobidModels;
import org.grobid.core.data.CopyrightsLicense.License;
import org.grobid.core.data.Date;
import org.grobid.core.data.*;
import org.grobid.core.document.xml.XmlBuilderUtils;
Expand Down Expand Up @@ -250,23 +251,50 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
(biblio.getPublicationDate() != null) ||
(biblio.getNormalizedPublicationDate() != null)) {
tei.append("\t\t\t<publicationStmt>\n");

CopyrightsLicense copyrightsLicense = biblio.getCopyrightsLicense();

if (biblio.getPublisher() != null) {
// publisher and date under <publicationStmt> for better TEI conformance
tei.append("\t\t\t\t<publisher>" + TextUtilities.HTMLEncode(biblio.getPublisher()) +
"</publisher>\n");
} else {
// a dummy publicationStmt is still necessary according to TEI
tei.append("\t\t\t\t<publisher/>\n");
}


tei.append("\t\t\t\t<availability status=\"unknown\">");
tei.append("<p>Copyright ");
// We introduce something more meaningful with TEI customization to encode copyrights information:
// - @copyrightsOwner with value "publisher", "authors", "unknown"
// - license related to copyrights exception is encoded via <licence>
// (note: I have no clue what can mean "free" as status for a document - there are always some sort of
// restrictions like moral rights even for public domain documents)
if (copyrightsLicense != null) {
tei.append("\t\t\t\t<availability ");

if (copyrightsLicense.getCopyrightsOwner() != null) {
tei.append("copyrightsOwner=\""+ copyrightsLicense.getCopyrightsOwner().getName() +"\" ");
}

if (copyrightsLicense.getLicense() != null && copyrightsLicense.getLicense() != License.UNDECIDED) {
tei.append("status=\"restricted\">");
tei.append("<licence>"+copyrightsLicense.getLicense().getName()+"</licence>");
} else {
tei.append(" status=\"unknown\"><licence/>");
}

/*tei.append("<p>Copyright ");
//if (biblio.getPublicationDate() != null)
tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "</p>\n");
tei.append(TextUtilities.HTMLEncode(biblio.getPublisher()) + "</p>\n");*/

tei.append("\t\t\t\t</availability>\n");
} else {
// a dummy publicationStmt is still necessary according to TEI
tei.append("\t\t\t\t<publisher/>\n");
tei.append("\t\t\t\t<availability ");

if (defaultPublicationStatement == null) {
tei.append("\t\t\t\t<availability status=\"unknown\"><licence/></availability>");
tei.append(" status=\"unknown\"><licence/></availability>");
} else {
tei.append("\t\t\t\t<availability status=\"unknown\"><p>" +
tei.append(" status=\"unknown\"><p>" +
TextUtilities.HTMLEncode(defaultPublicationStatement) + "</p></availability>");
}
tei.append("\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.grobid.core.data.Date;
import org.grobid.core.data.Keyword;
import org.grobid.core.data.Person;
import org.grobid.core.data.CopyrightsLicense;
import org.grobid.core.document.*;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.engines.label.SegmentationLabels;
Expand Down Expand Up @@ -309,6 +310,15 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
}
}

// copyrights/license identification
if (resHeader.getCopyright() != null && resHeader.getCopyright().length()>0) {
if (GrobidProperties.getGrobidEngineName("copyright").equals("delft")) {
CopyrightsLicense copyrightsLicense = LicenseClassifier.getInstance().classify(resHeader.getCopyright());
if (copyrightsLicense != null)
resHeader.setCopyrightsLicense(copyrightsLicense);
}
}

resHeader = consolidateHeader(resHeader, config.getConsolidateHeader());

// we don't need to serialize if we process the full text (it would be done 2 times)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ public List<CopyrightsLicense> classify(List<String> texts) throws Exception {
int rank = 0;
for (Double scoreField : scoreFields) {
if (scoreField>0.5 && scoreField >= bestProb) {
owner = CopyrightsOwner.valueOf(owners.get(rank));
owner = CopyrightsOwner.valueOf(owners.get(rank).toUpperCase());
bestProb = scoreField;
}
scoreUndecided = scoreField;
Expand Down Expand Up @@ -206,7 +206,9 @@ public List<CopyrightsLicense> classify(List<String> texts) throws Exception {
rank = 0;
for (Double scoreField : scoreFields) {
if (scoreField>0.5 && scoreField >= bestProb) {
license = License.valueOf(licenses.get(rank));
String valueLicense = licenses.get(rank);
valueLicense = valueLicense.replace("-", "");
license = License.valueOf(valueLicense.toUpperCase());
bestProb = scoreField;
}
scoreUndecided = scoreField;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public List<LabeledReferenceResult> extract(Document doc, SortedSet<DocumentPiec
// this does not apply to CRF which can process "infinite" input sequence
// this is relevant to the reference segmenter RNN model, which is position-free in its
// application, but could not be generalized to other RNN or transformer model long inputs
if (GrobidProperties.getGrobidCRFEngine(GrobidModels.REFERENCE_SEGMENTER) == GrobidCRFEngine.DELFT) {
if (GrobidProperties.getGrobidEngine(GrobidModels.REFERENCE_SEGMENTER) == GrobidCRFEngine.DELFT) {
String[] featureVectorLines = featureVector.split("\n");

/*for(LayoutToken token : tokenizationsReferences) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class TaggerFactory {
private TaggerFactory() {}

public static synchronized GenericTagger getTagger(GrobidModel model) {
return getTagger(model, GrobidProperties.getGrobidCRFEngine(model), GrobidProperties.getDelftArchitecture(model));
return getTagger(model, GrobidProperties.getGrobidEngine(model), GrobidProperties.getDelftArchitecture(model));
}

public static synchronized GenericTagger getTagger(GrobidModel model, GrobidCRFEngine engine) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ public static File getPdfaltoPath() {
return pathToPdfalto;
}

private static String getGrobidCRFEngineName(final String modelName) {
public static String getGrobidEngineName(final String modelName) {
ModelParameters param = modelMap.get(modelName);
if (param == null) {
LOGGER.debug("No configuration parameter defined for model " + modelName);
Expand All @@ -597,24 +597,24 @@ private static String getGrobidCRFEngineName(final String modelName) {
return param.engine;
}

public static GrobidCRFEngine getGrobidCRFEngine(final String modelName) {
String engineName = getGrobidCRFEngineName(modelName);
public static GrobidCRFEngine getGrobidEngine(final String modelName) {
String engineName = getGrobidEngineName(modelName);
if (engineName == null)
return null;
else
return GrobidCRFEngine.get(engineName);
}

public static GrobidCRFEngine getGrobidCRFEngine(final GrobidModel model) {
return getGrobidCRFEngine(model.getModelName());
public static GrobidCRFEngine getGrobidEngine(final GrobidModel model) {
return getGrobidEngine(model.getModelName());
}

public static File getModelPath(final GrobidModel model) {
if (modelMap.get(model.getModelName()) == null) {
// model is not specified in the config, ignoring
return null;
}
String extension = getGrobidCRFEngine(model).getExt();
String extension = getGrobidEngine(model).getExt();
return new File(getGrobidHome(), FOLDER_NAME_MODELS + File.separator
+ model.getFolderName() + File.separator
+ FILE_NAME_MODEL + "." + extension);
Expand Down
6 changes: 5 additions & 1 deletion grobid-home/config/grobid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,10 @@ grobid:
batch_size: 40

- name: "copyright"
# at this time, must always be DeLFT, not other implementation
# at this time, we only have a DeLFT implementation,
# use crf if the deep learning library JNI is not available and model will then be ignored
engine: "delft"
#engine: "crf"
delft:
# deep learning parameters
architecture: "gru"
Expand All @@ -271,7 +273,9 @@ grobid:

- name: "license"
# at this time, must always be DeLFT, not other implementation
# use crf if the deep learning library JNI is not available and model will then be ignored
engine: "delft"
#engine: "crf"
delft:
# deep learning parameters
architecture: "gru"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public void train(boolean incremental) {
trainer.train(getTemplatePath(), dataPath, tempModelPath, GrobidProperties.getWapitiNbThreads(), model, incremental);
// if we are here, that means that training succeeded
// rename model for CRF sequence labellers (not with DeLFT deep learning models)
if (GrobidProperties.getGrobidCRFEngine(this.model) != GrobidCRFEngine.DELFT)
if (GrobidProperties.getGrobidEngine(this.model) != GrobidCRFEngine.DELFT)
renameModels(oldModelPath, tempModelPath);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

public class TrainerFactory {
public static GenericTrainer getTrainer(GrobidModel model) {
switch (GrobidProperties.getGrobidCRFEngine(model)) {
switch (GrobidProperties.getGrobidEngine(model)) {
case CRFPP:
return new CRFPPGenericTrainer();
case WAPITI:
Expand All @@ -16,7 +16,7 @@ public static GenericTrainer getTrainer(GrobidModel model) {
case DUMMY:
return new DummyTrainer();
default:
throw new IllegalStateException("Unsupported GROBID sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine(model));
throw new IllegalStateException("Unsupported GROBID sequence labelling engine: " + GrobidProperties.getGrobidEngine(model));
}
}
}

0 comments on commit 268186d

Please sign in to comment.