Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix affiliation missing when using DL affiliation-address model #1166

Merged
merged 4 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
package org.grobid.core.engines;

import org.chasen.crfpp.Tagger;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.GrobidModel;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.Affiliation;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorAffiliationAddress;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.lexicon.Lexicon;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -24,8 +25,12 @@
public class AffiliationAddressParser extends AbstractParser {
public Lexicon lexicon = Lexicon.getInstance();

protected AffiliationAddressParser(GrobidModel model) {
super(model);
}

public AffiliationAddressParser() {
super(GrobidModels.AFFILIATION_ADDRESS);
this(GrobidModels.AFFILIATION_ADDRESS);
}

public List<Affiliation> processing(String input) {
Expand Down Expand Up @@ -78,22 +83,26 @@ protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizatio
return affiliationBlocks;
}

/**
* Separate affiliation blocks, when they appears to be in separate set of offsets.
*/
protected static List<String> getAffiliationBlocksFromSegments(List<List<LayoutToken>> tokenizations) {
ArrayList<String> affiliationBlocks = new ArrayList<String>();
ArrayList<String> affiliationBlocks = new ArrayList<>();
int end = 0;
for(List<LayoutToken> tokenizationSegment : tokenizations) {
if (tokenizationSegment == null || tokenizationSegment.size() == 0)
if (CollectionUtils.isEmpty(tokenizationSegment))
continue;

// if we have an offset shit, we introduce a segmentation of the affiliation block
LayoutToken startToken = tokenizationSegment.get(0);
int start = startToken.getOffset();
if (start-end > 2)
if (start-end > 2 && end > 0)
affiliationBlocks.add("\n");

for(LayoutToken tok : tokenizationSegment) {
if (tok.getText().length() == 0)
if (StringUtils.isEmpty(tok.getText())) {
continue;
}

if (!tok.getText().equals(" ")) {
if (tok.getText().equals("\n")) {
Expand Down Expand Up @@ -123,11 +132,11 @@ public List<Affiliation> processingLayoutTokens(List<List<LayoutToken>> tokeniza

//System.out.println(affiliationBlocks.toString());

List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> placesPositions = new ArrayList<>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
List<List<LayoutToken>> allTokens = new ArrayList<>();
allTokens.add(tokenizationsAffiliation);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,28 @@
package org.grobid.core.engines;

import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.AfterClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.assertThat;
import static org.hamcrest.CoreMatchers.is;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import com.google.common.base.Joiner;

import org.grobid.core.GrobidModels;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.Affiliation;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.features.FeaturesVectorAffiliationAddress;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.main.LibraryLoader;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.OffsetPosition;
import org.junit.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.*;
import static org.junit.Assert.assertThat;

public class AffiliationAddressParserTest {

Expand All @@ -43,13 +38,13 @@ public class AffiliationAddressParserTest {

@Before
public void setUp() throws Exception {
this.target = new AffiliationAddressParser();
this.target = new AffiliationAddressParser(GrobidModels.DUMMY);
this.analyzer = GrobidAnalyzer.getInstance();
}

@BeforeClass
public static void init() {
LibraryLoader.load();
// LibraryLoader.load();
GrobidProperties.getInstance();
}

Expand Down Expand Up @@ -257,4 +252,109 @@ public void shouldExtractMultipleAffiliations() throws Exception {
is("University of Madness")
);
}

@Test
@Ignore("This test is used to show the failing input data")
public void testResultExtractionLayoutTokensFromDLOutput() throws Exception {
String result = "\n" +
"\n" +
"Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<department>\n" +
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t<affiliation>\t<department>\n" +
"Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
"Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t<affiliation>\tI-<institution>\n" +
"Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<institution>\n" +
"Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<addrLine>\n" +
"Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<addrLine>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t<affiliation>\tI-<addrLine>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<addrLine>\n" +
"00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t<affiliation>\t<addrLine>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<settlement>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t<affiliation>\t<other>\n" +
"Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t<affiliation>\tI-<country>\n" +
";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t<affiliation>\t<country>\n";

List<LayoutToken> tokenizations = Arrays.stream(result.split("\n"))
.map(row -> new LayoutToken(row.split("\t")[0]))
.collect(Collectors.toList());

assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0)));
}


@Test
public void testResultExtractionLayoutTokensFromCRFOutput() throws Exception {
String result = "MD\tmd\tM\tMD\tMD\tMD\tD\tMD\tMD\tMD\tLINESTART\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXX\t<affiliation>\tI-<institution>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<department>\n" +
"of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t<affiliation>\t<department>\n" +
"Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
"Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<department>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t<affiliation>\tI-<institution>\n" +
"Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<institution>\n" +
"Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
"Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<institution>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<addrLine>\n" +
"Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t<affiliation>\t<addrLine>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t<affiliation>\tI-<postCode>\n" +
"-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t<affiliation>\t<postCode>\n" +
"00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t<affiliation>\t<postCode>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t<affiliation>\tI-<settlement>\n" +
",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t<affiliation>\tI-<other>\n" +
"Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t<affiliation>\tI-<country>\n" +
";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t<affiliation>\t<country>";

List<LayoutToken> tokenizations = Arrays.stream(result.split("\n"))
.map(row -> new LayoutToken(row.split("\t")[0]))
.collect(Collectors.toList());

assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0)));
}

@Test
public void testGetAffiliationBlocksFromSegments_1() throws Exception {
String block1 = "Department of science, University of Science, University of Madness";
List<LayoutToken> tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1);
tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100));

String block2 = "Department of mental health, University of happyness, Italy";
List<LayoutToken> tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2);
tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 500));

List<String> affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2));

assertThat(affiliationBlocksFromSegments, hasSize(22));
assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n"))));
assertThat(affiliationBlocksFromSegments.get(11), is("\n"));
}

@Test
public void testGetAffiliationBlocksFromSegments_2() throws Exception {
String block1 = "Department of science, University of Science, University of Madness";
List<LayoutToken> tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1);
tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100));

String block2 = "Department of mental health, University of happyness, Italy";
List<LayoutToken> tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2);
tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 100 + tokBlock1.size()));

List<String> affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2));

assertThat(affiliationBlocksFromSegments, hasSize(21));
assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n"))));
assertThat(affiliationBlocksFromSegments.get(11), is(not("@newline")));

}
}
Loading