From 02aaed0d42f601f02fdab763f3f1b35499814151 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 12 Apr 2024 09:28:57 +0900 Subject: [PATCH 01/31] add kotlin and kotlin-test --- build.gradle | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/build.gradle b/build.gradle index 352a283f8b..4a57f048c9 100644 --- a/build.gradle +++ b/build.gradle @@ -10,15 +10,20 @@ buildscript { classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0' classpath "gradle.plugin.com.github.jengelman.gradle.plugins:shadow:7.0.0" classpath 'com.adarshr:gradle-test-logger-plugin:2.0.0' + classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.21" } } repositories { mavenLocal() mavenCentral() + maven { + url = uri("https://plugins.gradle.org/m2/") + } } apply plugin: 'jacoco' +apply plugin: 'org.jetbrains.kotlin.jvm' jacoco { toolVersion = '0.8.8' @@ -55,8 +60,18 @@ subprojects { } } - sourceCompatibility = 1.11 - targetCompatibility = 1.11 +// sourceCompatibility = 1.11 +// targetCompatibility = 1.11 + + kotlin { + jvmToolchain(17) + } + + java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } + } repositories { mavenCentral() @@ -86,14 +101,20 @@ subprojects { // packaging local libs inside grobid-core.jar implementation fileTree(dir: new File(rootProject.rootDir, 'grobid-core/localLibs'), include: localLibs) - testRuntimeOnly 'org.junit.vintage:junit-vintage-engine:5.9.3' - testImplementation(platform('org.junit:junit-bom:5.9.3')) + testRuntimeOnly "org.junit.jupiter:junit-jupiter-engine" + testRuntimeOnly "org.junit.vintage:junit-vintage-engine" + testImplementation(platform('org.junit:junit-bom:5.10.2')) + testRuntimeOnly("org.junit.platform:junit-platform-launcher") { + because("Only needed to run tests in a version of IntelliJ IDEA that bundles older versions") + } testImplementation('org.junit.jupiter:junit-jupiter') testImplementation 'org.easymock:easymock:5.1.0' testImplementation "org.powermock:powermock-api-easymock:2.0.7" testImplementation "org.powermock:powermock-module-junit4:2.0.7" testImplementation "xmlunit:xmlunit:1.6" testImplementation "org.hamcrest:hamcrest-all:1.3" + testImplementation 'org.jetbrains.kotlin:kotlin-test' + testImplementation "io.mockk:mockk:1.13.9" implementation "com.cybozu.labs:langdetect:1.1-20120112" implementation "com.rockymadden.stringmetric:stringmetric-core_2.11:0.27.4" From 84ef8020a6d382fa769e8fe68a5dc75830f1d9e5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:00:33 +0800 Subject: [PATCH 02/31] fix build --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 4a57f048c9..421879fb6c 100644 --- a/build.gradle +++ b/build.gradle @@ -23,7 +23,6 @@ repositories { } apply plugin: 'jacoco' -apply plugin: 'org.jetbrains.kotlin.jvm' jacoco { toolVersion = '0.8.8' @@ -34,6 +33,7 @@ allprojects { apply plugin: 'base' apply plugin: 'com.github.kt3k.coveralls' apply plugin: 'com.adarshr.test-logger' + apply plugin: 'org.jetbrains.kotlin.jvm' group = "org.grobid" From 1ebcf3a59abd932640c698830a6943e456eeb38d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:01:22 +0800 Subject: [PATCH 03/31] add kotlin test --- .../src/test/kotlin/org/grobid/core/test.kt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 grobid-core/src/test/kotlin/org/grobid/core/test.kt diff --git a/grobid-core/src/test/kotlin/org/grobid/core/test.kt b/grobid-core/src/test/kotlin/org/grobid/core/test.kt new file mode 100644 index 0000000000..282420c3be --- /dev/null +++ b/grobid-core/src/test/kotlin/org/grobid/core/test.kt @@ -0,0 +1,15 @@ +package org.grobid.core + +import org.junit.Test +import java.nio.charset.StandardCharsets + +class TextParserTest { + + @Test + fun testConvertFractions6Numeric() { + val byteArray = byteArrayOf(-3, -1, -73, 0, 103, 0, 47, 0, 109, 0, 108, 0); + val input = String(byteArray, StandardCharsets.UTF_16LE) + + print("toto") + } +} From 8443e6d04bcb174a491568304522802aa36d6537 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:02:14 +0800 Subject: [PATCH 04/31] update action's component version --- .github/workflows/ci-build-unstable.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index cdfa981049..9d56158542 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -12,9 +12,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v4 - name: Set up JDK 17 - uses: actions/setup-java@v1 + uses: actions/setup-java@v4 with: java-version: 1.17 - name: Build with Gradle @@ -38,10 +38,10 @@ jobs: steps: - name: Create more disk space run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Build and push id: docker_build - uses: mr-smithers-excellent/docker-build-push@v5 + uses: mr-smithers-excellent/docker-build-push@v6 with: username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }} password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }} From 7fd419470641b34b491f3b57e7b0bfd4150cc5bc Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:06:54 +0800 Subject: [PATCH 05/31] fix actions --- .github/workflows/ci-build-unstable.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index 9d56158542..cf69c53314 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -16,7 +16,9 @@ jobs: - name: Set up JDK 17 uses: actions/setup-java@v4 with: - java-version: 1.17 + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' - name: Build with Gradle run: ./gradlew clean assemble --info --stacktrace --no-daemon From 2dc07a8d8ff68f127d4989c71a8741c569529785 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:36:06 +0800 Subject: [PATCH 06/31] cleanup --- .../src/test/kotlin/org/grobid/core/test.kt | 15 --------- .../core/utilities/SentenceUtilitiesKTest.kt | 32 +++++++++++++++++++ 2 files changed, 32 insertions(+), 15 deletions(-) delete mode 100644 grobid-core/src/test/kotlin/org/grobid/core/test.kt create mode 100644 grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt diff --git a/grobid-core/src/test/kotlin/org/grobid/core/test.kt b/grobid-core/src/test/kotlin/org/grobid/core/test.kt deleted file mode 100644 index 282420c3be..0000000000 --- a/grobid-core/src/test/kotlin/org/grobid/core/test.kt +++ /dev/null @@ -1,15 +0,0 @@ -package org.grobid.core - -import org.junit.Test -import java.nio.charset.StandardCharsets - -class TextParserTest { - - @Test - fun testConvertFractions6Numeric() { - val byteArray = byteArrayOf(-3, -1, -73, 0, 103, 0, 47, 0, 109, 0, 108, 0); - val input = String(byteArray, StandardCharsets.UTF_16LE) - - print("toto") - } -} diff --git a/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt new file mode 100644 index 0000000000..e26efb79a2 --- /dev/null +++ b/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt @@ -0,0 +1,32 @@ +package org.grobid.core.utilities + +import org.junit.Test +import kotlin.test.assertFalse +import kotlin.test.assertTrue + +class SentenceUtilitiesKTest { + + @Test + fun testToSkipToken_shouldReturnTrue() { + val tokens = arrayOf("-", " ", "\n", "\t") + + tokens.forEach { token -> + assertTrue(SentenceUtilities.toSkipToken(token)) + } + + } + + @Test + fun testToSkipTokenNoHypen_shouldReturnTrue() { + val tokens = arrayOf(" ", "\n", "\t") + + tokens.forEach { token -> + assertTrue(SentenceUtilities.toSkipToken(token)) + } + + assertFalse { SentenceUtilities.toSkipToken("-") } + + } + + +} From 9db86676a132542b0dda29e7c7651d52960529a3 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:37:18 +0800 Subject: [PATCH 07/31] cleanup and fix test --- .../java/org/grobid/core/utilities/SentenceUtilities.java | 2 +- .../org/grobid/core/utilities/SentenceUtilitiesKTest.kt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index a424e5e808..c0b4498835 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -297,7 +297,7 @@ public static boolean toSkipToken(String tok) { return false; } - private static boolean toSkipTokenNoHyphen(String tok) { + static boolean toSkipTokenNoHyphen(String tok) { if (tok.equals(" ") || tok.equals("\n") || tok.equals("\t")) return true; else diff --git a/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt index e26efb79a2..a9fdeca5e6 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/utilities/SentenceUtilitiesKTest.kt @@ -21,10 +21,10 @@ class SentenceUtilitiesKTest { val tokens = arrayOf(" ", "\n", "\t") tokens.forEach { token -> - assertTrue(SentenceUtilities.toSkipToken(token)) + assertTrue(SentenceUtilities.toSkipTokenNoHyphen(token)) } - assertFalse { SentenceUtilities.toSkipToken("-") } + assertFalse { SentenceUtilities.toSkipTokenNoHyphen("-") } } From d4a82614b3da94d4dfef7d937d2fc42de8713de0 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 09:52:39 +0800 Subject: [PATCH 08/31] add tests on the current code --- .../engines/FundingAcknowledgementParser.java | 2 +- .../FundingAcknowledgementParserTest.kt | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index c92b270ff1..4db6af52d2 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -102,7 +102,7 @@ public MutablePair,List,List Date: Sun, 28 Apr 2024 10:35:07 +0800 Subject: [PATCH 09/31] fix missing of last person in the acknowledgment / funding --- .../engines/FundingAcknowledgementParser.java | 17 +++- .../FundingAcknowledgementParserTest.kt | 88 ++++++++++++++++++- 2 files changed, 100 insertions(+), 5 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 4db6af52d2..0c11294c28 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -69,6 +69,10 @@ protected FundingAcknowledgementParser() { super(GrobidModels.FUNDING_ACKNOWLEDGEMENT); } + FundingAcknowledgementParser(GrobidModel model) { + super(model); + } + private MutablePair,List,List>> processing(List tokenizationFunding, GrobidAnalysisConfig config) { if (tokenizationFunding == null || tokenizationFunding.size() == 0) @@ -189,7 +193,7 @@ public MutablePair,List,List,List,List>> + protected MutablePair,List,List>> getExtractionResult(List tokenizations, String result) { List fundings = new ArrayList<>(); List persons = new ArrayList<>(); @@ -451,8 +455,14 @@ public MutablePair,List,List,List,List 0) + if (CollectionUtils.isNotEmpty(institutions)) { affiliations.addAll(institutions); + } for(Funding localFunding : fundings) { localFunding.inferAcronyms(); diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt index 01ee60cdf9..5cd4693c78 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt @@ -1,16 +1,21 @@ package org.grobid.core.engines import org.grobid.core.GrobidModels +import org.grobid.core.analyzers.GrobidAnalyzer +import org.grobid.core.layout.LayoutToken import org.grobid.core.lexicon.Lexicon import org.grobid.core.utilities.GrobidConfig import org.grobid.core.utilities.GrobidProperties +import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.hasSize import org.junit.Before import org.junit.Test import org.powermock.api.easymock.PowerMock class FundingAcknowledgementParserTest { - private lateinit var target: DateParser + private lateinit var target: FundingAcknowledgementParser @Before @Throws(Exception::class) @@ -19,11 +24,90 @@ class FundingAcknowledgementParserTest { val modelParameters = GrobidConfig.ModelParameters() modelParameters.name = "bao" GrobidProperties.addModel(modelParameters) - target = DateParser(GrobidModels.DUMMY) + target = FundingAcknowledgementParser(GrobidModels.DUMMY) } @Test fun testGetExtractionResult() { + val input: String = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript."; + + val results: String = "Our\tour\tO\tOu\tOur\tOur\tr\tur\tOur\tOur\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "warmest\twarmest\tw\twa\twar\twarm\tt\tst\test\tmest\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "thanks\tthanks\tt\tth\ttha\tthan\ts\tks\tnks\tanks\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "to\tto\tt\tto\tto\tto\to\tto\tto\tto\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Patrice\tpatrice\tP\tPa\tPat\tPatr\te\tce\tice\trice\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Lopez\tlopez\tL\tLo\tLop\tLope\tz\tez\tpez\topez\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "author\tauthor\ta\tau\taut\tauth\tr\tor\thor\tthor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Grobid\tgrobid\tG\tGr\tGro\tGrob\td\tid\tbid\tobid\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "22\t22\t2\t22\t22\t22\t2\t22\t22\t22\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "DeLFT\tdelft\tD\tDe\tDeL\tDeLF\tT\tFT\tLFT\teLFT\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "20\t20\t2\t20\t20\t20\t0\t20\t20\t20\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "other\tother\to\tot\toth\tothe\tr\ter\ther\tther\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "open\topen\to\top\tope\topen\tn\ten\tpen\topen\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tHYPHEN\t0\t\n" + + "source\tsource\ts\tso\tsou\tsour\te\tce\trce\turce\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "projects\tprojects\tp\tpr\tpro\tproj\ts\tts\tcts\tects\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "his\this\th\thi\this\this\ts\tis\this\this\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "continuous\tcontinuous\tc\tco\tcon\tcont\ts\tus\tous\tuous\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "support\tsupport\ts\tsu\tsup\tsupp\tt\trt\tort\tport\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "inspiration\tinspiration\ti\tin\tins\tinsp\tn\ton\tion\ttion\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "with\twith\tw\twi\twit\twith\th\tth\tith\twith\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "ideas\tideas\ti\tid\tide\tidea\ts\tas\teas\tdeas\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "suggestions\tsuggestions\ts\tsu\tsug\tsugg\ts\tns\tons\tions\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "fruitful\tfruitful\tf\tfr\tfru\tfrui\tl\tul\tful\ttful\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "discussions\tdiscussions\td\tdi\tdis\tdisc\ts\tns\tons\tions\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t\n" + + "We\twe\tW\tWe\tWe\tWe\te\tWe\tWe\tWe\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "thank\tthank\tt\tth\ttha\tthan\tk\tnk\tank\thank\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Pedro\tpedro\tP\tPe\tPed\tPedr\to\tro\tdro\tedro\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Baptista\tbaptista\tB\tBa\tBap\tBapt\ta\tta\tsta\tista\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "de\tde\td\tde\tde\tde\te\tde\tde\tde\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Castro\tcastro\tC\tCa\tCas\tCast\to\tro\ttro\tstro\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "his\this\th\thi\this\this\ts\tis\this\this\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "support\tsupport\ts\tsu\tsup\tsupp\tt\trt\tort\tport\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "during\tduring\td\tdu\tdur\tduri\tg\tng\ting\tring\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "this\tthis\tt\tth\tthi\tthis\ts\tis\this\tthis\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t\n" + + "Special\tspecial\tS\tSp\tSpe\tSpec\tl\tal\tial\tcial\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "thanks\tthanks\tt\tth\ttha\tthan\ts\tks\tnks\tanks\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "to\tto\tt\tto\tto\tto\to\tto\tto\tto\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Erina\terina\tE\tEr\tEri\tErin\ta\tna\tina\trina\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Fujita\tfujita\tF\tFu\tFuj\tFuji\ta\tta\tita\tjita\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "useful\tuseful\tu\tus\tuse\tusef\tl\tul\tful\teful\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "tips\ttips\tt\tti\ttip\ttips\ts\tps\tips\ttips\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "on\ton\to\ton\ton\ton\tn\ton\ton\ton\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "manuscript\tmanuscript\tm\tma\tman\tmanu\tt\tpt\tipt\tript\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + val (element, mutableTriple) = target.getExtractionResult(tokens, results) + + assertThat(mutableTriple.left, hasSize(0)) + assertThat(mutableTriple.middle, hasSize(3)) + assertThat(mutableTriple.middle.get(0).rawName, `is`("Patrice Lopez")) + assertThat(mutableTriple.middle.get(1).rawName, `is`("Pedro Baptista de Castro")) + assertThat(mutableTriple.middle.get(2).rawName, `is`("Erina Fujita")) + assertThat(mutableTriple.right, hasSize(0)) } } \ No newline at end of file From ea1245a26a5cfc33f942b2413b609d8b0792c6d2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 28 Apr 2024 10:58:05 +0800 Subject: [PATCH 10/31] add more tests and add MEXT abbreviation --- .../java/org/grobid/core/data/Funder.java | 2 + .../FundingAcknowledgementParserTest.kt | 59 ++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/Funder.java b/grobid-core/src/main/java/org/grobid/core/data/Funder.java index e4e847ea45..465b05a5da 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Funder.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Funder.java @@ -60,6 +60,8 @@ public class Funder { prefixFounders.put("NSF", "National Science Foundation"); prefixFounders.put("NIH", "National Institutes of Health"); prefixFounders.put("ERC", "European Research Council"); + //Japanese government + prefixFounders.put("MEXT", "Ministry of Education, Culture, Sports, Science and Technology"); } public Funder() { diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt index 5cd4693c78..565fb78594 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt @@ -2,6 +2,8 @@ package org.grobid.core.engines import org.grobid.core.GrobidModels import org.grobid.core.analyzers.GrobidAnalyzer +import org.grobid.core.data.Funder +import org.grobid.core.data.Funding import org.grobid.core.layout.LayoutToken import org.grobid.core.lexicon.Lexicon import org.grobid.core.utilities.GrobidConfig @@ -30,7 +32,7 @@ class FundingAcknowledgementParserTest { @Test fun testGetExtractionResult() { - val input: String = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript."; + val input = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript."; val results: String = "Our\tour\tO\tOu\tOur\tOur\tr\tur\tOur\tOur\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "warmest\twarmest\tw\twa\twar\twarm\tt\tst\test\tmest\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + @@ -110,4 +112,59 @@ class FundingAcknowledgementParserTest { assertThat(mutableTriple.middle.get(2).rawName, `is`("Erina Fujita")) assertThat(mutableTriple.right, hasSize(0)) } + + @Test + fun testGetExtractionResult2() { + + val input = "This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503]."; + + val results: String = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "partly\tpartly\tp\tpa\tpar\tpart\ty\tly\ttly\trtly\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "MEXT\tmext\tM\tME\tMEX\tMEXT\tT\tXT\tEXT\tMEXT\tLINEIN\tALLCAP\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\tI-\n" + + "Program\tprogram\tP\tPr\tPro\tProg\tm\tam\tram\tgram\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + ":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tPUNCT\t0\t\n" + + "Data\tdata\tD\tDa\tDat\tData\ta\tta\tata\tData\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Creation\tcreation\tC\tCr\tCre\tCrea\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Utilization\tutilization\tU\tUt\tUti\tUtil\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tHYPHEN\t0\t\n" + + "Type\ttype\tT\tTy\tTyp\tType\te\tpe\type\tType\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Material\tmaterial\tM\tMa\tMat\tMate\tl\tal\tial\trial\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\t\n" + + "Development\tdevelopment\tD\tDe\tDev\tDeve\tt\tnt\tent\tment\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\t\n" + + "Project\tproject\tP\tPr\tPro\tProj\tt\tct\tect\tject\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "Digital\tdigital\tD\tDi\tDig\tDigi\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Transformation\ttransformation\tT\tTr\tTra\tTran\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Initiative\tinitiative\tI\tIn\tIni\tInit\te\tve\tive\ttive\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Center\tcenter\tC\tCe\tCen\tCent\tr\ter\tter\tnter\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Magnetic\tmagnetic\tM\tMa\tMag\tMagn\tc\tic\ttic\tetic\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Materials\tmaterials\tM\tMa\tMat\tMate\ts\tls\tals\tials\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\t\n" + + "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Number\tnumber\tN\tNu\tNum\tNumb\tr\ter\tber\tmber\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "JPMXP1122715503\tjpmxp1122715503\tJ\tJP\tJPM\tJPMX\t3\t03\t503\t5503\tLINEIN\tALLCAP\tCONTAINSDIGITS\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\tI-\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + val (element, mutableTriple) = target.getExtractionResult(tokens, results) + + assertThat(mutableTriple.left, hasSize(1)) + val funding1: Funding = mutableTriple.left.get(0) + val funder1: Funder = funding1.funder +// assertThat(funder1.fullName, `is`("MEXT")) + assertThat(funding1.programFullName, `is`("Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)")) + assertThat(funder1.fullName, `is`("Ministry of Education, Culture, Sports, Science and Technology")) + assertThat(mutableTriple.middle, hasSize(0)) + assertThat(mutableTriple.right, hasSize(0)) + } } \ No newline at end of file From f74466ebe89d9ea4e2febedaa947ff73e177f4c6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 17:07:02 +0900 Subject: [PATCH 11/31] cosmetics --- .../java/org/grobid/core/engines/Engine.java | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 949e2d63a5..296b685114 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -1,18 +1,10 @@ package org.grobid.core.engines; -import org.apache.commons.lang3.tuple.Pair; +import nu.xom.Element; import org.apache.commons.lang3.tuple.MutablePair; import org.apache.commons.lang3.tuple.MutableTriple; - -import nu.xom.Element; - -import org.grobid.core.data.Affiliation; -import org.grobid.core.data.BibDataSet; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.data.ChemicalEntity; -import org.grobid.core.data.PatentItem; -import org.grobid.core.data.Person; -import org.grobid.core.data.Funding; +import org.apache.commons.lang3.tuple.Pair; +import org.grobid.core.data.*; import org.grobid.core.document.Document; import org.grobid.core.document.DocumentSource; import org.grobid.core.engines.config.GrobidAnalysisConfig; @@ -24,14 +16,15 @@ import org.grobid.core.utilities.Utilities; import org.grobid.core.utilities.counters.CntManager; import org.grobid.core.utilities.counters.impl.CntManagerFactory; - import org.grobid.core.utilities.crossref.CrossrefClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; -import java.util.*; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; /** * Class for managing the extraction of bibliographical information from PDF @@ -1184,7 +1177,7 @@ public String processFundingAcknowledgement(String text, GrobidAnalysisConfig co result.append(localResult.getLeft().toXML()); } catch (final Exception exp) { - throw new GrobidException("An exception occured while running Grobid funding-acknowledgement model.", exp); + throw new GrobidException("An exception occurred while running Grobid funding-acknowledgement model.", exp); } return result.toString(); From 047af5bdd0a3c4c0ff61e02866805d058323a1eb Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 17:07:30 +0900 Subject: [PATCH 12/31] add transformation from token to character position --- .../grobid/core/utilities/TextUtilities.java | 49 ++++++++++++++ .../core/utilities/TextUtilitiesTest.java | 64 +++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 06f69bcdee..73ec73b352 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1556,4 +1556,53 @@ public static org.apache.commons.lang3.tuple.Pair matchTokenAndString(List layoutTokens, String text, List positions) { + List newPositions = new ArrayList<>(); + StringBuilder accumulator = new StringBuilder(); + int pos = 0; + + for (OffsetPosition position : positions) { + List urlTokens = layoutTokens.subList(position.start, position.end); + boolean first = true; + accumulator = new StringBuilder(); + for (int i = 0; i < urlTokens.size(); i++) { + LayoutToken token = urlTokens.get(i); + if (StringUtils.isEmpty(token.getText())) + continue; + int newPos = text.indexOf(token.getText(), pos); + if (newPos != -1) { + //We update pos only at the first token of the annotation positions + if (first) { + pos = newPos; + first = false; + } + accumulator.append(token); + } else { + if (SentenceUtilities.toSkipToken(token.getText())) { + continue; + } + if (StringUtils.isNotEmpty(accumulator)) { + int start = text.indexOf(accumulator.toString(), pos); + newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); + pos = newPos; + break; + } + pos = newPos; + } + } + if (StringUtils.isNotEmpty(accumulator)) { + int start = text.indexOf(accumulator.toString(), pos); + newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); + accumulator = new StringBuilder(); + } + + } + if (StringUtils.isNotEmpty(accumulator)) { + int start = text.indexOf(accumulator.toString(), pos); + newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); + } + + return newPositions; + } } diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index ff5ac7467b..4df8704ae9 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -8,12 +8,14 @@ import org.junit.Test; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.startsWith; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.*; @@ -407,4 +409,66 @@ public void testOrcidPattern() { } } } + + @Test + public void testMatchTokenAndString() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + List urlTokens = Arrays.asList(new OffsetPosition(10, 23)); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url1 = offsetPositions.get(0); + assertThat(url1.start, is(26)); + assertThat(url1.end, is(65)); + assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); + + } + + + @Test + public void testMatchTokenAndString_twoElements() throws Exception { + final String input = "This work is available at https://github.com/lfoppiano/ \n" + + "supercon2. The repository contains the code of the \n" + + "SuperCon 2 interface, the curation workflow, and the \n" + + "\n" + + "Table 2. Data support, the number of entities for each label in \n" + + "each of the datasets used for evaluating the ML models. The \n" + + "base dataset is the original dataset described in [18], and the \n" + + "curation dataset is automatically collected based on the data-\n" + + "base corrections by the interface and manually corrected. \n" + + "\n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + final String inputReal = "This work is available at https://github.com/lfoppiano/ supercon2. The repository contains the code of the SuperCon 2 interface, the curation workflow, and the Table 2. Data support, the number of entities for each label in each of the datasets used for evaluating the ML models. The base dataset is the original dataset described in [18], and the curation dataset is automatically collected based on the database corrections by the interface and manually corrected. "; + List urlTokens = Arrays.asList(new OffsetPosition(0, 3), new OffsetPosition(10, 23)); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, inputReal, urlTokens); + + assertThat(offsetPositions, hasSize(2)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(url0.start, is(0)); + assertThat(url0.end, is(9)); + + assertThat(inputReal.substring(url0.start, url0.end), is("This work")); + + OffsetPosition url1 = offsetPositions.get(1); + assertThat(url1.start, is(26)); + assertThat(url1.end, is(65)); + + assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); + + } } From 9f2edb6aeae31bf2a25f07813bc5c16117ff511a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 17:07:54 +0900 Subject: [PATCH 13/31] add class to represent the parse of a funding and acknowledgement statement --- .../core/data/FundingAcknowledgmentParse.java | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 grobid-core/src/main/java/org/grobid/core/data/FundingAcknowledgmentParse.java diff --git a/grobid-core/src/main/java/org/grobid/core/data/FundingAcknowledgmentParse.java b/grobid-core/src/main/java/org/grobid/core/data/FundingAcknowledgmentParse.java new file mode 100644 index 0000000000..a44e930922 --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/data/FundingAcknowledgmentParse.java @@ -0,0 +1,46 @@ +package org.grobid.core.data; + +import java.util.ArrayList; +import java.util.List; + +/** + * This class represent the funding / acknowledgement statement + */ +public class FundingAcknowledgmentParse { + List fundingList = new ArrayList<>(); + List personList = new ArrayList<>(); + List affiliations = new ArrayList<>(); +// List statementAnnotations = new ArrayList<>(); + + public List getFundings() { + return fundingList; + } + + public void setFundings(List fundingList) { + this.fundingList = fundingList; + } + + public List getPersons() { + return personList; + } + + public void setPersons(List personList) { + this.personList = personList; + } + + public List getAffiliations() { + return affiliations; + } + + public void setAffiliations(List fundingBodies) { + this.affiliations = fundingBodies; + } + +// public List getStatementAnnotations() { +// return statementAnnotations; +// } + +// public void setStatementAnnotations(List statementAnnotations) { +// this.statementAnnotations = statementAnnotations; +// } +} From 4b3a763269df5fccb11ba82282edbebcac2d66bd Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 17:18:08 +0900 Subject: [PATCH 14/31] fix the funding and acknowledgement parser to preserve the sentence segmentation and the reference markers --- .../engines/FundingAcknowledgementParser.java | 475 +++++++++++++----- ...ingAcknowledgementParserIntegrationTest.kt | 117 +++++ .../FundingAcknowledgementParserTest.kt | 237 ++++++++- 3 files changed, 675 insertions(+), 154 deletions(-) create mode 100644 grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 0c11294c28..160e84854f 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -1,65 +1,38 @@ package org.grobid.core.engines; +import com.google.common.collect.Iterables; +import nu.xom.*; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.MutablePair; +import org.apache.commons.lang3.tuple.MutableTriple; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.GrobidModel; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.GrobidAnalyzer; -import org.grobid.core.data.Funding; -import org.grobid.core.data.Funder; -import org.grobid.core.data.Person; -import org.grobid.core.data.Affiliation; +import org.grobid.core.data.*; +import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.engines.tagging.GenericTaggerUtils; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.features.FeaturesVectorFunding; -import org.grobid.core.features.FeatureFactory; -import org.grobid.core.lang.Language; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.lexicon.Lexicon; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.OffsetPosition; +import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.UnicodeUtil; -import org.grobid.core.engines.config.GrobidAnalysisConfig; - -import java.util.ArrayList; -import java.util.Calendar; -import java.util.List; -import java.util.StringTokenizer; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import nu.xom.Attribute; -import nu.xom.Element; -import nu.xom.Elements; -import nu.xom.Node; -import nu.xom.Nodes; -import nu.xom.Text; -import nu.xom.Document; -import nu.xom.ParsingException; -import nu.xom.ValidityException; -import nu.xom.Builder; - import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.lang3.tuple.MutablePair; -import org.apache.commons.lang3.tuple.Pair; -import org.apache.commons.lang3.tuple.MutableTriple; +import java.util.ArrayList; +import java.util.List; -import static org.apache.commons.lang3.StringUtils.isNotBlank; -import static org.grobid.core.engines.label.TaggingLabels.*; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; -import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; -import static org.grobid.core.document.xml.XmlBuilderUtils.textNode; +import static org.grobid.core.engines.label.TaggingLabels.*; public class FundingAcknowledgementParser extends AbstractParser { @@ -73,15 +46,16 @@ protected FundingAcknowledgementParser() { super(model); } - private MutablePair,List,List>> + private MutablePair>, FundingAcknowledgmentParse> processing(List tokenizationFunding, GrobidAnalysisConfig config) { - if (tokenizationFunding == null || tokenizationFunding.size() == 0) + if (CollectionUtils.isEmpty(tokenizationFunding)) { return null; + } String res; try { String featureVector = FeaturesVectorFunding.addFeatures(tokenizationFunding, null); res = label(featureVector); -//System.out.println(res); + } catch (Exception e) { throw new GrobidException("CRF labeling with table model fails.", e); } @@ -93,14 +67,59 @@ protected FundingAcknowledgementParser() { } /** - * For convenience, a processing method taking a raw string as input. - * Tokenization is done with the default Grobid analyzer triggered by the identified language. + * For convenience, a processing method taking a raw string as input. + * Tokenization is done with the default Grobid analyzer triggered by the identified language. + * + * TODO: implement the sentence segmentation **/ public MutablePair,List,List>> processing(String text, - GrobidAnalysisConfig config) { + GrobidAnalysisConfig config) { text = UnicodeUtil.normaliseText(text); List tokenizationFunding = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - return processing(tokenizationFunding, config); + MutablePair>, FundingAcknowledgmentParse> results = processing(tokenizationFunding, config); + MutableTriple, List, List> entities = MutableTriple.of(results.getRight().getFundings(), results.getRight().getPersons(), results.getRight().getAffiliations()); + List> annotations = results.getLeft(); + + Element outputParagraph = injectedAnnotationsInNode(tokenizationFunding, annotations, teiElement("p")); + + return MutablePair.of(outputParagraph, entities); + } + + /** + * This method takes in input a tokenized text, a set of annotations and a root element and attach a list of nodes + * under the root where the text is combined with the annotations + */ + protected static Element injectedAnnotationsInNode(List tokenizationFunding, List> annotations, Element rootElement) { + + int pos = 0; + for(Pair annotation: annotations) { + OffsetPosition annotationPosition = annotation.getLeft(); + Element annotationContentElement = annotation.getRight(); + + List before = tokenizationFunding.subList(pos, annotationPosition.start); + String clusterContentBefore = LayoutTokensUtil.toText(before); + + if (CollectionUtils.isNotEmpty(before) && before.get(0).getText().equals(" ")) { + rootElement.appendChild(new Text(" ")); + } + + rootElement.appendChild(clusterContentBefore); + + pos = annotationPosition.end; + rootElement.appendChild(annotationContentElement); + } + + // add last chunk of paragraph stuff (or whole paragraph if no note callout matching) + List remaining = tokenizationFunding.subList(pos, tokenizationFunding.size()); + String remainingClusterContent = LayoutTokensUtil.normalizeDehyphenizeText(remaining); + + if (CollectionUtils.isNotEmpty(remaining) && remaining.get(0).getText().equals(" ")) { + rootElement.appendChild(new Text(" ")); + } + + rootElement.appendChild(remainingClusterContent); + + return rootElement; } /** @@ -121,49 +140,129 @@ public MutablePair,List,List tokenizationFunding = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(paragraphText); - - MutablePair,List,List>> localResult = processing(tokenizationFunding, config); - - // replace paragraph content - if (localResult.getLeft() != null && localResult.getLeft().getChildCount()>0) { - ((Element) paragraph).removeChildren(); - for (int i = localResult.getLeft().getChildCount()-1; i >=0; i--) { - Node localNode = localResult.getLeft().getChild(i); - localNode.detach(); - ((Element) paragraph).insertChild(localNode, 0); + GrobidAnalyzer analyzer = GrobidAnalyzer.getInstance(); + List tokenizationFunding = analyzer.tokenizeWithLayoutToken(paragraphText); + + StringBuilder sb = new StringBuilder(); + + MutablePair>, FundingAcknowledgmentParse> localResult = processing(tokenizationFunding, config); + + List> annotations = localResult.left; + FundingAcknowledgmentParse localEntities = localResult.right; + + if (CollectionUtils.isEmpty(annotations)) { + continue; + } + + List list = annotations.stream().map(a -> a.getLeft()).toList(); + List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, list); + List> annotationsWithPosRefToText = new ArrayList<>(); + for (int i = 0; i < annotationsPositionText.size(); i++) { + annotationsWithPosRefToText.add(Pair.of(annotationsPositionText.get(i), annotations.get(i).getRight())); + } + + annotations = annotationsWithPosRefToText; + + if (sentenceSegmentation) { +// Pair, List> sentenceInformation = extractSentencesAndPositionsFromParagraphElement(rootElementStatement); +// +// List sentencesList = sentenceInformation.getLeft(); +// List offsetPositionList = sentenceInformation.getRight(); +// +// List> sentenceLayoutTokens = sentencesList.stream() +// .map(analyzer::tokenizeWithLayoutToken) +// .toList(); +// +// List sentenceTokenPositions = new ArrayList<>(); +// int pos = 0; +// for (List sentenceLayoutToken : sentenceLayoutTokens) { +// offsetPositionList.add(new OffsetPosition(pos, pos + sentenceLayoutToken.size())); +// pos += sentenceLayoutToken.size(); +// } + int pos = 0; + int sentenceStartOffset = 0; + Nodes sentences = paragraph.query("//s"); + + if(sentences.size() == 0) { + // Overly careful - we should never end up here. + LOGGER.warn("While the configuration claim that paragraphs must be segmented, we did not find any sentence. "); + + List nodes = getNodesAnnotationsInTextNode(paragraph, annotations); + + for (int i = 0; i < paragraph.getChildCount(); i++) { + paragraph.getChild(i).detach(); + } + for (Node node: nodes) { + node.detach(); + ((Element) paragraph).appendChild(node); + } + } + + for (Node sentence : sentences) { + String sentenceText = sentence.getValue(); + List newChildren = new ArrayList<>(); + for (int i = 0; i < sentence.getChildCount(); i++) { + //Assumption here is that the structure is flat to maximum one level down + Node currentNode = sentence.getChild(i); + if (currentNode instanceof Text) { + String text = currentNode.getValue(); + int finalPos = pos; + List> annotationsInThisChunk = annotations.stream() + .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end < finalPos + text.length()) + .toList(); + + if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { + List nodes = getNodesAnnotationsInTextNode(currentNode, annotationsInThisChunk, pos); + newChildren.addAll(nodes); + } else { + newChildren.add(currentNode); + } + pos += text.length(); + } else if (currentNode instanceof Element) { + newChildren.add(currentNode); + pos += currentNode.getValue().length(); + } /*else { + System.out.println(currentNode); + }*/ + } + + for (int i = 0; i < sentence.getChildCount(); i++) { + sentence.getChild(i).detach(); + } + for (Node node: newChildren) { + node.detach(); + ((Element) sentence).appendChild(node); + } + + sentenceStartOffset += sentenceText.length(); + } + } else { + List nodes = getNodesAnnotationsInTextNode(paragraph, annotations); + + for (int i = 0; i < paragraph.getChildCount(); i++) { + paragraph.getChild(i).detach(); + } + for (Node node: nodes) { + node.detach(); + ((Element) paragraph).appendChild(node); } } + // update extracted entities if (globalResult == null) { - globalResult = MutablePair.of(root, localResult.getRight()); + globalResult = MutablePair.of(rootElementStatement, MutableTriple.of(localEntities.getFundings(), localEntities.getPersons(), localEntities.getAffiliations())); } else { // concatenate members of the local results to the global ones - MutableTriple,List,List> localEntities = localResult.getRight(); - MutableTriple,List,List> globalEntities = globalResult.getRight(); - - List localFundings = localEntities.getLeft(); - List globalFundings = globalEntities.getLeft(); - globalFundings.addAll(localFundings); - globalEntities.setLeft(globalFundings); - - List localPersons = localEntities.getMiddle(); - List globalPersons = globalEntities.getMiddle(); - globalPersons.addAll(localPersons); - globalEntities.setMiddle(globalPersons); - - List localAffiliation = localEntities.getRight(); - List globalAffiliations = globalEntities.getRight(); - globalAffiliations.addAll(localAffiliation); - globalEntities.setRight(globalAffiliations); - - globalResult.setRight(globalEntities); + globalResult = aggregateResults(MutableTriple.of(localEntities.getFundings(), localEntities.getPersons(), localEntities.getAffiliations()), globalResult); } + } //System.out.println(globalResult.getLeft().toXML()); @@ -173,11 +272,95 @@ public MutablePair,List,List getNodesAnnotationsInTextNode(Node targetNode, List> annotations) { + return getNodesAnnotationsInTextNode(targetNode, annotations, 0); + } + + /** + * The sentence offset allow to calculate the position relative to the sentence of annotations that + * have been calculated in relation with the paragraph. + */ + protected static List getNodesAnnotationsInTextNode(Node targetNode, List> annotations, int sentenceOffset) { + String text = targetNode.getValue(); + + List outputNodes = new ArrayList<>(); + + int pos = 0; + for (Pair annotation : annotations) { + OffsetPosition annotationPosition = annotation.getLeft(); + Element annotationContentElement = annotation.getRight(); + + String before = text.substring(pos, annotationPosition.start - sentenceOffset); + +// if (StringUtils.isNotEmpty(before) && before.startsWith(" ")) { +// outputNodes.add(new Text(" ")); +// } + + outputNodes.add(new Text(before)); + pos = annotationPosition.end - sentenceOffset; + outputNodes.add(annotationContentElement); + } + + String remaining = text.substring(pos); + +// if (StringUtils.isNotEmpty(remaining) && remaining.startsWith(" ")) { +// outputNodes.add(new Text(" ")); +// } + + outputNodes.add(new Text(remaining)); + + return outputNodes; + } + + private static @NotNull MutablePair, List, List>> aggregateResults(MutableTriple, List, List> localEntities, MutablePair, List, List>> globalResult) { + MutableTriple,List,List> globalEntities = globalResult.getRight(); + + List localFundings = localEntities.getLeft(); + List globalFundings = globalEntities.getLeft(); + globalFundings.addAll(localFundings); + globalEntities.setLeft(globalFundings); + + List localPersons = localEntities.getMiddle(); + List globalPersons = globalEntities.getMiddle(); + globalPersons.addAll(localPersons); + globalEntities.setMiddle(globalPersons); + + List localAffiliation = localEntities.getRight(); + List globalAffiliations = globalEntities.getRight(); + globalAffiliations.addAll(localAffiliation); + globalEntities.setRight(globalAffiliations); + + globalResult.setRight(globalEntities); + return globalResult; } + protected static Pair, List> extractSentencesAndPositionsFromParagraphElement(Element paragraphElement) { + int offset = 0; + List sentenceOffsetPositions = new ArrayList<>(); + + Nodes sentences = paragraphElement.query("//s"); + List sentencesAsString = new ArrayList<>(); + for (Node sentence : sentences) { + String sentenceText = sentence.getValue(); + sentenceOffsetPositions.add(new OffsetPosition(offset, offset + sentenceText.length())); + sentencesAsString.add(sentence.getValue()); + offset += sentence.getValue().length(); + } + + return Pair.of(sentencesAsString, sentenceOffsetPositions); + } + /** * The processing here is called from the header and/or full text parser in cascade @@ -188,35 +371,44 @@ public MutablePair,List,List,List,List>> - getExtractionResult(List tokenizations, String result) { + protected MutablePair>, FundingAcknowledgmentParse> getExtractionResult(List tokensParagraph, String labellingResult) { List fundings = new ArrayList<>(); List persons = new ArrayList<>(); List affiliations = new ArrayList<>(); List institutions = new ArrayList<>(); + FundingAcknowledgmentParse parsedStatement = new FundingAcknowledgmentParse(); + parsedStatement.setFundings(fundings); + parsedStatement.setPersons(persons); + parsedStatement.setAffiliations(affiliations); + // current funding Funding funding = new Funding(); // current person Person person = new Person(); - + // current organization Affiliation affiliation = new Affiliation(); Affiliation institution = new Affiliation(); - TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FUNDING_ACKNOWLEDGEMENT, result, tokenizations); + TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FUNDING_ACKNOWLEDGEMENT, labellingResult, tokensParagraph); List clusters = clusteror.cluster(); TaggingLabel previousLabel = null; - Element curParagraph = teiElement("p"); - List curParagraphNodes = new ArrayList<>(); + List elements = new ArrayList<>(); + List positions = new ArrayList<>(); + int posTokenization = 0; + int posCharacters = 0; for (TaggingTokenCluster cluster : clusters) { if (cluster == null) { @@ -224,7 +416,9 @@ public MutablePair,List,List 0 && tokenizations.size()>=posTokenization && tokenizations.get(posTokenization-1).getText().equals(" ")) { + if (posTokenization > 0 + && tokensParagraph.size()>=posTokenization + && tokensParagraph.get(posTokenization-1).getText().equals(" ")) { spaceBefore = true; } @@ -232,7 +426,24 @@ public MutablePair,List,List tokens = cluster.concatTokens(); - String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(tokens)); + String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(tokens)); + + if (clusterLabel.equals(FUNDING_OTHER)) { + posTokenization += tokens.size(); + posCharacters += clusterContent.length(); + continue; + } + + // We adjust the end position when the entity ends with a space + int endPosTokenization = posTokenization + tokens.size(); + if (Iterables.getLast(tokens).getText().equals(" ")) { + endPosTokenization -= 1; + } + + int endPosCharacters = posCharacters + clusterContent.length(); + if (Iterables.getLast(tokens).getText().equals(" ")) { + endPosCharacters -= 1; + } if (clusterLabel.equals(FUNDING_FUNDER_NAME)) { Funder localFunder = funding.getFunder(); @@ -259,11 +470,9 @@ public MutablePair,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List,List> entities = MutableTriple.of(fundings, persons, affiliations); + List> annotations = new ArrayList<>(); + + for (int i = 0; i < elements.size(); i++) { + annotations.add(Pair.of(positions.get(i), elements.get(i))); + } - return MutablePair.of(curParagraph, entities); + return MutablePair.of(annotations, parsedStatement); } /** diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt new file mode 100644 index 0000000000..72011fcdc2 --- /dev/null +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -0,0 +1,117 @@ +package org.grobid.core.engines + +import org.grobid.core.engines.config.GrobidAnalysisConfig +import org.grobid.core.factory.AbstractEngineFactory +import org.grobid.core.utilities.GrobidConfig +import org.grobid.core.utilities.GrobidProperties +import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.MatcherAssert.assertThat +import org.hamcrest.Matchers.hasSize +import org.junit.Before +import org.junit.BeforeClass +import org.junit.Test + +class FundingAcknowledgementParserIntegrationTest { + + private lateinit var target: FundingAcknowledgementParser + + @Before + @Throws(Exception::class) + fun setUp() { + val modelParameters = GrobidConfig.ModelParameters() + modelParameters.name = "bao" + GrobidProperties.addModel(modelParameters) + target = FundingAcknowledgementParser() + } + + @Test + fun testXmlFragmentProcessing_withoutSentenceSegmentation_shouldReturnSameXML() { + + val input = "\n\t\t\t
\n
Acknowledgments

This research was " + + "funded by the NASA Land-Cover and Land-Use Change Program (Grant Number: 80NSSC18K0315), the NASA " + + "Carbon Monitoring System (Grant Number: 80NSSC20K0022), and

\n\t\t\t
\n\n" + + + // Expected +// val output = "\n\t\t\t
\n
Acknowledgments

This research was " + +// "funded by the NASA " + +// "Land-Cover and Land-Use Change Program " + +// "(Grant Number: 80NSSC18K0315), " + +// "the NASA Carbon Monitoring System " + +// "(Grant Number: 80NSSC20K0022), " + +// "and

\n\t\t\t
\n\n" + + // Current version output + val output = "
\n
Acknowledgments

This research was " + + "funded by the NASA " + + "Land-Cover and Land-Use Change Program " + + "(Grant Number: 80NSSC18K0315), " + + "the NASA Carbon Monitoring System " + + "(Grant Number: 80NSSC20K0022), " + + "and

\n\t\t\t
" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(false) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), `is`(output)) + assertThat(mutableTriple.left, hasSize(2)) + } + + @Test + fun testXmlFragmentProcessing2_withoutSentenceSegmentation_shouldReturnSameXML() { + val input ="\n" + + "\t\t\t
\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
\n\n" + + // Expected +// val output = "\n\t\t\t
\n" + +// "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + +// "\t\t\t
\n\n" + + // Current version output + val output = "
\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(false) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), `is`(output)) + } + + @Test + fun testXmlFragmentProcessing2_withSentenceSegmentation_shouldWork() { + val input ="\n" + + "\t\t\t
\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
\n\n" + + val output = "\n\t\t\t
\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
\n\n" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), `is`(output)) + } + + companion object { + @JvmStatic + @BeforeClass + @Throws(java.lang.Exception::class) + fun setInitialContext(): Unit { + AbstractEngineFactory.init() + } + } +} \ No newline at end of file diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt index 565fb78594..cc636b4aa7 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt @@ -1,19 +1,21 @@ package org.grobid.core.engines +import nu.xom.Builder +import nu.xom.Document +import nu.xom.Element import org.grobid.core.GrobidModels import org.grobid.core.analyzers.GrobidAnalyzer import org.grobid.core.data.Funder import org.grobid.core.data.Funding import org.grobid.core.layout.LayoutToken -import org.grobid.core.lexicon.Lexicon import org.grobid.core.utilities.GrobidConfig import org.grobid.core.utilities.GrobidProperties +import org.grobid.core.utilities.LayoutTokensUtil import org.hamcrest.CoreMatchers.`is` import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.hasSize import org.junit.Before import org.junit.Test -import org.powermock.api.easymock.PowerMock class FundingAcknowledgementParserTest { @@ -22,7 +24,6 @@ class FundingAcknowledgementParserTest { @Before @Throws(Exception::class) fun setUp() { - PowerMock.mockStatic(Lexicon::class.java) val modelParameters = GrobidConfig.ModelParameters() modelParameters.name = "bao" GrobidProperties.addModel(modelParameters) @@ -103,14 +104,14 @@ class FundingAcknowledgementParserTest { val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); - val (element, mutableTriple) = target.getExtractionResult(tokens, results) + val (element, fundingAcknowledgmentParse) = target.getExtractionResult(tokens, results) - assertThat(mutableTriple.left, hasSize(0)) - assertThat(mutableTriple.middle, hasSize(3)) - assertThat(mutableTriple.middle.get(0).rawName, `is`("Patrice Lopez")) - assertThat(mutableTriple.middle.get(1).rawName, `is`("Pedro Baptista de Castro")) - assertThat(mutableTriple.middle.get(2).rawName, `is`("Erina Fujita")) - assertThat(mutableTriple.right, hasSize(0)) + assertThat(fundingAcknowledgmentParse.fundings, hasSize(0)) + assertThat(fundingAcknowledgmentParse.persons, hasSize(3)) + assertThat(fundingAcknowledgmentParse.persons.get(0).rawName, `is`("Patrice Lopez")) + assertThat(fundingAcknowledgmentParse.persons.get(1).rawName, `is`("Pedro Baptista de Castro")) + assertThat(fundingAcknowledgmentParse.persons.get(2).rawName, `is`("Erina Fujita")) + assertThat(fundingAcknowledgmentParse.affiliations, hasSize(0)) } @Test @@ -156,15 +157,221 @@ class FundingAcknowledgementParserTest { val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); - val (element, mutableTriple) = target.getExtractionResult(tokens, results) + val (element, fundingAcknowledgmentParse) = target.getExtractionResult(tokens, results) - assertThat(mutableTriple.left, hasSize(1)) - val funding1: Funding = mutableTriple.left.get(0) + assertThat(fundingAcknowledgmentParse.fundings, hasSize(1)) + val funding1: Funding = fundingAcknowledgmentParse.fundings.get(0) val funder1: Funder = funding1.funder // assertThat(funder1.fullName, `is`("MEXT")) assertThat(funding1.programFullName, `is`("Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)")) assertThat(funder1.fullName, `is`("Ministry of Education, Culture, Sports, Science and Technology")) - assertThat(mutableTriple.middle, hasSize(0)) - assertThat(mutableTriple.right, hasSize(0)) + assertThat(fundingAcknowledgmentParse.persons, hasSize(0)) + assertThat(fundingAcknowledgmentParse.affiliations, hasSize(0)) + } + + @Test + fun extractSentencesAndPositionsFromParagraphElement_shouldReturnValidIntervals() { + //Here the namespace is already removed as it must be removed when the node arrives at the method we are testing + val input ="\n" + + "\t\t\t
\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
\n\n" + + val parser = Builder() + val localDoc: Document = parser.build(input, null) + val root = localDoc.rootElement + val paragraphs = root.query("//p") + + val firstParagraphText = paragraphs[0].value + + val (strings, offsetPositions) = FundingAcknowledgementParser.extractSentencesAndPositionsFromParagraphElement( + paragraphs[0] as Element? + ) + + assertThat(strings, hasSize(3)) + assertThat(offsetPositions, hasSize(3)) + assertThat(firstParagraphText.substring(offsetPositions[0].start, offsetPositions[0].end), + `is`("Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.")) + assertThat(firstParagraphText.substring(offsetPositions[1].start, offsetPositions[1].end), + `is`("We thank Pedro Baptista de Castro for his support during this work.")) + assertThat(firstParagraphText.substring(offsetPositions[2].start, offsetPositions[2].end), + `is`("Special thanks to Erina Fujita for useful tips on the manuscript.")) + } + + @Test + fun testGetExtractionResultNew1_ShouldReturnCorrectElementsAndPositions() { + + val input = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript."; + + val results: String = "Our\tour\tO\tOu\tOur\tOur\tr\tur\tOur\tOur\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "warmest\twarmest\tw\twa\twar\twarm\tt\tst\test\tmest\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "thanks\tthanks\tt\tth\ttha\tthan\ts\tks\tnks\tanks\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "to\tto\tt\tto\tto\tto\to\tto\tto\tto\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Patrice\tpatrice\tP\tPa\tPat\tPatr\te\tce\tice\trice\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Lopez\tlopez\tL\tLo\tLop\tLope\tz\tez\tpez\topez\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "author\tauthor\ta\tau\taut\tauth\tr\tor\thor\tthor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Grobid\tgrobid\tG\tGr\tGro\tGrob\td\tid\tbid\tobid\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "22\t22\t2\t22\t22\t22\t2\t22\t22\t22\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "DeLFT\tdelft\tD\tDe\tDeL\tDeLF\tT\tFT\tLFT\teLFT\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "20\t20\t2\t20\t20\t20\t0\t20\t20\t20\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "other\tother\to\tot\toth\tothe\tr\ter\ther\tther\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "open\topen\to\top\tope\topen\tn\ten\tpen\topen\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tHYPHEN\t0\t\n" + + "source\tsource\ts\tso\tsou\tsour\te\tce\trce\turce\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "projects\tprojects\tp\tpr\tpro\tproj\ts\tts\tcts\tects\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "his\this\th\thi\this\this\ts\tis\this\this\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "continuous\tcontinuous\tc\tco\tcon\tcont\ts\tus\tous\tuous\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "support\tsupport\ts\tsu\tsup\tsupp\tt\trt\tort\tport\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "inspiration\tinspiration\ti\tin\tins\tinsp\tn\ton\tion\ttion\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "with\twith\tw\twi\twit\twith\th\tth\tith\twith\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "ideas\tideas\ti\tid\tide\tidea\ts\tas\teas\tdeas\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "suggestions\tsuggestions\ts\tsu\tsug\tsugg\ts\tns\tons\tions\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "fruitful\tfruitful\tf\tfr\tfru\tfrui\tl\tul\tful\ttful\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "discussions\tdiscussions\td\tdi\tdis\tdisc\ts\tns\tons\tions\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t\n" + + "We\twe\tW\tWe\tWe\tWe\te\tWe\tWe\tWe\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "thank\tthank\tt\tth\ttha\tthan\tk\tnk\tank\thank\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Pedro\tpedro\tP\tPe\tPed\tPedr\to\tro\tdro\tedro\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Baptista\tbaptista\tB\tBa\tBap\tBapt\ta\tta\tsta\tista\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "de\tde\td\tde\tde\tde\te\tde\tde\tde\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Castro\tcastro\tC\tCa\tCas\tCast\to\tro\ttro\tstro\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "his\this\th\thi\this\this\ts\tis\this\this\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "support\tsupport\ts\tsu\tsup\tsupp\tt\trt\tort\tport\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "during\tduring\td\tdu\tdur\tduri\tg\tng\ting\tring\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "this\tthis\tt\tth\tthi\tthis\ts\tis\this\tthis\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t\n" + + "Special\tspecial\tS\tSp\tSpe\tSpec\tl\tal\tial\tcial\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "thanks\tthanks\tt\tth\ttha\tthan\ts\tks\tnks\tanks\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "to\tto\tt\tto\tto\tto\to\tto\tto\tto\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Erina\terina\tE\tEr\tEri\tErin\ta\tna\tina\trina\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Fujita\tfujita\tF\tFu\tFuj\tFuji\ta\tta\tita\tjita\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "useful\tuseful\tu\tus\tuse\tusef\tl\tul\tful\teful\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "tips\ttips\tt\tti\ttip\ttips\ts\tps\tips\ttips\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "on\ton\to\ton\ton\ton\tn\ton\ton\ton\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "manuscript\tmanuscript\tm\tma\tman\tmanu\tt\tpt\tipt\tript\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + val (spans, statement) = target.getExtractionResult(tokens, results) + + assertThat(statement.fundings, hasSize(0)) + assertThat(statement.persons, hasSize(3)) + assertThat(statement.persons[0].rawName, `is`("Patrice Lopez")) + assertThat(statement.persons[1].rawName, `is`("Pedro Baptista de Castro")) + assertThat(statement.persons[2].rawName, `is`("Erina Fujita")) + assertThat(statement.affiliations, hasSize(0)) + + assertThat(spans, hasSize(3)) + val span0 = spans[0] + val offsetPosition0 = span0.left + val element0 = span0.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition0.start, offsetPosition0.end)), `is`("Patrice Lopez")) + assertThat(element0.toXML(), `is`("Patrice Lopez")) + + val span1 = spans[1] + val offsetPosition1 = span1.left + val element1 = span1.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition1.start, offsetPosition1.end)), `is`("Pedro Baptista de Castro")) + assertThat(element1.toXML(), `is`("Pedro Baptista de Castro")) + + val span2 = spans[2] + val offsetPosition2 = span2.left + val element2 = span2.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("Erina Fujita")) + assertThat(element2.toXML(), `is`("Erina Fujita")) + } + + @Test + fun testGetExtractionResultNew2_ShouldReturnCorrectElementsAndPositions() { + val input = "This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503]."; + + val results: String = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "was\twas\tw\twa\twas\twas\ts\tas\twas\twas\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "partly\tpartly\tp\tpa\tpar\tpart\ty\tly\ttly\trtly\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "supported\tsupported\ts\tsu\tsup\tsupp\td\ted\tted\trted\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "by\tby\tb\tby\tby\tby\ty\tby\tby\tby\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "MEXT\tmext\tM\tME\tMEX\tMEXT\tT\tXT\tEXT\tMEXT\tLINEIN\tALLCAP\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\tI-\n" + + "Program\tprogram\tP\tPr\tPro\tProg\tm\tam\tram\tgram\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + ":\t:\t:\t:\t:\t:\t:\t:\t:\t:\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tPUNCT\t0\t\n" + + "Data\tdata\tD\tDa\tDat\tData\ta\tta\tata\tData\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Creation\tcreation\tC\tCr\tCre\tCrea\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Utilization\tutilization\tU\tUt\tUti\tUtil\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tHYPHEN\t0\t\n" + + "Type\ttype\tT\tTy\tTyp\tType\te\tpe\type\tType\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Material\tmaterial\tM\tMa\tMat\tMate\tl\tal\tial\trial\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Research\tresearch\tR\tRe\tRes\tRese\th\tch\trch\tarch\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\t\n" + + "and\tand\ta\tan\tand\tand\td\tnd\tand\tand\tLINEIN\tNOCAPS\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\t\n" + + "Development\tdevelopment\tD\tDe\tDev\tDeve\tt\tnt\tent\tment\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\tNOPUNCT\t0\t\n" + + "Project\tproject\tP\tPr\tPro\tProj\tt\tct\tect\tject\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "(\t(\t(\t(\t(\t(\t(\t(\t(\t(\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "Digital\tdigital\tD\tDi\tDig\tDigi\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Transformation\ttransformation\tT\tTr\tTra\tTran\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Initiative\tinitiative\tI\tIn\tIni\tInit\te\tve\tive\ttive\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Center\tcenter\tC\tCe\tCen\tCent\tr\ter\tter\tnter\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "for\tfor\tf\tfo\tfor\tfor\tr\tor\tfor\tfor\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Magnetic\tmagnetic\tM\tMa\tMag\tMagn\tc\tic\ttic\tetic\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "Materials\tmaterials\tM\tMa\tMat\tMate\ts\tls\tals\tials\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ")\t)\t)\t)\t)\t)\t)\t)\t)\t)\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\t\n" + + "Grant\tgrant\tG\tGr\tGra\tGran\tt\tnt\tant\trant\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Number\tnumber\tN\tNu\tNum\tNumb\tr\ter\tber\tmber\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + + "JPMXP1122715503\tjpmxp1122715503\tJ\tJP\tJPM\tJPMX\t3\t03\t503\t5503\tLINEIN\tALLCAP\tCONTAINSDIGITS\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\tI-\n" + + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + val (spans, statement) = target.getExtractionResult(tokens, results) + + assertThat(statement.fundings, hasSize(1)) + assertThat(statement.persons, hasSize(0)) + assertThat(statement.affiliations, hasSize(0)) + + assertThat(spans, hasSize(3)) + val span0 = spans[0] + val offsetPosition0 = span0.left + val element0 = span0.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition0.start, offsetPosition0.end)), `is`("MEXT")) + assertThat(element0.toXML(), `is`("MEXT")) + + val span1 = spans[1] + val offsetPosition1 = span1.left + val element1 = span1.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition1.start, offsetPosition1.end)), `is`("Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)")) + assertThat(element1.toXML(), `is`("Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)")) + + val span2 = spans[2] + val offsetPosition2 = span2.left + val element2 = span2.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("JPMXP1122715503")) + assertThat(element2.toXML(), `is`("JPMXP1122715503")) } } \ No newline at end of file From 7628f4099e305551cb784c5c2d72f4d785ae1d76 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 17:34:14 +0900 Subject: [PATCH 15/31] publish tests results on github actions --- .github/workflows/ci-build-unstable.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index cf69c53314..444c527e00 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -25,6 +25,15 @@ jobs: - name: Test with Gradle Jacoco and Coveralls run: ./gradlew test jacocoTestReport coveralls --no-daemon + - name: Publish Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + files: | + test-results/**/*.xml + test-results/**/*.trx + test-results/**/*.json + - name: Coveralls GitHub Action uses: coverallsapp/github-action@v2 with: From 83416a92c9b26ea13bad590e83c1f37769bdd2e3 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 17:44:27 +0900 Subject: [PATCH 16/31] fix test path --- .github/workflows/ci-build-unstable.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index 444c527e00..19cb5afcda 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -30,9 +30,9 @@ jobs: if: always() with: files: | - test-results/**/*.xml - test-results/**/*.trx - test-results/**/*.json + build/test-results/**/*.xml + build/test-results/**/*.trx + build/test-results/**/*.json - name: Coveralls GitHub Action uses: coverallsapp/github-action@v2 From 364176da2b5b71568f0211cc7d9cd7b0a9c8cf47 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 18:12:27 +0900 Subject: [PATCH 17/31] Fix incorrect offsets when processing paragraphs and update tests --- .../engines/FundingAcknowledgementParser.java | 142 ++++++++++-------- ...ingAcknowledgementParserIntegrationTest.kt | 10 +- 2 files changed, 86 insertions(+), 66 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 160e84854f..6be01ec4b5 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -150,8 +150,6 @@ public MutablePair,List,List tokenizationFunding = analyzer.tokenizeWithLayoutToken(paragraphText); - StringBuilder sb = new StringBuilder(); - MutablePair>, FundingAcknowledgmentParse> localResult = processing(tokenizationFunding, config); List> annotations = localResult.left; @@ -186,73 +184,17 @@ public MutablePair,List,List nodes = getNodesAnnotationsInTextNode(paragraph, annotations); - - for (int i = 0; i < paragraph.getChildCount(); i++) { - paragraph.getChild(i).detach(); - } - for (Node node: nodes) { - node.detach(); - ((Element) paragraph).appendChild(node); - } + updateParagraphNodeWithAnnotations(paragraph, annotations); } - for (Node sentence : sentences) { - String sentenceText = sentence.getValue(); - List newChildren = new ArrayList<>(); - for (int i = 0; i < sentence.getChildCount(); i++) { - //Assumption here is that the structure is flat to maximum one level down - Node currentNode = sentence.getChild(i); - if (currentNode instanceof Text) { - String text = currentNode.getValue(); - int finalPos = pos; - List> annotationsInThisChunk = annotations.stream() - .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end < finalPos + text.length()) - .toList(); - - if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { - List nodes = getNodesAnnotationsInTextNode(currentNode, annotationsInThisChunk, pos); - newChildren.addAll(nodes); - } else { - newChildren.add(currentNode); - } - pos += text.length(); - } else if (currentNode instanceof Element) { - newChildren.add(currentNode); - pos += currentNode.getValue().length(); - } /*else { - System.out.println(currentNode); - }*/ - } - - for (int i = 0; i < sentence.getChildCount(); i++) { - sentence.getChild(i).detach(); - } - for (Node node: newChildren) { - node.detach(); - ((Element) sentence).appendChild(node); - } - - sentenceStartOffset += sentenceText.length(); - } + updateNodes(sentences, annotations); } else { - List nodes = getNodesAnnotationsInTextNode(paragraph, annotations); - - for (int i = 0; i < paragraph.getChildCount(); i++) { - paragraph.getChild(i).detach(); - } - for (Node node: nodes) { - node.detach(); - ((Element) paragraph).appendChild(node); - } + updateParagraphNodeWithAnnotations(paragraph, annotations); } // update extracted entities @@ -277,6 +219,84 @@ public MutablePair,List,List> annotations) { + int pos = 0; + List newChildren = new ArrayList<>(); + for (int i = 0; i < paragraph.getChildCount(); i++) { + //Assumption here is that the structure is flat to maximum one level down + Node currentNode = paragraph.getChild(i); + if (currentNode instanceof Text) { + String text = currentNode.getValue(); + int finalPos = pos; + List> annotationsInThisChunk = annotations.stream() + .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end < finalPos + text.length()) + .toList(); + + if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { + List nodes = getNodesAnnotationsInTextNode(currentNode, annotationsInThisChunk, pos); + newChildren.addAll(nodes); + } else { + newChildren.add(currentNode); + } + pos += text.length(); + } else if (currentNode instanceof Element) { + newChildren.add(currentNode); + pos += currentNode.getValue().length(); + } + } + + for (int i = 0; i < paragraph.getChildCount(); i++) { + paragraph.getChild(i).detach(); + } + for (Node node: newChildren) { + node.detach(); + ((Element) paragraph).appendChild(node); + } + } + + private static void updateNodes(Nodes sentences, List> annotations) { + int pos = 0; + int sentenceStartOffset = 0; + for (Node sentence : sentences) { + String sentenceText = sentence.getValue(); + List newChildren = new ArrayList<>(); + for (int i = 0; i < sentence.getChildCount(); i++) { + //Assumption here is that the structure is flat to maximum one level down + Node currentNode = sentence.getChild(i); + if (currentNode instanceof Text) { + String text = currentNode.getValue(); + int finalPos = pos; + List> annotationsInThisChunk = annotations.stream() + .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end < finalPos + text.length()) + .toList(); + + if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { + List nodes = getNodesAnnotationsInTextNode(currentNode, annotationsInThisChunk, pos); + newChildren.addAll(nodes); + } else { + newChildren.add(currentNode); + } + pos += text.length(); + } else if (currentNode instanceof Element) { + newChildren.add(currentNode); + pos += currentNode.getValue().length(); + } /*else { + System.out.println(currentNode); + }*/ + } + + for (int i = 0; i < sentence.getChildCount(); i++) { + sentence.getChild(i).detach(); + } + for (Node node: newChildren) { + node.detach(); + ((Element) sentence).appendChild(node); + } + + sentenceStartOffset += sentenceText.length(); + } + } + /** * This method return a list of nodes corresponding to the annotations as they are positioned in * the text content of the target node. If the node is empty, should be used @see injectedAnnotationsInNode diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index 72011fcdc2..c7413c3f24 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -48,7 +48,7 @@ class FundingAcknowledgementParserIntegrationTest { "(Grant Number: 80NSSC18K0315), " + "the NASA Carbon Monitoring System " + "(Grant Number: 80NSSC20K0022), " + - "and

\n\t\t\t" + "and

\n\t\t\t" val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .withSentenceSegmentation(false) @@ -74,7 +74,7 @@ class FundingAcknowledgementParserIntegrationTest { // Current version output val output = "
\n" + - "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + "\t\t\t
" val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() @@ -93,9 +93,9 @@ class FundingAcknowledgementParserIntegrationTest { "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + "\t\t\t\n\n" - val output = "\n\t\t\t
\n" + - "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + - "\t\t\t
\n\n" + val output = "
\n" + + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
" val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .withSentenceSegmentation(true) From 9dc767f3224ffaaed2d332aaf51a194529ef119f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 18:17:44 +0900 Subject: [PATCH 18/31] report on test failure/success --- .github/workflows/ci-build-unstable.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index 19cb5afcda..f5d7721f60 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -25,14 +25,13 @@ jobs: - name: Test with Gradle Jacoco and Coveralls run: ./gradlew test jacocoTestReport coveralls --no-daemon - - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action@v2 - if: always() + - name: Test Results + uses: dorny/test-reporter@v1 with: - files: | - build/test-results/**/*.xml - build/test-results/**/*.trx - build/test-results/**/*.json + artifact: test-results + name: JUNIT Tests + path: '*.xml' + reporter: java-junit - name: Coveralls GitHub Action uses: coverallsapp/github-action@v2 From 753a73ecbe20a591050f0accb58e9d9607e162d2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 18:42:03 +0900 Subject: [PATCH 19/31] report on test failure/success --- .github/workflows/ci-build-unstable.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index f5d7721f60..8ed302e156 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -25,13 +25,11 @@ jobs: - name: Test with Gradle Jacoco and Coveralls run: ./gradlew test jacocoTestReport coveralls --no-daemon - - name: Test Results - uses: dorny/test-reporter@v1 + - name: Publish Test Report + uses: mikepenz/action-junit-report@v4 + if: success() || failure() # always run even if the previous step fails with: - artifact: test-results - name: JUNIT Tests - path: '*.xml' - reporter: java-junit + report_paths: '**/build/test-results/test/TEST-*.xml' - name: Coveralls GitHub Action uses: coverallsapp/github-action@v2 From b2873bd473a6fdc121816d1c7698becf72771500 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 1 May 2024 19:06:30 +0900 Subject: [PATCH 20/31] enable sentence segmentation in the processing of a text chunk --- .../engines/FundingAcknowledgementParser.java | 45 ++++++++++++++++--- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 6be01ec4b5..3b20d704fa 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -11,6 +11,7 @@ import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.*; +import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.engines.tagging.GenericTaggerUtils; @@ -19,10 +20,8 @@ import org.grobid.core.layout.LayoutToken; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.OffsetPosition; -import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.UnicodeUtil; +import org.grobid.core.utilities.*; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -70,9 +69,8 @@ protected FundingAcknowledgementParser() { * For convenience, a processing method taking a raw string as input. * Tokenization is done with the default Grobid analyzer triggered by the identified language. * - * TODO: implement the sentence segmentation **/ - public MutablePair,List,List>> processing(String text, + public MutablePair, List, List>> processing(String text, GrobidAnalysisConfig config) { text = UnicodeUtil.normaliseText(text); List tokenizationFunding = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); @@ -80,9 +78,42 @@ public MutablePair,List,List, List, List> entities = MutableTriple.of(results.getRight().getFundings(), results.getRight().getPersons(), results.getRight().getAffiliations()); List> annotations = results.getLeft(); - Element outputParagraph = injectedAnnotationsInNode(tokenizationFunding, annotations, teiElement("p")); + Element outputParagraph = teiElement("p"); + outputParagraph.appendChild(text); + + if (config.isWithSentenceSegmentation()) { + List theSentences = + SentenceUtilities.getInstance().runSentenceDetection(text); + + // update the xml paragraph element + int pos = 0; + int posInSentence = 0; + for(int i=0; i=0; i--) { + Node theNode = outputParagraph.getChild(i); + if (theNode instanceof Text) { + outputParagraph.removeChild(theNode); + } else if (theNode instanceof Element) { + if (!((Element) theNode).getLocalName().equals("s")) { + outputParagraph.removeChild(theNode); + } + } + } + } - return MutablePair.of(outputParagraph, entities); + return processingXmlFragment(outputParagraph.toXML(), config); } /** From 097ca9371790d5887cc02399d197a93b2d21880b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 08:09:38 +0900 Subject: [PATCH 21/31] update xmlunit library --- build.gradle | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 421879fb6c..89546fb8ac 100644 --- a/build.gradle +++ b/build.gradle @@ -111,7 +111,8 @@ subprojects { testImplementation 'org.easymock:easymock:5.1.0' testImplementation "org.powermock:powermock-api-easymock:2.0.7" testImplementation "org.powermock:powermock-module-junit4:2.0.7" - testImplementation "xmlunit:xmlunit:1.6" + testImplementation "org.xmlunit:xmlunit-matchers:2.10.0" + testImplementation "org.xmlunit:xmlunit-legacy:2.10.0" testImplementation "org.hamcrest:hamcrest-all:1.3" testImplementation 'org.jetbrains.kotlin:kotlin-test' testImplementation "io.mockk:mockk:1.13.9" From cedee649c9abc624507b80a5b8bcea29ce8cef6f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 08:10:10 +0900 Subject: [PATCH 22/31] Fix bug in the transformation of the intervals from token-based to character-based when the same tokens occur subsequently --- .../grobid/core/utilities/TextUtilities.java | 18 +++--- .../core/utilities/TextUtilitiesTest.java | 42 +++++++++++++ ...ingAcknowledgementParserIntegrationTest.kt | 32 +++++++++- .../FundingAcknowledgementParserTest.kt | 63 +++++++++++++++++++ 4 files changed, 144 insertions(+), 11 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 73ec73b352..26d520fe73 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1561,20 +1561,21 @@ public static List matchTokenAndString(List layoutT List newPositions = new ArrayList<>(); StringBuilder accumulator = new StringBuilder(); int pos = 0; + int textPositionOfToken = 0; for (OffsetPosition position : positions) { - List urlTokens = layoutTokens.subList(position.start, position.end); + List annotationTokens = layoutTokens.subList(position.start, position.end); boolean first = true; accumulator = new StringBuilder(); - for (int i = 0; i < urlTokens.size(); i++) { - LayoutToken token = urlTokens.get(i); + for (int i = 0; i < annotationTokens.size(); i++) { + LayoutToken token = annotationTokens.get(i); if (StringUtils.isEmpty(token.getText())) continue; - int newPos = text.indexOf(token.getText(), pos); - if (newPos != -1) { + textPositionOfToken = text.indexOf(token.getText(), pos); + if (textPositionOfToken != -1) { //We update pos only at the first token of the annotation positions if (first) { - pos = newPos; + pos = textPositionOfToken; first = false; } accumulator.append(token); @@ -1585,16 +1586,17 @@ public static List matchTokenAndString(List layoutT if (StringUtils.isNotEmpty(accumulator)) { int start = text.indexOf(accumulator.toString(), pos); newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); - pos = newPos; + pos = textPositionOfToken; break; } - pos = newPos; + pos = textPositionOfToken; } } if (StringUtils.isNotEmpty(accumulator)) { int start = text.indexOf(accumulator.toString(), pos); newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); accumulator = new StringBuilder(); + pos = textPositionOfToken; } } diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index 4df8704ae9..4db3914aca 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -471,4 +471,46 @@ public void testMatchTokenAndString_twoElements() throws Exception { assertThat(inputReal.substring(url1.start, url1.end), is("https://github.com/lfoppiano/ supercon2")); } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception { + final String input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List urlTokens = Arrays.asList( + new OffsetPosition(0, 3), + new OffsetPosition(5, 8), + new OffsetPosition(10, 13), + new OffsetPosition(15, 18) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, urlTokens); + + assertThat(offsetPositions, hasSize(4)); + + OffsetPosition url0 = offsetPositions.get(0); + assertThat(url0.start, is(0)); + assertThat(url0.end, is(19)); + + assertThat(input.substring(url0.start, url0.end), is("Christophe Castagne")); + + OffsetPosition url1 = offsetPositions.get(1); + assertThat(url1.start, is(21)); + assertThat(url1.end, is(34)); + + assertThat(input.substring(url1.start, url1.end), is("Claudie Marec")); + + OffsetPosition url2 = offsetPositions.get(2); + assertThat(url2.start, is(36)); + assertThat(url2.end, is(49)); + + assertThat(input.substring(url2.start, url2.end), is("Claudie Marec")); + + OffsetPosition url3 = offsetPositions.get(3); + assertThat(url3.start, is(51)); + assertThat(url3.end, is(66)); + + assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder")); + + } } diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index c7413c3f24..eead71bbd4 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -10,6 +10,7 @@ import org.hamcrest.Matchers.hasSize import org.junit.Before import org.junit.BeforeClass import org.junit.Test +import org.xmlunit.matchers.CompareMatcher class FundingAcknowledgementParserIntegrationTest { @@ -56,7 +57,7 @@ class FundingAcknowledgementParserIntegrationTest { val (element, mutableTriple) = target.processingXmlFragment(input, config) - assertThat(element.toXML(), `is`(output)) + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) assertThat(mutableTriple.left, hasSize(2)) } @@ -83,7 +84,7 @@ class FundingAcknowledgementParserIntegrationTest { val (element, mutableTriple) = target.processingXmlFragment(input, config) - assertThat(element.toXML(), `is`(output)) + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) } @Test @@ -103,7 +104,32 @@ class FundingAcknowledgementParserIntegrationTest { val (element, mutableTriple) = target.processingXmlFragment(input, config) - assertThat(element.toXML(), `is`(output)) + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + + @Test + fun testXmlFragmentProcessing_ErrorCase_withSentenceSegmentation_shouldWork() { + val input =""" +
+

Florentina Münzner, Lucy Schlicht, Adrian Tanara, Sany Tchanra and Marie-Jeanne Pesant for the manual curation of logsheets and archiving data at PANGAEA.We also acknowledge the work of Andree Behnken who developed the dds-fdp web service.All authors approved the final manuscript.This article is contribution number 26 of the Tara Oceans Consortium.The collection of Tara Oceans data was made possible by those who contributed to sampling and to logistics during the Tara Oceans Expedition: Alain Giese, Alan Deidun, Alban Lazar, Aldine Amiel, Ali Chase, Aline Tribollet, Ameer Abdullah, Amélie Betus, André Abreu, Andres Peyrot, Andrew Baker, Anna Deniaud, Anne Doye, Anne Ghuysen Watrin, Anne Royer, Anne Thompson, Annie McGrother, Antoine Sciandra, Antoine Triller, Aurélie Chambouvet, Baptiste Bernard, Baptiste Regnier, Beatriz Fernandez, Benedetto Barone, Bertrand Manzano, Bianca Silva, Brett Grant, Brigitte Sabard, Bruno Dunckel, Camille Clérissi, Catarina Marcolin, Cédric Guigand, Céline Bachelier, Céline Blanchard, Céline Dimier-Hugueney, Céline Rottier, Chris Bowler, Christian Rouvière, Christian Sardet, Christophe Boutte, Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder, Colomban De Vargas, Cornelia Maier, Cyril Tricot, Dana Sardet, Daniel Bayley, Daniel Cron, Daniele Iudicone, David Mountain, David Obura, David Sauveur, Defne Arslan, Denis Dausse, Denis de La Broise, Diana Ruiz Pino, Didier Zoccola, Édouard Leymarie, Éloïse Fontaine, Émilie Sauvage, Emilie Villar, Emmanuel Boss, Emmanuel G. Reynaud, Éric Béraud, Eric Karsenti, Eric Pelletier, Éric Roettinger, Erica Goetz, Fabien Perault, Fabiola Canard, Fabrice Not, Fabrizio D'Ortenzio, Fabrizio Limena, Floriane Desprez, Franck Prejger, François Aurat, François Noël, Franscisco Cornejo, Gabriel Gorsky, Gabriele Procaccini, Gabriella Gilkes, Gipsi Lima-Mendez, Grigor Obolensky, Guillaume Bracq, Guillem Salazar, Halldor Stefansson, Hélène Santener, Hervé Bourmaud, Hervé Le Goff, Hiroyuki Ogata, Hubert Gautier, Hugo Sarmento, Ian Probert, Isabel Ferrera, Isabelle Taupier-Letage, Jan Wengers, Jarred Swalwell, Javier del Campo, Jean-Baptiste Romagnan, Jean-Claude Gascard, Jean-Jacques Kerdraon, Jean-Louis Jamet, Jean-Michel Grisoni, Jennifer Gillette, Jérémie Capoulade, Jérôme Bastion, Jérôme Teigné, Joannie Ferland, Johan Decelle, Judith Prihoda, Julie Poulain, Julien Daniel, Julien Girardot, Juliette Chatelin, Lars Stemmann, Laurence Garczarek, Laurent Beguery, Lee Karp-Boss, Leila Tirichine, Linda Mollestan, Lionel Bigot, Loïc Vallette, Lucie Bittner, Lucie Subirana, Luis Gutiérrez, Lydiane Mattio, Magali Puiseux, Marc Domingos, Marc Picheral, Marc Wessner, Marcela Cornejo, Margaux Carmichael, Marion Lauters, Martin Hertau, Martina Sailerova, Mathilde Ménard, Matthieu Labaste, Matthieu Oriot, Matthieu Bretaud, Mattias Ormestad, Maya Dolan, Melissa Duhaime, Michael Pitiot, Mike Lunn, Mike Sieracki, Montse Coll, Myriam Thomas, Nadine Lebois, Nicole Poulton, Nigel Grimsley, Noan Le Bescot, Oleg Simakov, Olivier Broutin, Olivier Desprez, Olivier Jaillon, Olivier Marien, Olivier Poirot, Olivier Quesnel, Pamela Labbe-Ibanez, Pascal Hingamp, Pascal Morin, Pascale Joannot, Patrick Chang, Patrick Wincker, Paul Muir, Philippe Clais, Philippe Koubbi, Pierre Testor, Rachel Moreau, Raphaël Morard, Roland Heilig, Romain Troublé, Roxana Di Mauro, Roxanne Boonstra, Ruby Pillay, Sabrina Speich, Sacha Bollet, Samuel Audrain, Sandra Da Costa, Sarah Searson, Sasha Tozzi, Sébastien Colin, Sergey Pisarev, Shirley Falcone, Sibylle Le Barrois d'Orgeval, Silvia G. Acinas, Simon Morisset, Sophie Marinesque, Sophie Nicaud, Stefanie Kandels-Lewis, Stéphane Audic, Stephane Pesant, Stéphanie Reynaud, Thierry Mansir, Thomas Lefort, Uros Krzic, Valérian Morzadec, Vincent Hilaire, Vincent Le Pennec, Vincent Taillandier, Xavier Bailly, Xavier Bougeard, Xavier Durrieu de Madron, Yann Chavance, Yann Depays, Yohann Mucherie.

+
+ +""" + + val output = """ +
+

Florentina Münzner, Lucy Schlicht, Adrian Tanara, Sany Tchanra and Marie-Jeanne Pesant for the manual curation of logsheets and archiving data at PANGAEA.We also acknowledge the work of Andree Behnken who developed the dds-fdp web service.All authors approved the final manuscript.This article is contribution number 26 of the Tara Oceans Consortium.The collection of Tara Oceans data was made possible by those who contributed to sampling and to logistics during the Tara Oceans Expedition: Alain Giese, Alan Deidun, Alban Lazar, Aldine Amiel, Ali Chase, Aline Tribollet, Ameer Abdullah, Amélie Betus, André Abreu, Andres Peyrot, Andrew Baker, Anna Deniaud, Anne Doye, Anne Ghuysen Watrin, Anne Royer, Anne Thompson, Annie McGrother, Antoine Sciandra, Antoine Triller, Aurélie Chambouvet, Baptiste Bernard, Baptiste Regnier, Beatriz Fernandez, Benedetto Barone, Bertrand Manzano, Bianca Silva, Brett Grant, Brigitte Sabard, Bruno Dunckel, Camille Clérissi, Catarina Marcolin, Cédric Guigand, Céline Bachelier, Céline Blanchard, Céline Dimier-Hugueney, Céline Rottier, Chris Bowler, Christian Rouvière, Christian Sardet, Christophe Boutte, Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder, Colomban De Vargas, Cornelia Maier, Cyril Tricot, Dana Sardet, Daniel Bayley, Daniel Cron, Daniele Iudicone, David Mountain, David Obura, David Sauveur, Defne Arslan, Denis Dausse, Denis de La Broise, Diana Ruiz Pino, Didier Zoccola, Édouard Leymarie, Éloïse Fontaine, Émilie Sauvage, Emilie Villar, Emmanuel Boss, Emmanuel G. Reynaud, Éric Béraud, Eric Karsenti, Eric Pelletier, Éric Roettinger, Erica Goetz, Fabien Perault, Fabiola Canard, Fabrice Not, Fabrizio D'Ortenzio, Fabrizio Limena, Floriane Desprez, Franck Prejger, François Aurat, François Noël, Franscisco Cornejo, Gabriel Gorsky, Gabriele Procaccini, Gabriella Gilkes, Gipsi Lima-Mendez, Grigor Obolensky, Guillaume Bracq, Guillem Salazar, Halldor Stefansson, Hélène Santener, Hervé Bourmaud, Hervé Le Goff, Hiroyuki Ogata, Hubert Gautier, Hugo Sarmento, Ian Probert, Isabel Ferrera, Isabelle Taupier-Letage, Jan Wengers, Jarred Swalwell, Javier del Campo, Jean-Baptiste Romagnan, Jean-Claude Gascard, Jean-Jacques Kerdraon, Jean-Louis Jamet, Jean-Michel Grisoni, Jennifer Gillette, Jérémie Capoulade, Jérôme Bastion, Jérôme Teigné, Joannie Ferland, Johan Decelle, Judith Prihoda, Julie Poulain, Julien Daniel, Julien Girardot, Juliette Chatelin, Lars Stemmann, Laurence Garczarek, Laurent Beguery, Lee Karp-Boss, Leila Tirichine, Linda Mollestan, Lionel Bigot, Loïc Vallette, Lucie Bittner, Lucie Subirana, Luis Gutiérrez, Lydiane Mattio, Magali Puiseux, Marc Domingos, Marc Picheral, Marc Wessner, Marcela Cornejo, Margaux Carmichael, Marion Lauters, Martin Hertau, Martina Sailerova, Mathilde Ménard, Matthieu Labaste, Matthieu Oriot, Matthieu Bretaud, Mattias Ormestad, Maya Dolan, Melissa Duhaime, Michael Pitiot, Mike Lunn, Mike Sieracki, Montse Coll, Myriam Thomas, Nadine Lebois, Nicole Poulton, Nigel Grimsley, Noan Le Bescot, Oleg Simakov, Olivier Broutin, Olivier Desprez, Olivier Jaillon, Olivier Marien, Olivier Poirot, Olivier Quesnel, Pamela Labbe-Ibanez, Pascal Hingamp, Pascal Morin, Pascale Joannot, Patrick Chang, Patrick Wincker, Paul Muir, Philippe Clais, Philippe Koubbi, Pierre Testor, Rachel Moreau, Raphaël Morard, Roland Heilig, Romain Troublé, Roxana Di Mauro, Roxanne Boonstra, Ruby Pillay, Sabrina Speich, Sacha Bollet, Samuel Audrain, Sandra Da Costa, Sarah Searson, Sasha Tozzi, Sébastien Colin, Sergey Pisarev, Shirley Falcone, Sibylle Le Barrois d'Orgeval, Silvia G. Acinas, Simon Morisset, Sophie Marinesque, Sophie Nicaud, Stefanie Kandels-Lewis, Stéphane Audic, Stephane Pesant, Stéphanie Reynaud, Thierry Mansir, Thomas Lefort, Uros Krzic, Valérian Morzadec, Vincent Hilaire, Vincent Le Pennec, Vincent Taillandier, Xavier Bailly, Xavier Bougeard, Xavier Durrieu de Madron, Yann Chavance, Yann Depays, Yohann Mucherie.

+
+ +""" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) } companion object { diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt index cc636b4aa7..b3aa7227ec 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt @@ -12,6 +12,7 @@ import org.grobid.core.utilities.GrobidConfig import org.grobid.core.utilities.GrobidProperties import org.grobid.core.utilities.LayoutTokensUtil import org.hamcrest.CoreMatchers.`is` +import org.hamcrest.CoreMatchers.not import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.hasSize import org.junit.Before @@ -374,4 +375,66 @@ class FundingAcknowledgementParserTest { assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("JPMXP1122715503")) assertThat(element2.toXML(), `is`("JPMXP1122715503")) } + + @Test + fun testGetExtractionResult_ErrorCase_ShouldReturnCorrectElementsAndPositions() { + val input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,"; + + val results: String = "Christophe\tchristophe\tC\tCh\tChr\tChri\te\the\tphe\tophe\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Castagne\tcastagne\tC\tCa\tCas\tCast\te\tne\tgne\tagne\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" + + "Claudie\tclaudie\tC\tCl\tCla\tClau\te\tie\tdie\tudie\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Marec\tmarec\tM\tMa\tMar\tMare\tc\tec\trec\tarec\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" + + "Claudie\tclaudie\tC\tCl\tCla\tClau\te\tie\tdie\tudie\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Marec\tmarec\tM\tMa\tMar\tMare\tc\tec\trec\tarec\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" + + "Claudio\tclaudio\tC\tCl\tCla\tClau\to\tio\tdio\tudio\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + + "Stalder\tstalder\tS\tSt\tSta\tStal\tr\ter\tder\tlder\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" + + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + val (spans, statement) = target.getExtractionResult(tokens, results) + + assertThat(statement.fundings, hasSize(0)) + assertThat(statement.persons, hasSize(4)) + assertThat(statement.affiliations, hasSize(0)) + + assertThat(spans, hasSize(4)) + val span0 = spans[0] + val offsetPosition0 = span0.left + val element0 = span0.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition0.start, offsetPosition0.end)), `is`("Christophe Castagne")) + assertThat(element0.toXML(), `is`("Christophe Castagne")) + + val span1 = spans[1] + val offsetPosition1 = span1.left + val element1 = span1.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition1.start, offsetPosition1.end)), `is`("Claudie Marec")) + assertThat(element1.toXML(), `is`("Claudie Marec")) + + val span2 = spans[2] + val offsetPosition2 = span2.left + val element2 = span2.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("Claudie Marec")) + assertThat(element2.toXML(), `is`("Claudie Marec")) + + // The name is the same, but the offset should be different + assertThat(offsetPosition2.start, `is`(not(offsetPosition1.start))) + assertThat(offsetPosition2.end, `is`(not(offsetPosition1.end))) + + val span3 = spans[3] + val offsetPosition3 = span3.left + val element3 = span3.right + + assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition3.start, offsetPosition3.end)), `is`("Claudio Stalder")) + assertThat(element3.toXML(), `is`("Claudio Stalder")) + } + + + } \ No newline at end of file From 83c7a1015814a69ac4e46d5a6919dded950ec8b8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 10:04:57 +0900 Subject: [PATCH 23/31] Fix bug in the transformation of the intervals from token-based to character-based when the same tokens occur subsequently and the annotation is composed by a single token --- .../grobid/core/utilities/TextUtilities.java | 4 +-- .../core/utilities/TextUtilitiesTest.java | 36 ++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 26d520fe73..163c296046 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1586,7 +1586,7 @@ public static List matchTokenAndString(List layoutT if (StringUtils.isNotEmpty(accumulator)) { int start = text.indexOf(accumulator.toString(), pos); newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); - pos = textPositionOfToken; + pos = textPositionOfToken + 1; break; } pos = textPositionOfToken; @@ -1596,7 +1596,7 @@ public static List matchTokenAndString(List layoutT int start = text.indexOf(accumulator.toString(), pos); newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); accumulator = new StringBuilder(); - pos = textPositionOfToken; + pos = textPositionOfToken + 1; } } diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index 4db3914aca..f0eaaa2887 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -13,7 +13,6 @@ import java.util.regex.Matcher; import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.CoreMatchers.startsWith; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.*; @@ -513,4 +512,39 @@ public void testMatchTokenAndString_twoElementsWithEqualValue() throws Exception assertThat(input.substring(url3.start, url3.end), is("Claudio Stalder")); } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exception { + final String input = "We thank Felix Randow, Shigeki Higashiyama and Feng Zhang for plasmids.We thank Florian Steinberg for discussions and disclosure of unpublished results.We thank Matthew Freeman for helpful discussions.We express our deep gratitude to Moises Mallo for advice concerning CRISPR plus CRISPR reagents.We are grateful for the assistance of Ana Nóvoa and IGC's transgenics and mouse facilities.We thank IGC's cell sorting/flow cytometry, sequencing, and histopathology facilities."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List annotationTokenPositions = Arrays.asList( + new OffsetPosition(4, 7), + new OffsetPosition(9, 12), + new OffsetPosition(15, 18), + new OffsetPosition(27, 30), + new OffsetPosition(49, 52), + new OffsetPosition(71, 74), + new OffsetPosition(103, 106), + new OffsetPosition(109, 110), + new OffsetPosition(125, 126) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions); + + assertThat(offsetPositions, hasSize(9)); + + OffsetPosition url7 = offsetPositions.get(7); + assertThat(url7.start, is(349)); + assertThat(url7.end, is(352)); + + assertThat(input.substring(url7.start, url7.end), is("IGC")); + + OffsetPosition url8 = offsetPositions.get(8); + assertThat(url8.start, is(397)); + assertThat(url8.end, is(400)); + + assertThat(input.substring(url8.start, url8.end), is("IGC")); + + } } From 39892ff5f636143bd331c06c05e07ef37e18a4e4 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 11:18:34 +0900 Subject: [PATCH 24/31] Fix wrong Xpath expression --- .../engines/FundingAcknowledgementParser.java | 6 ++--- .../grobid/core/utilities/TextUtilities.java | 2 +- ...ingAcknowledgementParserIntegrationTest.kt | 26 +++++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 3b20d704fa..768855af7b 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -215,7 +215,7 @@ public MutablePair,List,List,List,List> annotations) { + private static void updateSentencesNodes(Nodes sentences, List> annotations) { int pos = 0; int sentenceStartOffset = 0; for (Node sentence : sentences) { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 163c296046..a8ea6a7c3e 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1586,7 +1586,7 @@ public static List matchTokenAndString(List layoutT if (StringUtils.isNotEmpty(accumulator)) { int start = text.indexOf(accumulator.toString(), pos); newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); - pos = textPositionOfToken + 1; + pos = textPositionOfToken; break; } pos = textPositionOfToken; diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index eead71bbd4..32c96f868c 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -132,6 +132,32 @@ class FundingAcknowledgementParserIntegrationTest { assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) } + @Test + fun testXmlFragmentProcessing_ErrorCase2_withSentenceSegmentation_shouldWork() { + val input =""" +
+
Acknowledgements

The authors would like to acknowledge Lucy Popplewell in the preparation of EMR notes for this study.

+
The authors would like to acknowledge Keele University's Prognosis and Consultation Epidemiology

Research Group who have given us permission to utilise the morbidity definitions (©2014).The copyright of the morbidity definitions/categorization lists (©2014) used in this publication is owned by Keele University, the development of which was supported by the Primary Care Research Consortium; For access/details relating to the morbidity definitions/categorisation lists (©2014) please go to www.keele.ac.uk/mrr.

+
+ +""" + + val output =""" +
+
Acknowledgements

The authors would like to acknowledge Lucy Popplewell in the preparation of EMR notes for this study.

+
The authors would like to acknowledge Keele University's Prognosis and Consultation Epidemiology

Research Group who have given us permission to utilise the morbidity definitions (©2014).The copyright of the morbidity definitions/categorization lists (©2014) used in this publication is owned by Keele University, the development of which was supported by the Primary Care Research Consortium; For access/details relating to the morbidity definitions/categorisation lists (©2014) please go to www.keele.ac.uk/mrr.

+
+ +""" + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + companion object { @JvmStatic @BeforeClass From 48779a2c1e22952aded0ef0a81986af180c7d861 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 12:50:26 +0900 Subject: [PATCH 25/31] Fix another corner case --- .../grobid/core/utilities/TextUtilities.java | 12 ++++++---- .../core/utilities/TextUtilitiesTest.java | 23 +++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index a8ea6a7c3e..f0e6cf03af 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -1584,19 +1584,23 @@ public static List matchTokenAndString(List layoutT continue; } if (StringUtils.isNotEmpty(accumulator)) { + int accumulatorTextLength = accumulator.toString().length(); int start = text.indexOf(accumulator.toString(), pos); - newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); - pos = textPositionOfToken; + int end = start + accumulatorTextLength; + newPositions.add(new OffsetPosition(start, end)); + pos = end; break; } pos = textPositionOfToken; } } if (StringUtils.isNotEmpty(accumulator)) { + int annotationTextLength = accumulator.toString().length(); int start = text.indexOf(accumulator.toString(), pos); - newPositions.add(new OffsetPosition(start, start + accumulator.toString().length())); + int end = start + annotationTextLength; + newPositions.add(new OffsetPosition(start, end)); + pos = end; accumulator = new StringBuilder(); - pos = textPositionOfToken + 1; } } diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java index f0eaaa2887..8b53cc263e 100644 --- a/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java +++ b/grobid-core/src/test/java/org/grobid/core/utilities/TextUtilitiesTest.java @@ -547,4 +547,27 @@ public void testMatchTokenAndString_twoElementsWithEqualValue2() throws Exceptio assertThat(input.substring(url8.start, url8.end), is("IGC")); } + + @Test + public void testMatchTokenAndString_twoElementsWithEqualValue3() throws Exception { + final String input = "We thank Benoit Demars for providing reaeration data and comments that signficantly improved the manuscript.This study was supported a NERC Case studentship awarded to DP, GYD and SJ, an ERC starting grant awarded to GYD, and the University of Exeter."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + List annotationTokenPositions = Arrays.asList( + new OffsetPosition(4, 7), + new OffsetPosition(40, 41), + new OffsetPosition(62, 63), + new OffsetPosition(79, 84) + ); + + List offsetPositions = TextUtilities.matchTokenAndString(tokenisedInput, input, annotationTokenPositions); + + assertThat(offsetPositions, hasSize(4)); + + OffsetPosition url7 = offsetPositions.get(1); + assertThat(input.substring(url7.start, url7.end), is("NERC")); + + OffsetPosition url8 = offsetPositions.get(2); + assertThat(input.substring(url8.start, url8.end), is("ERC")); + } } From e15416733f72a948267815197d29b0447407632f Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 13:25:08 +0900 Subject: [PATCH 26/31] cleanup --- .../engines/FundingAcknowledgementParser.java | 28 ++++--------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 768855af7b..72be438603 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -183,14 +183,13 @@ public MutablePair,List,List>, FundingAcknowledgmentParse> localResult = processing(tokenizationFunding, config); - List> annotations = localResult.left; - FundingAcknowledgmentParse localEntities = localResult.right; - - if (CollectionUtils.isEmpty(annotations)) { + if (localResult == null || CollectionUtils.isEmpty(localResult.left)) { continue; } + List> annotations = localResult.left; + FundingAcknowledgmentParse localEntities = localResult.right; - List list = annotations.stream().map(a -> a.getLeft()).toList(); + List list = annotations.stream().map(Pair::getLeft).toList(); List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, list); List> annotationsWithPosRefToText = new ArrayList<>(); for (int i = 0; i < annotationsPositionText.size(); i++) { @@ -200,21 +199,6 @@ public MutablePair,List,List, List> sentenceInformation = extractSentencesAndPositionsFromParagraphElement(rootElementStatement); -// -// List sentencesList = sentenceInformation.getLeft(); -// List offsetPositionList = sentenceInformation.getRight(); -// -// List> sentenceLayoutTokens = sentencesList.stream() -// .map(analyzer::tokenizeWithLayoutToken) -// .toList(); -// -// List sentenceTokenPositions = new ArrayList<>(); -// int pos = 0; -// for (List sentenceLayoutToken : sentenceLayoutTokens) { -// offsetPositionList.add(new OffsetPosition(pos, pos + sentenceLayoutToken.size())); -// pos += sentenceLayoutToken.size(); -// } Nodes sentences = paragraph.query(".//s"); if(sentences.size() == 0) { @@ -223,7 +207,7 @@ public MutablePair,List,List> annotations) { + private static void updateSentencesNodesWithAnnotations(Nodes sentences, List> annotations) { int pos = 0; int sentenceStartOffset = 0; for (Node sentence : sentences) { From 21a0cdd7d50faa24704cf55375dc37b212e08729 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sat, 4 May 2024 13:25:58 +0900 Subject: [PATCH 27/31] add --open of java.base/java.io (warn from huggingface spaces) --- Dockerfile.crf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.crf b/Dockerfile.crf index 55d8fdca21..6b2383f362 100644 --- a/Dockerfile.crf +++ b/Dockerfile.crf @@ -78,7 +78,7 @@ WORKDIR /opt/grobid COPY --from=builder /opt/grobid . -ENV GROBID_SERVICE_OPTS "-Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep --add-opens java.base/java.lang=ALL-UNNAMED" +ENV GROBID_SERVICE_OPTS "-Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep --add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED" CMD ["./grobid-service/bin/grobid-service"] From fb17eece22f25f27184a6cbcfa66542f71483fab Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 5 May 2024 14:08:41 +0900 Subject: [PATCH 28/31] fix lost of the last entity that was sharing boundary with the sentence --- .../engines/FundingAcknowledgementParser.java | 7 +++--- ...ingAcknowledgementParserIntegrationTest.kt | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 72be438603..2b202ea4cb 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -22,7 +22,6 @@ import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.UnicodeUtil; import org.grobid.core.utilities.*; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -244,7 +243,7 @@ private static void updateParagraphNodeWithAnnotations(Node paragraph, List> annotationsInThisChunk = annotations.stream() - .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end < finalPos + text.length()) + .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end <= finalPos + text.length()) .toList(); if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { @@ -282,7 +281,7 @@ private static void updateSentencesNodesWithAnnotations(Nodes sentences, List> annotationsInThisChunk = annotations.stream() - .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end < finalPos + text.length()) + .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end <= finalPos + text.length()) .toList(); if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { @@ -357,7 +356,7 @@ protected static List getNodesAnnotationsInTextNode(Node targetNode, List< return outputNodes; } - private static @NotNull MutablePair, List, List>> aggregateResults(MutableTriple, List, List> localEntities, MutablePair, List, List>> globalResult) { + private static MutablePair, List, List>> aggregateResults(MutableTriple, List, List> localEntities, MutablePair, List, List>> globalResult) { MutableTriple,List,List> globalEntities = globalResult.getRight(); List localFundings = localEntities.getLeft(); diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index 32c96f868c..04fefa973b 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -148,6 +148,30 @@ class FundingAcknowledgementParserIntegrationTest {
The authors would like to acknowledge Keele University's Prognosis and Consultation Epidemiology

Research Group who have given us permission to utilise the morbidity definitions (©2014).The copyright of the morbidity definitions/categorization lists (©2014) used in this publication is owned by Keele University, the development of which was supported by the Primary Care Research Consortium; For access/details relating to the morbidity definitions/categorisation lists (©2014) please go to www.keele.ac.uk/mrr.

+""" + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + + @Test + fun testXmlFragmentProcessing_ErrorCase3_withSentenceSegmentation_shouldWork() { + val input =""" +
+
Funding

This work was supported by European Molecular Biology Laboratory, the NSF award "BIGDATA: Mid-Scale: DA: ESCE: Collaborative Research: Scalable Statistical Computing for Emerging Omics Data Streams" and Genentech Inc.

+
+ +""" + + val output =""" +
+
Funding

This work was supported by European Molecular Biology Laboratory, the NSF award "BIGDATA: Mid-Scale: DA: ESCE: Collaborative Research: Scalable Statistical Computing for Emerging Omics Data Streams" and Genentech Inc.

+
+ """ val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() .withSentenceSegmentation(true) From 633651209b7bf69509b100fb6c7cbb1e33948938 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 5 May 2024 20:57:05 +0900 Subject: [PATCH 29/31] merge sentences whose boundaries are clashing with the annotations from the funding-acknowledgment --- .../engines/FundingAcknowledgementParser.java | 148 ++++++++++++++++-- .../core/utilities/SentenceUtilities.java | 47 +++--- ...ingAcknowledgementParserIntegrationTest.kt | 20 +++ 3 files changed, 180 insertions(+), 35 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 2b202ea4cb..d1199fbbb8 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -17,20 +17,26 @@ import org.grobid.core.engines.tagging.GenericTaggerUtils; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.features.FeaturesVectorFunding; +import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.UnicodeUtil; import org.grobid.core.utilities.*; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; import java.util.List; +import java.util.stream.Collectors; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.engines.label.TaggingLabels.*; +import static org.grobid.core.layout.VectorGraphicBoxCalculator.mergeBoxes; public class FundingAcknowledgementParser extends AbstractParser { @@ -153,11 +159,11 @@ protected static Element injectedAnnotationsInNode(List tokenizatio } /** - * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) + * For convenience, a processing method taking an TEI XML segment as input - only paragraphs (Element p) * will be processed in this segment and paragraph element will be replaced with the processed content. * Resulting entities are relative to the whole processed XML segment. - * - * Tokenization is done with the default Grobid analyzer triggered by the identified language. + * + * Tokenization is done with the default Grobid analyzer triggered by the identified language. **/ public MutablePair,List,List>> processingXmlFragment(String tei, GrobidAnalysisConfig config) { @@ -188,8 +194,8 @@ public MutablePair,List,List> annotations = localResult.left; FundingAcknowledgmentParse localEntities = localResult.right; - List list = annotations.stream().map(Pair::getLeft).toList(); - List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, list); + List annotationsPositionTokens = annotations.stream().map(Pair::getLeft).toList(); + List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, annotationsPositionTokens); List> annotationsWithPosRefToText = new ArrayList<>(); for (int i = 0; i < annotationsPositionText.size(); i++) { annotationsWithPosRefToText.add(Pair.of(annotationsPositionText.get(i), annotations.get(i).getRight())); @@ -205,7 +211,7 @@ public MutablePair,List,List,List,ListThis method modify the sentences in input + */ + private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List> annotations, GrobidAnalysisConfig config) { + // We merge the sentences (including their coordinates) for which the annotations + // are falling in between two of them or they will be lost later. + + List sentencePositions = getOffsetPositionsFromNodes(sentences); + + // We obtain the corrected coordinates that don't fall over the annotations + List correctedOffsetPositions = SentenceUtilities.correctSentencePositions(sentencePositions, annotations + .stream() + .map(Pair::getLeft).toList()); + + List toRemove = new ArrayList<>(); + for (OffsetPosition correctedOffsetPosition : correctedOffsetPositions) { + List originalSentences = sentencePositions.stream() + .filter(a -> a.start >= correctedOffsetPosition.start && a.end <= correctedOffsetPosition.end) + .toList(); + + // if for each "corrected sentences offset" there are more than one original sentence that + // falls into it, it means we need to merge + if (originalSentences.size() > 1) { + List toMerge = originalSentences.stream() + .map(sentencePositions::indexOf) + .toList(); + + Element destination = (Element) sentences.get(toMerge.get(0)); + boolean needToMergeCoordinates = config.isGenerateTeiCoordinates("s"); + List boundingBoxes = new ArrayList<>(); + Attribute destCoordinates = null; + + if (needToMergeCoordinates) { + destCoordinates = destination.getAttribute("coords"); + String coordinates = destCoordinates.getValue(); + boundingBoxes = Arrays.stream(coordinates.split(";")) + .map(BoundingBox::fromString) + .collect(Collectors.toList()); + } + + for (int i = 1; i < toMerge.size(); i++) { + Integer sentenceToMergeIndex = toMerge.get(i); + Node sentenceToMerge = sentences.get(sentenceToMergeIndex); + + // Merge coordinates + if (needToMergeCoordinates) { + Attribute coords = destination.getAttribute("coords"); + String coordinates = coords.getValue(); + boundingBoxes.addAll(Arrays.stream(coordinates.split(";")) + .map(BoundingBox::fromString) + .toList()); + + List mergedBoundingBoxes = mergeBoxes(boundingBoxes); + String coordsAsString = String.join(";", mergedBoundingBoxes.stream().map(BoundingBox::toString).toList()); + Attribute newCoords = new Attribute("coords", coordsAsString); + destination.removeAttribute(coords); + destination.addAttribute(newCoords); + } + + // Merge content + boolean first = true; + Node previous = null; + for (int c = 0; c < sentenceToMerge.getChildCount(); c++) { + Node child = sentenceToMerge.getChild(c); + + if (first) { + first = false; + Node lastNodeDestination = destination.getChild(destination.getChildCount() - 1); + previous = lastNodeDestination; +// if (lastNodeDestination instanceof Text) { +// ((Text) lastNodeDestination).setValue(((Text) lastNodeDestination).getValue() + " "); +// previous = lastNodeDestination; +// } else { +// Text newSpace = new Text(" "); +// destination.appendChild(newSpace); +// previous = newSpace; +// } + } + + if (previous instanceof Text && child instanceof Text) { + ((Text) previous).setValue(previous.getValue() + child.getValue()); + } else { + ((Element) sentenceToMerge).replaceChild(child, new Text("placeholder")); + child.detach(); + destination.appendChild(child); + previous = child; + } + } + sentenceToMerge.detach(); + toRemove.add(sentenceToMergeIndex); + } + } + } + toRemove.stream() + .sorted(Comparator.reverseOrder()) + .forEach(sentences::remove); + + return sentences; + } + + private static @NotNull List getOffsetPositionsFromNodes(Nodes sentences) { + List sentencePositions = new ArrayList<>(); + int start = 0; + for (Node sentence : sentences) { + int end = start + sentence.getValue().length(); + sentencePositions.add(new OffsetPosition(start, end)); + start = end; + } + return sentencePositions; + } + private static void updateParagraphNodeWithAnnotations(Node paragraph, List> annotations) { int pos = 0; List newChildren = new ArrayList<>(); @@ -400,18 +520,18 @@ protected static Pair, List> extractSentencesAndPos * The processing here is called from the header and/or full text parser in cascade * when one of these higher-level model detect a "funding" section, or in case * no funding section is found, when a acknolwedgements section is detected. - * - * Independently from the place this parser is called, it process the input sequence - * of layout tokens in a context free manner. - * + * + * Independently from the place this parser is called, it process the input sequence + * of layout tokens in a context free manner. + * * The expected input here is a paragraph. * * // This returns a Element of the annotation and the position where should be injected, relative to the paragraph. * // TODO: make new data objects for the annotations - * - * Return an XML fragment with inline annotations of the input text, together with - * extracted normalized entities. These entities are referenced by the inline - * annotations with the usual @target attribute pointing to xml:id. + * + * Return an XML fragment with inline annotations of the input text, together with + * extracted normalized entities. These entities are referenced by the inline + * annotations with the usual @target attribute pointing to xml:id. */ protected MutablePair>, FundingAcknowledgmentParse> getExtractionResult(List tokensParagraph, String labellingResult) { List fundings = new ArrayList<>(); diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java index c0b4498835..7446f26bc5 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java @@ -141,27 +141,7 @@ public List runSentenceDetection(String text, List finalSentencePositions = new ArrayList<>(); - int forbiddenIndex = 0; - for(int j=0; j < sentencePositions.size(); j++) { - OffsetPosition position = sentencePositions.get(j); - for(int i=forbiddenIndex; i < forbidden.size(); i++) { - OffsetPosition forbiddenPos = forbidden.get(i); - if (forbiddenPos.end < position.end) - continue; - if (forbiddenPos.start > position.end) - break; - while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) { - if (j+1 < sentencePositions.size()) { - position.end = sentencePositions.get(j+1).end; - j++; - forbiddenIndex = i; - } else - break; - } - } - finalSentencePositions.add(position); - } + List finalSentencePositions = correctSentencePositions(sentencePositions, forbidden); // as a heuristics for all implementations, because they clearly all fail for this case, we // attached to the right sentence the numerical bibliographical references markers expressed @@ -286,6 +266,31 @@ public List runSentenceDetection(String text, List correctSentencePositions(List sentencePositions, List forbiddenPositions) { + List finalSentencePositions = new ArrayList<>(); + int forbiddenIndex = 0; + for(int j = 0; j < sentencePositions.size(); j++) { + OffsetPosition position = new OffsetPosition(sentencePositions.get(j).start, sentencePositions.get(j).end); + for(int i = forbiddenIndex; i < forbiddenPositions.size(); i++) { + OffsetPosition forbiddenPos = forbiddenPositions.get(i); + if (forbiddenPos.end < position.end) + continue; + if (forbiddenPos.start > position.end) + break; + while ( (forbiddenPos.start < position.end && position.end < forbiddenPos.end) ) { + if (j+1 < sentencePositions.size()) { + position.end = sentencePositions.get(j+1).end; + j++; + forbiddenIndex = i; + } else + break; + } + } + finalSentencePositions.add(position); + } + return finalSentencePositions; + } + /** * Return true if the token should be skipped when considering sentence content. */ diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index 04fefa973b..17bf78d85b 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -182,6 +182,26 @@ class FundingAcknowledgementParserIntegrationTest { assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) } + @Test + fun testXmlFragmentProcessing_mergingSentences_shouldMergeCorrectly() { + val input ="\n" + + "\t\t\t
\n" + + "
Acknowledgements

Our warmest thanks to PatriceLopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro BaptistadeCastro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
\n\n" + + val output = "
\n" + + "
Acknowledgements

Our warmest thanks to PatriceLopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro BaptistadeCastro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + + "\t\t\t
" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + companion object { @JvmStatic @BeforeClass From 91991706c556c15f57d2a99c687a1f696cb0c628 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 6 May 2024 07:56:02 +0900 Subject: [PATCH 30/31] fix coordinates merge --- .../grobid/core/engines/FundingAcknowledgementParser.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index d1199fbbb8..0a5994b02f 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -278,8 +278,10 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List mergedBoundingBoxes = mergeBoxes(boundingBoxes); String coordsAsString = String.join(";", mergedBoundingBoxes.stream().map(BoundingBox::toString).toList()); Attribute newCoords = new Attribute("coords", coordsAsString); - destination.removeAttribute(coords); destination.addAttribute(newCoords); } From c70d6d3a04e9604c32ad5c0fe51c8cdbc8d98308 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 8 May 2024 09:39:59 +0900 Subject: [PATCH 31/31] Fix merging of coordinates to avoid merge when on different pages, add object for annotations with xml nodes --- .../grobid/core/data/AnnotatedXMLElement.java | 35 +++++++++ .../engines/FundingAcknowledgementParser.java | 73 ++++++++--------- ...ingAcknowledgementParserIntegrationTest.kt | 78 ++++++++++++++++--- .../FundingAcknowledgementParserTest.kt | 68 ++++++++-------- 4 files changed, 176 insertions(+), 78 deletions(-) create mode 100644 grobid-core/src/main/java/org/grobid/core/data/AnnotatedXMLElement.java diff --git a/grobid-core/src/main/java/org/grobid/core/data/AnnotatedXMLElement.java b/grobid-core/src/main/java/org/grobid/core/data/AnnotatedXMLElement.java new file mode 100644 index 0000000000..8a9b28e02a --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/data/AnnotatedXMLElement.java @@ -0,0 +1,35 @@ +package org.grobid.core.data; + +import nu.xom.Element; +import org.grobid.core.utilities.OffsetPosition; + +/** + * This class represent an annotation in an XML node. + * The annotation is composed by two information: the XML Element node and the offset position + */ +public class AnnotatedXMLElement { + + private OffsetPosition offsetPosition; + private Element annotationNode; + + public AnnotatedXMLElement(Element annotationNode, OffsetPosition offsetPosition) { + this.annotationNode = annotationNode; + this.offsetPosition = offsetPosition; + } + + public OffsetPosition getOffsetPosition() { + return offsetPosition; + } + + public void setOffsetPosition(OffsetPosition offsetPosition) { + this.offsetPosition = offsetPosition; + } + + public Element getAnnotationNode() { + return annotationNode; + } + + public void setAnnotationNode(Element annotationNode) { + this.annotationNode = annotationNode; + } +} diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 0a5994b02f..cfeef3637f 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -9,6 +9,7 @@ import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.GrobidModel; import org.grobid.core.GrobidModels; +import org.grobid.core.data.AnnotatedXMLElement; import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.*; import org.grobid.core.document.xml.XmlBuilderUtils; @@ -23,15 +24,11 @@ import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.UnicodeUtil; import org.grobid.core.utilities.*; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; +import java.util.*; import java.util.stream.Collectors; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; @@ -50,8 +47,7 @@ protected FundingAcknowledgementParser() { super(model); } - private MutablePair>, FundingAcknowledgmentParse> - processing(List tokenizationFunding, GrobidAnalysisConfig config) { + private MutablePair, FundingAcknowledgmentParse> processing(List tokenizationFunding, GrobidAnalysisConfig config) { if (CollectionUtils.isEmpty(tokenizationFunding)) { return null; } @@ -78,10 +74,10 @@ protected FundingAcknowledgementParser() { public MutablePair, List, List>> processing(String text, GrobidAnalysisConfig config) { text = UnicodeUtil.normaliseText(text); - List tokenizationFunding = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); - MutablePair>, FundingAcknowledgmentParse> results = processing(tokenizationFunding, config); - MutableTriple, List, List> entities = MutableTriple.of(results.getRight().getFundings(), results.getRight().getPersons(), results.getRight().getAffiliations()); - List> annotations = results.getLeft(); +// List tokenizationFunding = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); +// MutablePair, FundingAcknowledgmentParse> results = processing(tokenizationFunding, config); +// MutableTriple, List, List> entities = MutableTriple.of(results.getRight().getFundings(), results.getRight().getPersons(), results.getRight().getAffiliations()); +// List annotations = results.getLeft(); Element outputParagraph = teiElement("p"); outputParagraph.appendChild(text); @@ -186,19 +182,19 @@ public MutablePair,List,List tokenizationFunding = analyzer.tokenizeWithLayoutToken(paragraphText); - MutablePair>, FundingAcknowledgmentParse> localResult = processing(tokenizationFunding, config); + MutablePair, FundingAcknowledgmentParse> localResult = processing(tokenizationFunding, config); if (localResult == null || CollectionUtils.isEmpty(localResult.left)) { continue; } - List> annotations = localResult.left; + List annotations = localResult.left; FundingAcknowledgmentParse localEntities = localResult.right; - List annotationsPositionTokens = annotations.stream().map(Pair::getLeft).toList(); + List annotationsPositionTokens = annotations.stream().map(AnnotatedXMLElement::getOffsetPosition).toList(); List annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, annotationsPositionTokens); - List> annotationsWithPosRefToText = new ArrayList<>(); + List annotationsWithPosRefToText = new ArrayList<>(); for (int i = 0; i < annotationsPositionText.size(); i++) { - annotationsWithPosRefToText.add(Pair.of(annotationsPositionText.get(i), annotations.get(i).getRight())); + annotationsWithPosRefToText.add(new AnnotatedXMLElement(annotations.get(i).getAnnotationNode(), annotationsPositionText.get(i))); } annotations = annotationsWithPosRefToText; @@ -245,7 +241,7 @@ public MutablePair,List,ListThis method modify the sentences in input */ - private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List> annotations, GrobidAnalysisConfig config) { + private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List annotations, GrobidAnalysisConfig config) { // We merge the sentences (including their coordinates) for which the annotations // are falling in between two of them or they will be lost later. @@ -254,7 +250,7 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List correctedOffsetPositions = SentenceUtilities.correctSentencePositions(sentencePositions, annotations .stream() - .map(Pair::getLeft).toList()); + .map(AnnotatedXMLElement::getOffsetPosition).toList()); List toRemove = new ArrayList<>(); for (OffsetPosition correctedOffsetPosition : correctedOffsetPositions) { @@ -297,8 +293,15 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List mergedBoundingBoxes = mergeBoxes(boundingBoxes); - String coordsAsString = String.join(";", mergedBoundingBoxes.stream().map(BoundingBox::toString).toList()); + // Group by page, then merge + List postMergeBoxes = new ArrayList<>(); + Map> boundingBoxesByPage = boundingBoxes.stream().collect(Collectors.groupingBy(BoundingBox::getPage)); + for(Map.Entry> boxesByPages : boundingBoxesByPage.entrySet()) { + List mergedBoundingBoxes = mergeBoxes(boxesByPages.getValue()); + postMergeBoxes.addAll(mergedBoundingBoxes); + } + + String coordsAsString = String.join(";", postMergeBoxes.stream().map(BoundingBox::toString).toList()); Attribute newCoords = new Attribute("coords", coordsAsString); destination.addAttribute(newCoords); } @@ -344,7 +347,7 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List getOffsetPositionsFromNodes(Nodes sentences) { + private static List getOffsetPositionsFromNodes(Nodes sentences) { List sentencePositions = new ArrayList<>(); int start = 0; for (Node sentence : sentences) { @@ -355,7 +358,7 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List> annotations) { + private static void updateParagraphNodeWithAnnotations(Node paragraph, List annotations) { int pos = 0; List newChildren = new ArrayList<>(); for (int i = 0; i < paragraph.getChildCount(); i++) { @@ -364,8 +367,8 @@ private static void updateParagraphNodeWithAnnotations(Node paragraph, List> annotationsInThisChunk = annotations.stream() - .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end <= finalPos + text.length()) + List annotationsInThisChunk = annotations.stream() + .filter(a -> a.getOffsetPosition().start >= finalPos && a.getOffsetPosition().end <= finalPos + text.length()) .toList(); if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { @@ -390,7 +393,7 @@ private static void updateParagraphNodeWithAnnotations(Node paragraph, List> annotations) { + private static void updateSentencesNodesWithAnnotations(Nodes sentences, List annotations) { int pos = 0; int sentenceStartOffset = 0; for (Node sentence : sentences) { @@ -402,8 +405,8 @@ private static void updateSentencesNodesWithAnnotations(Nodes sentences, List> annotationsInThisChunk = annotations.stream() - .filter(a -> a.getLeft().start >= finalPos && a.getLeft().end <= finalPos + text.length()) + List annotationsInThisChunk = annotations.stream() + .filter(a -> a.getOffsetPosition().start >= finalPos && a.getOffsetPosition().end <= finalPos + text.length()) .toList(); if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { @@ -438,7 +441,7 @@ private static void updateSentencesNodesWithAnnotations(Nodes sentences, List getNodesAnnotationsInTextNode(Node targetNode, List> annotations) { + protected static List getNodesAnnotationsInTextNode(Node targetNode, List annotations) { return getNodesAnnotationsInTextNode(targetNode, annotations, 0); } @@ -446,15 +449,15 @@ protected static List getNodesAnnotationsInTextNode(Node targetNode, List< * The sentence offset allow to calculate the position relative to the sentence of annotations that * have been calculated in relation with the paragraph. */ - protected static List getNodesAnnotationsInTextNode(Node targetNode, List> annotations, int sentenceOffset) { + protected static List getNodesAnnotationsInTextNode(Node targetNode, List annotations, int sentenceOffset) { String text = targetNode.getValue(); List outputNodes = new ArrayList<>(); int pos = 0; - for (Pair annotation : annotations) { - OffsetPosition annotationPosition = annotation.getLeft(); - Element annotationContentElement = annotation.getRight(); + for (AnnotatedXMLElement annotation : annotations) { + OffsetPosition annotationPosition = annotation.getOffsetPosition(); + Element annotationContentElement = annotation.getAnnotationNode(); String before = text.substring(pos, annotationPosition.start - sentenceOffset); @@ -535,7 +538,7 @@ protected static Pair, List> extractSentencesAndPos * extracted normalized entities. These entities are referenced by the inline * annotations with the usual @target attribute pointing to xml:id. */ - protected MutablePair>, FundingAcknowledgmentParse> getExtractionResult(List tokensParagraph, String labellingResult) { + protected MutablePair, FundingAcknowledgmentParse> getExtractionResult(List tokensParagraph, String labellingResult) { List fundings = new ArrayList<>(); List persons = new ArrayList<>(); List affiliations = new ArrayList<>(); @@ -827,10 +830,10 @@ protected MutablePair>, FundingAcknowledgment localFunding.inferAcronyms(); } - List> annotations = new ArrayList<>(); + List annotations = new ArrayList<>(); for (int i = 0; i < elements.size(); i++) { - annotations.add(Pair.of(positions.get(i), elements.get(i))); + annotations.add(new AnnotatedXMLElement(elements.get(i), positions.get(i))); } return MutablePair.of(annotations, parsedStatement); diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt index 17bf78d85b..4ae2422567 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserIntegrationTest.kt @@ -4,13 +4,13 @@ import org.grobid.core.engines.config.GrobidAnalysisConfig import org.grobid.core.factory.AbstractEngineFactory import org.grobid.core.utilities.GrobidConfig import org.grobid.core.utilities.GrobidProperties -import org.hamcrest.CoreMatchers.`is` import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.hasSize import org.junit.Before import org.junit.BeforeClass import org.junit.Test import org.xmlunit.matchers.CompareMatcher +import java.util.* class FundingAcknowledgementParserIntegrationTest { @@ -63,7 +63,7 @@ class FundingAcknowledgementParserIntegrationTest { @Test fun testXmlFragmentProcessing2_withoutSentenceSegmentation_shouldReturnSameXML() { - val input ="\n" + + val input = "\n" + "\t\t\t
\n" + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + "\t\t\t
\n\n" @@ -89,7 +89,7 @@ class FundingAcknowledgementParserIntegrationTest { @Test fun testXmlFragmentProcessing2_withSentenceSegmentation_shouldWork() { - val input ="\n" + + val input = "\n" + "\t\t\t
\n" + "
Acknowledgements

Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro Baptista de Castro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + "\t\t\t
\n\n" @@ -109,7 +109,7 @@ class FundingAcknowledgementParserIntegrationTest { @Test fun testXmlFragmentProcessing_ErrorCase_withSentenceSegmentation_shouldWork() { - val input =""" + val input = """

Florentina Münzner, Lucy Schlicht, Adrian Tanara, Sany Tchanra and Marie-Jeanne Pesant for the manual curation of logsheets and archiving data at PANGAEA.We also acknowledge the work of Andree Behnken who developed the dds-fdp web service.All authors approved the final manuscript.This article is contribution number 26 of the Tara Oceans Consortium.The collection of Tara Oceans data was made possible by those who contributed to sampling and to logistics during the Tara Oceans Expedition: Alain Giese, Alan Deidun, Alban Lazar, Aldine Amiel, Ali Chase, Aline Tribollet, Ameer Abdullah, Amélie Betus, André Abreu, Andres Peyrot, Andrew Baker, Anna Deniaud, Anne Doye, Anne Ghuysen Watrin, Anne Royer, Anne Thompson, Annie McGrother, Antoine Sciandra, Antoine Triller, Aurélie Chambouvet, Baptiste Bernard, Baptiste Regnier, Beatriz Fernandez, Benedetto Barone, Bertrand Manzano, Bianca Silva, Brett Grant, Brigitte Sabard, Bruno Dunckel, Camille Clérissi, Catarina Marcolin, Cédric Guigand, Céline Bachelier, Céline Blanchard, Céline Dimier-Hugueney, Céline Rottier, Chris Bowler, Christian Rouvière, Christian Sardet, Christophe Boutte, Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder, Colomban De Vargas, Cornelia Maier, Cyril Tricot, Dana Sardet, Daniel Bayley, Daniel Cron, Daniele Iudicone, David Mountain, David Obura, David Sauveur, Defne Arslan, Denis Dausse, Denis de La Broise, Diana Ruiz Pino, Didier Zoccola, Édouard Leymarie, Éloïse Fontaine, Émilie Sauvage, Emilie Villar, Emmanuel Boss, Emmanuel G. Reynaud, Éric Béraud, Eric Karsenti, Eric Pelletier, Éric Roettinger, Erica Goetz, Fabien Perault, Fabiola Canard, Fabrice Not, Fabrizio D'Ortenzio, Fabrizio Limena, Floriane Desprez, Franck Prejger, François Aurat, François Noël, Franscisco Cornejo, Gabriel Gorsky, Gabriele Procaccini, Gabriella Gilkes, Gipsi Lima-Mendez, Grigor Obolensky, Guillaume Bracq, Guillem Salazar, Halldor Stefansson, Hélène Santener, Hervé Bourmaud, Hervé Le Goff, Hiroyuki Ogata, Hubert Gautier, Hugo Sarmento, Ian Probert, Isabel Ferrera, Isabelle Taupier-Letage, Jan Wengers, Jarred Swalwell, Javier del Campo, Jean-Baptiste Romagnan, Jean-Claude Gascard, Jean-Jacques Kerdraon, Jean-Louis Jamet, Jean-Michel Grisoni, Jennifer Gillette, Jérémie Capoulade, Jérôme Bastion, Jérôme Teigné, Joannie Ferland, Johan Decelle, Judith Prihoda, Julie Poulain, Julien Daniel, Julien Girardot, Juliette Chatelin, Lars Stemmann, Laurence Garczarek, Laurent Beguery, Lee Karp-Boss, Leila Tirichine, Linda Mollestan, Lionel Bigot, Loïc Vallette, Lucie Bittner, Lucie Subirana, Luis Gutiérrez, Lydiane Mattio, Magali Puiseux, Marc Domingos, Marc Picheral, Marc Wessner, Marcela Cornejo, Margaux Carmichael, Marion Lauters, Martin Hertau, Martina Sailerova, Mathilde Ménard, Matthieu Labaste, Matthieu Oriot, Matthieu Bretaud, Mattias Ormestad, Maya Dolan, Melissa Duhaime, Michael Pitiot, Mike Lunn, Mike Sieracki, Montse Coll, Myriam Thomas, Nadine Lebois, Nicole Poulton, Nigel Grimsley, Noan Le Bescot, Oleg Simakov, Olivier Broutin, Olivier Desprez, Olivier Jaillon, Olivier Marien, Olivier Poirot, Olivier Quesnel, Pamela Labbe-Ibanez, Pascal Hingamp, Pascal Morin, Pascale Joannot, Patrick Chang, Patrick Wincker, Paul Muir, Philippe Clais, Philippe Koubbi, Pierre Testor, Rachel Moreau, Raphaël Morard, Roland Heilig, Romain Troublé, Roxana Di Mauro, Roxanne Boonstra, Ruby Pillay, Sabrina Speich, Sacha Bollet, Samuel Audrain, Sandra Da Costa, Sarah Searson, Sasha Tozzi, Sébastien Colin, Sergey Pisarev, Shirley Falcone, Sibylle Le Barrois d'Orgeval, Silvia G. Acinas, Simon Morisset, Sophie Marinesque, Sophie Nicaud, Stefanie Kandels-Lewis, Stéphane Audic, Stephane Pesant, Stéphanie Reynaud, Thierry Mansir, Thomas Lefort, Uros Krzic, Valérian Morzadec, Vincent Hilaire, Vincent Le Pennec, Vincent Taillandier, Xavier Bailly, Xavier Bougeard, Xavier Durrieu de Madron, Yann Chavance, Yann Depays, Yohann Mucherie.

@@ -134,7 +134,7 @@ class FundingAcknowledgementParserIntegrationTest { @Test fun testXmlFragmentProcessing_ErrorCase2_withSentenceSegmentation_shouldWork() { - val input =""" + val input = """
Acknowledgements

The authors would like to acknowledge Lucy Popplewell in the preparation of EMR notes for this study.

The authors would like to acknowledge Keele University's Prognosis and Consultation Epidemiology

Research Group who have given us permission to utilise the morbidity definitions (©2014).The copyright of the morbidity definitions/categorization lists (©2014) used in this publication is owned by Keele University, the development of which was supported by the Primary Care Research Consortium; For access/details relating to the morbidity definitions/categorisation lists (©2014) please go to www.keele.ac.uk/mrr.

@@ -142,7 +142,7 @@ class FundingAcknowledgementParserIntegrationTest { """ - val output =""" + val output = """
Acknowledgements

The authors would like to acknowledge Lucy Popplewell in the preparation of EMR notes for this study.

The authors would like to acknowledge Keele University's Prognosis and Consultation Epidemiology

Research Group who have given us permission to utilise the morbidity definitions (©2014).The copyright of the morbidity definitions/categorization lists (©2014) used in this publication is owned by Keele University, the development of which was supported by the Primary Care Research Consortium; For access/details relating to the morbidity definitions/categorisation lists (©2014) please go to www.keele.ac.uk/mrr.

@@ -160,14 +160,14 @@ class FundingAcknowledgementParserIntegrationTest { @Test fun testXmlFragmentProcessing_ErrorCase3_withSentenceSegmentation_shouldWork() { - val input =""" + val input = """
Funding

This work was supported by European Molecular Biology Laboratory, the NSF award "BIGDATA: Mid-Scale: DA: ESCE: Collaborative Research: Scalable Statistical Computing for Emerging Omics Data Streams" and Genentech Inc.

""" - val output =""" + val output = """
Funding

This work was supported by European Molecular Biology Laboratory, the NSF award "BIGDATA: Mid-Scale: DA: ESCE: Collaborative Research: Scalable Statistical Computing for Emerging Omics Data Streams" and Genentech Inc.

@@ -184,7 +184,7 @@ class FundingAcknowledgementParserIntegrationTest { @Test fun testXmlFragmentProcessing_mergingSentences_shouldMergeCorrectly() { - val input ="\n" + + val input = "\n" + "\t\t\t
\n" + "
Acknowledgements

Our warmest thanks to PatriceLopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions.We thank Pedro BaptistadeCastro for his support during this work.Special thanks to Erina Fujita for useful tips on the manuscript.

\n" + "\t\t\t
\n\n" @@ -202,6 +202,66 @@ class FundingAcknowledgementParserIntegrationTest { assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) } + @Test + fun testXmlFragmentProcessing_mergingSentencesAndCoordinatesInTheSamePage_shouldMergeCoordinates() { + val input = """
" + + "
Acknowledgements

This is sentence 1 in page 1 where we thanks PatriceLopez, who is also overlapping in sentence 2, page 2, with annotations [22], DeLFT [20], and more text.

\n" + + "
""" + + val output = """
" + + "
Acknowledgements

This is sentence 1 in page 1 where we thanks PatriceLopez, who is also overlapping in sentence 2, page 2, with annotations [22], DeLFT [20], and more text.

\n" + + "
""" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .generateTeiCoordinates(listOf("s")) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + + @Test + fun testXmlFragmentProcessing_mergingSentencesAndCoordinatesInTheSamePage2_shouldMergeCoordinates() { + val input = """
" + + "
Acknowledgements

This is sentence 1 in page 1 where we thanks PatriceLopez, who is also overlapping in sentence 2, page 2, with annotations [22], DeLFT [20], and more text.

\n" + + "
""" + + val output = """
" + + "
Acknowledgements

This is sentence 1 in page 1 where we thanks PatriceLopez, who is also overlapping in sentence 2, page 2, with annotations [22], DeLFT [20], and more text.

\n" + + "
""" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .generateTeiCoordinates(listOf("s")) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + + @Test + fun testXmlFragmentProcessing_mergingSentencesAndCoordinatesInDifferentPages_shouldNotMergeCoordinates() { + val input = """
" + + "
Acknowledgements

This is sentence 1 in page 1 where we thanks PatriceLopez, who is also overlapping in sentence 2, page 2, with annotations [22], DeLFT [20], and more text.

\n" + + "
""" + + val output = """
" + + "
Acknowledgements

This is sentence 1 in page 1 where we thanks PatriceLopez, who is also overlapping in sentence 2, page 2, with annotations [22], DeLFT [20], and more text.

\n" + + "
""" + + val config = GrobidAnalysisConfig.GrobidAnalysisConfigBuilder() + .withSentenceSegmentation(true) + .generateTeiCoordinates(listOf("s")) + .build() + + val (element, mutableTriple) = target.processingXmlFragment(input, config) + + assertThat(element.toXML(), CompareMatcher.isIdenticalTo(output)) + } + companion object { @JvmStatic @BeforeClass diff --git a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt index b3aa7227ec..006a1e3ebf 100644 --- a/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt +++ b/grobid-core/src/test/kotlin/org/grobid/core/engines/FundingAcknowledgementParserTest.kt @@ -34,7 +34,7 @@ class FundingAcknowledgementParserTest { @Test fun testGetExtractionResult() { - val input = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript."; + val input = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript." val results: String = "Our\tour\tO\tOu\tOur\tOur\tr\tur\tOur\tOur\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "warmest\twarmest\tw\twa\twar\twarm\tt\tst\test\tmest\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + @@ -101,9 +101,9 @@ class FundingAcknowledgementParserTest { "on\ton\to\ton\ton\ton\tn\ton\ton\ton\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + "manuscript\tmanuscript\tm\tma\tman\tmanu\tt\tpt\tipt\tript\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t" - val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input) val (element, fundingAcknowledgmentParse) = target.getExtractionResult(tokens, results) @@ -118,7 +118,7 @@ class FundingAcknowledgementParserTest { @Test fun testGetExtractionResult2() { - val input = "This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503]."; + val input = "This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503]." val results: String = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + @@ -154,9 +154,9 @@ class FundingAcknowledgementParserTest { "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + "JPMXP1122715503\tjpmxp1122715503\tJ\tJP\tJPM\tJPMX\t3\t03\t503\t5503\tLINEIN\tALLCAP\tCONTAINSDIGITS\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\tI-\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t" - val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input) val (element, fundingAcknowledgmentParse) = target.getExtractionResult(tokens, results) @@ -202,7 +202,7 @@ class FundingAcknowledgementParserTest { @Test fun testGetExtractionResultNew1_ShouldReturnCorrectElementsAndPositions() { - val input = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript."; + val input = "Our warmest thanks to Patrice Lopez, the author of Grobid [22], DeLFT [20], and other open-source projects for his continuous support and inspiration with ideas, suggestions, and fruitful discussions. We thank Pedro Baptista de Castro for his support during this work. Special thanks to Erina Fujita for useful tips on the manuscript." val results: String = "Our\tour\tO\tOu\tOur\tOur\tr\tur\tOur\tOur\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "warmest\twarmest\tw\twa\twar\twarm\tt\tst\test\tmest\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + @@ -269,9 +269,9 @@ class FundingAcknowledgementParserTest { "on\ton\to\ton\ton\ton\tn\ton\ton\ton\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + "the\tthe\tt\tth\tthe\tthe\te\the\tthe\tthe\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + "manuscript\tmanuscript\tm\tma\tman\tmanu\tt\tpt\tipt\tript\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t" - val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input) val (spans, statement) = target.getExtractionResult(tokens, results) @@ -284,22 +284,22 @@ class FundingAcknowledgementParserTest { assertThat(spans, hasSize(3)) val span0 = spans[0] - val offsetPosition0 = span0.left - val element0 = span0.right + val offsetPosition0 = span0.offsetPosition + val element0 = span0.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition0.start, offsetPosition0.end)), `is`("Patrice Lopez")) assertThat(element0.toXML(), `is`("Patrice Lopez")) val span1 = spans[1] - val offsetPosition1 = span1.left - val element1 = span1.right + val offsetPosition1 = span1.offsetPosition + val element1 = span1.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition1.start, offsetPosition1.end)), `is`("Pedro Baptista de Castro")) assertThat(element1.toXML(), `is`("Pedro Baptista de Castro")) val span2 = spans[2] - val offsetPosition2 = span2.left - val element2 = span2.right + val offsetPosition2 = span2.offsetPosition + val element2 = span2.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("Erina Fujita")) assertThat(element2.toXML(), `is`("Erina Fujita")) @@ -307,7 +307,7 @@ class FundingAcknowledgementParserTest { @Test fun testGetExtractionResultNew2_ShouldReturnCorrectElementsAndPositions() { - val input = "This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503]."; + val input = "This work was partly supported by MEXT Program: Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials) Grant Number [JPMXP1122715503]." val results: String = "This\tthis\tT\tTh\tThi\tThis\ts\tis\this\tThis\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "work\twork\tw\two\twor\twork\tk\trk\tork\twork\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + @@ -343,9 +343,9 @@ class FundingAcknowledgementParserTest { "[\t[\t[\t[\t[\t[\t[\t[\t[\t[\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tOPENBRACKET\t0\t\n" + "JPMXP1122715503\tjpmxp1122715503\tJ\tJP\tJPM\tJPMX\t3\t03\t503\t5503\tLINEIN\tALLCAP\tCONTAINSDIGITS\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "]\t]\t]\t]\t]\t]\t]\t]\t]\t]\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tENDBRACKET\t0\tI-\n" + - ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t"; + ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\tLINEEND\tALLCAP\tNODIGIT\t1\t0\t0\tDOT\t0\t" - val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input) val (spans, statement) = target.getExtractionResult(tokens, results) @@ -355,22 +355,22 @@ class FundingAcknowledgementParserTest { assertThat(spans, hasSize(3)) val span0 = spans[0] - val offsetPosition0 = span0.left - val element0 = span0.right + val offsetPosition0 = span0.offsetPosition + val element0 = span0.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition0.start, offsetPosition0.end)), `is`("MEXT")) assertThat(element0.toXML(), `is`("MEXT")) val span1 = spans[1] - val offsetPosition1 = span1.left - val element1 = span1.right + val offsetPosition1 = span1.offsetPosition + val element1 = span1.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition1.start, offsetPosition1.end)), `is`("Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)")) assertThat(element1.toXML(), `is`("Data Creation and Utilization-Type Material Research and Development Project (Digital Transformation Initiative Center for Magnetic Materials)")) val span2 = spans[2] - val offsetPosition2 = span2.left - val element2 = span2.right + val offsetPosition2 = span2.offsetPosition + val element2 = span2.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("JPMXP1122715503")) assertThat(element2.toXML(), `is`("JPMXP1122715503")) @@ -378,7 +378,7 @@ class FundingAcknowledgementParserTest { @Test fun testGetExtractionResult_ErrorCase_ShouldReturnCorrectElementsAndPositions() { - val input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder,"; + val input = "Christophe Castagne, Claudie Marec, Claudie Marec, Claudio Stalder," val results: String = "Christophe\tchristophe\tC\tCh\tChr\tChri\te\the\tphe\tophe\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\tI-\n" + "Castagne\tcastagne\tC\tCa\tCas\tCast\te\tne\tgne\tagne\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + @@ -393,7 +393,7 @@ class FundingAcknowledgementParserTest { "Stalder\tstalder\tS\tSt\tSta\tStal\tr\ter\tder\tlder\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\tNOPUNCT\t0\t\n" + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAP\tNODIGIT\t1\t0\t0\tCOMMA\t0\tI-\n" - val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + val tokens: List = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input) val (spans, statement) = target.getExtractionResult(tokens, results) @@ -403,22 +403,22 @@ class FundingAcknowledgementParserTest { assertThat(spans, hasSize(4)) val span0 = spans[0] - val offsetPosition0 = span0.left - val element0 = span0.right + val offsetPosition0 = span0.offsetPosition + val element0 = span0.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition0.start, offsetPosition0.end)), `is`("Christophe Castagne")) assertThat(element0.toXML(), `is`("Christophe Castagne")) val span1 = spans[1] - val offsetPosition1 = span1.left - val element1 = span1.right + val offsetPosition1 = span1.offsetPosition + val element1 = span1.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition1.start, offsetPosition1.end)), `is`("Claudie Marec")) assertThat(element1.toXML(), `is`("Claudie Marec")) val span2 = spans[2] - val offsetPosition2 = span2.left - val element2 = span2.right + val offsetPosition2 = span2.offsetPosition + val element2 = span2.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition2.start, offsetPosition2.end)), `is`("Claudie Marec")) assertThat(element2.toXML(), `is`("Claudie Marec")) @@ -428,8 +428,8 @@ class FundingAcknowledgementParserTest { assertThat(offsetPosition2.end, `is`(not(offsetPosition1.end))) val span3 = spans[3] - val offsetPosition3 = span3.left - val element3 = span3.right + val offsetPosition3 = span3.offsetPosition + val element3 = span3.annotationNode assertThat(LayoutTokensUtil.toText(tokens.subList(offsetPosition3.start, offsetPosition3.end)), `is`("Claudio Stalder")) assertThat(element3.toXML(), `is`("Claudio Stalder"))