From 9c6a5c0e0eb4503a8f67a43dcb3d2cfcae94e369 Mon Sep 17 00:00:00 2001 From: jose miguel mut Date: Fri, 13 Sep 2019 17:01:57 +0100 Subject: [PATCH 1/3] use system sort for the list of contigs Of course this is platform dependent, but given that we have some species with lots of contigs (e.g. almost 0.5M for rat or turkey), we can either: - keep the hacky implementation of this commit. - evaluate memory performance of: loading the file into a list, sort the list in memory, write the list. For at least a couple million of 20 bytes strings. If we go for the second option, the changes have to be done in the same place as the current system sort. --- .../eva/accession/release/io/ContigWriter.java | 15 ++++++++++++++- .../release/io/VariantContextWriter.java | 6 +++--- .../ListActiveContigsStepConfigurationTest.java | 2 +- .../ListMergedContigsStepConfigurationTest.java | 2 +- .../accession/release/io/ContigWriterTest.java | 4 ++-- .../release/io/VariantContextWriterTest.java | 2 +- 6 files changed, 22 insertions(+), 9 deletions(-) diff --git a/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java b/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java index 45420f17b..e0de7953d 100644 --- a/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java +++ b/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java @@ -69,6 +69,19 @@ public void update(ExecutionContext executionContext) throws ItemStreamException @Override public void close() throws ItemStreamException { printWriter.close(); + sortContigFile(); + } + + private void sortContigFile() { + try { + int exitCode = Runtime.getRuntime().exec(new String[]{"sort", this.output.getAbsolutePath()}).waitFor(); + if (exitCode != 0) { + throw new ItemStreamException( + "Trying to sort the contig list, the 'sort' command returned exit code'" + exitCode + "'."); + } + } catch (IOException | InterruptedException e) { + throw new ItemStreamException("Failed sorting contig list. ", e); + } } @Override @@ -85,7 +98,7 @@ public void write(List contigs) { throw new IllegalArgumentException("Could not find the corresponding sequence name for contig " + contig); } - printWriter.println(contig + "," + sequenceName); + printWriter.println(sequenceName + "," + contig); } } diff --git a/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriter.java b/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriter.java index 437287d80..c4bdd4b5f 100644 --- a/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriter.java +++ b/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriter.java @@ -137,9 +137,9 @@ private void addContigs(Set metaData) { BufferedReader bufferedReader = new BufferedReader(new FileReader(contigsFilePath)); String contigLine; while ((contigLine = bufferedReader.readLine()) != null) { - String[] contigAndName = contigLine.split(","); - String contig = contigAndName[0]; - String name = contigAndName[1]; + String[] nameAndContig = contigLine.split(","); + String name = nameAndContig[0]; + String contig = nameAndContig[1]; metaData.add(new VCFHeaderLine("contig", "")); } } catch (IOException e) { diff --git a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListActiveContigsStepConfigurationTest.java b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListActiveContigsStepConfigurationTest.java index 8c8d78f2d..174841fd7 100644 --- a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListActiveContigsStepConfigurationTest.java +++ b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListActiveContigsStepConfigurationTest.java @@ -100,7 +100,7 @@ public void assertStepExecutesAndCompletes() { public void contigsWritten() throws Exception { assertStepExecutesAndCompletes(); - assertEquals(new HashSet<>(Arrays.asList("CM001954.1,CAE13", "CM001941.2,CAE1")), + assertEquals(new HashSet<>(Arrays.asList("CAE13,CM001954.1", "CAE1,CM001941.2")), setOfLines(getActiveContigsFilePath(inputParameters.getOutputFolder(), inputParameters.getAssemblyAccession()))); } diff --git a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListMergedContigsStepConfigurationTest.java b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListMergedContigsStepConfigurationTest.java index 18cd449bd..6d9c1d988 100644 --- a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListMergedContigsStepConfigurationTest.java +++ b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/configuration/steps/ListMergedContigsStepConfigurationTest.java @@ -101,7 +101,7 @@ public void assertStepExecutesAndCompletes() { public void contigsWritten() throws Exception { assertStepExecutesAndCompletes(); - assertEquals(new HashSet<>(Arrays.asList("CM001954.1,CAE13", "CM001941.2,CAE1")), + assertEquals(new HashSet<>(Arrays.asList("CAE13,CM001954.1", "CAE1,CM001941.2")), setOfLines(ContigWriter.getMergedContigsFilePath(inputParameters.getOutputFolder(), inputParameters.getAssemblyAccession()))); } diff --git a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java index d319330ad..c50d11453 100644 --- a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java +++ b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java @@ -85,7 +85,7 @@ public void write() throws Exception { assertEquals(contigs.size(), numberOfLines(output)); - List expectedLines = Arrays.asList("CM0001.1,Chr1", "CM0002.1,Chr2", "CM0003.1,Chr3"); + List expectedLines = Arrays.asList("Chr1,CM0001.1", "Chr2,CM0002.1", "Chr3,CM0003.1"); assertContigFileContent(output, expectedLines); } @@ -123,7 +123,7 @@ public void useSequenceNameIfContigIsRefSeqAccession() throws IOException { assertEquals(contigs.size(), numberOfLines(output)); - List expectedLines = Arrays.asList("NC0001.1,Chr1", "NC0002.1,Chr2", "CM0003.1,Chr3"); + List expectedLines = Arrays.asList("Chr1,NC0001.1", "Chr2,NC0002.1", "Chr3,CM0003.1"); assertContigFileContent(output, expectedLines); } diff --git a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriterTest.java b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriterTest.java index f4ec73b03..d0d14e0f4 100644 --- a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriterTest.java +++ b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/VariantContextWriterTest.java @@ -187,7 +187,7 @@ private List grepFile(File file, String regex) throws IOException { public void checkMetadataSection() throws Exception { File outputFolder = temporaryFolder.newFolder(); FileWriter fileWriter = new FileWriter(ContigWriter.getActiveContigsFilePath(outputFolder, REFERENCE_ASSEMBLY)); - String contig = "CM0001.1,Chr1"; + String contig = "Chr1,CM0001.1"; fileWriter.write(contig); fileWriter.close(); From 787a5de8e3ee234088b1da26ef992f6937b0fcb3 Mon Sep 17 00:00:00 2001 From: Andres Silva Date: Mon, 16 Sep 2019 16:23:16 +0100 Subject: [PATCH 2/3] add test for contig order --- .../accession/release/io/ContigWriterTest.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java index c50d11453..33b0a096c 100644 --- a/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java +++ b/eva-accession-release/src/test/java/uk/ac/ebi/eva/accession/release/io/ContigWriterTest.java @@ -85,8 +85,8 @@ public void write() throws Exception { assertEquals(contigs.size(), numberOfLines(output)); - List expectedLines = Arrays.asList("Chr1,CM0001.1", "Chr2,CM0002.1", "Chr3,CM0003.1"); - assertContigFileContent(output, expectedLines); + List expectedLinesSorted = Arrays.asList("Chr1,CM0001.1", "Chr2,CM0002.1", "Chr3,CM0003.1"); + assertContigFileContentAndOrder(output, expectedLinesSorted); } private long numberOfLines(File file) throws IOException { @@ -94,11 +94,13 @@ private long numberOfLines(File file) throws IOException { return br.lines().count(); } - private void assertContigFileContent(File file, List expectedLines) throws IOException { + private void assertContigFileContentAndOrder(File file, List expectedLines) throws IOException { BufferedReader br = new BufferedReader(new FileReader(file)); + int index = 0; String line; while ((line = br.readLine()) != null) { - assertTrue(expectedLines.contains(line)); + assertEquals(expectedLines.get(index), line); + index++; } } @@ -117,14 +119,14 @@ public void useSequenceNameIfContigIsRefSeqAccession() throws IOException { ContigWriter contigWriter = new ContigWriter(output, contigMapping); contigWriter.open(null); - List contigs = Arrays.asList(REFSEQ_ACCESSION_1, REFSEQ_ACCESSION_2, GENBANK_ACCESSION_3); + List contigs = Arrays.asList(REFSEQ_ACCESSION_2, GENBANK_ACCESSION_3, REFSEQ_ACCESSION_1); contigWriter.write(contigs); contigWriter.close(); assertEquals(contigs.size(), numberOfLines(output)); - List expectedLines = Arrays.asList("Chr1,NC0001.1", "Chr2,NC0002.1", "Chr3,CM0003.1"); - assertContigFileContent(output, expectedLines); + List expectedLinesSorted = Arrays.asList("Chr1,NC0001.1", "Chr2,NC0002.1", "Chr3,CM0003.1"); + assertContigFileContentAndOrder(output, expectedLinesSorted); } /** From 3460e524aa12ad60ae187348f2e14fd78fc94293 Mon Sep 17 00:00:00 2001 From: Andres Silva Date: Mon, 16 Sep 2019 23:09:45 +0100 Subject: [PATCH 3/3] write the contig file in order --- .../eva/accession/release/io/ContigWriter.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java b/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java index e0de7953d..916a85a5f 100644 --- a/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java +++ b/eva-accession-release/src/main/java/uk/ac/ebi/eva/accession/release/io/ContigWriter.java @@ -23,9 +23,11 @@ import uk.ac.ebi.eva.accession.core.contig.ContigNaming; import uk.ac.ebi.eva.accession.core.contig.ContigSynonyms; +import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStreamReader; import java.io.PrintWriter; import java.nio.file.Paths; import java.util.List; @@ -74,11 +76,22 @@ public void close() throws ItemStreamException { private void sortContigFile() { try { - int exitCode = Runtime.getRuntime().exec(new String[]{"sort", this.output.getAbsolutePath()}).waitFor(); + Process process = Runtime.getRuntime().exec(new String[]{"sort", this.output.getAbsolutePath()}); + int exitCode = process.waitFor(); + if (exitCode != 0) { throw new ItemStreamException( "Trying to sort the contig list, the 'sort' command returned exit code'" + exitCode + "'."); } + + BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line; + PrintWriter printWriter2 = new PrintWriter(new FileWriter(this.output)); + while ((line = br.readLine()) != null) { + printWriter2.println(line); + } + printWriter2.close(); + } catch (IOException | InterruptedException e) { throw new ItemStreamException("Failed sorting contig list. ", e); }