From 69ddeba6a90561039b097b78174b868b0cb6f035 Mon Sep 17 00:00:00 2001 From: Shifu Chen Date: Fri, 18 Feb 2022 22:55:42 +0800 Subject: [PATCH] improve performance and reduce false positive --- src/common.h | 4 ++++ src/indexer.cpp | 27 +++++++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/common.h b/src/common.h index 2402aca..293b26b 100644 --- a/src/common.h +++ b/src/common.h @@ -38,5 +38,9 @@ static const int PACK_SIZE = 1000; // if the number of in memory packs is full, the producer thread should sleep static const int PACK_IN_MEM_LIMIT = 100; +// the key dup in normal level will be kept, in high level will be skipped +static const int DUPE_NORMAL_LEVEL = -1; +static const int DUPE_HIGH_LEVEL = -2; + #endif /* COMMON_H */ diff --git a/src/indexer.cpp b/src/indexer.cpp index bcb3560..6642613 100644 --- a/src/indexer.cpp +++ b/src/indexer.cpp @@ -85,8 +85,16 @@ void Indexer::indexContig(int ctg, string seq, int start) { if(mKmerPos.count(kmer) >0 ){ GenePos gp = mKmerPos[kmer]; // already marked as a dupe - if(gp.contig < 0) { - mDupeList[gp.position].push_back(site); + if(gp.contig == DUPE_HIGH_LEVEL){ + // skip it since it's already a high level dupe + continue; + } else if(gp.contig == DUPE_NORMAL_LEVEL) { + if(mDupeList[gp.position].size() >= GlobalSettings::skipKeyDupThreshold) { + mKmerPos[kmer].contig = DUPE_HIGH_LEVEL; + mDupeList[gp.position]=vector(); + } + else + mDupeList[gp.position].push_back(site); } else { // else make a new dupe entry vector gps; @@ -94,7 +102,7 @@ void Indexer::indexContig(int ctg, string seq, int start) { gps.push_back(site); mDupeList.push_back(gps); // and mark it as a dupe - mKmerPos[kmer].contig = -1; + mKmerPos[kmer].contig = DUPE_NORMAL_LEVEL; mKmerPos[kmer].position = mDupeList.size() -1; mUniquePos--; mDupePos++; @@ -126,10 +134,10 @@ vector Indexer::mapRead(Read* r) { } GenePos gp = mKmerPos[kmer]; // is a dupe - if(gp.contig < 0) { + if(gp.contig == DUPE_HIGH_LEVEL) { // too much keys in this dupe, then skip it - if(mDupeList[gp.position].size() > GlobalSettings::skipKeyDupThreshold) - continue; + continue; + } else if(gp.contig == DUPE_NORMAL_LEVEL) { for(int g=0; g Indexer::mapRead(Read* r) { else kmerStat[gplong] += 1; } - } else { + } else { // not a dupe long gplong = gp2long(shift(gp, i)); if(kmerStat.count(gplong)==0) kmerStat[gplong] = 1; @@ -183,7 +191,10 @@ vector Indexer::mapRead(Read* r) { continue; GenePos gp = mKmerPos[kmer]; // is a dupe - if(gp.contig < 0) { + if(gp.contig == DUPE_HIGH_LEVEL) { + // too much keys in this dupe, then skip it + continue; + } else if(gp.contig == DUPE_NORMAL_LEVEL) { for(int g=0; g