From 32d32a63e809f63401ea499870687e987e444f3b Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Fri, 7 Jul 2023 00:36:12 +0900 Subject: [PATCH] Get rid of the query Sequence object and expose xdrop --- src/commons/BlockAligner.cpp | 40 ++++++++++++++++++----------------- src/commons/BlockAligner.h | 3 ++- src/commons/LocalParameters.h | 2 +- src/sra/blockalign.cpp | 11 +++++----- src/sra/playground.cpp | 6 +++--- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/commons/BlockAligner.cpp b/src/commons/BlockAligner.cpp index 58c9bf2..2e4531a 100644 --- a/src/commons/BlockAligner.cpp +++ b/src/commons/BlockAligner.cpp @@ -88,12 +88,12 @@ LocalAln align_local( size_t b_len, const char* b_str, const char* b_rev, PaddedBytes* b, const AAMatrix* matrix, Gaps gaps, size_t a_idx, size_t b_idx, - Cigar* cigar, SizeRange range + Cigar* cigar, SizeRange range, int32_t x_drop ) { LocalAln res_aln; AlignResult res; - int32_t x_drop = -(range.min * gaps.extend + gaps.open); + // int32_t x_drop = -(range.min * gaps.extend + gaps.open); // forwards alignment starting at (a_idx, b_idx) block_set_bytes_padded_aa(a, (uint8_t*)(a_str + a_idx), a_len - a_idx, range.max); @@ -128,15 +128,15 @@ LocalAln align_local( LocalAln align_local_profile( BlockHandle block_trace, BlockHandle block_no_trace, const size_t a_len, const char* a_str, const char* a_rev, PaddedBytes* a, - const size_t b_len, const int8_t* b_profile_matrix, AAProfile* b, + const size_t b_len, const char* b_str, AAProfile* b, Gaps gaps, BaseMatrix& subMat, size_t a_idx, size_t b_idx, - Cigar* cigar, SizeRange range + Cigar* cigar, SizeRange range, int32_t x_drop ) { LocalAln res_aln; AlignResult res; - int32_t x_drop = 10000; // -(range.min * gaps.extend + gaps.open); + // int32_t x_drop = 10000; // -(range.min * gaps.extend + gaps.open); // forwards alignment starting at (a_idx, b_idx) block_set_bytes_padded_aa(a, (uint8_t*)(a_str + a_idx), a_len - a_idx, range.max); @@ -146,7 +146,7 @@ LocalAln align_local_profile( for (size_t i = 0; i < (b_len - b_idx); ++i) { // printf("%d ", b_idx + i); for (uint8_t j = 0; j < Sequence::PROFILE_AA_SIZE; ++j) { - volatile int val = b_profile_matrix[(j * b_len) + (b_idx + i)]; + volatile int val = static_cast(static_cast(b_str[(b_idx + i) * Sequence::PROFILE_READIN_SIZE + j]) / 4); // if (val < curr_max) { // curr_max = val; // } @@ -195,7 +195,8 @@ LocalAln align_local_profile( for (size_t i = 0; i < (b_len - b_idx); ++i) { // printf("%d ", b_len - b_idx - i); for (uint8_t j = 0; j < Sequence::PROFILE_AA_SIZE; ++j) { - volatile int val = b_profile_matrix[(j * b_len) + (b_len - b_idx - i)]; + // volatile int val = b_profile_matrix[(j * b_len) + (b_len - b_idx - i)]; + volatile int val = static_cast(static_cast(b_str[(b_len - b_idx - i) * Sequence::PROFILE_READIN_SIZE + j]) / 4); // if (val < curr_max) { // curr_max = val; // } @@ -233,19 +234,20 @@ LocalAln align_local_profile( Matcher::result_t -BlockAligner::align(Sequence &query, - DistanceCalculator::LocalAlignment alignment, - EvalueComputation *evaluer, - int xdrop, - BaseMatrix* subMat) { - unsigned int qKey = query.getDbKey(); - size_t b_len = query.L; - const char* b_str = query.getSeqData(); +BlockAligner::align( + const char* querySeq, + unsigned int queryLength, + DistanceCalculator::LocalAlignment alignment, + EvalueComputation *evaluer, + int xdrop, + BaseMatrix* subMat +) { + size_t b_len = queryLength; + const char* b_str = querySeq; if (dbtype == Parameters::DBTYPE_AMINO_ACIDS) { SRAUtil::strrev(querySeqRev, b_str, b_len); } const char* b_rev = querySeqRev; - const int8_t *b_profile_matrix = query.getAlignmentProfile(); size_t a_len = targetLength; const char* a_str = targetSeq; @@ -275,9 +277,9 @@ BlockAligner::align(Sequence &query, LocalAln local_aln; if (dbtype == Parameters::DBTYPE_AMINO_ACIDS) { - local_aln = align_local(blockTrace, blockNoTrace, a_len, a_str, a_rev, a, b_len, b_str, b_rev, b, &BLOSUM62, gaps, qUngappedEndPos, dbUngappedEndPos, cigar, range); + local_aln = align_local(blockTrace, blockNoTrace, a_len, a_str, a_rev, a, b_len, b_str, b_rev, b, &BLOSUM62, gaps, qUngappedEndPos, dbUngappedEndPos, cigar, range, xdrop); } else { - local_aln = align_local_profile(blockTrace, blockNoTrace, a_len, a_str, a_rev, a, b_len, b_profile_matrix, bProfile, gaps, *subMat, qUngappedEndPos, dbUngappedEndPos, cigar, range); + local_aln = align_local_profile(blockTrace, blockNoTrace, a_len, a_str, a_rev, a, b_len, b_str, bProfile, gaps, *subMat, qUngappedEndPos, dbUngappedEndPos, cigar, range, xdrop); } // printf("a: %s\nb: %s\nscore: %d\nstart idx: (%lu, %lu)\nend idx: (%lu, %lu)\n", // a_str, @@ -346,7 +348,7 @@ BlockAligner::align(Sequence &query, // EXIT(EXIT_FAILURE); return Matcher::result_t( - qKey, bitScore, qcov, dbcov, seqId, evalue, alnLength, + 0, bitScore, qcov, dbcov, seqId, evalue, alnLength, local_aln.a_start, local_aln.a_end, a_len, local_aln.b_start, local_aln.b_end, b_len, backtrace ); diff --git a/src/commons/BlockAligner.h b/src/commons/BlockAligner.h index f18dc77..cc78a4c 100644 --- a/src/commons/BlockAligner.h +++ b/src/commons/BlockAligner.h @@ -20,7 +20,8 @@ class BlockAligner { void initTarget(Sequence &target); Matcher::result_t align( - Sequence &query, + const char* querySeq, + unsigned int queryLength, DistanceCalculator::LocalAlignment alignment, EvalueComputation *evaluer, int xdrop, diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 366bb6e..3fa417e 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -30,7 +30,7 @@ class LocalParameters : public Parameters { unsigned int requiredKmerMatches; PARAMETER(PARAM_X_DROP) - int8_t xdrop; + unsigned int xdrop; PARAMETER(PARAM_RANGE_MIN) uintptr_t rangeMin; diff --git a/src/sra/blockalign.cpp b/src/sra/blockalign.cpp index 87c5839..a1c65fb 100644 --- a/src/sra/blockalign.cpp +++ b/src/sra/blockalign.cpp @@ -214,7 +214,6 @@ int blockalign(int argc, const char **argv, const Command &command) { #ifdef OPENMP thread_idx = static_cast(omp_get_thread_num()); #endif - Sequence querySeq(par.maxSeqLen, querySequenceReader.getDbtype(), subMat, useProfileSearch ? 0 : par.kmerSize, false, false, !useProfileSearch); Sequence targetSeq(par.maxSeqLen, seqType, subMat, par.kmerSize, par.spacedKmer, false, false, par.spacedKmerPattern); Indexer idx(subMat->alphabetSize - 1, par.kmerSize); @@ -223,8 +222,10 @@ int blockalign(int argc, const char **argv, const Command &command) { par.maxSeqLen, par.rangeMin, par.rangeMax, isNucDB ? -par.gapOpen.values.nucleotide() : -par.gapOpen.values.aminoacid(), isNucDB ? -par.gapExtend.values.nucleotide() : -par.gapExtend.values.aminoacid(), - querySeq.getSeqType() + querySequenceReader.getDbtype() ); + + // Sequence querySeq(par.maxSeqLen, querySequenceReader.getDbtype(), subMat, 0, false, false, false); // Matcher matcher(querySeq.getSeqType(), targetSeq.getSeqType(), par.maxSeqLen, subMat, &evaluer, false, 1.0, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid(), 1.0, 0); char buffer[1024]; @@ -304,7 +305,7 @@ int blockalign(int argc, const char **argv, const Command &command) { if (useProfileSearch) { realSeq.clear(); - querySeq.extractProfileSequence(querySeqData, queryEntryLen - 1, *subMat, realSeq); + Sequence::extractProfileSequence(querySeqData, queryEntryLen - 1, *subMat, realSeq); if (realSeq.length() != querySeqLen) { Debug(Debug::ERROR) << "Query sequence length is wrong!\n" << "Correct count: " << correct_count << "\n" @@ -360,10 +361,10 @@ int blockalign(int argc, const char **argv, const Command &command) { isBlockAlignerInit = true; } - querySeq.mapSequence(queryId, queryKey, querySeqData, querySeqLen); + // querySeq.mapSequence(queryId, queryKey, querySeqData, querySeqLen); // matcher.initQuery(&querySeq); // Matcher::result_t res = matcher.getSWResult(&targetSeq, INT_MAX, false, 0, 0.0, par.evalThr, Matcher::SCORE_COV_SEQID, 0, false); - Matcher::result_t res = blockAligner.align(querySeq, aln, &evaluer, xdrop, subMat); + Matcher::result_t res = blockAligner.align(querySeqData, querySeqLen, aln, &evaluer, xdrop, subMat); res.dbKey = targetKey; res.queryOrfStartPos = queryKey; alignmentsNum++; diff --git a/src/sra/playground.cpp b/src/sra/playground.cpp index e15798e..fea1e83 100644 --- a/src/sra/playground.cpp +++ b/src/sra/playground.cpp @@ -206,8 +206,8 @@ unsigned int qprof_1_len = 45026; unsigned int seq_len = (qprof_1_len - 1) / Sequence::PROFILE_READIN_SIZE; - Sequence s1(65000, Parameters::DBTYPE_HMM_PROFILE, &subMat, kmer_size, true, true); - s1.mapSequence(0, 0, (const char*)qprof_1, seq_len); + // Sequence s1(65000, Parameters::DBTYPE_HMM_PROFILE, &subMat, kmer_size, true, true); + // s1.mapSequence(0, 0, (const char*)qprof_1, seq_len); // s1.printProfile(); Sequence s2(65000, Parameters::DBTYPE_AMINO_ACIDS, &subMat, kmer_size, false, false); @@ -230,7 +230,7 @@ unsigned int qprof_1_len = 45026; aln.endPos = 0; aln.diagonal = 0; aln.distToDiagonal = 0; - Matcher::result_t res = blockAligner.align(s1, aln, &evaluer, 10000, &subMat); + Matcher::result_t res = blockAligner.align((const char*)qprof_1, seq_len, aln, &evaluer, 10000, &subMat); std::cout << "Score: " << res.score << std::endl; std::cout << "Eval: " << res.eval << std::endl;