Skip to content

Commit

Permalink
add -i/--incrmnt
Browse files Browse the repository at this point in the history
  • Loading branch information
Yan Gao committed Feb 10, 2021
1 parent 65f9f3b commit 59724c6
Show file tree
Hide file tree
Showing 22 changed files with 2,325 additions and 298 deletions.
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ LIB_DIR = $(OUT_PRE_DIR)/lib
INC_DIR = ./include
SRC_DIR = ./src

SOURCE = $(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa_align.c $(SRC_DIR)/abpoa_graph.c $(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/simd_check.c $(SRC_DIR)/utils.c $(SRC_DIR)/abpoa_plot.c
HEADER = $(SRC_DIR)/abpoa.h $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/seq.h $(SRC_DIR)/kdq.h $(SRC_DIR)/kseq.h $(SRC_DIR)/simd_instruction.h $(SRC_DIR)/simd_abpoa_align.h $(SRC_DIR)/utils.h
OBJS = $(SRC_DIR)/abpoa_align.o $(SRC_DIR)/abpoa_graph.o $(SRC_DIR)/simd_abpoa_align.o $(SRC_DIR)/simd_check.o $(SRC_DIR)/utils.o $(SRC_DIR)/abpoa_plot.o
SOURCE = $(SRC_DIR)/abpoa_align.c $(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa_graph.c $(SRC_DIR)/abpoa_plot.c $(SRC_DIR)/abpoa_seq.c $(SRC_DIR)/kalloc.c $(SRC_DIR)/kstring.c $(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/simd_check.c $(SRC_DIR)/utils.c
HEADER = $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa.h $(SRC_DIR)/abpoa_seq.h $(SRC_DIR)/kalloc.h $(SRC_DIR)/kdq.h $(SRC_DIR)/khash.h $(SRC_DIR)/kseq.h $(SRC_DIR)/kstring.h $(SRC_DIR)/simd_instruction.h $(SRC_DIR)/simd_abpoa_align.h $(SRC_DIR)/utils.h
OBJS = $(SRC_DIR)/abpoa_align.o $(SRC_DIR)/abpoa_graph.o $(SRC_DIR)/abpoa_plot.o $(SRC_DIR)/abpoa_seq.o $(SRC_DIR)/kalloc.o $(SRC_DIR)/kstring.o $(SRC_DIR)/simd_abpoa_align.o $(SRC_DIR)/simd_check.o $(SRC_DIR)/utils.o

# SIMD label
SIMD_CHECK_D = -D __CHECK_SIMD_MAIN__
Expand Down Expand Up @@ -91,7 +91,7 @@ $(ABPOALIB):$(OBJS)
$(AR) -csr $@ $(OBJS)

$(SRC_DIR)/abpoa.o:$(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa_align.h \
$(SRC_DIR)/seq.h $(SRC_DIR)/utils.h $(SRC_DIR)/simd_instruction.h
$(SRC_DIR)/abpoa_seq.h $(SRC_DIR)/utils.h $(SRC_DIR)/simd_instruction.h
$(CC) -c $(CFLAGS) $(SIMD_FLAG) $< -o $@

$(SRC_DIR)/simd_check.o:$(SRC_DIR)/simd_check.c $(SRC_DIR)/simd_instruction.h
Expand Down
41 changes: 28 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,15 @@
[![Build Status](https://img.shields.io/travis/yangao07/abPOA/master.svg?label=Master)](https://travis-ci.org/yangao07/abPOA)
[![License](https://img.shields.io/badge/License-MIT-black.svg)](https://github.com/yangao07/abPOA/blob/master/LICENSE)
<!-- [![PyPI](https://img.shields.io/pypi/v/pyabpoa.svg?style=flat)](https://pypi.python.org/pypi/pyabpoa) -->
## Updates (v1.0.6)
## Updates (v1.1.0)

- Fixed a bug in read id assignment
- Fixed a bug in DP of first row
- Archived evaluation datasets and scripts
- Added option `-i/--incrmnt` to align sequence to an existing graph in GFA/MSA format

## Getting started
Download the [latest release](https://github.com/yangao07/abPOA/releases):
```
wget https://github.com/yangao07/abPOA/releases/download/v1.0.6/abPOA-v1.0.6.tar.gz
tar -zxvf abPOA-v1.0.6.tar.gz && cd abPOA-v1.0.6
wget https://github.com/yangao07/abPOA/releases/download/v1.1.0/abPOA-v1.1.0.tar.gz
tar -zxvf abPOA-v1.1.0.tar.gz && cd abPOA-v1.1.0
```
Make from source and run with test data:
```
Expand All @@ -40,6 +38,7 @@ abpoa ./test_data/seq.fa > cons.fa
- [To generate consensus sequence](#gen_cons)
- [To generate row-column multiple sequence alignment](#gen_msa)
- [To generate graph information in GFA format](#gen_gfa)
- [To align sequence to an existing graph in GFA/MSA format](#aln_to_gfa)
- [To generate a plot of the alignment graph](#gen_plot)
- [Commands and options](#cmd)
- [Input](#input)
Expand Down Expand Up @@ -81,9 +80,9 @@ You can also build abPOA from source files.
Make sure you have gcc (>=6.4.0) and zlib installed before compiling.
It is recommended to download the [latest release](https://github.com/yangao07/abPOA/releases).
```
wget https://github.com/yangao07/abPOA/releases/download/v1.0.6/abPOA-v1.0.6.tar.gz
tar -zxvf abPOA-v1.0.6.tar.gz
cd abPOA-v1.0.6; make
wget https://github.com/yangao07/abPOA/releases/download/v1.1.0/abPOA-v1.1.0.tar.gz
tar -zxvf abPOA-v1.1.0.tar.gz
cd abPOA-v1.1.0; make
```
Or, you can use `git clone` command to download the source code.
This gives you the latest version of abPOA, which might be still under development.
Expand All @@ -95,8 +94,8 @@ cd abPOA; make
### <a name="binary"></a>Pre-built binary executable file for Linux/Unix
If you meet any compiling issue, please try the pre-built binary file:
```
wget https://github.com/yangao07/abPOA/releases/download/v1.0.6/abPOA-v1.0.6_x64-linux.tar.gz
tar -zxvf abPOA-v1.0.6_x64-linux.tar.gz
wget https://github.com/yangao07/abPOA/releases/download/v1.1.0/abPOA-v1.1.0_x64-linux.tar.gz
tar -zxvf abPOA-v1.1.0_x64-linux.tar.gz
```

## <a name="usage"></a>General usage
Expand All @@ -115,11 +114,25 @@ abpoa seq.fa -r2 > cons.out
### <a name="gen_gfa"></a>To generate graph information in [GFA](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) format

```
abpoa seq.fa -r3 > abpoa.gfa
abpoa seq.fa -r3 > out.gfa
```
To include the generated consensus sequence as a path in the GFA file:
```
abpoa seq.fa -r4 > abpoa.gfa
abpoa seq.fa -r4 > out.gfa
```

### <a name="aln_to_gfa"></a>To align sequence to an existing graph in GFA/MSA format
```
abpoa -i in.gfa seq.fa -r3 > out.gfa
abpoa -i in.msa seq.fa -Ar1 > out.msa
```
For GFA input file, `S` and `P` lines are required and are used to reconstruct the alignment graph.
For MSA input file, which is generally a FASTA format file, `-` in the sequence indicates the alignment gap.
If you want to use abPOA to generate a MSA output file and then perform the incremental graph alignment, please do not forget `-A` to include the FASTA header of each sequence:

```
abpoa seq1.fa -Ar1 > seq1.msa
abpoa -i seq1.msa seq2.fa > cons.fa
```

### <a name="gen_plot"></a>To generate a plot of the alignment graph
Expand Down Expand Up @@ -158,6 +171,8 @@ Options:
-l --in-list input file is a list of sequence file names [False]
each line is one sequence file containing a set of sequences
which will be aligned by abPOA to generate a consensus sequence
-i --incrmnt FILE incrementally align sequences to an existing graph/MSA [Null]
graph could be in GFA or MSA format generated by abPOA
-o --output FILE ouput to FILE [stdout]
-r --result INT output result mode [0]
- 0: consensus (FASTA format)
Expand Down
4 changes: 2 additions & 2 deletions example.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "abpoa.h"

// AaCcGgTtNn ==> 0,1,2,3,4
unsigned char nst_nt4_table[256] = {
unsigned char _nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
Expand Down Expand Up @@ -74,7 +74,7 @@ int main(void) {
seq_lens[i] = strlen(seqs[i]);
bseqs[i] = (uint8_t*)malloc(sizeof(uint8_t) * seq_lens[i]);
for (j = 0; j < seq_lens[i]; ++j)
bseqs[i][j] = nst_nt4_table[(int)seqs[i][j]];
bseqs[i][j] = _nt4_table[(int)seqs[i][j]];
}

// output to stdout
Expand Down
34 changes: 23 additions & 11 deletions include/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <stdint.h>
#include "simd_instruction.h"


#define ABPOA_GLOBAL_MODE 0
#define ABPOA_LOCAL_MODE 1
#define ABPOA_EXTEND_MODE 2
Expand All @@ -26,13 +25,13 @@
#define ABPOA_CSOFT_CLIP 4
#define ABPOA_CHARD_CLIP 5

#define ABPOA_SRC_NODE_ID 0
#define ABPOA_SRC_NODE_ID 0
#define ABPOA_SINK_NODE_ID 1

#define ABPOA_OUT_CONS 0
#define ABPOA_OUT_MSA 1
#define ABPOA_OUT_CONS 0
#define ABPOA_OUT_MSA 1
#define ABPOA_OUT_CONS_MSA 2
#define ABPOA_OUT_GFA 3
#define ABPOA_OUT_GFA 3
#define ABPOA_OUT_CONS_GFA 4

#define ABPOA_HB 0
Expand Down Expand Up @@ -67,7 +66,7 @@ typedef struct {
// alignment mode
uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_msa_header:1, out_cons:1, out_gfa:1, is_diploid:1, use_read_ids:1;
uint8_t amb_strand:1;
char *out_pog;
char *incr_fn, *out_pog;
int align_mode, gap_mode, cons_agrm;
double min_freq; // for multiploid data

Expand All @@ -92,16 +91,26 @@ typedef struct {
int *index_to_node_id;
int *node_id_to_index, *node_id_to_max_pos_left, *node_id_to_max_pos_right, *node_id_to_max_remain, *node_id_to_msa_rank;
uint8_t is_topological_sorted:1, is_called_cons:1, is_set_msa_rank:1;
double cal_R_time; // for evaluation
} abpoa_graph_t;

typedef struct {
int l, m; char *s;
} abpoa_str_t;

typedef struct {
int n_seq, m_seq;
abpoa_str_t *seq, *name, *comment, *qual;
uint8_t *is_rc;
} abpoa_seq_t;

typedef struct {
SIMDi *s_mem; uint64_t s_msize; // qp, DP_HE, dp_f OR qp, DP_H, dp_f : based on (qlen, num_of_value, m, node_n)
int *dp_beg, *dp_end, *dp_beg_sn, *dp_end_sn; int rang_m; // if band : based on (node_m)
} abpoa_simd_matrix_t;

typedef struct {
abpoa_graph_t *abg;
abpoa_seq_t *abs;
abpoa_simd_matrix_t *abm;
} abpoa_t;

Expand All @@ -112,14 +121,17 @@ void abpoa_free_para(abpoa_para_t *abpt);

// init for alignment
abpoa_t *abpoa_init(void);
void abpoa_free(abpoa_t *ab, abpoa_para_t *abpt);
void abpoa_free(abpoa_t *ab);

// perform msa
int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l);

// clean alignment graph
void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen);

// restore graph from GFA/FASTA file
abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt);

// for development:
// align a sequence to a graph
int abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res);
Expand Down Expand Up @@ -165,13 +177,13 @@ void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt);
// cons_l: store consensus sequences length
// cons_n: store number of consensus sequences
// Note: cons_seq and cons_l need to be freed by user.
int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, int seq_n, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n);
int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n);

// generate column multiple sequence alignment from graph
void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, char **read_names, uint8_t *is_rc, int seq_n, FILE *out_fp, uint8_t ***msa_seq, int *msa_l);
void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***msa_seq, int *msa_l);

// generate graph in GFA format to _out_fp_
void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, char **read_names, uint8_t *is_rc, int seq_n, FILE *out_fp);
void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);

// generate DOT graph plot and dump graph into PDF/PNG format file
int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt);
Expand Down
9 changes: 5 additions & 4 deletions python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pip install pyabpoa
```

### Install pyabpoa from source
Alternatively, you can install pyabpoa from source:
Alternatively, you can install pyabpoa from source (cython is required):
```
git clone https://github.com/yangao07/abPOA.git
cd abPOA
Expand All @@ -31,7 +31,7 @@ seqs=[
'CCCGGAAGA',
'CCGAAGA'
]
res=a.msa(seqs, out_cons=True, out_msa=True, out_pog='pog.png') # perform multiple sequence alignment
res=a.msa(seqs, out_cons=True, out_msa=True, out_pog='pog.png', incr_fn='') # perform multiple sequence alignment
# generate a figure of alignment graph to pog.png
for seq in res.cons_seq:
Expand Down Expand Up @@ -67,13 +67,14 @@ This constructs a multiple sequence alignment handler of pyabpoa, it accepts the

The `msa_aligner` handler provides one method which performs multiple sequence alignment and takes four arguments:
```
pyabpoa.msa_aligner.msa(seqs, out_cons, out_msa, out_pog=None)
pyabpoa.msa_aligner.msa(seqs, out_cons, out_msa, out_pog='', incr_fn='')
```

* **seqs**: a list variable containing a set of input sequences; **positional**
* **out_cons**: a bool variable to ask pyabpoa to generate consensus sequence; **positional**
* **out_msa**: a bool variable to ask pyabpoa to generate RC-MSA; **positional**
* **out_pog**: name of a file (`.png` or `.pdf`) to store the plot of the final alignment graph; **optional**, default: **None**
* **out_pog**: name of a file (`.png` or `.pdf`) to store the plot of the final alignment graph; **optional**, default: **''**
* **incr_fn**: name of an existing graph (GFA) or MSA (FASTA) file, incrementally align sequence to this graph/MSA; **optional**, default: **''**

### Class pyabpoa.msa_result
```
Expand Down
25 changes: 20 additions & 5 deletions python/cabpoa.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ cdef extern from "abpoa.h":
# alignment mode
uint8_t ret_cigar, rev_cigar, out_msa, out_msa_header, out_cons, out_gfa, is_diploid, use_read_ids # mode: 0: global, 1: local, 2: extend
uint8_t amb_strand
char *incr_fn
char *out_pog
int align_mode, gap_mode, cons_agrm
double min_freq # for diploid data
Expand Down Expand Up @@ -90,13 +91,25 @@ cdef extern from "abpoa.h":
int *node_id_to_max_remain
int *node_id_to_msa_rank
uint8_t is_topological_sorted, is_called_cons, is_set_msa_rank
double cal_R_time

ctypedef struct abpoa_str_t:
int l, m
char *s

ctypedef struct abpoa_seq_t:
int n_seq, m_seq
abpoa_str_t *seq
abpoa_str_t *name
abpoa_str_t *comment
abpoa_str_t *qual
uint8_t *is_rc

ctypedef struct abpoa_simd_matrix_t:
pass

ctypedef struct abpoa_t:
abpoa_graph_t *abg
abpoa_seq_t *abs
abpoa_simd_matrix_t *abm

# init for abpoa parameters
Expand All @@ -107,14 +120,16 @@ cdef extern from "abpoa.h":

# init for alignment
abpoa_t *abpoa_init()
void abpoa_free(abpoa_t *ab, abpoa_para_t *abpt)
void abpoa_free(abpoa_t *ab)

# do msa for a set of input sequences
int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp, uint8_t ***cons_seq, uint8_t ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l)

# clean alignment graph
void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen)

# restore graph from GFA/MSA file
abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt)
# align a sequence to a graph
int abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res)

Expand All @@ -132,12 +147,12 @@ cdef extern from "abpoa.h":
# cons_l: store consensus sequences length
# cons_n: store number of consensus sequences
# Note: cons_seq and cons_l need to be freed by user.
int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, int seq_n, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n)
int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n)
# generate column multiple sequence alignment from graph
void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, char **seq_names, uint8_t *is_rc, int seq_n, FILE *out_fp, uint8_t ***msa_seq, int *msa_l)
void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***msa_seq, int *msa_l)

# generate full graph in GFA format
void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, char **seq_names, uint8_t *is_rc, int seq_n, FILE *out_fp)
void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp)

# generate DOT graph plot
int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt)
Loading

0 comments on commit 59724c6

Please sign in to comment.