Skip to content

Commit

Permalink
v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Yan Gao committed Apr 15, 2020
1 parent bd37903 commit 595ad56
Show file tree
Hide file tree
Showing 13 changed files with 298 additions and 120 deletions.
12 changes: 7 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,16 @@ tags
cscope.*

# python
dist/*
pyabpoa.egg-info/*
python/build/*
python/dist/*
python/example.png
python/pyabpoa.c
python/pyabPOA.egg-info/*
python/pyabpoa.egg-info/*
python/src/*

# eval file
msa_abPOA*
msa_abPOA.c
s2g_abPOA*
s2g_abPOA.c
evaluation/msa_abPOA
evaluation/msa_spoa

12 changes: 12 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
include src/abpoa.h
include src/abpoa_align.h
include src/abpoa_graph.h
include src/kdq.h
include src/kseq.h
include src/simd_abpoa_align.h
include src/simd_instruction.h
include src/utils.h
include python/cabpoa.pxd
include python/pyabpoa.c
include python/pyabpoa.pyx
include python/README.md
22 changes: 19 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,30 +41,35 @@ SIMD_FLAG = -march=native

ifneq ($(sse2),)
SIMD_FLAG=$(FLAG_SSE2)
py_SIMD_FLAG = SSE2=1
else ifneq ($(sse41),)
SIMD_FLAG=$(FLAG_SSE41)
py_SIMD_FLAG = SSE41=1
else ifneq ($(avx2),)
SIMD_FLAG=$(FLAG_AVX2)
py_SIMD_FLAG = AVX2=1
else ifneq ($(avx512f),)
SIMD_FLAG=$(FLAG_AVX512F)
py_SIMD_FLAG = AVX512f=1
else ifneq ($(avx512bw),)
SIMD_FLAG=$(FLAG_AVX512BW)
py_SIMD_FLAG = AVX512BW=1
endif

.c.o:
$(CC) -c $(CFLAGS) $< -o $@

BIN = $(BIN_DIR)/abPOA
BIN = $(BIN_DIR)/abpoa
ifneq ($(gdb),)
BIN = $(BIN_DIR)/gdb_abPOA
BIN = $(BIN_DIR)/gdb_abpoa
endif
ABPOALIB = $(LIB_DIR)/libabpoa.a
# TODO add example
EXAMPLE = example


all: $(BIN)
abPOA: $(BIN)
abpoa: $(BIN)
libabpoa: $(ABPOALIB)
example: $(EXAMPLE)

Expand All @@ -89,5 +94,16 @@ $(SRC_DIR)/simd_check.o:$(SRC_DIR)/simd_check.c $(SRC_DIR)/simd_instruction.h
$(SRC_DIR)/simd_abpoa_align.o:$(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/simd_instruction.h $(SRC_DIR)/utils.h
$(CC) -c $(CFLAGS) $(SIMD_FLAG) $< -o $@

install_py: python/cabpoa.pxd python/pyabpoa.pyx python/README.md
${py_SIMD_FLAG} python setup.py install

sdist: install_py
${py_SIMD_FLAG} python setup.py sdist #bdist_wheel

publish_pypi: clean_py sdist
twine upload dist/*

clean:
rm -f $(SRC_DIR)/*.[oa] $(LIB_DIR)/*.[oa] $(BIN)
clean_py:
rm -rf build/ dist/ pyabpoa.egg-info/ python/pyabpoa.c
30 changes: 15 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ tar -zxvf abPOA-v1.0.0.tar.gz && cd abPOA-v1.0.0
Install via conda and run with test data:
```
conda install -c bioconda abpoa
abPOA ./test_data/seq.fa > cons.fa
abpoa ./test_data/seq.fa > cons.fa
```
Or, make from source and run with test data:
```
make; ./bin/abPOA ./test_data/seq.fa > cons.fa
make; ./bin/abpoa ./test_data/seq.fa > cons.fa
```
## Table of Contents

- [Introduction](#introduction)
- [Installation](#install)
- [Installing abPOA and pyabPOA via conda or pip](#conda)
- [Installing abPOA and pyabpoa via conda or pip](#conda)
- [Building abPOA from source files](#build)
- [Pre-built binary executable file for Linux/Unix](#binary)
- [General usage](#usage)
Expand All @@ -53,12 +53,12 @@ It right now supports SSE2/SSE41/AVX2 vectorization and more advanced instructio

## <a name="install"></a>Installation

### <a name="conda"></a>Installing abPOA and [pyabPOA](python/README.md) via conda or pip
### <a name="conda"></a>Installing abPOA and [pyabpoa](python/README.md) via conda or pip
On Linux/Unix and Mac OS, abPOA can be installed via
```
conda install -c bioconda abpoa # install abPOA program
conda install -c bioconda pyabpoa # install pyabPOA module
pip install pyabpoa # install pyabPOA module
conda install -c bioconda pyabpoa # install pyabpoa module
pip install pyabpoa # install pyabpoa module
```

### <a name="build"></a>Building abPOA from source files
Expand Down Expand Up @@ -88,26 +88,26 @@ tar -zxvf abPOA-v1.0.0_x64-linux.tar.gz
### <a name="gen_cons"></a>Generate consensus sequence

```
abPOA seq.fa > cons.fa
abpoa seq.fa > cons.fa
```

### <a name="gen_cons"></a>Generate row-column multiple sequence alignment in PIR format

```
abPOA seq.fa -r2 > cons.out
abpoa seq.fa -r2 > cons.out
```

### <a name="gen_plot"></a>Generate plot of alignment graph

```
abPOA seq.fa -g poa.png > cons.fa
abpoa seq.fa -g poa.png > cons.fa
```

## <a name="cmd"></a>Commands and options
```
abPOA: adaptive banded Partial Order Alignment
abpoa: adaptive banded Partial Order Alignment
Usage: abPOA [option] <in.fa/fq> > cons.fa/msa.out
Usage: abpoa [option] <in.fa/fq> > cons.fa/msa.out
Options:
Alignment:
Expand Down Expand Up @@ -142,20 +142,20 @@ Options:
```

## <a name="input"></a>Input
abPOA works with FASTA, FASTQ, gzip'd FASTA(.fa.gz) and gzip'd FASTQ(.fq.gz) formats. The input file is expected to contains multiple reads which will be processed as a whole set.
abpoa works with FASTA, FASTQ, gzip'd FASTA(.fa.gz) and gzip'd FASTQ(.fq.gz) formats. The input file is expected to contains multiple reads which will be processed as a whole set.

abPOA also can take a list of file names as input with option `-l`, where each line is the path of one sequence file containing multiple sequences.
abpoa also can take a list of file names as input with option `-l`, where each line is the path of one sequence file containing multiple sequences.

## <a name="output"></a>Output
### <a name="cons"></a>Consensus sequence
abPOA outputs consensus sequence in FASTA format with the name field as "Consensus_sequence".
abpoa outputs consensus sequence in FASTA format with the name field as "Consensus_sequence".
For example:
```
>Consensus_sequence
ACGTGTACACGTTGAC
```
### <a name="msa"></a>Row-column multiple sequence alignment
abPOA outputs the row-column multiple sequence alignment of input sequences in PIR format with a FASTA header. For example:
abpoa outputs the row-column multiple sequence alignment of input sequences in PIR format with a FASTA header. For example:
```
>Multiple_sequence_alignment
ACGTGTACA-GTTGAC
Expand Down
164 changes: 164 additions & 0 deletions evaluation/msa_abPOA.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/* To compile:
* gcc -O3 msa_abPOA.c -I ./include -L ./lib -labpoa -lz -o msa_abPOA
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#include <getopt.h>
#include "abpoa.h"

#define MAX_SEQ_LEN 10000
#define MAX_INDEL_LEN 1000
// AaCcGgTtNn ==> 0,1,2,3,4
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};

// read fasta format
char *get_seq(FILE *fp)
{
char c = fgetc(fp);
if (feof(fp)) return NULL; // end of file
else if (c != '>') {
fprintf(stderr, "Unexpected format.\n");
exit(1);
}
// starts with '>'
char *seq = (char *)malloc(sizeof(char) * MAX_SEQ_LEN);
fgets(seq, MAX_SEQ_LEN, fp); // header line
if(fgets(seq, MAX_SEQ_LEN, fp) == NULL) { // sequence line
fprintf(stderr, "No fasta sequence.\n");
exit(1);
}

int seq_len = strlen(seq) - 1;
seq[seq_len] = '\0';
return seq;
}

void help() {
printf(
"\n"
"usage: ./msa_abPOA [options ...]\n"
"\n"
" options:\n"
" -b <int>\n"
" default: 10\n"
" 1st part of extra-band-width, set as negative to disable adaptive banded DP\n"
" -m <int>\n"
" default: 2\n"
" score for matching bases\n"
" -x <int>\n"
" default: 4\n"
" penalty for mismatching bases\n"
" -o <int(,int)>\n"
" default: gap_open1 = 4, gap_open2 = 24\n"
" gap opening penalty (must be non-negative)\n"
" -e <int(,int)>\n"
" default: gap_ext1 = 2, gap_ext2 = 1\n"
" gap extension penalty (must be non-negative)\n"
" -s <file>\n"
" default: seq.fa\n"
" the input sequence set\n"
" -n <int>\n"
" default: 10\n"
" number of sequences in each set\n"
" -h \n"
" prints the usage\n"
);
}

int main (int argc, char * const argv[]) {
char *seq_file = "seq.fa";
//initial
abpoa_para_t *abpt = abpoa_init_para();

int n_seqs = 0, *seq_lens; uint8_t **bseqs;
char opt; char *s;;
while ((opt = getopt(argc, argv, "l:m:x:o:e:s:n:hb:")) != -1) {
switch (opt) {
case 'l': abpt->align_mode = atoi(optarg); break;
case 'm': abpt->match = atoi(optarg); break;
case 'x': abpt->mismatch = atoi(optarg); break;
case 'o': abpt->gap_open1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_open2 = strtol(s+1, &s, 10); break;
case 'e': abpt->gap_ext1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_ext2 = strtol(s+1, &s, 10); break;
case 's': seq_file = optarg; break;
case 'n': n_seqs = atoi(optarg); break;
case 'b': abpt->wb = atoi(optarg); break;
case 'h': help(); return 0;
default: help(); return 1;
}
}

seq_lens = (int*)malloc(sizeof(int) * n_seqs);
bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * n_seqs);

// output options
abpt->out_cons = 1; // generate consensus sequence, set 0 to disable

abpoa_post_set_para(abpt);

FILE *fp_seq;
if((fp_seq = fopen(seq_file, "r")) == NULL) {
printf("fail to read seq\n");
exit(1);
}

struct timeval start_time, end_time;
double runtime = 0;

int i;
abpoa_t *ab = abpoa_init();
while(1) {
int seq_i = 0;
while(1) {
char *seq = get_seq(fp_seq);
if(seq == NULL ) break;

int seq_len = strlen(seq);
seq_lens[seq_i] = seq_len;
bseqs[seq_i] = (uint8_t*)malloc(seq_len * sizeof(uint8_t));
for (i = 0; i < seq_len; ++i)
bseqs[seq_i][i] = nst_nt4_table[(int)seq[i]];
free(seq);
seq_i++;
if (seq_i == n_seqs) break; // read a set of sequences
}
if (seq_i == 0) break;

gettimeofday(&start_time, NULL);
abpoa_msa(ab, abpt, n_seqs, seq_lens, bseqs, stdout, NULL, NULL, NULL, NULL, NULL);
gettimeofday(&end_time, NULL);
runtime = runtime + (end_time.tv_sec - start_time.tv_sec) + (end_time.tv_usec - start_time.tv_usec) * 1e-6;

if (n_seqs > 0) {
for (i = 0; i < n_seqs; ++i) free(bseqs[i]);
}

if(feof(fp_seq)) break;
}
abpoa_free(ab, abpt);
fprintf(stderr, "%.2f\t", runtime);

free(seq_lens); free(bseqs);
abpoa_free_para(abpt); fclose(fp_seq);
return 0;
}
25 changes: 0 additions & 25 deletions python/Makefile

This file was deleted.

Loading

0 comments on commit 595ad56

Please sign in to comment.