-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_alleles_from_sam.py
136 lines (120 loc) · 4.21 KB
/
get_alleles_from_sam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import numpy
from Bio import SeqIO
import sys
import pysam
# import vcf
from pstats import Stats
from collections import OrderedDict
from cProfile import run
from pstats import Stats
USE_CHASTITY = False
### cat sd_0001_PAO1_5k.sam | python get_alleles_from_sam.py sample_name positions.txt vcf_file
def get_args():
if len(sys.argv) != 4:
print 'incorrect number of args, useage is get_alleles_from_sam.py <sample_name> <positions.txt> <output_file>'
else:
return sys.argv[1], sys.argv[2], sys.argv[3]
def read_positions(positions_txt):
# all positions are 0-based (Pythonic)
positions = OrderedDict()
for line in open(positions_txt, 'r'):
cols = line.strip().split(',')
if cols[0] not in positions.keys():
positions[cols[0]] = set()
positions[cols[0]].add(int(cols[1]) - 1)
else:
positions[cols[0]].add(int(cols[1]) - 1)
return positions
def init_array(refs, position_file):
# 1 2 3 4 5 ...
#A 0 0 0 0 0 ...
#C 0 0 0 0 0 ...
#G 0 0 0 0 0 ...
#T 0 0 0 0 0 ...
array = {}
positions = read_positions(position_file)
for ref in positions.keys():
array[ref] = numpy.zeros((4, max(positions[ref]) + 1), dtype=int)
return array, positions
def read_sam(position_file):
base_dict = {'A':0, 'C':1, 'G':2, 'T':3}
alignment = pysam.Samfile('-', 'r')
array, positions = init_array(alignment.references, position_file)
for line in alignment:
# ignore any unmapped reads
if line.is_unmapped:
continue
chrom = alignment.getrname(line.tid)
read_positions = set(xrange(line.pos, line.aend))
try:
isecs = positions[chrom].intersection(read_positions)
except KeyError:
continue
if isecs:
#overlap = [(pos, line.seq[pos-line.pos]) for pos in isec]
#quality = [(pos, ord(line.qual[pos-line.pos])-33) for pos in isec]
aligned_pairs = dict((ref, query) for (query, ref) in line.get_aligned_pairs())
for isec in isecs:
if aligned_pairs[isec]:
read_base = line.seq[aligned_pairs[isec]]
if read_base != 'N':
array[chrom][(base_dict[read_base], isec)] += 1
return array, positions
def write_alleles(array, positions, sample_name, output_file):
frag = ''
chastity_list = []
gaps = 0
#indel_positions = [(position.CHROM, position.POS - 1) for position
# in list(vcf.Reader(open(sys.argv[3], 'r'))) if position.is_indel]
indel_positions = []
for chrom in positions.keys():
for pos in sorted(positions[chrom]):
if (chrom, pos) in indel_positions:
gaps += 1
frag += '-'
else:
counts = tuple(array[chrom][:,pos])
counts_sort = sorted(counts, reverse=True)
if all(counts[0] > base for base in counts[1:4]):
# chastity is greatest / (greatest + second greatest)
if USE_CHASTITY:
chastity_list.append(float(counts_sort[0]) /
sum(counts_sort[:2]))
frag += 'A'
elif all(counts[1] > base for base in counts[0:1] + counts[2:4]):
if USE_CHASTITY:
chastity_list.append(float(counts_sort[0]) /
sum(counts_sort[:2]))
frag += 'C'
elif all(counts[2] > base for base in counts[0:2] + counts[3:4]):
if USE_CHASTITY:
chastity_list.append(float(counts_sort[0]) /
sum(counts_sort[:2]))
frag += 'G'
elif all(counts[3] > base for base in counts[0:3]):
if USE_CHASTITY:
chastity_list.append(float(counts_sort[0]) /
sum(counts_sort[:2]))
frag += 'T'
else:
gaps += 1
frag += '-'
total = sum([len(positions[chrom]) for chrom in positions.keys()])
print 'Sample: %s' %(sample_name)
print 'Total positions: %i' %(total)
print 'Gaps: %i' %(gaps)
print 'Positions covered: %.2f %%' %(100 - (float(gaps) / total * 100.0))
print 'Sample: %s' %(sample_name)
if USE_CHASTITY:
mean_chastity = (sum(chastity_list) / (len(frag) - gaps)) * 100.0
print 'Mean chastity: %.2f %%' %(mean_chastity)
if mean_chastity < 90:
print 'CHASTITY WARNING: Mixed samples can severely affect accuracy of placement'
with open(output_file, 'w') as file_out:
print >>file_out, '>%s_new\n%s\n' %(sample_name, frag),
if __name__ == '__main__':
sample_name, position_file, output_file = get_args()
array, positions = read_sam(position_file)
# run('write_alleles(array, positions)', 'stats')
#stats = Stats('stats')
write_alleles(array, positions, sample_name, output_file)