-
Notifications
You must be signed in to change notification settings - Fork 0
/
FASTA_Expasy_Prot_Param_Tool_v1_3.py
149 lines (119 loc) · 8.16 KB
/
FASTA_Expasy_Prot_Param_Tool_v1_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# v 1.3
# Updated to name the CSV file after the sequence name if only one sequence is used, or after the parent directory if multiple sequences are in a single FASTA file or multiple FASTA files are found. Added date and time to the CSV name.
# Added user input if no fasta file found.
import os
import time
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import csv
Fc_sequence = ("EPKSSDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLYITREPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSC"
"SVMHEALHNHYTQKSLSLSPGK")
def calculate_charge_at_pH7(sequence):
analysed_seq = ProteinAnalysis(sequence)
return analysed_seq.charge_at_pH(7.0)
def calculate_percent_charged(sequence):
charged_aa = sum(sequence.count(aa) for aa in 'DEHKR')
total_aa = len(sequence)
return (charged_aa / total_aa) * 100
def calculate_percent_hydrophobic(sequence):
hydrophobic_aa = sum(sequence.count(aa) for aa in 'AILMFWYV')
total_aa = len(sequence)
return (hydrophobic_aa / total_aa) * 100
def calculate_molar_extinction_coefficient_oxidized(sequence):
analysed_seq = ProteinAnalysis(sequence)
epsilon_prot = analysed_seq.molar_extinction_coefficient()
return epsilon_prot[1]
def calculate_abs_0_1_percent_oxidized(sequence):
extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence)
molecular_weight = ProteinAnalysis(sequence).molecular_weight()
return extinction_coefficient / molecular_weight
def parse_fasta_and_export_with_protparam(input_files, c_terminus="", n_terminus="", multimer_count=1):
sequences = []
date_time = time.strftime("%Y%m%d_%H%M%S")
single_sequence_name = None
for input_file in input_files:
with open(input_file, 'r') as file:
sequence_id = None
sequence_lines = []
sequence_count = 0
for line in file:
line = line.strip()
if line.startswith('>'):
sequence_count += 1
if sequence_id is not None and sequence_lines:
sequence = ''.join(sequence_lines)
if c_terminus == "Fc":
c_terminus_seq = Fc_sequence
comment = "Fc added"
else:
c_terminus_seq = c_terminus
comment = ""
sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
percent_charged = calculate_percent_charged(sequence_with_modifications)
percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
sequence_id = line[1:]
sequence_lines = []
else:
sequence_lines.append(line)
if sequence_id is not None and sequence_lines:
sequence = ''.join(sequence_lines)
if c_terminus == "Fc":
c_terminus_seq = Fc_sequence
comment = "Fc added"
else:
c_terminus_seq = c_terminus
comment = ""
sequence_with_modifications = n_terminus + (sequence * multimer_count) + c_terminus_seq
charge_at_pH7 = calculate_charge_at_pH7(sequence_with_modifications)
percent_charged = calculate_percent_charged(sequence_with_modifications)
percent_hydrophobic = calculate_percent_hydrophobic(sequence_with_modifications)
extinction_coefficient = calculate_molar_extinction_coefficient_oxidized(sequence_with_modifications)
abs_0_1_percent = calculate_abs_0_1_percent_oxidized(sequence_with_modifications)
isoelectric_point = ProteinAnalysis(sequence_with_modifications).isoelectric_point()
molecular_weight = ProteinAnalysis(sequence_with_modifications).molecular_weight()
sequences.append((sequence_id, percent_charged, percent_hydrophobic, charge_at_pH7, isoelectric_point, extinction_coefficient, molecular_weight, abs_0_1_percent, sequence, c_terminus_seq, n_terminus, multimer_count, sequence_with_modifications, comment))
if sequence_count == 1:
single_sequence_name = sequence_id
output_dir = os.path.dirname(input_files[0])
if len(input_files) == 1 and single_sequence_name:
# If only one sequence is processed, use the sequence name
output_csv = os.path.join(output_dir, f'{single_sequence_name}_{date_time}_ProtParam_Results.csv')
else:
# If multiple sequences or files, use the parent directory name
parent_dir_name = os.path.basename(output_dir)
output_csv = os.path.join(output_dir, f'{parent_dir_name}_{date_time}_ProtParam_Results.csv')
with open(output_csv, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['Sequence Name', '% Charged Amino Acids', '% Hydrophobic Amino Acids', 'Charge at pH 7', 'Isoelectric Point', 'Extinction Coefficient M-1cm-1 Oxidized', 'MW in daltons', 'Abs 0.1% (1g/L) Oxidized', 'Sequence', 'C-Terminus', 'N-Terminus', 'Multimer Count', 'Analyzed Sequence', 'Comments'])
for seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term, n_term, multimer, seq_with_modifications, comment in sequences:
c_term_display = c_term if c_term else "No C-terminus added"
n_term_display = n_term if n_term else "No N-terminus added"
writer.writerow([seq_id, percent_charged, percent_hydrophobic, charge_pH7, pI, extinction_coefficient, mw, abs_0_1_percent, seq, c_term_display, n_term_display, multimer, seq_with_modifications, comment])
print(f"Sequences exported to '{output_csv}'")
input("Press the <ENTER> key to continue...")
def find_fasta_files(root_folder):
fasta_files = []
for root, dirs, files in os.walk(root_folder):
for file in files:
if file.endswith('.fasta'):
fasta_files.append(os.path.join(root, file))
return fasta_files
print("The directory must contain .fasta files to be used.")
root_folder = input("Please enter the path to the root folder: ")
input_files = find_fasta_files(root_folder)
if input_files:
print(f"Found FASTA files: {input_files}")
multimer_count_input = input("Enter the multimer count (Example: dimer = 2): ")
multimer_count = int(multimer_count_input) if multimer_count_input else 1
n_terminus = input("Enter the N-Terminus to add at the beginning (Press enter for no N-Terminus): ")
c_terminus = input("Enter the C-Terminus to add at the end (Press enter for no C-Terminus, or type 'Fc' for Fc sequence): ")
parse_fasta_and_export_with_protparam(input_files, c_terminus, n_terminus, multimer_count)
else:
print("No FASTA files found in the specified root folder.")
input("Press the <ENTER> key to continue...")