Skip to content

Commit

Permalink
fix: loadDatabases check for duplicate protIDs (#349)
Browse files Browse the repository at this point in the history
  • Loading branch information
edkerk committed Aug 19, 2023
1 parent f29852a commit 5af432a
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 126 deletions.
261 changes: 135 additions & 126 deletions doc/src/geckomat/get_enzyme_data/loadDatabases.html
Original file line number Diff line number Diff line change
Expand Up @@ -141,133 +141,142 @@ <h2><a name="_source"></a>SOURCE CODE <a href="#_top"><img alt="^" border="0" sr
0079 <span class="keyword">else</span>
0080 databases.uniprot = [];
0081 <span class="keyword">end</span>
0082 <span class="keyword">end</span>
0083
0084 <span class="comment">%% KEGG</span>
0085 <span class="keyword">if</span> any(strcmp(selectDatabase,{<span class="string">'kegg'</span>,<span class="string">'both'</span>}))
0086 keggPath = fullfile(filePath,<span class="string">'kegg.tsv'</span>);
0087 <span class="keyword">if</span> ~exist(keggPath,<span class="string">'file'</span>)
0088 <span class="keyword">if</span> isempty(kegg.ID)
0089 printOrange(<span class="string">'WARNING: No kegg.ID is specified, unable to download KEGG DB.\n'</span>)
0090 <span class="keyword">else</span>
0091 <a href="#_sub1" class="code" title="subfunction downloadKEGG(keggID, filePath, keggGeneID)">downloadKEGG</a>(kegg.ID,keggPath,kegg.geneID);
0092 <span class="keyword">end</span>
0093 <span class="keyword">end</span>
0094 <span class="keyword">if</span> exist(keggPath,<span class="string">'file'</span>)
0095 fid = fopen(keggPath,<span class="string">'r'</span>);
0096 fileContent = textscan(fid,<span class="string">'%q %q %q %q %q %q %q'</span>,<span class="string">'Delimiter'</span>,<span class="string">','</span>,<span class="string">'HeaderLines'</span>,0);
0097 fclose(fid);
0098 databases.kegg.uniprot = fileContent{1};
0099 databases.kegg.genes = fileContent{2};
0100 databases.kegg.keggGene = fileContent{3};
0101 databases.kegg.eccodes = fileContent{4};
0102 databases.kegg.MW = str2double(fileContent{5});
0103 databases.kegg.pathway = fileContent{6};
0104 databases.kegg.seq = fileContent{7};
0105 <span class="keyword">else</span>
0106 databases.kegg = [];
0107 <span class="keyword">end</span>
0108 <span class="keyword">end</span>
0109 <span class="keyword">end</span>
0110
0111 <a name="_sub1" href="#_subfunctions" class="code">function downloadKEGG(keggID, filePath, keggGeneID)</a>
0112 <span class="comment">%% Download gene information</span>
0113 progressbar([<span class="string">'Downloading KEGG data for organism code '</span> keggID])
0114 webOptions = weboptions(<span class="string">'Timeout'</span>,30);
0115 <span class="keyword">try</span>
0116 gene_list = webread([<span class="string">'http://rest.kegg.jp/list/'</span> keggID],webOptions);
0117 <span class="keyword">catch</span> ME
0118 <span class="keyword">switch</span> ME.identifier
0119 <span class="keyword">case</span> <span class="string">'MATLAB:webservices:HTTP400StatusCodeError'</span>
0120 error([<span class="string">'Unable to download data form KEGG with a potentially invalid ID: '</span> keggID ])
0121 <span class="keyword">end</span>
0122 <span class="keyword">end</span>
0123 gene_list = regexpi(gene_list, <span class="string">'[^\n]+'</span>,<span class="string">'match'</span>)';
0124 gene_id = regexpi(gene_list,[<span class="string">'(?&lt;='</span> keggID <span class="string">':)\S+'</span>],<span class="string">'match'</span>);
0125
0126 <span class="comment">% Retrieve information for every gene in the list, 10 genes per query</span>
0127 genesPerQuery = 10;
0128 queries = ceil(numel(gene_id)/genesPerQuery);
0129 keggData = cell(numel(gene_id),1);
0130 <span class="keyword">for</span> i = 1:queries
0131 <span class="comment">% Download batches of genes</span>
0132 firstIdx = i*genesPerQuery-(genesPerQuery-1);
0133 lastIdx = i*genesPerQuery;
0134 <span class="keyword">if</span> lastIdx &gt; numel(gene_id) <span class="comment">% Last query has probably less genes</span>
0135 lastIdx = numel(gene_id);
0136 <span class="keyword">end</span>
0137 url = [<span class="string">'http://rest.kegg.jp/get/'</span> keggID <span class="string">':'</span> strjoin([gene_id{firstIdx:lastIdx}],[<span class="string">'+'</span> keggID <span class="string">':'</span>])];
0138
0139 retry = true;
0140 <span class="keyword">while</span> retry
0141 <span class="keyword">try</span>
0142 retry = false;
0143 out = webread(url,webOptions);
0144 <span class="keyword">catch</span>
0145 retry = true;
0146 <span class="keyword">end</span>
0147 <span class="keyword">end</span>
0148 outSplit = strsplit(out,[<span class="string">'///'</span> 10]); <span class="comment">%10 is new line character</span>
0149 <span class="keyword">if</span> numel(outSplit) &lt; lastIdx-firstIdx+2
0150 error(<span class="string">'KEGG returns less genes per query'</span>) <span class="comment">%Reduce genesPerQuery</span>
0151 <span class="keyword">end</span>
0152 keggData(firstIdx:lastIdx) = outSplit(1:end-1);
0153 progressbar(i/queries)
0154 <span class="keyword">end</span>
0155
0156 <span class="comment">%% Parsing of info to keggDB format</span>
0157 sequence = regexprep(keggData,<span class="string">'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*'</span>,<span class="string">'$1'</span>);
0158 <span class="comment">%No AASEQ -&gt; no protein -&gt; not of interest</span>
0159 noProt = startsWith(sequence,<span class="string">'ENTRY '</span>);
0160 uni = regexprep(keggData,<span class="string">'.*UniProt: (\S+?)\s.*'</span>,<span class="string">'$1'</span>);
0161 noUni = startsWith(uni,<span class="string">'ENTRY '</span>);
0162 uni(noProt | noUni) = [];
0163 keggData(noProt | noUni) = [];
0164 sequence(noProt | noUni) = [];
0165 sequence = regexprep(sequence,<span class="string">'\s*'</span>,<span class="string">''</span>);
0166 keggGene = regexprep(keggData,<span class="string">'ENTRY\s+(\S+?)\s.+'</span>,<span class="string">'$1'</span>);
0167
0168 <span class="keyword">switch</span> keggGeneID
0169 <span class="keyword">case</span> {<span class="string">'kegg'</span>,<span class="string">''</span>}
0170 gene_name = keggGene;
0171 <span class="keyword">otherwise</span>
0172 <span class="comment">% In case there are special characters:</span>
0173 keggGeneIDT = regexptranslate(<span class="string">'escape'</span>,keggGeneID);
0174 gene_name = regexprep(keggData,[<span class="string">'.+'</span> keggGeneIDT <span class="string">': (\S+?)\n.+'</span>],<span class="string">'$1'</span>);
0175 noID = ~contains(keggData,keggGeneIDT);
0176 <span class="keyword">if</span> all(noID)
0177 error([<span class="string">'None of the KEGG entries are annotated with the gene identifier '</span> keggGeneID])
0178 <span class="keyword">else</span>
0179 gene_name(noID)= [];
0180 keggData(noID) = [];
0181 keggGene(noID) = [];
0182 sequence(noID) = [];
0183 uni(noID) = [];
0184 <span class="keyword">end</span>
0185 <span class="keyword">end</span>
0186
0187 EC_names = regexprep(keggData,<span class="string">'.*ORTHOLOGY.*\[EC:(.*?)\].*'</span>,<span class="string">'$1'</span>);
0188 EC_names(startsWith(EC_names,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
0189
0190 MW = cell(numel(sequence),1);
0191 <span class="keyword">for</span> i=1:numel(sequence)
0192 <span class="keyword">if</span> ~isempty(sequence{i})
0193 MW{i} = num2str(round(<a href="calculateMW.html" class="code" title="function MW = calculateMW(sequence)">calculateMW</a>(sequence{i})));
0194 <span class="keyword">end</span>
0195 <span class="keyword">end</span>
0196
0197 pathway = regexprep(keggData,<span class="string">'.*PATHWAY\s+(.*?)(BRITE|MODULE).*'</span>,<span class="string">'$1'</span>);
0198 pathway(startsWith(pathway,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
0199 pathway = strrep(pathway,[keggID <span class="string">'01100 Metabolic pathways'</span>],<span class="string">''</span>);
0200 pathway = regexprep(pathway,<span class="string">'\n'</span>,<span class="string">''</span>);
0201 pathway = regexprep(pathway,<span class="string">' '</span>,<span class="string">''</span>);
0202
0203 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence];
0204 out = cell2table(out);
0082 <span class="keyword">if</span> ~isempty(databases.uniprot)
0083 [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,<span class="string">'stable'</span>);
0084 <span class="keyword">if</span> numel(uniqueIDs) &lt; numel(databases.uniprot.ID)
0085 duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx);
0086 dispEM([<span class="string">'Duplicate entries are found for the following proteins. '</span><span class="keyword">...</span>
0087 <span class="string">'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '</span><span class="keyword">...</span>
0088 <span class="string">'in the model adapter:'</span>],true,databases.uniprot.ID(duplID));
0089 <span class="keyword">end</span>
0090 <span class="keyword">end</span>
0091 <span class="keyword">end</span>
0092
0093 <span class="comment">%% KEGG</span>
0094 <span class="keyword">if</span> any(strcmp(selectDatabase,{<span class="string">'kegg'</span>,<span class="string">'both'</span>}))
0095 keggPath = fullfile(filePath,<span class="string">'kegg.tsv'</span>);
0096 <span class="keyword">if</span> ~exist(keggPath,<span class="string">'file'</span>)
0097 <span class="keyword">if</span> isempty(kegg.ID)
0098 printOrange(<span class="string">'WARNING: No kegg.ID is specified, unable to download KEGG DB.\n'</span>)
0099 <span class="keyword">else</span>
0100 <a href="#_sub1" class="code" title="subfunction downloadKEGG(keggID, filePath, keggGeneID)">downloadKEGG</a>(kegg.ID,keggPath,kegg.geneID);
0101 <span class="keyword">end</span>
0102 <span class="keyword">end</span>
0103 <span class="keyword">if</span> exist(keggPath,<span class="string">'file'</span>)
0104 fid = fopen(keggPath,<span class="string">'r'</span>);
0105 fileContent = textscan(fid,<span class="string">'%q %q %q %q %q %q %q'</span>,<span class="string">'Delimiter'</span>,<span class="string">','</span>,<span class="string">'HeaderLines'</span>,0);
0106 fclose(fid);
0107 databases.kegg.uniprot = fileContent{1};
0108 databases.kegg.genes = fileContent{2};
0109 databases.kegg.keggGene = fileContent{3};
0110 databases.kegg.eccodes = fileContent{4};
0111 databases.kegg.MW = str2double(fileContent{5});
0112 databases.kegg.pathway = fileContent{6};
0113 databases.kegg.seq = fileContent{7};
0114 <span class="keyword">else</span>
0115 databases.kegg = [];
0116 <span class="keyword">end</span>
0117 <span class="keyword">end</span>
0118 <span class="keyword">end</span>
0119
0120 <a name="_sub1" href="#_subfunctions" class="code">function downloadKEGG(keggID, filePath, keggGeneID)</a>
0121 <span class="comment">%% Download gene information</span>
0122 progressbar([<span class="string">'Downloading KEGG data for organism code '</span> keggID])
0123 webOptions = weboptions(<span class="string">'Timeout'</span>,30);
0124 <span class="keyword">try</span>
0125 gene_list = webread([<span class="string">'http://rest.kegg.jp/list/'</span> keggID],webOptions);
0126 <span class="keyword">catch</span> ME
0127 <span class="keyword">switch</span> ME.identifier
0128 <span class="keyword">case</span> <span class="string">'MATLAB:webservices:HTTP400StatusCodeError'</span>
0129 error([<span class="string">'Unable to download data form KEGG with a potentially invalid ID: '</span> keggID ])
0130 <span class="keyword">end</span>
0131 <span class="keyword">end</span>
0132 gene_list = regexpi(gene_list, <span class="string">'[^\n]+'</span>,<span class="string">'match'</span>)';
0133 gene_id = regexpi(gene_list,[<span class="string">'(?&lt;='</span> keggID <span class="string">':)\S+'</span>],<span class="string">'match'</span>);
0134
0135 <span class="comment">% Retrieve information for every gene in the list, 10 genes per query</span>
0136 genesPerQuery = 10;
0137 queries = ceil(numel(gene_id)/genesPerQuery);
0138 keggData = cell(numel(gene_id),1);
0139 <span class="keyword">for</span> i = 1:queries
0140 <span class="comment">% Download batches of genes</span>
0141 firstIdx = i*genesPerQuery-(genesPerQuery-1);
0142 lastIdx = i*genesPerQuery;
0143 <span class="keyword">if</span> lastIdx &gt; numel(gene_id) <span class="comment">% Last query has probably less genes</span>
0144 lastIdx = numel(gene_id);
0145 <span class="keyword">end</span>
0146 url = [<span class="string">'http://rest.kegg.jp/get/'</span> keggID <span class="string">':'</span> strjoin([gene_id{firstIdx:lastIdx}],[<span class="string">'+'</span> keggID <span class="string">':'</span>])];
0147
0148 retry = true;
0149 <span class="keyword">while</span> retry
0150 <span class="keyword">try</span>
0151 retry = false;
0152 out = webread(url,webOptions);
0153 <span class="keyword">catch</span>
0154 retry = true;
0155 <span class="keyword">end</span>
0156 <span class="keyword">end</span>
0157 outSplit = strsplit(out,[<span class="string">'///'</span> 10]); <span class="comment">%10 is new line character</span>
0158 <span class="keyword">if</span> numel(outSplit) &lt; lastIdx-firstIdx+2
0159 error(<span class="string">'KEGG returns less genes per query'</span>) <span class="comment">%Reduce genesPerQuery</span>
0160 <span class="keyword">end</span>
0161 keggData(firstIdx:lastIdx) = outSplit(1:end-1);
0162 progressbar(i/queries)
0163 <span class="keyword">end</span>
0164
0165 <span class="comment">%% Parsing of info to keggDB format</span>
0166 sequence = regexprep(keggData,<span class="string">'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*'</span>,<span class="string">'$1'</span>);
0167 <span class="comment">%No AASEQ -&gt; no protein -&gt; not of interest</span>
0168 noProt = startsWith(sequence,<span class="string">'ENTRY '</span>);
0169 uni = regexprep(keggData,<span class="string">'.*UniProt: (\S+?)\s.*'</span>,<span class="string">'$1'</span>);
0170 noUni = startsWith(uni,<span class="string">'ENTRY '</span>);
0171 uni(noProt | noUni) = [];
0172 keggData(noProt | noUni) = [];
0173 sequence(noProt | noUni) = [];
0174 sequence = regexprep(sequence,<span class="string">'\s*'</span>,<span class="string">''</span>);
0175 keggGene = regexprep(keggData,<span class="string">'ENTRY\s+(\S+?)\s.+'</span>,<span class="string">'$1'</span>);
0176
0177 <span class="keyword">switch</span> keggGeneID
0178 <span class="keyword">case</span> {<span class="string">'kegg'</span>,<span class="string">''</span>}
0179 gene_name = keggGene;
0180 <span class="keyword">otherwise</span>
0181 <span class="comment">% In case there are special characters:</span>
0182 keggGeneIDT = regexptranslate(<span class="string">'escape'</span>,keggGeneID);
0183 gene_name = regexprep(keggData,[<span class="string">'.+'</span> keggGeneIDT <span class="string">': (\S+?)\n.+'</span>],<span class="string">'$1'</span>);
0184 noID = ~contains(keggData,keggGeneIDT);
0185 <span class="keyword">if</span> all(noID)
0186 error([<span class="string">'None of the KEGG entries are annotated with the gene identifier '</span> keggGeneID])
0187 <span class="keyword">else</span>
0188 gene_name(noID)= [];
0189 keggData(noID) = [];
0190 keggGene(noID) = [];
0191 sequence(noID) = [];
0192 uni(noID) = [];
0193 <span class="keyword">end</span>
0194 <span class="keyword">end</span>
0195
0196 EC_names = regexprep(keggData,<span class="string">'.*ORTHOLOGY.*\[EC:(.*?)\].*'</span>,<span class="string">'$1'</span>);
0197 EC_names(startsWith(EC_names,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
0198
0199 MW = cell(numel(sequence),1);
0200 <span class="keyword">for</span> i=1:numel(sequence)
0201 <span class="keyword">if</span> ~isempty(sequence{i})
0202 MW{i} = num2str(round(<a href="calculateMW.html" class="code" title="function MW = calculateMW(sequence)">calculateMW</a>(sequence{i})));
0203 <span class="keyword">end</span>
0204 <span class="keyword">end</span>
0205
0206 writetable(out, filePath, <span class="string">'FileType'</span>, <span class="string">'text'</span>, <span class="string">'WriteVariableNames'</span>,false);
0207 fprintf(<span class="string">'Model-specific KEGG database stored at %s\n'</span>,filePath);
0208 <span class="keyword">end</span></pre></div>
0206 pathway = regexprep(keggData,<span class="string">'.*PATHWAY\s+(.*?)(BRITE|MODULE).*'</span>,<span class="string">'$1'</span>);
0207 pathway(startsWith(pathway,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
0208 pathway = strrep(pathway,[keggID <span class="string">'01100 Metabolic pathways'</span>],<span class="string">''</span>);
0209 pathway = regexprep(pathway,<span class="string">'\n'</span>,<span class="string">''</span>);
0210 pathway = regexprep(pathway,<span class="string">' '</span>,<span class="string">''</span>);
0211
0212 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence];
0213 out = cell2table(out);
0214
0215 writetable(out, filePath, <span class="string">'FileType'</span>, <span class="string">'text'</span>, <span class="string">'WriteVariableNames'</span>,false);
0216 fprintf(<span class="string">'Model-specific KEGG database stored at %s\n'</span>,filePath);
0217 <span class="keyword">end</span></pre></div>
<hr><address>Generated by <strong><a href="http://www.artefact.tk/software/matlab/m2html/" title="Matlab Documentation in HTML">m2html</a></strong> &copy; 2005</address>
</body>
</html>
9 changes: 9 additions & 0 deletions src/geckomat/get_enzyme_data/loadDatabases.m
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@
else
databases.uniprot = [];
end
if ~isempty(databases.uniprot)
[uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,'stable');
if numel(uniqueIDs) < numel(databases.uniprot.ID)
duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx);
dispEM(['Duplicate entries are found for the following proteins. '...
'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '...
'in the model adapter:'],true,databases.uniprot.ID(duplID));
end
end
end

%% KEGG
Expand Down

0 comments on commit 5af432a

Please sign in to comment.