fix: loadDatabases check for duplicate protIDs (#349)

SysBioChalmers · Aug 19, 2023 · 5af432a · 5af432a
1 parent f29852a
commit 5af432a
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 126 deletions.
diff --git a/doc/src/geckomat/get_enzyme_data/loadDatabases.html b/doc/src/geckomat/get_enzyme_data/loadDatabases.html
@@ -141,133 +141,142 @@ <h2><a name="_source"></a>SOURCE CODE <a href="#_top"><img alt="^" border="0" sr
 0079     <span class="keyword">else</span>
 0080         databases.uniprot = [];
 0081     <span class="keyword">end</span>
-0082 <span class="keyword">end</span>
-0083 
-0084 <span class="comment">%% KEGG</span>
-0085 <span class="keyword">if</span> any(strcmp(selectDatabase,{<span class="string">'kegg'</span>,<span class="string">'both'</span>}))
-0086     keggPath = fullfile(filePath,<span class="string">'kegg.tsv'</span>);
-0087     <span class="keyword">if</span> ~exist(keggPath,<span class="string">'file'</span>)
-0088         <span class="keyword">if</span> isempty(kegg.ID)
-0089             printOrange(<span class="string">'WARNING: No kegg.ID is specified, unable to download KEGG DB.\n'</span>)
-0090         <span class="keyword">else</span>
-0091             <a href="#_sub1" class="code" title="subfunction downloadKEGG(keggID, filePath, keggGeneID)">downloadKEGG</a>(kegg.ID,keggPath,kegg.geneID);
-0092         <span class="keyword">end</span>
-0093     <span class="keyword">end</span>
-0094     <span class="keyword">if</span> exist(keggPath,<span class="string">'file'</span>)
-0095         fid         = fopen(keggPath,<span class="string">'r'</span>);
-0096         fileContent = textscan(fid,<span class="string">'%q %q %q %q %q %q %q'</span>,<span class="string">'Delimiter'</span>,<span class="string">','</span>,<span class="string">'HeaderLines'</span>,0);
-0097         fclose(fid);
-0098         databases.kegg.uniprot    = fileContent{1};
-0099         databases.kegg.genes      = fileContent{2};
-0100         databases.kegg.keggGene   = fileContent{3};
-0101         databases.kegg.eccodes    = fileContent{4};
-0102         databases.kegg.MW         = str2double(fileContent{5});
-0103         databases.kegg.pathway    = fileContent{6};
-0104         databases.kegg.seq        = fileContent{7};
-0105     <span class="keyword">else</span>
-0106         databases.kegg = [];
-0107     <span class="keyword">end</span>
-0108 <span class="keyword">end</span>
-0109 <span class="keyword">end</span>
-0110 
-0111 <a name="_sub1" href="#_subfunctions" class="code">function downloadKEGG(keggID, filePath, keggGeneID)</a>
-0112 <span class="comment">%% Download gene information</span>
-0113 progressbar([<span class="string">'Downloading KEGG data for organism code '</span> keggID])
-0114 webOptions = weboptions(<span class="string">'Timeout'</span>,30);
-0115 <span class="keyword">try</span>
-0116     gene_list = webread([<span class="string">'http://rest.kegg.jp/list/'</span> keggID],webOptions);
-0117 <span class="keyword">catch</span> ME
-0118     <span class="keyword">switch</span> ME.identifier
-0119         <span class="keyword">case</span> <span class="string">'MATLAB:webservices:HTTP400StatusCodeError'</span>
-0120             error([<span class="string">'Unable to download data form KEGG with a potentially invalid ID: '</span> keggID ])
-0121     <span class="keyword">end</span>
-0122 <span class="keyword">end</span>
-0123 gene_list = regexpi(gene_list, <span class="string">'[^\n]+'</span>,<span class="string">'match'</span>)';
-0124 gene_id   = regexpi(gene_list,[<span class="string">'(?&lt;='</span> keggID <span class="string">':)\S+'</span>],<span class="string">'match'</span>);
-0125 
-0126 <span class="comment">% Retrieve information for every gene in the list, 10 genes per query</span>
-0127 genesPerQuery = 10;
-0128 queries = ceil(numel(gene_id)/genesPerQuery);
-0129 keggData  = cell(numel(gene_id),1);
-0130 <span class="keyword">for</span> i = 1:queries
-0131     <span class="comment">% Download batches of genes</span>
-0132     firstIdx = i*genesPerQuery-(genesPerQuery-1);
-0133     lastIdx  = i*genesPerQuery;
-0134     <span class="keyword">if</span> lastIdx &gt; numel(gene_id) <span class="comment">% Last query has probably less genes</span>
-0135         lastIdx = numel(gene_id);
-0136     <span class="keyword">end</span>
-0137     url      = [<span class="string">'http://rest.kegg.jp/get/'</span> keggID <span class="string">':'</span> strjoin([gene_id{firstIdx:lastIdx}],[<span class="string">'+'</span> keggID <span class="string">':'</span>])];
-0138 
-0139     retry = true;
-0140     <span class="keyword">while</span> retry
-0141         <span class="keyword">try</span>
-0142             retry = false;
-0143             out   = webread(url,webOptions);
-0144         <span class="keyword">catch</span>
-0145             retry = true;
-0146         <span class="keyword">end</span>
-0147     <span class="keyword">end</span>
-0148     outSplit = strsplit(out,[<span class="string">'///'</span> 10]); <span class="comment">%10 is new line character</span>
-0149     <span class="keyword">if</span> numel(outSplit) &lt; lastIdx-firstIdx+2
-0150         error(<span class="string">'KEGG returns less genes per query'</span>) <span class="comment">%Reduce genesPerQuery</span>
-0151     <span class="keyword">end</span>
-0152     keggData(firstIdx:lastIdx) = outSplit(1:end-1);
-0153     progressbar(i/queries)
-0154 <span class="keyword">end</span>
-0155 
-0156 <span class="comment">%% Parsing of info to keggDB format</span>
-0157 sequence  = regexprep(keggData,<span class="string">'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*'</span>,<span class="string">'$1'</span>);
-0158 <span class="comment">%No AASEQ -&gt; no protein -&gt; not of interest</span>
-0159 noProt    = startsWith(sequence,<span class="string">'ENTRY '</span>);
-0160 uni       = regexprep(keggData,<span class="string">'.*UniProt: (\S+?)\s.*'</span>,<span class="string">'$1'</span>);
-0161 noUni     = startsWith(uni,<span class="string">'ENTRY '</span>);
-0162 uni(noProt | noUni)       = [];
-0163 keggData(noProt | noUni) = [];
-0164 sequence(noProt | noUni)  = [];
-0165 sequence  = regexprep(sequence,<span class="string">'\s*'</span>,<span class="string">''</span>);
-0166 keggGene  = regexprep(keggData,<span class="string">'ENTRY\s+(\S+?)\s.+'</span>,<span class="string">'$1'</span>);
-0167 
-0168 <span class="keyword">switch</span> keggGeneID
-0169     <span class="keyword">case</span> {<span class="string">'kegg'</span>,<span class="string">''</span>}
-0170         gene_name = keggGene;
-0171     <span class="keyword">otherwise</span>
-0172         <span class="comment">% In case there are special characters:</span>
-0173         keggGeneIDT = regexptranslate(<span class="string">'escape'</span>,keggGeneID);
-0174         gene_name = regexprep(keggData,[<span class="string">'.+'</span> keggGeneIDT <span class="string">': (\S+?)\n.+'</span>],<span class="string">'$1'</span>);
-0175         noID = ~contains(keggData,keggGeneIDT);
-0176         <span class="keyword">if</span> all(noID)
-0177             error([<span class="string">'None of the KEGG entries are annotated with the gene identifier '</span> keggGeneID])
-0178         <span class="keyword">else</span>
-0179             gene_name(noID)= [];
-0180             keggData(noID) = [];
-0181             keggGene(noID) = [];
-0182             sequence(noID) = [];
-0183             uni(noID)      = [];
-0184         <span class="keyword">end</span>
-0185 <span class="keyword">end</span>
-0186 
-0187 EC_names  = regexprep(keggData,<span class="string">'.*ORTHOLOGY.*\[EC:(.*?)\].*'</span>,<span class="string">'$1'</span>);
-0188 EC_names(startsWith(EC_names,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
-0189 
-0190 MW = cell(numel(sequence),1);
-0191 <span class="keyword">for</span> i=1:numel(sequence)
-0192     <span class="keyword">if</span> ~isempty(sequence{i})
-0193         MW{i} = num2str(round(<a href="calculateMW.html" class="code" title="function MW = calculateMW(sequence)">calculateMW</a>(sequence{i})));
-0194     <span class="keyword">end</span>
-0195 <span class="keyword">end</span>
-0196 
-0197 pathway   = regexprep(keggData,<span class="string">'.*PATHWAY\s+(.*?)(BRITE|MODULE).*'</span>,<span class="string">'$1'</span>);
-0198 pathway(startsWith(pathway,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
-0199 pathway   = strrep(pathway,[keggID <span class="string">'01100  Metabolic pathways'</span>],<span class="string">''</span>);
-0200 pathway   = regexprep(pathway,<span class="string">'\n'</span>,<span class="string">''</span>);
-0201 pathway   = regexprep(pathway,<span class="string">'           '</span>,<span class="string">''</span>);
-0202 
-0203 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence];
-0204 out = cell2table(out);
+0082     <span class="keyword">if</span> ~isempty(databases.uniprot)
+0083         [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,<span class="string">'stable'</span>);
+0084         <span class="keyword">if</span> numel(uniqueIDs) &lt; numel(databases.uniprot.ID)
+0085             duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx);
+0086             dispEM([<span class="string">'Duplicate entries are found for the following proteins. '</span><span class="keyword">...</span>
+0087                     <span class="string">'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '</span><span class="keyword">...</span>
+0088                     <span class="string">'in the model adapter:'</span>],true,databases.uniprot.ID(duplID));
+0089         <span class="keyword">end</span>
+0090     <span class="keyword">end</span>
+0091 <span class="keyword">end</span>
+0092 
+0093 <span class="comment">%% KEGG</span>
+0094 <span class="keyword">if</span> any(strcmp(selectDatabase,{<span class="string">'kegg'</span>,<span class="string">'both'</span>}))
+0095     keggPath = fullfile(filePath,<span class="string">'kegg.tsv'</span>);
+0096     <span class="keyword">if</span> ~exist(keggPath,<span class="string">'file'</span>)
+0097         <span class="keyword">if</span> isempty(kegg.ID)
+0098             printOrange(<span class="string">'WARNING: No kegg.ID is specified, unable to download KEGG DB.\n'</span>)
+0099         <span class="keyword">else</span>
+0100             <a href="#_sub1" class="code" title="subfunction downloadKEGG(keggID, filePath, keggGeneID)">downloadKEGG</a>(kegg.ID,keggPath,kegg.geneID);
+0101         <span class="keyword">end</span>
+0102     <span class="keyword">end</span>
+0103     <span class="keyword">if</span> exist(keggPath,<span class="string">'file'</span>)
+0104         fid         = fopen(keggPath,<span class="string">'r'</span>);
+0105         fileContent = textscan(fid,<span class="string">'%q %q %q %q %q %q %q'</span>,<span class="string">'Delimiter'</span>,<span class="string">','</span>,<span class="string">'HeaderLines'</span>,0);
+0106         fclose(fid);
+0107         databases.kegg.uniprot    = fileContent{1};
+0108         databases.kegg.genes      = fileContent{2};
+0109         databases.kegg.keggGene   = fileContent{3};
+0110         databases.kegg.eccodes    = fileContent{4};
+0111         databases.kegg.MW         = str2double(fileContent{5});
+0112         databases.kegg.pathway    = fileContent{6};
+0113         databases.kegg.seq        = fileContent{7};
+0114     <span class="keyword">else</span>
+0115         databases.kegg = [];
+0116     <span class="keyword">end</span>
+0117 <span class="keyword">end</span>
+0118 <span class="keyword">end</span>
+0119 
+0120 <a name="_sub1" href="#_subfunctions" class="code">function downloadKEGG(keggID, filePath, keggGeneID)</a>
+0121 <span class="comment">%% Download gene information</span>
+0122 progressbar([<span class="string">'Downloading KEGG data for organism code '</span> keggID])
+0123 webOptions = weboptions(<span class="string">'Timeout'</span>,30);
+0124 <span class="keyword">try</span>
+0125     gene_list = webread([<span class="string">'http://rest.kegg.jp/list/'</span> keggID],webOptions);
+0126 <span class="keyword">catch</span> ME
+0127     <span class="keyword">switch</span> ME.identifier
+0128         <span class="keyword">case</span> <span class="string">'MATLAB:webservices:HTTP400StatusCodeError'</span>
+0129             error([<span class="string">'Unable to download data form KEGG with a potentially invalid ID: '</span> keggID ])
+0130     <span class="keyword">end</span>
+0131 <span class="keyword">end</span>
+0132 gene_list = regexpi(gene_list, <span class="string">'[^\n]+'</span>,<span class="string">'match'</span>)';
+0133 gene_id   = regexpi(gene_list,[<span class="string">'(?&lt;='</span> keggID <span class="string">':)\S+'</span>],<span class="string">'match'</span>);
+0134 
+0135 <span class="comment">% Retrieve information for every gene in the list, 10 genes per query</span>
+0136 genesPerQuery = 10;
+0137 queries = ceil(numel(gene_id)/genesPerQuery);
+0138 keggData  = cell(numel(gene_id),1);
+0139 <span class="keyword">for</span> i = 1:queries
+0140     <span class="comment">% Download batches of genes</span>
+0141     firstIdx = i*genesPerQuery-(genesPerQuery-1);
+0142     lastIdx  = i*genesPerQuery;
+0143     <span class="keyword">if</span> lastIdx &gt; numel(gene_id) <span class="comment">% Last query has probably less genes</span>
+0144         lastIdx = numel(gene_id);
+0145     <span class="keyword">end</span>
+0146     url      = [<span class="string">'http://rest.kegg.jp/get/'</span> keggID <span class="string">':'</span> strjoin([gene_id{firstIdx:lastIdx}],[<span class="string">'+'</span> keggID <span class="string">':'</span>])];
+0147 
+0148     retry = true;
+0149     <span class="keyword">while</span> retry
+0150         <span class="keyword">try</span>
+0151             retry = false;
+0152             out   = webread(url,webOptions);
+0153         <span class="keyword">catch</span>
+0154             retry = true;
+0155         <span class="keyword">end</span>
+0156     <span class="keyword">end</span>
+0157     outSplit = strsplit(out,[<span class="string">'///'</span> 10]); <span class="comment">%10 is new line character</span>
+0158     <span class="keyword">if</span> numel(outSplit) &lt; lastIdx-firstIdx+2
+0159         error(<span class="string">'KEGG returns less genes per query'</span>) <span class="comment">%Reduce genesPerQuery</span>
+0160     <span class="keyword">end</span>
+0161     keggData(firstIdx:lastIdx) = outSplit(1:end-1);
+0162     progressbar(i/queries)
+0163 <span class="keyword">end</span>
+0164 
+0165 <span class="comment">%% Parsing of info to keggDB format</span>
+0166 sequence  = regexprep(keggData,<span class="string">'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*'</span>,<span class="string">'$1'</span>);
+0167 <span class="comment">%No AASEQ -&gt; no protein -&gt; not of interest</span>
+0168 noProt    = startsWith(sequence,<span class="string">'ENTRY '</span>);
+0169 uni       = regexprep(keggData,<span class="string">'.*UniProt: (\S+?)\s.*'</span>,<span class="string">'$1'</span>);
+0170 noUni     = startsWith(uni,<span class="string">'ENTRY '</span>);
+0171 uni(noProt | noUni)       = [];
+0172 keggData(noProt | noUni) = [];
+0173 sequence(noProt | noUni)  = [];
+0174 sequence  = regexprep(sequence,<span class="string">'\s*'</span>,<span class="string">''</span>);
+0175 keggGene  = regexprep(keggData,<span class="string">'ENTRY\s+(\S+?)\s.+'</span>,<span class="string">'$1'</span>);
+0176 
+0177 <span class="keyword">switch</span> keggGeneID
+0178     <span class="keyword">case</span> {<span class="string">'kegg'</span>,<span class="string">''</span>}
+0179         gene_name = keggGene;
+0180     <span class="keyword">otherwise</span>
+0181         <span class="comment">% In case there are special characters:</span>
+0182         keggGeneIDT = regexptranslate(<span class="string">'escape'</span>,keggGeneID);
+0183         gene_name = regexprep(keggData,[<span class="string">'.+'</span> keggGeneIDT <span class="string">': (\S+?)\n.+'</span>],<span class="string">'$1'</span>);
+0184         noID = ~contains(keggData,keggGeneIDT);
+0185         <span class="keyword">if</span> all(noID)
+0186             error([<span class="string">'None of the KEGG entries are annotated with the gene identifier '</span> keggGeneID])
+0187         <span class="keyword">else</span>
+0188             gene_name(noID)= [];
+0189             keggData(noID) = [];
+0190             keggGene(noID) = [];
+0191             sequence(noID) = [];
+0192             uni(noID)      = [];
+0193         <span class="keyword">end</span>
+0194 <span class="keyword">end</span>
+0195 
+0196 EC_names  = regexprep(keggData,<span class="string">'.*ORTHOLOGY.*\[EC:(.*?)\].*'</span>,<span class="string">'$1'</span>);
+0197 EC_names(startsWith(EC_names,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
+0198 
+0199 MW = cell(numel(sequence),1);
+0200 <span class="keyword">for</span> i=1:numel(sequence)
+0201     <span class="keyword">if</span> ~isempty(sequence{i})
+0202         MW{i} = num2str(round(<a href="calculateMW.html" class="code" title="function MW = calculateMW(sequence)">calculateMW</a>(sequence{i})));
+0203     <span class="keyword">end</span>
+0204 <span class="keyword">end</span>
 0205 
-0206 writetable(out, filePath, <span class="string">'FileType'</span>, <span class="string">'text'</span>, <span class="string">'WriteVariableNames'</span>,false);
-0207 fprintf(<span class="string">'Model-specific KEGG database stored at %s\n'</span>,filePath);
-0208 <span class="keyword">end</span></pre></div>
+0206 pathway   = regexprep(keggData,<span class="string">'.*PATHWAY\s+(.*?)(BRITE|MODULE).*'</span>,<span class="string">'$1'</span>);
+0207 pathway(startsWith(pathway,<span class="string">'ENTRY '</span>)) = {<span class="string">''</span>};
+0208 pathway   = strrep(pathway,[keggID <span class="string">'01100  Metabolic pathways'</span>],<span class="string">''</span>);
+0209 pathway   = regexprep(pathway,<span class="string">'\n'</span>,<span class="string">''</span>);
+0210 pathway   = regexprep(pathway,<span class="string">'           '</span>,<span class="string">''</span>);
+0211 
+0212 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence];
+0213 out = cell2table(out);
+0214 
+0215 writetable(out, filePath, <span class="string">'FileType'</span>, <span class="string">'text'</span>, <span class="string">'WriteVariableNames'</span>,false);
+0216 fprintf(<span class="string">'Model-specific KEGG database stored at %s\n'</span>,filePath);
+0217 <span class="keyword">end</span></pre></div>
 <hr><address>Generated by <strong><a href="http://www.artefact.tk/software/matlab/m2html/" title="Matlab Documentation in HTML">m2html</a></strong> &copy; 2005</address>
 </body>
 </html>
diff --git a/src/geckomat/get_enzyme_data/loadDatabases.m b/src/geckomat/get_enzyme_data/loadDatabases.m
@@ -79,6 +79,15 @@
     else
         databases.uniprot = [];
     end
+    if ~isempty(databases.uniprot)
+        [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,'stable');
+        if numel(uniqueIDs) < numel(databases.uniprot.ID)
+            duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx);
+            dispEM(['Duplicate entries are found for the following proteins. '...
+                    'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '...
+                    'in the model adapter:'],true,databases.uniprot.ID(duplID));
+        end
+    end
 end
 
 %% KEGG