diff --git a/doc/src/geckomat/get_enzyme_data/loadDatabases.html b/doc/src/geckomat/get_enzyme_data/loadDatabases.html index 69c4a25f4..80e12d51a 100644 --- a/doc/src/geckomat/get_enzyme_data/loadDatabases.html +++ b/doc/src/geckomat/get_enzyme_data/loadDatabases.html @@ -141,133 +141,142 @@

SOURCE CODE ^else 0080 databases.uniprot = []; 0081 end -0082 end -0083 -0084 %% KEGG -0085 if any(strcmp(selectDatabase,{'kegg','both'})) -0086 keggPath = fullfile(filePath,'kegg.tsv'); -0087 if ~exist(keggPath,'file') -0088 if isempty(kegg.ID) -0089 printOrange('WARNING: No kegg.ID is specified, unable to download KEGG DB.\n') -0090 else -0091 downloadKEGG(kegg.ID,keggPath,kegg.geneID); -0092 end -0093 end -0094 if exist(keggPath,'file') -0095 fid = fopen(keggPath,'r'); -0096 fileContent = textscan(fid,'%q %q %q %q %q %q %q','Delimiter',',','HeaderLines',0); -0097 fclose(fid); -0098 databases.kegg.uniprot = fileContent{1}; -0099 databases.kegg.genes = fileContent{2}; -0100 databases.kegg.keggGene = fileContent{3}; -0101 databases.kegg.eccodes = fileContent{4}; -0102 databases.kegg.MW = str2double(fileContent{5}); -0103 databases.kegg.pathway = fileContent{6}; -0104 databases.kegg.seq = fileContent{7}; -0105 else -0106 databases.kegg = []; -0107 end -0108 end -0109 end -0110 -0111 function downloadKEGG(keggID, filePath, keggGeneID) -0112 %% Download gene information -0113 progressbar(['Downloading KEGG data for organism code ' keggID]) -0114 webOptions = weboptions('Timeout',30); -0115 try -0116 gene_list = webread(['http://rest.kegg.jp/list/' keggID],webOptions); -0117 catch ME -0118 switch ME.identifier -0119 case 'MATLAB:webservices:HTTP400StatusCodeError' -0120 error(['Unable to download data form KEGG with a potentially invalid ID: ' keggID ]) -0121 end -0122 end -0123 gene_list = regexpi(gene_list, '[^\n]+','match')'; -0124 gene_id = regexpi(gene_list,['(?<=' keggID ':)\S+'],'match'); -0125 -0126 % Retrieve information for every gene in the list, 10 genes per query -0127 genesPerQuery = 10; -0128 queries = ceil(numel(gene_id)/genesPerQuery); -0129 keggData = cell(numel(gene_id),1); -0130 for i = 1:queries -0131 % Download batches of genes -0132 firstIdx = i*genesPerQuery-(genesPerQuery-1); -0133 lastIdx = i*genesPerQuery; -0134 if lastIdx > numel(gene_id) % Last query has probably less genes -0135 lastIdx = numel(gene_id); -0136 end -0137 url = ['http://rest.kegg.jp/get/' keggID ':' strjoin([gene_id{firstIdx:lastIdx}],['+' keggID ':'])]; -0138 -0139 retry = true; -0140 while retry -0141 try -0142 retry = false; -0143 out = webread(url,webOptions); -0144 catch -0145 retry = true; -0146 end -0147 end -0148 outSplit = strsplit(out,['///' 10]); %10 is new line character -0149 if numel(outSplit) < lastIdx-firstIdx+2 -0150 error('KEGG returns less genes per query') %Reduce genesPerQuery -0151 end -0152 keggData(firstIdx:lastIdx) = outSplit(1:end-1); -0153 progressbar(i/queries) -0154 end -0155 -0156 %% Parsing of info to keggDB format -0157 sequence = regexprep(keggData,'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*','$1'); -0158 %No AASEQ -> no protein -> not of interest -0159 noProt = startsWith(sequence,'ENTRY '); -0160 uni = regexprep(keggData,'.*UniProt: (\S+?)\s.*','$1'); -0161 noUni = startsWith(uni,'ENTRY '); -0162 uni(noProt | noUni) = []; -0163 keggData(noProt | noUni) = []; -0164 sequence(noProt | noUni) = []; -0165 sequence = regexprep(sequence,'\s*',''); -0166 keggGene = regexprep(keggData,'ENTRY\s+(\S+?)\s.+','$1'); -0167 -0168 switch keggGeneID -0169 case {'kegg',''} -0170 gene_name = keggGene; -0171 otherwise -0172 % In case there are special characters: -0173 keggGeneIDT = regexptranslate('escape',keggGeneID); -0174 gene_name = regexprep(keggData,['.+' keggGeneIDT ': (\S+?)\n.+'],'$1'); -0175 noID = ~contains(keggData,keggGeneIDT); -0176 if all(noID) -0177 error(['None of the KEGG entries are annotated with the gene identifier ' keggGeneID]) -0178 else -0179 gene_name(noID)= []; -0180 keggData(noID) = []; -0181 keggGene(noID) = []; -0182 sequence(noID) = []; -0183 uni(noID) = []; -0184 end -0185 end -0186 -0187 EC_names = regexprep(keggData,'.*ORTHOLOGY.*\[EC:(.*?)\].*','$1'); -0188 EC_names(startsWith(EC_names,'ENTRY ')) = {''}; -0189 -0190 MW = cell(numel(sequence),1); -0191 for i=1:numel(sequence) -0192 if ~isempty(sequence{i}) -0193 MW{i} = num2str(round(calculateMW(sequence{i}))); -0194 end -0195 end -0196 -0197 pathway = regexprep(keggData,'.*PATHWAY\s+(.*?)(BRITE|MODULE).*','$1'); -0198 pathway(startsWith(pathway,'ENTRY ')) = {''}; -0199 pathway = strrep(pathway,[keggID '01100 Metabolic pathways'],''); -0200 pathway = regexprep(pathway,'\n',''); -0201 pathway = regexprep(pathway,' ',''); -0202 -0203 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence]; -0204 out = cell2table(out); +0082 if ~isempty(databases.uniprot) +0083 [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,'stable'); +0084 if numel(uniqueIDs) < numel(databases.uniprot.ID) +0085 duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx); +0086 dispEM(['Duplicate entries are found for the following proteins. '... +0087 'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '... +0088 'in the model adapter:'],true,databases.uniprot.ID(duplID)); +0089 end +0090 end +0091 end +0092 +0093 %% KEGG +0094 if any(strcmp(selectDatabase,{'kegg','both'})) +0095 keggPath = fullfile(filePath,'kegg.tsv'); +0096 if ~exist(keggPath,'file') +0097 if isempty(kegg.ID) +0098 printOrange('WARNING: No kegg.ID is specified, unable to download KEGG DB.\n') +0099 else +0100 downloadKEGG(kegg.ID,keggPath,kegg.geneID); +0101 end +0102 end +0103 if exist(keggPath,'file') +0104 fid = fopen(keggPath,'r'); +0105 fileContent = textscan(fid,'%q %q %q %q %q %q %q','Delimiter',',','HeaderLines',0); +0106 fclose(fid); +0107 databases.kegg.uniprot = fileContent{1}; +0108 databases.kegg.genes = fileContent{2}; +0109 databases.kegg.keggGene = fileContent{3}; +0110 databases.kegg.eccodes = fileContent{4}; +0111 databases.kegg.MW = str2double(fileContent{5}); +0112 databases.kegg.pathway = fileContent{6}; +0113 databases.kegg.seq = fileContent{7}; +0114 else +0115 databases.kegg = []; +0116 end +0117 end +0118 end +0119 +0120 function downloadKEGG(keggID, filePath, keggGeneID) +0121 %% Download gene information +0122 progressbar(['Downloading KEGG data for organism code ' keggID]) +0123 webOptions = weboptions('Timeout',30); +0124 try +0125 gene_list = webread(['http://rest.kegg.jp/list/' keggID],webOptions); +0126 catch ME +0127 switch ME.identifier +0128 case 'MATLAB:webservices:HTTP400StatusCodeError' +0129 error(['Unable to download data form KEGG with a potentially invalid ID: ' keggID ]) +0130 end +0131 end +0132 gene_list = regexpi(gene_list, '[^\n]+','match')'; +0133 gene_id = regexpi(gene_list,['(?<=' keggID ':)\S+'],'match'); +0134 +0135 % Retrieve information for every gene in the list, 10 genes per query +0136 genesPerQuery = 10; +0137 queries = ceil(numel(gene_id)/genesPerQuery); +0138 keggData = cell(numel(gene_id),1); +0139 for i = 1:queries +0140 % Download batches of genes +0141 firstIdx = i*genesPerQuery-(genesPerQuery-1); +0142 lastIdx = i*genesPerQuery; +0143 if lastIdx > numel(gene_id) % Last query has probably less genes +0144 lastIdx = numel(gene_id); +0145 end +0146 url = ['http://rest.kegg.jp/get/' keggID ':' strjoin([gene_id{firstIdx:lastIdx}],['+' keggID ':'])]; +0147 +0148 retry = true; +0149 while retry +0150 try +0151 retry = false; +0152 out = webread(url,webOptions); +0153 catch +0154 retry = true; +0155 end +0156 end +0157 outSplit = strsplit(out,['///' 10]); %10 is new line character +0158 if numel(outSplit) < lastIdx-firstIdx+2 +0159 error('KEGG returns less genes per query') %Reduce genesPerQuery +0160 end +0161 keggData(firstIdx:lastIdx) = outSplit(1:end-1); +0162 progressbar(i/queries) +0163 end +0164 +0165 %% Parsing of info to keggDB format +0166 sequence = regexprep(keggData,'.*AASEQ\s+\d+\s+([A-Z\s])+?\s+NTSEQ.*','$1'); +0167 %No AASEQ -> no protein -> not of interest +0168 noProt = startsWith(sequence,'ENTRY '); +0169 uni = regexprep(keggData,'.*UniProt: (\S+?)\s.*','$1'); +0170 noUni = startsWith(uni,'ENTRY '); +0171 uni(noProt | noUni) = []; +0172 keggData(noProt | noUni) = []; +0173 sequence(noProt | noUni) = []; +0174 sequence = regexprep(sequence,'\s*',''); +0175 keggGene = regexprep(keggData,'ENTRY\s+(\S+?)\s.+','$1'); +0176 +0177 switch keggGeneID +0178 case {'kegg',''} +0179 gene_name = keggGene; +0180 otherwise +0181 % In case there are special characters: +0182 keggGeneIDT = regexptranslate('escape',keggGeneID); +0183 gene_name = regexprep(keggData,['.+' keggGeneIDT ': (\S+?)\n.+'],'$1'); +0184 noID = ~contains(keggData,keggGeneIDT); +0185 if all(noID) +0186 error(['None of the KEGG entries are annotated with the gene identifier ' keggGeneID]) +0187 else +0188 gene_name(noID)= []; +0189 keggData(noID) = []; +0190 keggGene(noID) = []; +0191 sequence(noID) = []; +0192 uni(noID) = []; +0193 end +0194 end +0195 +0196 EC_names = regexprep(keggData,'.*ORTHOLOGY.*\[EC:(.*?)\].*','$1'); +0197 EC_names(startsWith(EC_names,'ENTRY ')) = {''}; +0198 +0199 MW = cell(numel(sequence),1); +0200 for i=1:numel(sequence) +0201 if ~isempty(sequence{i}) +0202 MW{i} = num2str(round(calculateMW(sequence{i}))); +0203 end +0204 end 0205 -0206 writetable(out, filePath, 'FileType', 'text', 'WriteVariableNames',false); -0207 fprintf('Model-specific KEGG database stored at %s\n',filePath); -0208 end +0206 pathway = regexprep(keggData,'.*PATHWAY\s+(.*?)(BRITE|MODULE).*','$1'); +0207 pathway(startsWith(pathway,'ENTRY ')) = {''}; +0208 pathway = strrep(pathway,[keggID '01100 Metabolic pathways'],''); +0209 pathway = regexprep(pathway,'\n',''); +0210 pathway = regexprep(pathway,' ',''); +0211 +0212 out = [uni, gene_name, keggGene, EC_names, MW, pathway, sequence]; +0213 out = cell2table(out); +0214 +0215 writetable(out, filePath, 'FileType', 'text', 'WriteVariableNames',false); +0216 fprintf('Model-specific KEGG database stored at %s\n',filePath); +0217 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/src/geckomat/get_enzyme_data/loadDatabases.m b/src/geckomat/get_enzyme_data/loadDatabases.m index c38e852bb..d882a2447 100644 --- a/src/geckomat/get_enzyme_data/loadDatabases.m +++ b/src/geckomat/get_enzyme_data/loadDatabases.m @@ -79,6 +79,15 @@ else databases.uniprot = []; end + if ~isempty(databases.uniprot) + [uniqueIDs,uniqueIdx] = unique(databases.uniprot.ID,'stable'); + if numel(uniqueIDs) < numel(databases.uniprot.ID) + duplID = setdiff(1:numel(databases.uniprot.ID),uniqueIdx); + dispEM(['Duplicate entries are found for the following proteins. '... + 'Manually curate the ''uniprot.tsv'' file, or adjust the uniprot parameters '... + 'in the model adapter:'],true,databases.uniprot.ID(duplID)); + end + end end %% KEGG