@article {38352, title = {InterPro in 2011: new developments in the family and domain prediction database}, journal = {Nucleic acids researchNucleic Acids Research}, volume = {40}, year = {2012}, note = {http://www.ncbi.nlm.nih.gov/pubmed/22096229?dopt=Abstract}, type = {10.1093/nar/gkr948}, abstract = {InterPro (http://www.ebi.ac.uk/interpro/) is a database that integrates diverse information about protein families, domains and functional sites, and makes it freely available to the public via Web-based interfaces and services. Central to the database are diagnostic models, known as signatures, against which protein sequences can be searched to determine their potential function. InterPro has utility in the large-scale analysis of whole genomes and meta-genomes, as well as in characterizing individual protein sequences. Herein we give an overview of new developments in the database and its associated software since 2009, including updates to database content, curation processes and Web and programmatic interfaces.}, keywords = {Databases, Protein, Protein Structure, Tertiary, Proteins, Sequence Analysis, Protein, software, Terminology as Topic, User-Computer Interface}, author = {Hunter, Sarah and Jones, Philip and Mitchell, Alex and Apweiler, Rolf and Attwood, Teresa K. and Bateman, Alex and Bernard, Thomas and Binns, David and Bork, Peer and Burge, Sarah and de Castro, Edouard and Coggill, Penny and Corbett, Matthew and Das, Ujjwal and Daugherty, Louise and Duquenne, Lauranne and Finn, Robert D. and Fraser, Matthew and Gough, Julian and Haft, Daniel and Hulo, Nicolas and Kahn, Daniel and Kelly, Elizabeth and Letunic, Ivica and Lonsdale, David and Lopez, Rodrigo and Madera, Martin and Maslen, John and McAnulla, Craig and McDowall, Jennifer and McMenamin, Conor and Mi, Huaiyu and Mutowo-Muellenet, Prudence and Mulder, Nicola and Natale, Darren and Orengo, Christine and Pesseat, Sebastien and Punta, Marco and Quinn, Antony F. and Rivoire, Catherine and Sangrador-Vegas, Amaia and J. Selengut and Sigrist, Christian J. A. and Scheremetjew, Maxim and Tate, John and Thimmajanarthanan, Manjulapramila and Thomas, Paul D. and Wu, Cathy H. and Yeats, Corin and Yong, Siew-Yit} } @article {49765, title = {InterPro in 2011: new developments in the family and domain prediction database.}, journal = {Nucleic Acids Res}, volume = {40}, year = {2012}, month = {2012 Jan}, pages = {D306-12}, abstract = {

InterPro (http://www.ebi.ac.uk/interpro/) is a database that integrates diverse information about protein families, domains and functional sites, and makes it freely available to the public via Web-based interfaces and services. Central to the database are diagnostic models, known as signatures, against which protein sequences can be searched to determine their potential function. InterPro has utility in the large-scale analysis of whole genomes and meta-genomes, as well as in characterizing individual protein sequences. Herein we give an overview of new developments in the database and its associated software since 2009, including updates to database content, curation processes and Web and programmatic interfaces.

}, keywords = {Databases, Protein, Protein Structure, Tertiary, Proteins, Sequence Analysis, Protein, software, Terminology as Topic, User-Computer Interface}, issn = {1362-4962}, doi = {10.1093/nar/gkr948}, author = {Hunter, Sarah and Jones, Philip and Mitchell, Alex and Apweiler, Rolf and Attwood, Teresa K and Bateman, Alex and Bernard, Thomas and Binns, David and Bork, Peer and Burge, Sarah and de Castro, Edouard and Coggill, Penny and Corbett, Matthew and Das, Ujjwal and Daugherty, Louise and Duquenne, Lauranne and Finn, Robert D and Fraser, Matthew and Gough, Julian and Haft, Daniel and Hulo, Nicolas and Kahn, Daniel and Kelly, Elizabeth and Letunic, Ivica and Lonsdale, David and Lopez, Rodrigo and Madera, Martin and Maslen, John and McAnulla, Craig and McDowall, Jennifer and McMenamin, Conor and Mi, Huaiyu and Mutowo-Muellenet, Prudence and Mulder, Nicola and Natale, Darren and Orengo, Christine and Pesseat, Sebastien and Punta, Marco and Quinn, Antony F and Rivoire, Catherine and Sangrador-Vegas, Amaia and Selengut, Jeremy D and Sigrist, Christian J A and Scheremetjew, Maxim and Tate, John and Thimmajanarthanan, Manjulapramila and Thomas, Paul D and Wu, Cathy H and Yeats, Corin and Yong, Siew-Yit} } @article {49779, title = {Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL): adapting the Partial Phylogenetic Profiling algorithm to scan sequences for signatures that predict protein function.}, journal = {BMC Bioinformatics}, volume = {11}, year = {2010}, month = {2010}, pages = {52}, abstract = {

BACKGROUND: Comparative genomics methods such as phylogenetic profiling can mine powerful inferences from inherently noisy biological data sets. We introduce Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL), a method that applies the Partial Phylogenetic Profiling (PPP) approach locally within a protein sequence to discover short sequence signatures associated with functional sites. The approach is based on the basic scoring mechanism employed by PPP, namely the use of binomial distribution statistics to optimize sequence similarity cutoffs during searches of partitioned training sets.

RESULTS: Here we illustrate and validate the ability of the SIMBAL method to find functionally relevant short sequence signatures by application to two well-characterized protein families. In the first example, we partitioned a family of ABC permeases using a metabolic background property (urea utilization). Thus, the TRUE set for this family comprised members whose genome of origin encoded a urea utilization system. By moving a sliding window across the sequence of a permease, and searching each subsequence in turn against the full set of partitioned proteins, the method found which local sequence signatures best correlated with the urea utilization trait. Mapping of SIMBAL "hot spots" onto crystal structures of homologous permeases reveals that the significant sites are gating determinants on the cytosolic face rather than, say, docking sites for the substrate-binding protein on the extracellular face. In the second example, we partitioned a protein methyltransferase family using gene proximity as a criterion. In this case, the TRUE set comprised those methyltransferases encoded near the gene for the substrate RF-1. SIMBAL identifies sequence regions that map onto the substrate-binding interface while ignoring regions involved in the methyltransferase reaction mechanism in general. Neither method for training set construction requires any prior experimental characterization.

CONCLUSIONS: SIMBAL shows that, in functionally divergent protein families, selected short sequences often significantly outperform their full-length parent sequence for making functional predictions by sequence similarity, suggesting avenues for improved functional classifiers. When combined with structural data, SIMBAL affords the ability to localize and model functional sites.

}, keywords = {algorithms, Amino Acid Sequence, Gene Expression Profiling, Molecular Sequence Data, Phylogeny, Proteins, Sequence Analysis, Protein, Structure-Activity Relationship}, issn = {1471-2105}, doi = {10.1186/1471-2105-11-52}, author = {Selengut, Jeremy D and Rusch, Douglas B and Haft, Daniel H} } @article {38506, title = {Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL): adapting the Partial Phylogenetic Profiling algorithm to scan sequences for signatures that predict protein function}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {11}, year = {2010}, note = {http://www.ncbi.nlm.nih.gov/pubmed/20102603?dopt=Abstract}, type = {10.1186/1471-2105-11-52}, abstract = {BACKGROUND: Comparative genomics methods such as phylogenetic profiling can mine powerful inferences from inherently noisy biological data sets. We introduce Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL), a method that applies the Partial Phylogenetic Profiling (PPP) approach locally within a protein sequence to discover short sequence signatures associated with functional sites. The approach is based on the basic scoring mechanism employed by PPP, namely the use of binomial distribution statistics to optimize sequence similarity cutoffs during searches of partitioned training sets. RESULTS: Here we illustrate and validate the ability of the SIMBAL method to find functionally relevant short sequence signatures by application to two well-characterized protein families. In the first example, we partitioned a family of ABC permeases using a metabolic background property (urea utilization). Thus, the TRUE set for this family comprised members whose genome of origin encoded a urea utilization system. By moving a sliding window across the sequence of a permease, and searching each subsequence in turn against the full set of partitioned proteins, the method found which local sequence signatures best correlated with the urea utilization trait. Mapping of SIMBAL "hot spots" onto crystal structures of homologous permeases reveals that the significant sites are gating determinants on the cytosolic face rather than, say, docking sites for the substrate-binding protein on the extracellular face. In the second example, we partitioned a protein methyltransferase family using gene proximity as a criterion. In this case, the TRUE set comprised those methyltransferases encoded near the gene for the substrate RF-1. SIMBAL identifies sequence regions that map onto the substrate-binding interface while ignoring regions involved in the methyltransferase reaction mechanism in general. Neither method for training set construction requires any prior experimental characterization. CONCLUSIONS: SIMBAL shows that, in functionally divergent protein families, selected short sequences often significantly outperform their full-length parent sequence for making functional predictions by sequence similarity, suggesting avenues for improved functional classifiers. When combined with structural data, SIMBAL affords the ability to localize and model functional sites.}, keywords = {algorithms, Amino Acid Sequence, Gene Expression Profiling, Molecular Sequence Data, Phylogeny, Proteins, Sequence Analysis, Protein, Structure-Activity Relationship}, author = {J. Selengut and Rusch, Douglas B. and Haft, Daniel H.} } @article {49781, title = {InterPro: the integrative protein signature database.}, journal = {Nucleic Acids Res}, volume = {37}, year = {2009}, month = {2009 Jan}, pages = {D211-5}, abstract = {

The InterPro database (http://www.ebi.ac.uk/interpro/) integrates together predictive models or {\textquoteright}signatures{\textquoteright} representing protein domains, families and functional sites from multiple, diverse source databases: Gene3D, PANTHER, Pfam, PIRSF, PRINTS, ProDom, PROSITE, SMART, SUPERFAMILY and TIGRFAMs. Integration is performed manually and approximately half of the total approximately 58,000 signatures available in the source databases belong to an InterPro entry. Recently, we have started to also display the remaining un-integrated signatures via our web interface. Other developments include the provision of non-signature data, such as structural data, in new XML files on our FTP site, as well as the inclusion of matchless UniProtKB proteins in the existing match XML files. The web interface has been extended and now links out to the ADAN predicted protein-protein interaction database and the SPICE and Dasty viewers. The latest public release (v18.0) covers 79.8\% of UniProtKB (v14.1) and consists of 16 549 entries. InterPro data may be accessed either via the web address above, via web services, by downloading files by anonymous FTP or by using the InterProScan search software (http://www.ebi.ac.uk/Tools/InterProScan/).

}, keywords = {Databases, Protein, Proteins, Sequence Analysis, Protein, Systems Integration}, issn = {1362-4962}, doi = {10.1093/nar/gkn785}, author = {Hunter, Sarah and Apweiler, Rolf and Attwood, Teresa K and Bairoch, Amos and Bateman, Alex and Binns, David and Bork, Peer and Das, Ujjwal and Daugherty, Louise and Duquenne, Lauranne and Finn, Robert D and Gough, Julian and Haft, Daniel and Hulo, Nicolas and Kahn, Daniel and Kelly, Elizabeth and Laugraud, Aur{\'e}lie and Letunic, Ivica and Lonsdale, David and Lopez, Rodrigo and Madera, Martin and Maslen, John and McAnulla, Craig and McDowall, Jennifer and Mistry, Jaina and Mitchell, Alex and Mulder, Nicola and Natale, Darren and Orengo, Christine and Quinn, Antony F and Selengut, Jeremy D and Sigrist, Christian J A and Thimma, Manjula and Thomas, Paul D and Valentin, Franck and Wilson, Derek and Wu, Cathy H and Yeats, Corin} } @article {38353, title = {InterPro: the integrative protein signature database}, journal = {Nucleic acids researchNucleic Acids Research}, volume = {37}, year = {2009}, note = {http://www.ncbi.nlm.nih.gov/pubmed/18940856?dopt=Abstract}, type = {10.1093/nar/gkn785}, abstract = {The InterPro database (http://www.ebi.ac.uk/interpro/) integrates together predictive models or {\textquoteright}signatures{\textquoteright} representing protein domains, families and functional sites from multiple, diverse source databases: Gene3D, PANTHER, Pfam, PIRSF, PRINTS, ProDom, PROSITE, SMART, SUPERFAMILY and TIGRFAMs. Integration is performed manually and approximately half of the total approximately 58,000 signatures available in the source databases belong to an InterPro entry. Recently, we have started to also display the remaining un-integrated signatures via our web interface. Other developments include the provision of non-signature data, such as structural data, in new XML files on our FTP site, as well as the inclusion of matchless UniProtKB proteins in the existing match XML files. The web interface has been extended and now links out to the ADAN predicted protein-protein interaction database and the SPICE and Dasty viewers. The latest public release (v18.0) covers 79.8\% of UniProtKB (v14.1) and consists of 16 549 entries. InterPro data may be accessed either via the web address above, via web services, by downloading files by anonymous FTP or by using the InterProScan search software (http://www.ebi.ac.uk/Tools/InterProScan/).}, keywords = {Databases, Protein, Proteins, Sequence Analysis, Protein, Systems Integration}, author = {Hunter, Sarah and Apweiler, Rolf and Attwood, Teresa K. and Bairoch, Amos and Bateman, Alex and Binns, David and Bork, Peer and Das, Ujjwal and Daugherty, Louise and Duquenne, Lauranne and Finn, Robert D. and Gough, Julian and Haft, Daniel and Hulo, Nicolas and Kahn, Daniel and Kelly, Elizabeth and Laugraud, Aur{\'e}lie and Letunic, Ivica and Lonsdale, David and Lopez, Rodrigo and Madera, Martin and Maslen, John and McAnulla, Craig and McDowall, Jennifer and Mistry, Jaina and Mitchell, Alex and Mulder, Nicola and Natale, Darren and Orengo, Christine and Quinn, Antony F. and J. Selengut and Sigrist, Christian J. A. and Thimma, Manjula and Thomas, Paul D. and Valentin, Franck and Wilson, Derek and Wu, Cathy H. and Yeats, Corin} } @article {49783, title = {New developments in the InterPro database.}, journal = {Nucleic Acids Res}, volume = {35}, year = {2007}, month = {2007 Jan}, pages = {D224-8}, abstract = {

InterPro is an integrated resource for protein families, domains and functional sites, which integrates the following protein signature databases: PROSITE, PRINTS, ProDom, Pfam, SMART, TIGRFAMs, PIRSF, SUPERFAMILY, Gene3D and PANTHER. The latter two new member databases have been integrated since the last publication in this journal. There have been several new developments in InterPro, including an additional reading field, new database links, extensions to the web interface and additional match XML files. InterPro has always provided matches to UniProtKB proteins on the website and in the match XML file on the FTP site. Additional matches to proteins in UniParc (UniProt archive) are now available for download in the new match XML files only. The latest InterPro release (13.0) contains more than 13 000 entries, covering over 78\% of all proteins in UniProtKB. The database is available for text- and sequence-based searches via a webserver (http://www.ebi.ac.uk/interpro), and for download by anonymous FTP (ftp://ftp.ebi.ac.uk/pub/databases/interpro). The InterProScan search tool is now also available via a web service at http://www.ebi.ac.uk/Tools/webservices/WSInterProScan.html.

}, keywords = {Databases, Protein, Internet, Protein Structure, Tertiary, Proteins, Sequence Analysis, Protein, Systems Integration, User-Computer Interface}, issn = {1362-4962}, doi = {10.1093/nar/gkl841}, author = {Mulder, Nicola J and Apweiler, Rolf and Attwood, Teresa K and Bairoch, Amos and Bateman, Alex and Binns, David and Bork, Peer and Buillard, Virginie and Cerutti, Lorenzo and Copley, Richard and Courcelle, Emmanuel and Das, Ujjwal and Daugherty, Louise and Dibley, Mark and Finn, Robert and Fleischmann, Wolfgang and Gough, Julian and Haft, Daniel and Hulo, Nicolas and Hunter, Sarah and Kahn, Daniel and Kanapin, Alexander and Kejariwal, Anish and Labarga, Alberto and Langendijk-Genevaux, Petra S and Lonsdale, David and Lopez, Rodrigo and Letunic, Ivica and Madera, Martin and Maslen, John and McAnulla, Craig and McDowall, Jennifer and Mistry, Jaina and Mitchell, Alex and Nikolskaya, Anastasia N and Orchard, Sandra and Orengo, Christine and Petryszak, Robert and Selengut, Jeremy D and Sigrist, Christian J A and Thomas, Paul D and Valentin, Franck and Wilson, Derek and Wu, Cathy H and Yeats, Corin} } @article {38369, title = {MDP-1: A novel eukaryotic magnesium-dependent phosphatase}, journal = {BiochemistryBiochemistry}, volume = {39}, year = {2000}, note = {http://www.ncbi.nlm.nih.gov/pubmed/10889041?dopt=Abstract}, abstract = {We report here the purification, cloning, expression, and characterization of a novel phosphatase, MDP-1. In the course of investigating the reported acid phosphatase activity of carbonic anhydrase III preparations, several discrete phosphatases were discerned. One of these, a magnesium-dependent species of 18.6 kDa, was purified to homogeneity and yielded several peptide sequences from which the parent gene was identified by database searching. Although orthologous genes were identified in fungi and plants as well as mammalian species, there was no apparent homology to any known family of phosphatases. The enzyme was expressed in Escherichia coli with a fusion tag and purified by affinity methods. The recombinant enzyme showed magnesium-dependent acid phosphatase activity comparable to the originally isolated rabbit protein. The enzyme catalyzes the rapid hydrolysis of p-nitrophenyl phosphate, ribose-5-phosphate, and phosphotyrosine. The selectivity for phosphotyrosine over phosphoserine or phosphothreonine is considerable, but the enzyme did not show activity toward five phosphotyrosine-containing peptides. None of the various substrates assayed (including various nucleotide, sugar, amino acid and peptide phosphates, phosphoinositides, and phosphodiesters) exhibited K(M) values lower than 1 mM, and many showed negligible rates of hydrolysis. The enzyme is inhibited by vanadate and fluoride but not by azide, cyanide, calcium, lithium, or tartaric acid. Chemical labeling, refolding, dialysis, and mutagenesis experiments suggest that the enzymatic mechanism is not dependent on cysteine, histidine, or nonmagnesium metal ions. In recognition of these observations, the enzyme has been given the name magnesium-dependent phosphatase-1 (MDP-1).}, keywords = {Amino Acid Sequence, Animals, Catalysis, Cations, Chromatography, Affinity, Cloning, Molecular, Cysteine, Enzyme Inhibitors, Histidine, Hydrogen-Ion Concentration, Magnesium, Mice, Molecular Sequence Data, Phosphoprotein Phosphatases, Protein Phosphatase 1, Rabbits, Sequence Analysis, Protein, Sequence Homology, Amino Acid, Substrate Specificity}, author = {J. Selengut and Levine, R. L.} }