@article {49777, title = {ProPhylo: partial phylogenetic profiling to guide protein family construction and assignment of biological process.}, journal = {BMC Bioinformatics}, volume = {12}, year = {2011}, month = {2011}, pages = {434}, abstract = {

BACKGROUND: Phylogenetic profiling is a technique of scoring co-occurrence between a protein family and some other trait, usually another protein family, across a set of taxonomic groups. In spite of several refinements in recent years, the technique still invites significant improvement. To be its most effective, a phylogenetic profiling algorithm must be able to examine co-occurrences among protein families whose boundaries are uncertain within large homologous protein superfamilies.

RESULTS: Partial Phylogenetic Profiling (PPP) is an iterative algorithm that scores a given taxonomic profile against the taxonomic distribution of families for all proteins in a genome. The method works through optimizing the boundary of each protein family, rather than by relying on prebuilt protein families or fixed sequence similarity thresholds. Double Partial Phylogenetic Profiling (DPPP) is a related procedure that begins with a single sequence and searches for optimal granularities for its surrounding protein family in order to generate the best query profiles for PPP. We present ProPhylo, a high-performance software package for phylogenetic profiling studies through creating individually optimized protein family boundaries. ProPhylo provides precomputed databases for immediate use and tools for manipulating the taxonomic profiles used as queries.

CONCLUSION: ProPhylo results show universal markers of methanogenesis, a new DNA phosphorothioation-dependent restriction enzyme, and efficacy in guiding protein family construction. The software and the associated databases are freely available under the open source Perl Artistic License from ftp://ftp.jcvi.org/pub/data/ppp/.

}, keywords = {algorithms, Archaea, Archaeal Proteins, DNA, Methane, Phylogeny, software}, issn = {1471-2105}, doi = {10.1186/1471-2105-12-434}, author = {Basu, Malay K and Selengut, Jeremy D and Haft, Daniel H} } @article {38452, title = {ProPhylo: partial phylogenetic profiling to guide protein family construction and assignment of biological process}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {12}, year = {2011}, note = {http://www.ncbi.nlm.nih.gov/pubmed/22070167?dopt=Abstract}, type = {10.1186/1471-2105-12-434}, abstract = {BACKGROUND: Phylogenetic profiling is a technique of scoring co-occurrence between a protein family and some other trait, usually another protein family, across a set of taxonomic groups. In spite of several refinements in recent years, the technique still invites significant improvement. To be its most effective, a phylogenetic profiling algorithm must be able to examine co-occurrences among protein families whose boundaries are uncertain within large homologous protein superfamilies. RESULTS: Partial Phylogenetic Profiling (PPP) is an iterative algorithm that scores a given taxonomic profile against the taxonomic distribution of families for all proteins in a genome. The method works through optimizing the boundary of each protein family, rather than by relying on prebuilt protein families or fixed sequence similarity thresholds. Double Partial Phylogenetic Profiling (DPPP) is a related procedure that begins with a single sequence and searches for optimal granularities for its surrounding protein family in order to generate the best query profiles for PPP. We present ProPhylo, a high-performance software package for phylogenetic profiling studies through creating individually optimized protein family boundaries. ProPhylo provides precomputed databases for immediate use and tools for manipulating the taxonomic profiles used as queries. CONCLUSION: ProPhylo results show universal markers of methanogenesis, a new DNA phosphorothioation-dependent restriction enzyme, and efficacy in guiding protein family construction. The software and the associated databases are freely available under the open source Perl Artistic License from ftp://ftp.jcvi.org/pub/data/ppp/.}, keywords = {algorithms, Archaea, Archaeal Proteins, DNA, Methane, Phylogeny, software}, author = {Basu, Malay K. and J. Selengut and Haft, Daniel H.} } @proceedings {38343, title = {Inexact Local Alignment Search over Suffix Arrays}, year = {2009}, month = {2009}, publisher = {IEEE}, type = {10.1109/BIBM.2009.25}, abstract = {We describe an algorithm for finding approximate seeds for DNA homology searches. In contrast to previous algorithms that use exact or spaced seeds, our approximate seeds may contain insertions and deletions. We present a generalized heuristic for finding such seeds efficiently and prove that the heuristic does not affect sensitivity. We show how to adapt this algorithm to work over the memory efficient suffix array with provably minimal overhead in running time. We demonstrate the effectiveness of our algorithm on two tasks: whole genome alignment of bacteria and alignment of the DNA sequences of 177 genes that are orthologous in human and mouse. We show our algorithm achieves better sensitivity and uses less memory than other commonly used local alignment tools.}, keywords = {bacteria, Bioinformatics, biology computing, Computational Biology, Costs, DNA, DNA homology searches, DNA sequences, Educational institutions, generalized heuristic, genes, Genetics, genome alignment, Genomics, human, inexact local alignment search, inexact seeds, local alignment, local alignment tools, memory efficient suffix array, microorganisms, molecular biophysics, mouse, Organisms, Sensitivity and Specificity, sequences, suffix array, USA Councils}, isbn = {978-0-7695-3885-3}, author = {Ghodsi, M. and M. Pop} } @article {38379, title = {Microbial oceanography in a sea of opportunity}, journal = {NatureNature}, volume = {459}, year = {2009}, type = {10.1038/nature08056}, abstract = {Plankton use solar energy to drive the nutrient cycles that make the planet habitable for larger organisms. We can now explore the diversity and functions of plankton using genomics, revealing the gene repertoires associated with survival in the oceans. Such studies will help us to appreciate the sensitivity of ocean systems and of the ocean{\textquoteright}s response to climate change, improving the predictive power of climate models.}, keywords = {Astronomy, astrophysics, Biochemistry, Bioinformatics, Biology, biotechnology, cancer, cell cycle, cell signalling, climate change, Computational Biology, development, developmental biology, DNA, drug discovery, earth science, ecology, environmental science, Evolution, evolutionary biology, functional genomics, Genetics, Genomics, geophysics, immunology, interdisciplinary science, life, marine biology, materials science, medical research, medicine, metabolomics, molecular biology, molecular interactions, nanotechnology, Nature, neurobiology, neuroscience, palaeobiology, pharmacology, Physics, proteomics, quantum physics, RNA, Science, science news, science policy, signal transduction, structural biology, systems biology, transcriptomics}, isbn = {0028-0836}, author = {Bowler, Chris and Karl, David M. and Rita R. Colwell} } @article {49679, title = {SplicePort--an interactive splice-site analysis tool.}, journal = {Nucleic Acids Res}, volume = {35}, year = {2007}, month = {2007 Jul}, pages = {W285-91}, abstract = {

SplicePort is a web-based tool for splice-site analysis that allows the user to make splice-site predictions for submitted sequences. In addition, the user can also browse the rich catalog of features that underlies these predictions, and which we have found capable of providing high classification accuracy on human splice sites. Feature selection is optimized for human splice sites, but the selected features are likely to be predictive for other mammals as well. With our interactive feature browsing and visualization tool, the user can view and explore subsets of features used in splice-site prediction (either the features that account for the classification of a specific input sequence or the complete collection of features). Selected feature sets can be searched, ranked or displayed easily. The user can group features into clusters and frequency plot WebLogos can be generated for each cluster. The user can browse the identified clusters and their contributing elements, looking for new interesting signals, or can validate previously observed signals. The SplicePort web server can be accessed at http://www.cs.umd.edu/projects/SplicePort and http://www.spliceport.org.

}, keywords = {Base Sequence, Chromosome mapping, Computational Biology, Computer simulation, DNA, Genome, HUMANS, Internet, Models, Genetic, Molecular Sequence Data, Pattern Recognition, Automated, RNA Splice Sites, sequence alignment, Sequence Analysis, DNA, User-Computer Interface}, issn = {1362-4962}, doi = {10.1093/nar/gkm407}, author = {Dogan, Rezarta Islamaj and Getoor, Lise and Wilbur, W John and Mount, Stephen M} } @article {49697, title = {Localization of sequences required for size-specific splicing of a small Drosophila intron in vitro.}, journal = {J Mol Biol}, volume = {253}, year = {1995}, month = {1995 Oct 27}, pages = {426-37}, abstract = {

Many introns in Drosophila and other invertebrates are less than 80 nucleotides in length, too small to be recognized by the vertebrate splicing machinery. Comparison of nuclear splicing extracts from human HeLa and Drosophila Kc cells has revealed species-specificity, consistent with the observed size differences. Here we present additional results with the 68 nucleotide fifth intron of the Drosophila myosin heavy chain gene. As observed with the 74 nucleotide second intron of the Drosophila white gene, the wild-type myosin intron is accurately spliced in a homologous extract, and increasing the size by 16 nucleotides both eliminates splicing in the Drosophila extract and allows accurate splicing in the human extract. In contrast to previous results, however, an upstream cryptic 5{\textquoteright} splice site is activated when the wild-type myosin intron is tested in a human HeLa cell nuclear extract, resulting in the removal of a 98 nucleotide intron. The size dependence of splicing in Drosophila extracts is also intron-specific; we noted that a naturally larger (150 nucleotide) intron from the ftz gene is efficiently spliced in Kc cell extracts that do not splice enlarged introns (of 84, 90, 150 or 350 nucleotides) derived from the 74 nucleotide white intron. Here, we have exploited that observation, using a series of hybrid introns to show that a region of 46 nucleotides at the 3{\textquoteright} end of the white intron is sufficient to confer the species-specific size effect. At least two sequence elements within this region, yet distinct from previously described branchpoint and pyrimidine tract signals, are required for efficient splicing of small hybrid introns in vitro.

}, keywords = {Animals, Base Sequence, Cell Line, DNA, Drosophila, Genes, Insect, HeLa Cells, HUMANS, Introns, Molecular Sequence Data, Myosin Heavy Chains, RNA Splicing, Species Specificity}, issn = {0022-2836}, doi = {10.1006/jmbi.1995.0564}, author = {Guo, M and Mount, S M} } @article {49699, title = {P element-mediated in vivo deletion analysis of white-apricot: deletions between direct repeats are strongly favored.}, journal = {Genetics}, volume = {136}, year = {1994}, month = {1994 Mar}, pages = {1001-11}, abstract = {

We have isolated and characterized deletions arising within a P transposon, P[hswa], in the presence of P transposase. P[hswa] carries white-apricot (wa) sequences, including a complete copia element, under the control of an hsp70 promoter, and resembles the original wa allele in eye color phenotype. In the presence of P transposase, P[hswa] shows a high overall rate (approximately 3\%) of germline mutations that result in increased eye pigmentation. Of 234 derivatives of P[hswa] with greatly increased eye pigmentation, at least 205 carried deletions within copia. Of these, 201 were precise deletions between the directly repeated 276-nucleotide copia long terminal repeats (LTRs), and four were unique deletions. High rates of transposase-induced precise deletion were observed within another P transposon carrying unrelated 599 nucleotide repeats (yeast 2 mu FLP; recombinase target sites) separated by 5.7 kb. Our observation that P element-mediated deletion formation occurs preferentially between direct repeats suggests general methods for controlling deletion formation.

}, keywords = {Alleles, Animals, Animals, Genetically Modified, Base Sequence, Crosses, Genetic, DNA, DNA Transposable Elements, Drosophila, Eye Color, Female, Genes, Insect, Male, Molecular Sequence Data, Nucleotidyltransferases, PHENOTYPE, Recombination, Genetic, Repetitive Sequences, Nucleic Acid, Sequence Deletion, Transformation, Genetic, Transposases}, issn = {0016-6731}, author = {Kurkulos, M and Weinberg, J M and Roy, D and Mount, S M} } @article {49701, title = {Species-specific signals for the splicing of a short Drosophila intron in vitro.}, journal = {Mol Cell Biol}, volume = {13}, year = {1993}, month = {1993 Feb}, pages = {1104-18}, abstract = {

The effects of branchpoint sequence, the pyrimidine stretch, and intron size on the splicing efficiency of the Drosophila white gene second intron were examined in nuclear extracts from Drosophila and human cells. This 74-nucleotide intron is typical of many Drosophila introns in that it lacks a significant pyrimidine stretch and is below the minimum size required for splicing in human nuclear extracts. Alteration of sequences of adjacent to the 3{\textquoteright} splice site to create a pyrimidine stretch was necessary for splicing in human, but not Drosophila, extracts. Increasing the size of this intron with insertions between the 5{\textquoteright} splice site and the branchpoint greatly reduced the efficiency of splicing of introns longer than 79 nucleotides in Drosophila extracts but had an opposite effect in human extracts, in which introns longer than 78 nucleotides were spliced with much greater efficiency. The white-apricot copia insertion is immediately adjacent to the branchpoint normally used in the splicing of this intron, and a copia long terminal repeat insertion prevents splicing in Drosophila, but not human, extracts. However, a consensus branchpoint does not restore the splicing of introns containing the copia long terminal repeat, and alteration of the wild-type branchpoint sequence alone does not eliminate splicing. These results demonstrate species specificity of splicing signals, particularly pyrimidine stretch and size requirements, and raise the possibility that variant mechanisms not found in mammals may operate in the splicing of small introns in Drosophila and possibly other species.

}, keywords = {Animals, Base Sequence, Cell Nucleus, Consensus Sequence, DNA, DNA Transposable Elements, Drosophila, Drosophila Proteins, Electrophoresis, Polyacrylamide Gel, HeLa Cells, HUMANS, Introns, Molecular Sequence Data, Mutation, Peptide Hydrolases, Proteins, Regulatory Sequences, Nucleic Acid, Retroelements, RNA Splicing, Species Specificity}, issn = {0270-7306}, author = {Guo, M and Lo, P C and Mount, S M} } @article {49707, title = {Sequence of a cDNA from the Drosophila melanogaster white gene.}, journal = {Nucleic Acids Res}, volume = {18}, year = {1990}, month = {1990 Mar 25}, pages = {1633}, keywords = {Amino Acid Sequence, Animals, Base Sequence, DNA, Drosophila melanogaster, Eye Color, genes, Molecular Sequence Data}, issn = {0305-1048}, author = {Pepling, M and Mount, S M} } @article {49706, title = {Structure and expression of the Drosophila melanogaster gene for the U1 small nuclear ribonucleoprotein particle 70K protein.}, journal = {Mol Cell Biol}, volume = {10}, year = {1990}, month = {1990 Jun}, pages = {2492-502}, abstract = {

A genomic clone encoding the Drosophila U1 small nuclear ribonucleoprotein particle 70K protein was isolated by hybridization with a human U1 small nuclear ribonucleoprotein particle 70K protein cDNA. Southern blot and in situ hybridizations showed that this U1 70K gene is unique in the Drosophila genome, residing at cytological position 27D1,2. Polyadenylated transcripts of 1.9 and 3.1 kilobases were observed. While the 1.9-kilobase mRNA is always more abundant, the ratio of these two transcripts is developmentally regulated. Analysis of cDNA and genomic sequences indicated that these two RNAs encode an identical protein with a predicted molecular weight of 52,879. Comparison of the U1 70K proteins predicted from Drosophila, human, and Xenopus cDNAs revealed 68\% amino acid identity in the most amino-terminal 214 amino acids, which include a sequence motif common to many proteins which bind RNA. The carboxy-terminal half is less well conserved but is highly charged and contains distinctive arginine-rich regions in all three species. These arginine-rich regions contain stretches of arginine-serine dipeptides like those found in transformer, transformer-2, and suppressor-of-white-apricot proteins, all of which have been identified as regulators of mRNA splicing in Drosophila melanogaster.

}, keywords = {Amino Acid Sequence, Animals, Base Sequence, Blotting, Northern, Blotting, Southern, Cloning, Molecular, DNA, Drosophila melanogaster, Gene expression, Gene Library, genes, HUMANS, Molecular Sequence Data, Molecular Weight, Oligonucleotide Probes, Poly A, Ribonucleoproteins, Ribonucleoproteins, Small Nuclear, RNA, RNA, Messenger, Sequence Homology, Nucleic Acid, Xenopus}, issn = {0270-7306}, author = {Mancebo, R and Lo, P C and Mount, S M} } @article {49715, title = {Pseudogenes for human small nuclear RNA U3 appear to arise by integration of self-primed reverse transcripts of the RNA into new chromosomal sites.}, journal = {Cell}, volume = {32}, year = {1983}, month = {1983 Feb}, pages = {461-72}, abstract = {

We find that both human and rat U3 snRNA can function as self-priming templates for AMV reverse transcriptase in vitro. The 74 base cDNA is primed by the 3{\textquoteright} end of intact U3 snRNA, and spans the characteristically truncated 69 or 70 base U3 sequence found in four different human U3 pseudogenes. The ability of human and rat U3 snRNA to self-prime is consistent with a U3 secondary structure model derived by a comparison between rat U3 snRNA and the homologous D2 snRNA from Dictyostelium discoideum. We propose that U3 pseudogenes are generated in vivo by integration of a self-primed cDNA copy of U3 snRNA at new chromosomal sites. We also consider the possibility that the same cDNA mediates gene conversion at the 5{\textquoteright} end of bona fide U3 genes where, over the entire region spanned by the U3 cDNA, the two rat U3 sequence variants U3A and U3B are identical.

}, keywords = {Animals, Base Sequence, DNA, genes, HUMANS, Nucleic Acid Conformation, Rats, Recombination, Genetic, Repetitive Sequences, Nucleic Acid, RNA, RNA, Small Nuclear, RNA-Directed DNA Polymerase, Templates, Genetic, Transcription, Genetic}, issn = {0092-8674}, author = {Bernstein, L B and Mount, S M and Weiner, A M} }