@article {49606, title = {Orchestrating high-throughput genomic analysis with Bioconductor.}, volume = {12}, year = {2015}, month = {2015 Feb}, pages = {115-21}, abstract = {

Bioconductor is an open-source, open-development software project for the analysis and comprehension of high-throughput data in genomics and molecular biology. The project aims to enable interdisciplinary research, collaboration and rapid development of scientific software. Based on the statistical programming language R, Bioconductor comprises 934 interoperable packages contributed by a large, diverse community of scientists. Packages cover a range of bioinformatic and statistical applications. They undergo formal initial review and continuous automated testing. We present an overview for prospective users and contributors.

}, keywords = {Computational Biology, Gene Expression Profiling, Genomics, High-Throughput Screening Assays, Programming Languages, software, User-Computer Interface}, issn = {1548-7105}, doi = {10.1038/nmeth.3252}, author = {Huber, Wolfgang and Carey, Vincent J and Gentleman, Robert and Anders, Simon and Carlson, Marc and Carvalho, Benilton S and Bravo, H{\'e}ctor Corrada and Davis, Sean and Gatto, Laurent and Girke, Thomas and Gottardo, Raphael and Hahne, Florian and Hansen, Kasper D and Irizarry, Rafael A and Lawrence, Michael and Love, Michael I and MacDonald, James and Obenchain, Valerie and Ole{\'s}, Andrzej K and Pag{\`e}s, Herv{\'e} and Reyes, Alejandro and Shannon, Paul and Smyth, Gordon K and Tenenbaum, Dan and Waldron, Levi and Morgan, Martin} } @article {45867, title = {Automated ensemble assembly and validation of microbial genomes.}, journal = {BMC Bioinformatics}, volume = {15}, year = {2014}, month = {2014}, pages = {126}, abstract = {

BACKGROUND: The continued democratization of DNA sequencing has sparked a new wave of development of genome assembly and assembly validation methods. As individual research labs, rather than centralized centers, begin to sequence the majority of new genomes, it is important to establish best practices for genome assembly. However, recent evaluations such as GAGE and the Assemblathon have concluded that there is no single best approach to genome assembly. Instead, it is preferable to generate multiple assemblies and validate them to determine which is most useful for the desired analysis; this is a labor-intensive process that is often impossible or unfeasible.

RESULTS: To encourage best practices supported by the community, we present iMetAMOS, an automated ensemble assembly pipeline; iMetAMOS encapsulates the process of running, validating, and selecting a single assembly from multiple assemblies. iMetAMOS packages several leading open-source tools into a single binary that automates parameter selection and execution of multiple assemblers, scores the resulting assemblies based on multiple validation metrics, and annotates the assemblies for genes and contaminants. We demonstrate the utility of the ensemble process on 225 previously unassembled Mycobacterium tuberculosis genomes as well as a Rhodobacter sphaeroides benchmark dataset. On these real data, iMetAMOS reliably produces validated assemblies and identifies potential contamination without user intervention. In addition, intelligent parameter selection produces assemblies of R. sphaeroides comparable to or exceeding the quality of those from the GAGE-B evaluation, affecting the relative ranking of some assemblers.

CONCLUSIONS: Ensemble assembly with iMetAMOS provides users with multiple, validated assemblies for each genome. Although computationally limited to small or mid-sized genomes, this approach is the most effective and reproducible means for generating high-quality assemblies and enables users to select an assembly best tailored to their specific needs.

}, keywords = {Genome, Bacterial, Genome, Microbial, Genomics, Mycobacterium tuberculosis, Rhodobacter sphaeroides, Sequence Analysis, DNA, software}, issn = {1471-2105}, doi = {10.1186/1471-2105-15-126}, author = {Koren, Sergey and Todd Treangen and Hill, Christopher M and Pop, Mihai and Phillippy, Adam M} } @article {49602, title = {Epiviz: interactive visual analytics for functional genomics data.}, volume = {11}, year = {2014}, month = {2014 Sep}, pages = {938-40}, abstract = {

Visualization is an integral aspect of genomics data analysis. Algorithmic-statistical analysis and interactive visualization are most effective when used iteratively. Epiviz (http://epiviz.cbcb.umd.edu/), a web-based genome browser, and the Epivizr Bioconductor package allow interactive, extensible and reproducible visualization within a state-of-the-art data-analysis platform.

}, keywords = {algorithms, Chromosome mapping, Data Mining, database management systems, Databases, Genetic, Genomics, Internet, software, User-Computer Interface}, issn = {1548-7105}, doi = {10.1038/nmeth.3038}, author = {Chelaru, Florin and Smith, Llewellyn and Goldstein, Naomi and Bravo, H{\'e}ctor Corrada} } @article {49535, title = {Genomic analysis of sequence-dependent DNA curvature in Leishmania.}, volume = {8}, year = {2013}, month = {2013}, pages = {e63068}, abstract = {

Leishmania major is a flagellated protozoan parasite of medical importance. Like other members of the Trypanosomatidae family, it possesses unique mechanisms of gene expression such as constitutive polycistronic transcription of directional gene clusters, gene amplification, mRNA trans-splicing, and extensive editing of mitochondrial transcripts. The molecular signals underlying most of these processes remain under investigation. In order to investigate the role of DNA secondary structure signals in gene expression, we carried out a genome-wide in silico analysis of the intrinsic DNA curvature. The L. major genome revealed a lower frequency of high intrinsic curvature regions as well as inter- and intra- chromosomal distribution heterogeneity, when compared to prokaryotic and eukaryotic organisms. Using a novel method aimed at detecting region-integrated intrinsic curvature (RIIC), high DNA curvature was found to be associated with regions implicated in transcription initiation. Those include divergent strand-switch regions between directional gene clusters and regions linked to markers of active transcription initiation such as acetylated H3 histone, TRF4 and SNAP50. These findings suggest a role for DNA curvature in transcription initiation in Leishmania supporting the relevance of DNA secondary structures signals.

}, keywords = {Chromosome mapping, Comparative Genomic Hybridization, Computational Biology, DNA, Protozoan, Genome, Protozoan, Genomics, HUMANS, Leishmania, Nucleic Acid Conformation}, issn = {1932-6203}, doi = {10.1371/journal.pone.0063068}, author = {Smircich, Pablo and Forteza, Diego and El-Sayed, Najib M and Garat, Beatriz} } @article {38529, title = {TIGRFAMs and Genome Properties in 2013}, journal = {Nucleic acids researchNucleic Acids Research}, volume = {41}, year = {2013}, note = {http://www.ncbi.nlm.nih.gov/pubmed/23197656?dopt=Abstract}, type = {10.1093/nar/gks1234}, abstract = {TIGRFAMs, available online at http://www.jcvi.org/tigrfams is a database of protein family definitions. Each entry features a seed alignment of trusted representative sequences, a hidden Markov model (HMM) built from that alignment, cutoff scores that let automated annotation pipelines decide which proteins are members, and annotations for transfer onto member proteins. Most TIGRFAMs models are designated equivalog, meaning they assign a specific name to proteins conserved in function from a common ancestral sequence. Models describing more functionally heterogeneous families are designated subfamily or domain, and assign less specific but more widely applicable annotations. The Genome Properties database, available at http://www.jcvi.org/genome-properties, specifies how computed evidence, including TIGRFAMs HMM results, should be used to judge whether an enzymatic pathway, a protein complex or another type of molecular subsystem is encoded in a genome. TIGRFAMs and Genome Properties content are developed in concert because subsystems reconstruction for large numbers of genomes guides selection of seed alignment sequences and cutoff values during protein family construction. Both databases specialize heavily in bacterial and archaeal subsystems. At present, 4284 models appear in TIGRFAMs, while 628 systems are described by Genome Properties. Content derives both from subsystem discovery work and from biocuration of the scientific literature.}, keywords = {Databases, Protein, Genome, Archaeal, Genome, Bacterial, Genomics, Internet, Markov chains, Molecular Sequence Annotation, Proteins, sequence alignment}, author = {Haft, Daniel H. and J. Selengut and Richter, Roland A. and Harkins, Derek and Basu, Malay K. and Beck, Erin} } @article {49764, title = {TIGRFAMs and Genome Properties in 2013.}, journal = {Nucleic Acids Res}, volume = {41}, year = {2013}, month = {2013 Jan}, pages = {D387-95}, abstract = {

TIGRFAMs, available online at http://www.jcvi.org/tigrfams is a database of protein family definitions. Each entry features a seed alignment of trusted representative sequences, a hidden Markov model (HMM) built from that alignment, cutoff scores that let automated annotation pipelines decide which proteins are members, and annotations for transfer onto member proteins. Most TIGRFAMs models are designated equivalog, meaning they assign a specific name to proteins conserved in function from a common ancestral sequence. Models describing more functionally heterogeneous families are designated subfamily or domain, and assign less specific but more widely applicable annotations. The Genome Properties database, available at http://www.jcvi.org/genome-properties, specifies how computed evidence, including TIGRFAMs HMM results, should be used to judge whether an enzymatic pathway, a protein complex or another type of molecular subsystem is encoded in a genome. TIGRFAMs and Genome Properties content are developed in concert because subsystems reconstruction for large numbers of genomes guides selection of seed alignment sequences and cutoff values during protein family construction. Both databases specialize heavily in bacterial and archaeal subsystems. At present, 4284 models appear in TIGRFAMs, while 628 systems are described by Genome Properties. Content derives both from subsystem discovery work and from biocuration of the scientific literature.

}, keywords = {Databases, Protein, Genome, Archaeal, Genome, Bacterial, Genomics, Internet, Markov chains, Molecular Sequence Annotation, Proteins, sequence alignment}, issn = {1362-4962}, doi = {10.1093/nar/gks1234}, author = {Haft, Daniel H and Selengut, Jeremy D and Richter, Roland A and Harkins, Derek and Basu, Malay K and Beck, Erin} } @article {49653, title = {Functional genomics of trypanosomatids.}, journal = {Parasite Immunol}, volume = {34}, year = {2012}, month = {2012 Feb-Mar}, pages = {72-9}, abstract = {

The decoding of the Tritryp reference genomes nearly 7 years ago provided a first peek into the biology of pathogenic trypanosomatids and a blueprint that has paved the way for genome-wide studies. Although 60-70\% of the predicted protein coding genes in Trypanosoma brucei, Trypanosoma cruzi and Leishmania major remain unannotated, the functional genomics landscape is rapidly changing. Facilitated by the advent of next-generation sequencing technologies, improved structural and functional annotation and genes and their products are emerging. Information is also growing for the interactions between cellular components as transcriptomes, regulatory networks and metabolomes are characterized, ushering in a new era of systems biology. Simultaneously, the launch of comparative sequencing of multiple strains of kinetoplastids will finally lead to the investigation of a vast, yet to be explored, evolutionary and pathogenomic space.

}, keywords = {Animals, Genome, Protozoan, Genomics, HUMANS, Proteome, Protozoan Proteins, Transcriptome, Trypanosomatina}, issn = {1365-3024}, doi = {10.1111/j.1365-3024.2011.01347.x}, author = {Choi, J and El-Sayed, N M} } @article {38276, title = {Gene expression anti-profiles as a basis for accurate universal cancer signatures}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {13}, year = {2012}, note = {http://www.ncbi.nlm.nih.gov/pubmed/23088656?dopt=Abstract}, type = {10.1186/1471-2105-13-272}, abstract = {BACKGROUND: Early screening for cancer is arguably one of the greatest public health advances over the last fifty years. However, many cancer screening tests are invasive (digital rectal exams), expensive (mammograms, imaging) or both (colonoscopies). This has spurred growing interest in developing genomic signatures that can be used for cancer diagnosis and prognosis. However, progress has been slowed by heterogeneity in cancer profiles and the lack of effective computational prediction tools for this type of data. RESULTS: We developed anti-profiles as a first step towards translating experimental findings suggesting that stochastic across-sample hyper-variability in the expression of specific genes is a stable and general property of cancer into predictive and diagnostic signatures. Using single-chip microarray normalization and quality assessment methods, we developed an anti-profile for colon cancer in tissue biopsy samples. To demonstrate the translational potential of our findings, we applied the signature developed in the tissue samples, without any further retraining or normalization, to screen patients for colon cancer based on genomic measurements from peripheral blood in an independent study (AUC of 0.89). This method achieved higher accuracy than the signature underlying commercially available peripheral blood screening tests for colon cancer (AUC of 0.81). We also confirmed the existence of hyper-variable genes across a range of cancer types and found that a significant proportion of tissue-specific genes are hyper-variable in cancer. Based on these observations, we developed a universal cancer anti-profile that accurately distinguishes cancer from normal regardless of tissue type (ten-fold cross-validation AUC > 0.92). CONCLUSIONS: We have introduced anti-profiles as a new approach for developing cancer genomic signatures that specifically takes advantage of gene expression heterogeneity. We have demonstrated that anti-profiles can be successfully applied to develop peripheral-blood based diagnostics for cancer and used anti-profiles to develop a highly accurate universal cancer signature. By using single-chip normalization and quality assessment methods, no further retraining of signatures developed by the anti-profile approach would be required before their application in clinical settings. Our results suggest that anti-profiles may be used to develop inexpensive and non-invasive universal cancer screening tests.}, keywords = {Area Under Curve, Colonic Neoplasms, Gene Expression Profiling, Genetic Variation, Genomics, HUMANS, Oligonucleotide Array Sequence Analysis, Prognosis, Transcriptome, Tumor Markers, Biological}, author = {H{\'e}ctor Corrada Bravo and Pihur, Vasyl and McCall, Matthew and Irizarry, Rafael A. and Leek, Jeffrey T.} } @article {38421, title = {The partitioned LASSO-patternsearch algorithm with application to gene expression data}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {13}, year = {2012}, note = {http://www.ncbi.nlm.nih.gov/pubmed/22587526?dopt=Abstract}, type = {10.1186/1471-2105-13-98}, abstract = {BACKGROUND: In systems biology, the task of reverse engineering gene pathways from data has been limited not just by the curse of dimensionality (the interaction space is huge) but also by systematic error in the data. The gene expression barcode reduces spurious association driven by batch effects and probe effects. The binary nature of the resulting expression calls lends itself perfectly to modern regularization approaches that thrive in high-dimensional settings. RESULTS: The Partitioned LASSO-Patternsearch algorithm is proposed to identify patterns of multiple dichotomous risk factors for outcomes of interest in genomic studies. A partitioning scheme is used to identify promising patterns by solving many LASSO-Patternsearch subproblems in parallel. All variables that survive this stage proceed to an aggregation stage where the most significant patterns are identified by solving a reduced LASSO-Patternsearch problem in just these variables. This approach was applied to genetic data sets with expression levels dichotomized by gene expression bar code. Most of the genes and second-order interactions thus selected and are known to be related to the outcomes. CONCLUSIONS: We demonstrate with simulations and data analyses that the proposed method not only selects variables and patterns more accurately, but also provides smaller models with better prediction accuracy, in comparison to several alternative methodologies.}, keywords = {algorithms, Breast Neoplasms, Computer simulation, Female, Gene expression, Gene Expression Profiling, Genomics, HUMANS, Models, Genetic}, author = {Shi, Weiliang and Wahba, Grace and Irizarry, Rafael A. and H{\'e}ctor Corrada Bravo and Wright, Stephen J.} } @proceedings {38374, title = {MetaPhyler: Taxonomic profiling for metagenomic sequences}, year = {2010}, month = {2010}, publisher = {IEEE}, type = {10.1109/BIBM.2010.5706544}, abstract = {A major goal of metagenomics is to characterize the microbial diversity of an environment. The most popular approach relies on 16S rRNA sequencing, however this approach can generate biased estimates due to differences in the copy number of the 16S rRNA gene between even closely related organisms, and due to PCR artifacts. The taxonomic composition can also be determined from whole-metagenome sequencing data by matching individual sequences against a database of reference genes. One major limitation of prior methods used for this purpose is the use of a universal classification threshold for all genes at all taxonomic levels. We propose that better classification results can be obtained by tuning the taxonomic classifier to each matching length, reference gene, and taxonomic level. We present a novel taxonomic profiler MetaPhyler, which uses marker genes as a taxonomic reference. Results on simulated datasets demonstrate that MetaPhyler outperforms other tools commonly used in this context (CARMA, Megan and PhymmBL). We also present interesting results obtained by applying MetaPhyler to a real metagenomic dataset.}, keywords = {Bioinformatics, CARMA comparison, Databases, Genomics, Linear regression, marker genes, matching length, Megan comparison, metagenomic sequences, metagenomics, MetaPhyler, microbial diversity, microorganisms, molecular biophysics, molecular configurations, Pattern classification, pattern matching, phylogenetic classification, Phylogeny, PhymmBL comparison, reference gene database, Sensitivity, sequence matching, taxonomic classifier, taxonomic level, taxonomic profiling, whole metagenome sequencing data}, isbn = {978-1-4244-8306-8}, author = {Liu, Bo and Gibbons, T. and Ghodsi, M. and M. Pop} } @article {38522, title = {Tackling the widespread and critical impact of batch effects in high-throughput data}, journal = {Nature reviews. GeneticsNature reviews. Genetics}, volume = {11}, year = {2010}, note = {http://www.ncbi.nlm.nih.gov/pubmed/20838408?dopt=Abstract}, type = {10.1038/nrg2825}, abstract = {High-throughput technologies are widely used, for example to assay genetic variants, gene and protein expression, and epigenetic modifications. One often overlooked complication with such studies is batch effects, which occur because measurements are affected by laboratory conditions, reagent lots and personnel differences. This becomes a major problem when batch effects are correlated with an outcome of interest and lead to incorrect conclusions. Using both published studies and our own analyses, we argue that batch effects (as well as other technical and biological artefacts) are widespread and critical to address. We review experimental and computational approaches for doing so.}, keywords = {biotechnology, Computational Biology, Genomics, Oligonucleotide Array Sequence Analysis, Periodicals as Topic, Research Design, Sequence Analysis, DNA}, author = {Leek, Jeffrey T. and Scharpf, Robert B. and H{\'e}ctor Corrada Bravo and Simcha, David and Langmead, Benjamin and Johnson, W. Evan and Geman, Donald and Baggerly, Keith and Irizarry, Rafael A.} } @proceedings {38343, title = {Inexact Local Alignment Search over Suffix Arrays}, year = {2009}, month = {2009}, publisher = {IEEE}, type = {10.1109/BIBM.2009.25}, abstract = {We describe an algorithm for finding approximate seeds for DNA homology searches. In contrast to previous algorithms that use exact or spaced seeds, our approximate seeds may contain insertions and deletions. We present a generalized heuristic for finding such seeds efficiently and prove that the heuristic does not affect sensitivity. We show how to adapt this algorithm to work over the memory efficient suffix array with provably minimal overhead in running time. We demonstrate the effectiveness of our algorithm on two tasks: whole genome alignment of bacteria and alignment of the DNA sequences of 177 genes that are orthologous in human and mouse. We show our algorithm achieves better sensitivity and uses less memory than other commonly used local alignment tools.}, keywords = {bacteria, Bioinformatics, biology computing, Computational Biology, Costs, DNA, DNA homology searches, DNA sequences, Educational institutions, generalized heuristic, genes, Genetics, genome alignment, Genomics, human, inexact local alignment search, inexact seeds, local alignment, local alignment tools, memory efficient suffix array, microorganisms, molecular biophysics, mouse, Organisms, Sensitivity and Specificity, sequences, suffix array, USA Councils}, isbn = {978-0-7695-3885-3}, author = {Ghodsi, M. and M. Pop} } @article {38379, title = {Microbial oceanography in a sea of opportunity}, journal = {NatureNature}, volume = {459}, year = {2009}, type = {10.1038/nature08056}, abstract = {Plankton use solar energy to drive the nutrient cycles that make the planet habitable for larger organisms. We can now explore the diversity and functions of plankton using genomics, revealing the gene repertoires associated with survival in the oceans. Such studies will help us to appreciate the sensitivity of ocean systems and of the ocean{\textquoteright}s response to climate change, improving the predictive power of climate models.}, keywords = {Astronomy, astrophysics, Biochemistry, Bioinformatics, Biology, biotechnology, cancer, cell cycle, cell signalling, climate change, Computational Biology, development, developmental biology, DNA, drug discovery, earth science, ecology, environmental science, Evolution, evolutionary biology, functional genomics, Genetics, Genomics, geophysics, immunology, interdisciplinary science, life, marine biology, materials science, medical research, medicine, metabolomics, molecular biology, molecular interactions, nanotechnology, Nature, neurobiology, neuroscience, palaeobiology, pharmacology, Physics, proteomics, quantum physics, RNA, Science, science news, science policy, signal transduction, structural biology, systems biology, transcriptomics}, isbn = {0028-0836}, author = {Bowler, Chris and Karl, David M. and Rita R. Colwell} } @article {49748, title = {A practical algorithm for finding maximal exact matches in large sequence datasets using sparse suffix arrays.}, journal = {Bioinformatics}, volume = {25}, year = {2009}, month = {2009 Jul 1}, pages = {1609-16}, abstract = {

MOTIVATION: High-throughput sequencing technologies place ever increasing demands on existing algorithms for sequence analysis. Algorithms for computing maximal exact matches (MEMs) between sequences appear in two contexts where high-throughput sequencing will vastly increase the volume of sequence data: (i) seeding alignments of high-throughput reads for genome assembly and (ii) designating anchor points for genome-genome comparisons.

RESULTS: We introduce a new algorithm for finding MEMs. The algorithm leverages a sparse suffix array (SA), a text index that stores every K-th position of the text. In contrast to a full text index that stores every position of the text, a sparse SA occupies much less memory. Even though we use a sparse index, the output of our algorithm is the same as a full text index algorithm as long as the space between the indexed suffixes is not greater than a minimum length of a MEM. By relying on partial matches and additional text scanning between indexed positions, the algorithm trades memory for extra computation. The reduced memory usage makes it possible to determine MEMs between significantly longer sequences.

AVAILABILITY: Source code for the algorithm is available under a BSD open source license at http://compbio.cs.princeton.edu/mems. The implementation can serve as a drop-in replacement for the MEMs algorithm in MUMmer 3.

}, keywords = {algorithms, Base Sequence, Genomics, sequence alignment, Sequence Analysis, DNA}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btp275}, author = {Khan, Zia and Bloom, Joshua S and Kruglyak, Leonid and Singh, Mona} } @proceedings {38555, title = {Uncovering Genomic Reassortments among Influenza Strains by Enumerating Maximal Bicliques}, year = {2008}, month = {2008}, publisher = {IEEE}, type = {10.1109/BIBM.2008.78}, abstract = {The evolutionary histories of viral genomes have received significant recent attention due to their importance in understanding virulence and the corresponding ramifications to public health. We present a novel framework to detect reassortment events in influenza based on the comparison of two distributions of phylogenetic trees, rather than a pair of, possibly unreliable, consensus trees. We show how to detect all high-probability inconsistencies between two distributions of trees by enumerating maximal bicliques within a defined incompatibility graph. In the process, we give the first quadratic delay algorithm for enumerating maximal bicliques within general bipartite graphs. We demonstrate the utility of our approach by applying it to several sets of influenza genomes (both human- and avian-hosted) and successfully identify all known reassortment events and a few novel candidate reassortments. In addition, on simulated datasets, our approach correctly finds implanted reassortments and rarely detects reassortments where none were introduced.}, keywords = {avian hosted influenza genome, Bioinformatics, Capacitive sensors, Delay, diseases, Event detection, general bipartite graphs, genomic reassortments, Genomics, graph theory, high probability inconsistencies, History, human hosted influenza genome, incompatibility graph, Influenza, influenza strain, maximal biclique, maximal biclique enumeration, microorganisms, phylogenetic trees, Phylogeny, Public healthcare, quadratic delay algorithm, reassortment, reassortment event detection, Tree graphs, viral genome evolutionary history, virulence}, isbn = {978-0-7695-3452-7}, author = {Nagarajan, N. and Kingsford, Carl} } @article {49677, title = {Evolution of genes and genomes on the Drosophila phylogeny.}, journal = {Nature}, volume = {450}, year = {2007}, month = {2007 Nov 8}, pages = {203-18}, abstract = {

Comparative analysis of multiple genomes in a phylogenetic framework dramatically improves the precision and sensitivity of evolutionary inference, producing more robust results than single-genome analyses can provide. The genomes of 12 Drosophila species, ten of which are presented here for the first time (sechellia, simulans, yakuba, erecta, ananassae, persimilis, willistoni, mojavensis, virilis and grimshawi), illustrate how rates and patterns of sequence divergence across taxa can illuminate evolutionary processes on a genomic scale. These genome sequences augment the formidable genetic tools that have made Drosophila melanogaster a pre-eminent model for animal genetics, and will further catalyse fundamental research on mechanisms of development, cell biology, genetics, disease, neurobiology, behaviour, physiology and evolution. Despite remarkable similarities among these Drosophila species, we identified many putatively non-neutral changes in protein-coding genes, non-coding RNA genes, and cis-regulatory regions. These may prove to underlie differences in the ecology and behaviour of these diverse species.

}, keywords = {Animals, Codon, DNA Transposable Elements, Drosophila, Drosophila Proteins, Evolution, Molecular, Gene Order, Genes, Insect, Genome, Insect, Genome, Mitochondrial, Genomics, Immunity, Multigene Family, Phylogeny, Reproduction, RNA, Untranslated, sequence alignment, Sequence Analysis, DNA, Synteny}, issn = {1476-4687}, doi = {10.1038/nature06341}, author = {Clark, Andrew G and Eisen, Michael B and Smith, Douglas R and Bergman, Casey M and Oliver, Brian and Markow, Therese A and Kaufman, Thomas C and Kellis, Manolis and Gelbart, William and Iyer, Venky N and Pollard, Daniel A and Sackton, Timothy B and Larracuente, Amanda M and Singh, Nadia D and Abad, Jose P and Abt, Dawn N and Adryan, Boris and Aguade, Montserrat and Akashi, Hiroshi and Anderson, Wyatt W and Aquadro, Charles F and Ardell, David H and Arguello, Roman and Artieri, Carlo G and Barbash, Daniel A and Barker, Daniel and Barsanti, Paolo and Batterham, Phil and Batzoglou, Serafim and Begun, Dave and Bhutkar, Arjun and Blanco, Enrico and Bosak, Stephanie A and Bradley, Robert K and Brand, Adrianne D and Brent, Michael R and Brooks, Angela N and Brown, Randall H and Butlin, Roger K and Caggese, Corrado and Calvi, Brian R and Bernardo de Carvalho, A and Caspi, Anat and Castrezana, Sergio and Celniker, Susan E and Chang, Jean L and Chapple, Charles and Chatterji, Sourav and Chinwalla, Asif and Civetta, Alberto and Clifton, Sandra W and Comeron, Josep M and Costello, James C and Coyne, Jerry A and Daub, Jennifer and David, Robert G and Delcher, Arthur L and Delehaunty, Kim and Do, Chuong B and Ebling, Heather and Edwards, Kevin and Eickbush, Thomas and Evans, Jay D and Filipski, Alan and Findeiss, Sven and Freyhult, Eva and Fulton, Lucinda and Fulton, Robert and Garcia, Ana C L and Gardiner, Anastasia and Garfield, David A and Garvin, Barry E and Gibson, Greg and Gilbert, Don and Gnerre, Sante and Godfrey, Jennifer and Good, Robert and Gotea, Valer and Gravely, Brenton and Greenberg, Anthony J and Griffiths-Jones, Sam and Gross, Samuel and Guigo, Roderic and Gustafson, Erik A and Haerty, Wilfried and Hahn, Matthew W and Halligan, Daniel L and Halpern, Aaron L and Halter, Gillian M and Han, Mira V and Heger, Andreas and Hillier, LaDeana and Hinrichs, Angie S and Holmes, Ian and Hoskins, Roger A and Hubisz, Melissa J and Hultmark, Dan and Huntley, Melanie A and Jaffe, David B and Jagadeeshan, Santosh and Jeck, William R and Johnson, Justin and Jones, Corbin D and Jordan, William C and Karpen, Gary H and Kataoka, Eiko and Keightley, Peter D and Kheradpour, Pouya and Kirkness, Ewen F and Koerich, Leonardo B and Kristiansen, Karsten and Kudrna, Dave and Kulathinal, Rob J and Kumar, Sudhir and Kwok, Roberta and Lander, Eric and Langley, Charles H and Lapoint, Richard and Lazzaro, Brian P and Lee, So-Jeong and Levesque, Lisa and Li, Ruiqiang and Lin, Chiao-Feng and Lin, Michael F and Lindblad-Toh, Kerstin and Llopart, Ana and Long, Manyuan and Low, Lloyd and Lozovsky, Elena and Lu, Jian and Luo, Meizhong and Machado, Carlos A and Makalowski, Wojciech and Marzo, Mar and Matsuda, Muneo and Matzkin, Luciano and McAllister, Bryant and McBride, Carolyn S and McKernan, Brendan and McKernan, Kevin and Mendez-Lago, Maria and Minx, Patrick and Mollenhauer, Michael U and Montooth, Kristi and Mount, Stephen M and Mu, Xu and Myers, Eugene and Negre, Barbara and Newfeld, Stuart and Nielsen, Rasmus and Noor, Mohamed A F and O{\textquoteright}Grady, Patrick and Pachter, Lior and Papaceit, Montserrat and Parisi, Matthew J and Parisi, Michael and Parts, Leopold and Pedersen, Jakob S and Pesole, Graziano and Phillippy, Adam M and Ponting, Chris P and Pop, Mihai and Porcelli, Damiano and Powell, Jeffrey R and Prohaska, Sonja and Pruitt, Kim and Puig, Marta and Quesneville, Hadi and Ram, Kristipati Ravi and Rand, David and Rasmussen, Matthew D and Reed, Laura K and Reenan, Robert and Reily, Amy and Remington, Karin A and Rieger, Tania T and Ritchie, Michael G and Robin, Charles and Rogers, Yu-Hui and Rohde, Claudia and Rozas, Julio and Rubenfield, Marc J and Ruiz, Alfredo and Russo, Susan and Salzberg, Steven L and Sanchez-Gracia, Alejandro and Saranga, David J and Sato, Hajime and Schaeffer, Stephen W and Schatz, Michael C and Schlenke, Todd and Schwartz, Russell and Segarra, Carmen and Singh, Rama S and Sirot, Laura and Sirota, Marina and Sisneros, Nicholas B and Smith, Chris D and Smith, Temple F and Spieth, John and Stage, Deborah E and Stark, Alexander and Stephan, Wolfgang and Strausberg, Robert L and Strempel, Sebastian and Sturgill, David and Sutton, Granger and Sutton, Granger G and Tao, Wei and Teichmann, Sarah and Tobari, Yoshiko N and Tomimura, Yoshihiko and Tsolas, Jason M and Valente, Vera L S and Venter, Eli and Venter, J Craig and Vicario, Saverio and Vieira, Filipe G and Vilella, Albert J and Villasante, Alfredo and Walenz, Brian and Wang, Jun and Wasserman, Marvin and Watts, Thomas and Wilson, Derek and Wilson, Richard K and Wing, Rod A and Wolfner, Mariana F and Wong, Alex and Wong, Gane Ka-Shu and Wu, Chung-I and Wu, Gabriel and Yamamoto, Daisuke and Yang, Hsiao-Pei and Yang, Shiaw-Pyng and Yorke, James A and Yoshida, Kiyohito and Zdobnov, Evgeny and Zhang, Peili and Zhang, Yu and Zimin, Aleksey V and Baldwin, Jennifer and Abdouelleil, Amr and Abdulkadir, Jamal and Abebe, Adal and Abera, Brikti and Abreu, Justin and Acer, St Christophe and Aftuck, Lynne and Alexander, Allen and An, Peter and Anderson, Erica and Anderson, Scott and Arachi, Harindra and Azer, Marc and Bachantsang, Pasang and Barry, Andrew and Bayul, Tashi and Berlin, Aaron and Bessette, Daniel and Bloom, Toby and Blye, Jason and Boguslavskiy, Leonid and Bonnet, Claude and Boukhgalter, Boris and Bourzgui, Imane and Brown, Adam and Cahill, Patrick and Channer, Sheridon and Cheshatsang, Yama and Chuda, Lisa and Citroen, Mieke and Collymore, Alville and Cooke, Patrick and Costello, Maura and D{\textquoteright}Aco, Katie and Daza, Riza and De Haan, Georgius and DeGray, Stuart and DeMaso, Christina and Dhargay, Norbu and Dooley, Kimberly and Dooley, Erin and Doricent, Missole and Dorje, Passang and Dorjee, Kunsang and Dupes, Alan and Elong, Richard and Falk, Jill and Farina, Abderrahim and Faro, Susan and Ferguson, Diallo and Fisher, Sheila and Foley, Chelsea D and Franke, Alicia and Friedrich, Dennis and Gadbois, Loryn and Gearin, Gary and Gearin, Christina R and Giannoukos, Georgia and Goode, Tina and Graham, Joseph and Grandbois, Edward and Grewal, Sharleen and Gyaltsen, Kunsang and Hafez, Nabil and Hagos, Birhane and Hall, Jennifer and Henson, Charlotte and Hollinger, Andrew and Honan, Tracey and Huard, Monika D and Hughes, Leanne and Hurhula, Brian and Husby, M Erii and Kamat, Asha and Kanga, Ben and Kashin, Seva and Khazanovich, Dmitry and Kisner, Peter and Lance, Krista and Lara, Marcia and Lee, William and Lennon, Niall and Letendre, Frances and LeVine, Rosie and Lipovsky, Alex and Liu, Xiaohong and Liu, Jinlei and Liu, Shangtao and Lokyitsang, Tashi and Lokyitsang, Yeshi and Lubonja, Rakela and Lui, Annie and MacDonald, Pen and Magnisalis, Vasilia and Maru, Kebede and Matthews, Charles and McCusker, William and McDonough, Susan and Mehta, Teena and Meldrim, James and Meneus, Louis and Mihai, Oana and Mihalev, Atanas and Mihova, Tanya and Mittelman, Rachel and Mlenga, Valentine and Montmayeur, Anna and Mulrain, Leonidas and Navidi, Adam and Naylor, Jerome and Negash, Tamrat and Nguyen, Thu and Nguyen, Nga and Nicol, Robert and Norbu, Choe and Norbu, Nyima and Novod, Nathaniel and O{\textquoteright}Neill, Barry and Osman, Sahal and Markiewicz, Eva and Oyono, Otero L and Patti, Christopher and Phunkhang, Pema and Pierre, Fritz and Priest, Margaret and Raghuraman, Sujaa and Rege, Filip and Reyes, Rebecca and Rise, Cecil and Rogov, Peter and Ross, Keenan and Ryan, Elizabeth and Settipalli, Sampath and Shea, Terry and Sherpa, Ngawang and Shi, Lu and Shih, Diana and Sparrow, Todd and Spaulding, Jessica and Stalker, John and Stange-Thomann, Nicole and Stavropoulos, Sharon and Stone, Catherine and Strader, Christopher and Tesfaye, Senait and Thomson, Talene and Thoulutsang, Yama and Thoulutsang, Dawa and Topham, Kerri and Topping, Ira and Tsamla, Tsamla and Vassiliev, Helen and Vo, Andy and Wangchuk, Tsering and Wangdi, Tsering and Weiand, Michael and Wilkinson, Jane and Wilson, Adam and Yadav, Shailendra and Young, Geneva and Yu, Qing and Zembek, Lisa and Zhong, Danni and Zimmer, Andrew and Zwirko, Zac and Jaffe, David B and Alvarez, Pablo and Brockman, Will and Butler, Jonathan and Chin, CheeWhye and Gnerre, Sante and Grabherr, Manfred and Kleber, Michael and Mauceli, Evan and MacCallum, Iain} } @article {38530, title = {TIGRFAMs and Genome Properties: tools for the assignment of molecular function and biological process in prokaryotic genomes}, journal = {Nucleic acids researchNucleic Acids Research}, volume = {35}, year = {2007}, note = {http://www.ncbi.nlm.nih.gov/pubmed/17151080?dopt=Abstract}, type = {10.1093/nar/gkl1043}, abstract = {TIGRFAMs is a collection of protein family definitions built to aid in high-throughput annotation of specific protein functions. Each family is based on a hidden Markov model (HMM), where both cutoff scores and membership in the seed alignment are chosen so that the HMMs can classify numerous proteins according to their specific molecular functions. Most TIGRFAMs models describe {\textquoteright}equivalog{\textquoteright} families, where both orthology and lateral gene transfer may be part of the evolutionary history, but where a single molecular function has been conserved. The Genome Properties system contains a queriable set of metabolic reconstructions, genome metrics and extractions of information from the scientific literature. Its genome-by-genome assertions of whether or not specific structures, pathways or systems are present provide high-level conceptual descriptions of genomic content. These assertions enable comparative genomics, provide a meaningful biological context to aid in manual annotation, support assignments of Gene Ontology (GO) biological process terms and help validate HMM-based predictions of protein function. The Genome Properties system is particularly useful as a generator of phylogenetic profiles, through which new protein family functions may be discovered. The TIGRFAMs and Genome Properties systems can be accessed at http://www.tigr.org/TIGRFAMs and http://www.tigr.org/Genome_Properties.}, keywords = {Archaeal Proteins, Bacterial Proteins, Databases, Protein, Genome, Bacterial, Genomics, Internet, Phylogeny, software, User-Computer Interface}, author = {J. Selengut and Haft, Daniel H. and Davidsen, Tanja and Ganapathy, Anurhada and Gwinn-Giglio, Michelle and Nelson, William C. and Richter, R. Alexander and White, Owen} } @article {38161, title = {Comparative genomics of emerging human ehrlichiosis agents}, journal = {PLoS geneticsPLoS genetics}, volume = {2}, year = {2006}, note = {http://www.ncbi.nlm.nih.gov/pubmed/16482227?dopt=Abstract}, type = {10.1371/journal.pgen.0020021}, abstract = {Anaplasma (formerly Ehrlichia) phagocytophilum, Ehrlichia chaffeensis, and Neorickettsia (formerly Ehrlichia) sennetsu are intracellular vector-borne pathogens that cause human ehrlichiosis, an emerging infectious disease. We present the complete genome sequences of these organisms along with comparisons to other organisms in the Rickettsiales order. Ehrlichia spp. and Anaplasma spp. display a unique large expansion of immunodominant outer membrane proteins facilitating antigenic variation. All Rickettsiales have a diminished ability to synthesize amino acids compared to their closest free-living relatives. Unlike members of the Rickettsiaceae family, these pathogenic Anaplasmataceae are capable of making all major vitamins, cofactors, and nucleotides, which could confer a beneficial role in the invertebrate vector or the vertebrate host. Further analysis identified proteins potentially involved in vacuole confinement of the Anaplasmataceae, a life cycle involving a hematophagous vector, vertebrate pathogenesis, human pathogenesis, and lack of transovarial transmission. These discoveries provide significant insights into the biology of these obligate intracellular pathogens.}, keywords = {Animals, Biotin, DNA Repair, Ehrlichia, Ehrlichiosis, Genome, Genomics, HUMANS, Models, Biological, Phylogeny, Rickettsia, Ticks}, author = {Dunning Hotopp, Julie C. and Lin, Mingqun and Madupu, Ramana and Crabtree, Jonathan and Angiuoli, Samuel V. and Eisen, Jonathan A. and Eisen, Jonathan and Seshadri, Rekha and Ren, Qinghu and Wu, Martin and Utterback, Teresa R. and Smith, Shannon and Lewis, Matthew and Khouri, Hoda and Zhang, Chunbin and Niu, Hua and Lin, Quan and Ohashi, Norio and Zhi, Ning and Nelson, William and Brinkac, Lauren M. and Dodson, Robert J. and Rosovitz, M. J. and Sundaram, Jaideep and Daugherty, Sean C. and Davidsen, Tanja and Durkin, Anthony S. and Gwinn, Michelle and Haft, Daniel H. and J. Selengut and Sullivan, Steven A. and Zafar, Nikhat and Zhou, Liwei and Benahmed, Faiza and Forberger, Heather and Halpin, Rebecca and Mulligan, Stephanie and Robinson, Jeffrey and White, Owen and Rikihisa, Yasuko and Tettelin, Herv{\'e}} } @article {38294, title = {Genome Properties: a system for the investigation of prokaryotic genetic content for microbiology, genome annotation and comparative genomics}, journal = {Bioinformatics (Oxford, England)Bioinformatics (Oxford, England)}, volume = {21}, year = {2005}, note = {http://www.ncbi.nlm.nih.gov/pubmed/15347579?dopt=Abstract}, type = {10.1093/bioinformatics/bti015}, abstract = {MOTIVATION: The presence or absence of metabolic pathways and structures provide a context that makes protein annotation far more reliable. Compiling such information across microbial genomes improves the functional classification of proteins and provides a valuable resource for comparative genomics. RESULTS: We have created a Genome Properties system to present key aspects of prokaryotic biology using standardized computational methods and controlled vocabularies. Properties reflect gene content, phenotype, phylogeny and computational analyses. The results of searches using hidden Markov models allow many properties to be deduced automatically, especially for families of proteins (equivalogs) conserved in function since their last common ancestor. Additional properties are derived from curation, published reports and other forms of evidence. Genome Properties system was applied to 156 complete prokaryotic genomes, and is easily mined to find differences between species, correlations between metabolic features and families of uncharacterized proteins, or relationships among properties. AVAILABILITY: Genome Properties can be found at http://www.tigr.org/Genome_Properties SUPPLEMENTARY INFORMATION: http://www.tigr.org/tigr-scripts/CMR2/genome_properties_references.spl.}, keywords = {Chromosome mapping, database management systems, Databases, Genetic, documentation, Gene Expression Profiling, Gene Expression Regulation, Genomics, Information Storage and Retrieval, Microbiological Techniques, natural language processing, Prokaryotic Cells, Proteome, signal transduction, software, User-Computer Interface, Vocabulary, Controlled}, author = {Haft, Daniel H. and J. Selengut and Brinkac, Lauren M. and Zafar, Nikhat and White, Owen} } @article {38138, title = {A book like its cover}, journal = {HeredityHeredity}, volume = {93}, year = {2004}, type = {10.1038/sj.hdy.6800475}, abstract = {An official journal of the Genetics Society, Heredity publishes high-quality articles describing original research and theoretical insights in all areas of genetics. Research papers are complimented by News \& Commentary articles and reviews, keeping researchers and students abreast of hot topics in the field.}, keywords = {animal and plant breeding, biometrical and statistical genetics, cytogenetics, ecological, eukaryotes, Genetics, Genomics, human population genetics, population and evolutionary genetics, post-genomics}, isbn = {0018-067X}, author = {Michael P. Cummings} } @article {49635, title = {Gene synteny and evolution of genome architecture in trypanosomatids.}, journal = {Mol Biochem Parasitol}, volume = {134}, year = {2004}, month = {2004 Apr}, pages = {183-91}, abstract = {

The trypanosomatid protozoa Trypanosoma brucei, Trypanosoma cruzi and Leishmania major are related human pathogens that cause markedly distinct diseases. Using information from genome sequencing projects currently underway, we have compared the sequences of large chromosomal fragments from each species. Despite high levels of divergence at the sequence level, these three species exhibit a striking conservation of gene order, suggesting that selection has maintained gene order among the trypanosomatids over hundreds of millions of years of evolution. The few sites of genome rearrangement between these species are marked by the presence of retrotransposon-like elements, suggesting that retrotransposons may have played an important role in shaping trypanosomatid genome organization. A degenerate retroelement was identified in L. major by examining the regions near breakage points of the synteny. This is the first such element found in L. major suggesting that retroelements were found in the common ancestor of all three species.

}, keywords = {Animals, Computational Biology, Evolution, Molecular, Gene Order, Genome, Protozoan, Genomics, Leishmania major, Multigene Family, Recombination, Genetic, Retroelements, Selection, Genetic, Synteny, Trypanosoma brucei brucei, Trypanosoma cruzi, Trypanosomatina}, issn = {0166-6851}, doi = {10.1016/j.molbiopara.2003.11.012}, author = {Ghedin, Elodie and Bringaud, Frederic and Peterson, Jeremy and Myler, Peter and Berriman, Matthew and Ivens, Alasdair and Andersson, Bj{\"o}rn and Bontempi, Esteban and Eisen, Jonathan and Angiuoli, Sam and Wanless, David and Von Arx, Anna and Murphy, Lee and Lennard, Nicola and Salzberg, Steven and Adams, Mark D and White, Owen and Hall, Neil and Stuart, Kenneth and Fraser, Claire M and el-Sayed, Najib M A} } @article {38480, title = {Schistosoma mansoni genome project: an update}, journal = {Parasitology InternationalParasitology International}, volume = {53}, year = {2004}, type = {16/j.parint.2004.01.009}, abstract = {A schistosome genome project was initiated by the World Health Organization in 1994 with the notion that the best prospects for identifying new targets for drugs, vaccines, and diagnostic development lie in schistosome gene discovery, development of chromosome maps, whole genome sequencing and genome analysis. Schistosoma mansoni has a haploid genome of 270 Mb contained on 8 pairs of chromosomes. It is estimated that the S. mansoni genome contains between 15~000 and 25~000 genes. There are approximately 16~689 ESTs obtained from diverse libraries representing different developmental stages of S. mansoni, deposited in the NCBI EST database. More than half of the deposited sequences correspond to genes of unknown function. Approximately 40-50\% of the sequences form unique clusters, suggesting that approximately 20-25\% of the total schistosome genes have been discovered. Efforts to develop low resolution chromosome maps are in progress. There is a genome sequencing program underway that will provide 3X sequence coverage of the S. mansoni genome that will result in approximately 95\% gene discovery. The genomics era has provided the resources to usher in the era of functional genomics that will involve microarrays to focus on specific metabolic pathways, proteomics to identify relevant proteins and protein-protein interactions to understand critical parasite pathways. Functional genomics is expected to accelerate the development of control and treatment strategies for schistosomiasis.}, keywords = {Chromosome mapping, Gene discovery, Genomics, Schistosoma mansoni}, isbn = {1383-5769}, author = {LoVerde, Philip T. and Hirai, Hirohisa and Merrick, Joseph M. and Lee, Norman H. and Najib M. El-Sayed} } @article {38574, title = {Whole genome comparisons of serotype 4b and 1/2a strains of the food-borne pathogen Listeria monocytogenes reveal new insights into the core genome components of this species}, journal = {Nucleic acids researchNucleic Acids Research}, volume = {32}, year = {2004}, note = {http://www.ncbi.nlm.nih.gov/pubmed/15115801?dopt=Abstract}, type = {10.1093/nar/gkh562}, abstract = {The genomes of three strains of Listeria monocytogenes that have been associated with food-borne illness in the USA were subjected to whole genome comparative analysis. A total of 51, 97 and 69 strain-specific genes were identified in L.monocytogenes strains F2365 (serotype 4b, cheese isolate), F6854 (serotype 1/2a, frankfurter isolate) and H7858 (serotype 4b, meat isolate), respectively. Eighty-three genes were restricted to serotype 1/2a and 51 to serotype 4b strains. These strain- and serotype-specific genes probably contribute to observed differences in pathogenicity, and the ability of the organisms to survive and grow in their respective environmental niches. The serotype 1/2a-specific genes include an operon that encodes the rhamnose biosynthetic pathway that is associated with teichoic acid biosynthesis, as well as operons for five glycosyl transferases and an adenine-specific DNA methyltransferase. A total of 8603 and 105 050 high quality single nucleotide polymorphisms (SNPs) were found on the draft genome sequences of strain H7858 and strain F6854, respectively, when compared with strain F2365. Whole genome comparative analyses revealed that the L.monocytogenes genomes are essentially syntenic, with the majority of genomic differences consisting of phage insertions, transposable elements and SNPs.}, keywords = {Base Composition, Chromosomes, Bacterial, DNA Transposable Elements, Food Microbiology, Genes, Bacterial, Genome, Bacterial, Genomics, Listeria monocytogenes, Meat, Open Reading Frames, Physical Chromosome Mapping, Polymorphism, Single Nucleotide, Prophages, Serotyping, Species Specificity, Synteny, virulence}, author = {Nelson, Karen E. and Fouts, Derrick E. and Mongodin, Emmanuel F. and Ravel, Jacques and DeBoy, Robert T. and Kolonay, James F. and Rasko, David A. and Angiuoli, Samuel V. and Gill, Steven R. and Paulsen, Ian T. and Peterson, Jeremy and White, Owen and Nelson, William C. and Nierman, William and Beanan, Maureen J. and Brinkac, Lauren M. and Daugherty, Sean C. and Dodson, Robert J. and Durkin, A. Scott and Madupu, Ramana and Haft, Daniel H. and J. Selengut and Van Aken, Susan and Khouri, Hoda and Fedorova, Nadia and Forberger, Heather and Tran, Bao and Kathariou, Sophia and Wonderling, Laura D. and Uhlich, Gaylen A. and Bayles, Darrell O. and Luchansky, John B. and Fraser, Claire M.} } @proceedings {38218, title = {Dynamic querying for pattern identification in microarray and genomic data}, volume = {3}, year = {2003}, month = {2003}, publisher = {IEEE}, type = {10.1109/ICME.2003.1221346}, abstract = {Data sets involving linear ordered sequences are a recurring theme in bioinformatics. Dynamic query tools that support exploration of these data sets can be useful for identifying patterns of interest. This paper describes the use of one such tool - timesearcher - to interactively explore linear sequence data sets taken from two bioinformatics problems. Microarray time course data sets involve expression levels for large numbers of genes over multiple time points. Timesearcher can be used to interactively search these data sets for genes with expression profiles of interest. The occurrence frequencies of short sequences of DNA in aligned exons can be used to identify sequences that play a role in the pre-mRNA splicing. Timesearcher can be used to search these data sets for candidate splicing signals.}, keywords = {Bioinformatics, data sets, Displays, dynamic querying, expression profiles, Frequency, Gene expression, genes, Genetics, genomic data, Genomics, linear ordered sequences, macromolecules, medical signal processing, Mice, Microarray, pattern identification, pattern recognition, premRNA splicing, Query processing, sequences, Signal processing, splicing, TimeSearcher}, isbn = {0-7803-7965-9}, author = {Hochheiser, H. and Baehrecke, E. H. and Stephen M. Mount and Shneiderman, Ben} } @article {49689, title = {Genomic sequence, splicing, and gene annotation.}, journal = {Am J Hum Genet}, volume = {67}, year = {2000}, month = {2000 Oct}, pages = {788-92}, keywords = {Animals, Consensus Sequence, Exons, genes, Genome, Genomics, HUMANS, Nucleotides, Regulatory Sequences, Nucleic Acid, RNA Splice Sites, RNA Splicing, Untranslated Regions}, issn = {0002-9297}, doi = {10.1086/303098}, author = {Mount, S M} }