@article {49599, title = {BlindCall: ultra-fast base-calling of high-throughput sequencing data by blind deconvolution.}, volume = {30}, year = {2014}, month = {2014 May 1}, pages = {1214-9}, abstract = {

MOTIVATION: Base-calling of sequencing data produced by high-throughput sequencing platforms is a fundamental process in current bioinformatics analysis. However, existing third-party probabilistic or machine-learning methods that significantly improve the accuracy of base-calls on these platforms are impractical for production use due to their computational inefficiency.

RESULTS: We directly formulate base-calling as a blind deconvolution problem and implemented BlindCall as an efficient solver to this inverse problem. BlindCall produced base-calls at accuracy comparable to state-of-the-art probabilistic methods while processing data at rates 10 times faster in most cases. The computational complexity of BlindCall scales linearly with read length making it better suited for new long-read sequencing technologies.

}, keywords = {algorithms, High-Throughput Nucleotide Sequencing, HUMANS, Probability, Reproducibility of Results, Sequence Analysis, DNA, software, Time factors}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btu010}, author = {Ye, Chengxi and Hsiao, Chiaowen and Corrada Bravo, Hector} } @article {49602, title = {Epiviz: interactive visual analytics for functional genomics data.}, volume = {11}, year = {2014}, month = {2014 Sep}, pages = {938-40}, abstract = {

Visualization is an integral aspect of genomics data analysis. Algorithmic-statistical analysis and interactive visualization are most effective when used iteratively. Epiviz (http://epiviz.cbcb.umd.edu/), a web-based genome browser, and the Epivizr Bioconductor package allow interactive, extensible and reproducible visualization within a state-of-the-art data-analysis platform.

}, keywords = {algorithms, Chromosome mapping, Data Mining, database management systems, Databases, Genetic, Genomics, Internet, software, User-Computer Interface}, issn = {1548-7105}, doi = {10.1038/nmeth.3038}, author = {Chelaru, Florin and Smith, Llewellyn and Goldstein, Naomi and Bravo, H{\'e}ctor Corrada} } @article {49605, title = {Minfi: a flexible and comprehensive Bioconductor package for the analysis of Infinium DNA methylation microarrays.}, volume = {30}, year = {2014}, month = {2014 May 15}, pages = {1363-9}, abstract = {

MOTIVATION: The recently released Infinium HumanMethylation450 array (the {\textquoteright}450k{\textquoteright} array) provides a high-throughput assay to quantify DNA methylation (DNAm) at \~{}450 000 loci across a range of genomic features. Although less comprehensive than high-throughput sequencing-based techniques, this product is more cost-effective and promises to be the most widely used DNAm high-throughput measurement technology over the next several years.

RESULTS: Here we describe a suite of computational tools that incorporate state-of-the-art statistical techniques for the analysis of DNAm data. The software is structured to easily adapt to future versions of the technology. We include methods for preprocessing, quality assessment and detection of differentially methylated regions from the kilobase to the megabase scale. We show how our software provides a powerful and flexible development platform for future methods. We also illustrate how our methods empower the technology to make discoveries previously thought to be possible only with sequencing-based methods.

AVAILABILITY AND IMPLEMENTATION: http://bioconductor.org/packages/release/bioc/html/minfi.html.

CONTACT: khansen@jhsph.edu; rafa@jimmy.harvard.edu

SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.

}, keywords = {Aged, algorithms, Colonic Neoplasms, DNA Methylation, Genome, High-Throughput Nucleotide Sequencing, HUMANS, Oligonucleotide Array Sequence Analysis, Polymorphism, Single Nucleotide, software}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btu049}, author = {Aryee, Martin J and Jaffe, Andrew E and Corrada-Bravo, Hector and Ladd-Acosta, Christine and Feinberg, Andrew P and Hansen, Kasper D and Irizarry, Rafael A} } @article {49724, title = {Phenotype-based cell-specific metabolic modeling reveals metabolic liabilities of cancer.}, journal = {Elife}, volume = {3}, year = {2014}, month = {2014}, abstract = {

Utilizing molecular data to derive functional physiological models tailored for specific cancer cells can facilitate the use of individually tailored therapies. To this end we present an approach termed PRIME for generating cell-specific genome-scale metabolic models (GSMMs) based on molecular and phenotypic data. We build >280 models of normal and cancer cell-lines that successfully predict metabolic phenotypes in an individual manner. We utilize this set of cell-specific models to predict drug targets that selectively inhibit cancerous but not normal cell proliferation. The top predicted target, MLYCD, is experimentally validated and the metabolic effects of MLYCD depletion investigated. Furthermore, we tested cell-specific predicted responses to the inhibition of metabolic enzymes, and successfully inferred the prognosis of cancer patients based on their PRIME-derived individual GSMMs. These results lay a computational basis and a counterpart experimental proof of concept for future personalized metabolic modeling applications, enhancing the search for novel selective anticancer therapies.

}, keywords = {algorithms, Antineoplastic Agents, Biomarkers, Tumor, Carboxy-Lyases, Cell Line, Tumor, Cell Proliferation, Citric Acid Cycle, Fatty Acids, Gene Knockdown Techniques, Genome, Human, HUMANS, Lymphocytes, Models, Biological, Neoplasms, Oxidation-Reduction, PHENOTYPE, Precision Medicine}, issn = {2050-084X}, doi = {10.7554/eLife.03641}, author = {Yizhak, Keren and Gaude, Edoardo and Le D{\'e}v{\'e}dec, Sylvia and Waldman, Yedael Y and Stein, Gideon Y and van de Water, Bob and Frezza, Christian and Ruppin, Eytan} } @article {49572, title = {Sailfish enables alignment-free isoform quantification from RNA-seq reads using lightweight algorithms.}, volume = {32}, year = {2014}, month = {2014 May}, pages = {462-4}, abstract = {

We introduce Sailfish, a computational method for quantifying the abundance of previously annotated RNA isoforms from RNA-seq data. Because Sailfish entirely avoids mapping reads, a time-consuming step in all current methods, it provides quantification estimates much faster than do existing approaches (typically 20 times faster) without loss of accuracy. By facilitating frequent reanalysis of data and reducing the need to optimize parameters, Sailfish exemplifies the potential of lightweight algorithms for efficiently processing sequencing reads.

}, keywords = {algorithms, Brain Chemistry, Computational Biology, HUMANS, Models, Biological, RNA Isoforms, Sequence Analysis, RNA, software}, issn = {1546-1696}, doi = {10.1038/nbt.2862}, author = {Patro, Rob and Mount, Stephen M and Kingsford, Carl} } @article {38421, title = {The partitioned LASSO-patternsearch algorithm with application to gene expression data}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {13}, year = {2012}, note = {http://www.ncbi.nlm.nih.gov/pubmed/22587526?dopt=Abstract}, type = {10.1186/1471-2105-13-98}, abstract = {BACKGROUND: In systems biology, the task of reverse engineering gene pathways from data has been limited not just by the curse of dimensionality (the interaction space is huge) but also by systematic error in the data. The gene expression barcode reduces spurious association driven by batch effects and probe effects. The binary nature of the resulting expression calls lends itself perfectly to modern regularization approaches that thrive in high-dimensional settings. RESULTS: The Partitioned LASSO-Patternsearch algorithm is proposed to identify patterns of multiple dichotomous risk factors for outcomes of interest in genomic studies. A partitioning scheme is used to identify promising patterns by solving many LASSO-Patternsearch subproblems in parallel. All variables that survive this stage proceed to an aggregation stage where the most significant patterns are identified by solving a reduced LASSO-Patternsearch problem in just these variables. This approach was applied to genetic data sets with expression levels dichotomized by gene expression bar code. Most of the genes and second-order interactions thus selected and are known to be related to the outcomes. CONCLUSIONS: We demonstrate with simulations and data analyses that the proposed method not only selects variables and patterns more accurately, but also provides smaller models with better prediction accuracy, in comparison to several alternative methodologies.}, keywords = {algorithms, Breast Neoplasms, Computer simulation, Female, Gene expression, Gene Expression Profiling, Genomics, HUMANS, Models, Genetic}, author = {Shi, Weiliang and Wahba, Grace and Irizarry, Rafael A. and H{\'e}ctor Corrada Bravo and Wright, Stephen J.} } @article {49744, title = {Accurate proteome-wide protein quantification from high-resolution 15N mass spectra.}, journal = {Genome Biol}, volume = {12}, year = {2011}, month = {2011}, pages = {R122}, abstract = {

In quantitative mass spectrometry-based proteomics, the metabolic incorporation of a single source of 15N-labeled nitrogen has many advantages over using stable isotope-labeled amino acids. However, the lack of a robust computational framework for analyzing the resulting spectra has impeded wide use of this approach. We have addressed this challenge by introducing a new computational methodology for analyzing 15N spectra in which quantification is integrated with identification. Application of this method to an Escherichia coli growth transition reveals significant improvement in quantification accuracy over previous methods.

}, keywords = {algorithms, Amino Acid Sequence, Bacterial Proteins, Escherichia coli, Isotope Labeling, Mass Spectrometry, Molecular Sequence Data, Nitrogen Isotopes, Proteome, proteomics, Sensitivity and Specificity, software}, issn = {1474-760X}, doi = {10.1186/gb-2011-12-12-r122}, author = {Khan, Zia and Amini, Sasan and Bloom, Joshua S and Ruse, Cristian and Caudy, Amy A and Kruglyak, Leonid and Singh, Mona and Perlman, David H and Tavazoie, Saeed} } @article {49777, title = {ProPhylo: partial phylogenetic profiling to guide protein family construction and assignment of biological process.}, journal = {BMC Bioinformatics}, volume = {12}, year = {2011}, month = {2011}, pages = {434}, abstract = {

BACKGROUND: Phylogenetic profiling is a technique of scoring co-occurrence between a protein family and some other trait, usually another protein family, across a set of taxonomic groups. In spite of several refinements in recent years, the technique still invites significant improvement. To be its most effective, a phylogenetic profiling algorithm must be able to examine co-occurrences among protein families whose boundaries are uncertain within large homologous protein superfamilies.

RESULTS: Partial Phylogenetic Profiling (PPP) is an iterative algorithm that scores a given taxonomic profile against the taxonomic distribution of families for all proteins in a genome. The method works through optimizing the boundary of each protein family, rather than by relying on prebuilt protein families or fixed sequence similarity thresholds. Double Partial Phylogenetic Profiling (DPPP) is a related procedure that begins with a single sequence and searches for optimal granularities for its surrounding protein family in order to generate the best query profiles for PPP. We present ProPhylo, a high-performance software package for phylogenetic profiling studies through creating individually optimized protein family boundaries. ProPhylo provides precomputed databases for immediate use and tools for manipulating the taxonomic profiles used as queries.

CONCLUSION: ProPhylo results show universal markers of methanogenesis, a new DNA phosphorothioation-dependent restriction enzyme, and efficacy in guiding protein family construction. The software and the associated databases are freely available under the open source Perl Artistic License from ftp://ftp.jcvi.org/pub/data/ppp/.

}, keywords = {algorithms, Archaea, Archaeal Proteins, DNA, Methane, Phylogeny, software}, issn = {1471-2105}, doi = {10.1186/1471-2105-12-434}, author = {Basu, Malay K and Selengut, Jeremy D and Haft, Daniel H} } @article {38452, title = {ProPhylo: partial phylogenetic profiling to guide protein family construction and assignment of biological process}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {12}, year = {2011}, note = {http://www.ncbi.nlm.nih.gov/pubmed/22070167?dopt=Abstract}, type = {10.1186/1471-2105-12-434}, abstract = {BACKGROUND: Phylogenetic profiling is a technique of scoring co-occurrence between a protein family and some other trait, usually another protein family, across a set of taxonomic groups. In spite of several refinements in recent years, the technique still invites significant improvement. To be its most effective, a phylogenetic profiling algorithm must be able to examine co-occurrences among protein families whose boundaries are uncertain within large homologous protein superfamilies. RESULTS: Partial Phylogenetic Profiling (PPP) is an iterative algorithm that scores a given taxonomic profile against the taxonomic distribution of families for all proteins in a genome. The method works through optimizing the boundary of each protein family, rather than by relying on prebuilt protein families or fixed sequence similarity thresholds. Double Partial Phylogenetic Profiling (DPPP) is a related procedure that begins with a single sequence and searches for optimal granularities for its surrounding protein family in order to generate the best query profiles for PPP. We present ProPhylo, a high-performance software package for phylogenetic profiling studies through creating individually optimized protein family boundaries. ProPhylo provides precomputed databases for immediate use and tools for manipulating the taxonomic profiles used as queries. CONCLUSION: ProPhylo results show universal markers of methanogenesis, a new DNA phosphorothioation-dependent restriction enzyme, and efficacy in guiding protein family construction. The software and the associated databases are freely available under the open source Perl Artistic License from ftp://ftp.jcvi.org/pub/data/ppp/.}, keywords = {algorithms, Archaea, Archaeal Proteins, DNA, Methane, Phylogeny, software}, author = {Basu, Malay K. and J. Selengut and Haft, Daniel H.} } @article {49779, title = {Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL): adapting the Partial Phylogenetic Profiling algorithm to scan sequences for signatures that predict protein function.}, journal = {BMC Bioinformatics}, volume = {11}, year = {2010}, month = {2010}, pages = {52}, abstract = {

BACKGROUND: Comparative genomics methods such as phylogenetic profiling can mine powerful inferences from inherently noisy biological data sets. We introduce Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL), a method that applies the Partial Phylogenetic Profiling (PPP) approach locally within a protein sequence to discover short sequence signatures associated with functional sites. The approach is based on the basic scoring mechanism employed by PPP, namely the use of binomial distribution statistics to optimize sequence similarity cutoffs during searches of partitioned training sets.

RESULTS: Here we illustrate and validate the ability of the SIMBAL method to find functionally relevant short sequence signatures by application to two well-characterized protein families. In the first example, we partitioned a family of ABC permeases using a metabolic background property (urea utilization). Thus, the TRUE set for this family comprised members whose genome of origin encoded a urea utilization system. By moving a sliding window across the sequence of a permease, and searching each subsequence in turn against the full set of partitioned proteins, the method found which local sequence signatures best correlated with the urea utilization trait. Mapping of SIMBAL "hot spots" onto crystal structures of homologous permeases reveals that the significant sites are gating determinants on the cytosolic face rather than, say, docking sites for the substrate-binding protein on the extracellular face. In the second example, we partitioned a protein methyltransferase family using gene proximity as a criterion. In this case, the TRUE set comprised those methyltransferases encoded near the gene for the substrate RF-1. SIMBAL identifies sequence regions that map onto the substrate-binding interface while ignoring regions involved in the methyltransferase reaction mechanism in general. Neither method for training set construction requires any prior experimental characterization.

CONCLUSIONS: SIMBAL shows that, in functionally divergent protein families, selected short sequences often significantly outperform their full-length parent sequence for making functional predictions by sequence similarity, suggesting avenues for improved functional classifiers. When combined with structural data, SIMBAL affords the ability to localize and model functional sites.

}, keywords = {algorithms, Amino Acid Sequence, Gene Expression Profiling, Molecular Sequence Data, Phylogeny, Proteins, Sequence Analysis, Protein, Structure-Activity Relationship}, issn = {1471-2105}, doi = {10.1186/1471-2105-11-52}, author = {Selengut, Jeremy D and Rusch, Douglas B and Haft, Daniel H} } @article {38506, title = {Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL): adapting the Partial Phylogenetic Profiling algorithm to scan sequences for signatures that predict protein function}, journal = {BMC bioinformaticsBMC Bioinformatics}, volume = {11}, year = {2010}, note = {http://www.ncbi.nlm.nih.gov/pubmed/20102603?dopt=Abstract}, type = {10.1186/1471-2105-11-52}, abstract = {BACKGROUND: Comparative genomics methods such as phylogenetic profiling can mine powerful inferences from inherently noisy biological data sets. We introduce Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL), a method that applies the Partial Phylogenetic Profiling (PPP) approach locally within a protein sequence to discover short sequence signatures associated with functional sites. The approach is based on the basic scoring mechanism employed by PPP, namely the use of binomial distribution statistics to optimize sequence similarity cutoffs during searches of partitioned training sets. RESULTS: Here we illustrate and validate the ability of the SIMBAL method to find functionally relevant short sequence signatures by application to two well-characterized protein families. In the first example, we partitioned a family of ABC permeases using a metabolic background property (urea utilization). Thus, the TRUE set for this family comprised members whose genome of origin encoded a urea utilization system. By moving a sliding window across the sequence of a permease, and searching each subsequence in turn against the full set of partitioned proteins, the method found which local sequence signatures best correlated with the urea utilization trait. Mapping of SIMBAL "hot spots" onto crystal structures of homologous permeases reveals that the significant sites are gating determinants on the cytosolic face rather than, say, docking sites for the substrate-binding protein on the extracellular face. In the second example, we partitioned a protein methyltransferase family using gene proximity as a criterion. In this case, the TRUE set comprised those methyltransferases encoded near the gene for the substrate RF-1. SIMBAL identifies sequence regions that map onto the substrate-binding interface while ignoring regions involved in the methyltransferase reaction mechanism in general. Neither method for training set construction requires any prior experimental characterization. CONCLUSIONS: SIMBAL shows that, in functionally divergent protein families, selected short sequences often significantly outperform their full-length parent sequence for making functional predictions by sequence similarity, suggesting avenues for improved functional classifiers. When combined with structural data, SIMBAL affords the ability to localize and model functional sites.}, keywords = {algorithms, Amino Acid Sequence, Gene Expression Profiling, Molecular Sequence Data, Phylogeny, Proteins, Sequence Analysis, Protein, Structure-Activity Relationship}, author = {J. Selengut and Rusch, Douglas B. and Haft, Daniel H.} } @article {49749, title = {Measuring differential gene expression by short read sequencing: quantitative comparison to 2-channel gene expression microarrays.}, journal = {BMC Genomics}, volume = {10}, year = {2009}, month = {2009}, pages = {221}, abstract = {

BACKGROUND: High-throughput cDNA synthesis and sequencing of poly(A)-enriched RNA is rapidly emerging as a technology competing to replace microarrays as a quantitative platform for measuring gene expression.

RESULTS: Consequently, we compared full length cDNA sequencing to 2-channel gene expression microarrays in the context of measuring differential gene expression. Because of its comparable cost to a gene expression microarray, our study focused on the data obtainable from a single lane of an Illumina 1 G sequencer. We compared sequencing data to a highly replicated microarray experiment profiling two divergent strains of S. cerevisiae.

CONCLUSION: Using a large number of quantitative PCR (qPCR) assays, more than previous studies, we found that neither technology is decisively better at measuring differential gene expression. Further, we report sequencing results from a diploid hybrid of two strains of S. cerevisiae that indicate full length cDNA sequencing can discover heterozygosity and measure quantitative allele-specific expression simultaneously.

}, keywords = {algorithms, DNA, Complementary, DNA, Fungal, Gene Expression Profiling, Oligonucleotide Array Sequence Analysis, Saccharomyces cerevisiae, sequence alignment, Sequence Analysis, DNA}, issn = {1471-2164}, doi = {10.1186/1471-2164-10-221}, author = {Bloom, Joshua S and Khan, Zia and Kruglyak, Leonid and Singh, Mona and Caudy, Amy A} } @article {49748, title = {A practical algorithm for finding maximal exact matches in large sequence datasets using sparse suffix arrays.}, journal = {Bioinformatics}, volume = {25}, year = {2009}, month = {2009 Jul 1}, pages = {1609-16}, abstract = {

MOTIVATION: High-throughput sequencing technologies place ever increasing demands on existing algorithms for sequence analysis. Algorithms for computing maximal exact matches (MEMs) between sequences appear in two contexts where high-throughput sequencing will vastly increase the volume of sequence data: (i) seeding alignments of high-throughput reads for genome assembly and (ii) designating anchor points for genome-genome comparisons.

RESULTS: We introduce a new algorithm for finding MEMs. The algorithm leverages a sparse suffix array (SA), a text index that stores every K-th position of the text. In contrast to a full text index that stores every position of the text, a sparse SA occupies much less memory. Even though we use a sparse index, the output of our algorithm is the same as a full text index algorithm as long as the space between the indexed suffixes is not greater than a minimum length of a MEM. By relying on partial matches and additional text scanning between indexed positions, the algorithm trades memory for extra computation. The reduced memory usage makes it possible to determine MEMs between significantly longer sequences.

AVAILABILITY: Source code for the algorithm is available under a BSD open source license at http://compbio.cs.princeton.edu/mems. The implementation can serve as a drop-in replacement for the MEMs algorithm in MUMmer 3.

}, keywords = {algorithms, Base Sequence, Genomics, sequence alignment, Sequence Analysis, DNA}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btp275}, author = {Khan, Zia and Bloom, Joshua S and Kruglyak, Leonid and Singh, Mona} } @article {49747, title = {Protein quantification across hundreds of experimental conditions.}, journal = {Proc Natl Acad Sci U S A}, volume = {106}, year = {2009}, month = {2009 Sep 15}, pages = {15544-8}, abstract = {

Quantitative studies of protein abundance rarely span more than a small number of experimental conditions and replicates. In contrast, quantitative studies of transcript abundance often span hundreds of experimental conditions and replicates. This situation exists, in part, because extracting quantitative data from large proteomics datasets is significantly more difficult than reading quantitative data from a gene expression microarray. To address this problem, we introduce two algorithmic advances in the processing of quantitative proteomics data. First, we use space-partitioning data structures to handle the large size of these datasets. Second, we introduce techniques that combine graph-theoretic algorithms with space-partitioning data structures to collect relative protein abundance data across hundreds of experimental conditions and replicates. We validate these algorithmic techniques by analyzing several datasets and computing both internal and external measures of quantification accuracy. We demonstrate the scalability of these techniques by applying them to a large dataset that comprises a total of 472 experimental conditions and replicates.

}, keywords = {algorithms, Animals, Automatic Data Processing, Chromatography, Liquid, Databases, Factual, Fungal Proteins, HUMANS, Isotopes, Mice, Proteins, proteomics, Tandem Mass Spectrometry}, issn = {1091-6490}, doi = {10.1073/pnas.0904100106}, author = {Khan, Zia and Bloom, Joshua S and Garcia, Benjamin A and Singh, Mona and Kruglyak, Leonid} } @article {49560, title = {MCMC data association and sparse factorization updating for real time multitarget tracking with merged and multiple measurements.}, volume = {28}, year = {2006}, month = {2006 Dec}, pages = {1960-72}, abstract = {

In several multitarget tracking applications, a target may return more than one measurement per target and interacting targets may return multiple merged measurements between targets. Existing algorithms for tracking and data association, initially applied to radar tracking, do not adequately address these types of measurements. Here, we introduce a probabilistic model for interacting targets that addresses both types of measurements simultaneously. We provide an algorithm for approximate inference in this model using a Markov chain Monte Carlo (MCMC)-based auxiliary variable particle filter. We Rao-Blackwellize the Markov chain to eliminate sampling over the continuous state space of the targets. A major contribution of this work is the use of sparse least squares updating and downdating techniques, which significantly reduce the computational cost per iteration of the Markov chain. Also, when combined with a simple heuristic, they enable the algorithm to correctly focus computation on interacting targets. We include experimental results on a challenging simulation sequence. We test the accuracy of the algorithm using two sensor modalities, video, and laser range data. We also show the algorithm exhibits real time performance on a conventional PC.

}, keywords = {algorithms, Artificial Intelligence, Image Enhancement, Image Interpretation, Computer-Assisted, Information Storage and Retrieval, Movement, Pattern Recognition, Automated, Reproducibility of Results, Sensitivity and Specificity, Subtraction Technique}, issn = {0162-8828}, doi = {10.1109/TPAMI.2006.247}, author = {Khan, Zia and Balch, Tucker and Dellaert, Frank} } @article {49562, title = {MCMC-based particle filtering for tracking a variable number of interacting targets.}, volume = {27}, year = {2005}, month = {2005 Nov}, pages = {1805-19}, abstract = {

We describe a particle filter that effectively deals with interacting targets--targets that are influenced by the proximity and/or behavior of other targets. The particle filter includes a Markov random field (MRF) motion prior that helps maintain the identity of targets throughout an interaction, significantly reducing tracker failures. We show that this MRF prior can be easily implemented by including an additional interaction factor in the importance weights of the particle filter. However, the computational requirements of the resulting multitarget filter render it unusable for large numbers of targets. Consequently, we replace the traditional importance sampling step in the particle filter with a novel Markov chain Monte Carlo (MCMC) sampling step to obtain a more efficient MCMC-based multitarget filter. We also show how to extend this MCMC-based filter to address a variable number of interacting targets. Finally, we present both qualitative and quantitative experimental results, demonstrating that the resulting particle filters deal efficiently and effectively with complicated target interactions.

}, keywords = {algorithms, Animals, Artificial Intelligence, Computer simulation, HUMANS, Image Enhancement, Image Interpretation, Computer-Assisted, Information Storage and Retrieval, Markov chains, Models, Biological, Models, Statistical, Monte Carlo Method, Motion, Movement, Pattern Recognition, Automated, Subtraction Technique, Video Recording}, issn = {0162-8828}, doi = {10.1109/TPAMI.2005.223}, author = {Khan, Zia and Balch, Tucker and Dellaert, Frank} } @article {49683, title = {Reducing storage requirements for biological sequence comparison.}, journal = {Bioinformatics}, volume = {20}, year = {2004}, month = {2004 Dec 12}, pages = {3363-9}, abstract = {

MOTIVATION: Comparison of nucleic acid and protein sequences is a fundamental tool of modern bioinformatics. A dominant method of such string matching is the {\textquoteright}seed-and-extend{\textquoteright} approach, in which occurrences of short subsequences called {\textquoteright}seeds{\textquoteright} are used to search for potentially longer matches in a large database of sequences. Each such potential match is then checked to see if it extends beyond the seed. To be effective, the seed-and-extend approach needs to catalogue seeds from virtually every substring in the database of search strings. Projects such as mammalian genome assemblies and large-scale protein matching, however, have such large sequence databases that the resulting list of seeds cannot be stored in RAM on a single computer. This significantly slows the matching process.

RESULTS: We present a simple and elegant method in which only a small fraction of seeds, called {\textquoteright}minimizers{\textquoteright}, needs to be stored. Using minimizers can speed up string-matching computations by a large factor while missing only a small fraction of the matches found using all seeds.

}, keywords = {algorithms, Databases, Genetic, Information Storage and Retrieval, Numerical Analysis, Computer-Assisted, sequence alignment, Sequence Analysis}, issn = {1367-4803}, doi = {10.1093/bioinformatics/bth408}, author = {Roberts, Michael and Hayes, Wayne and Hunt, Brian R and Mount, Stephen M and Yorke, James A} } @article {49685, title = {Improving the Arabidopsis genome annotation using maximal transcript alignment assemblies.}, journal = {Nucleic Acids Res}, volume = {31}, year = {2003}, month = {2003 Oct 1}, pages = {5654-66}, abstract = {

The spliced alignment of expressed sequence data to genomic sequence has proven a key tool in the comprehensive annotation of genes in eukaryotic genomes. A novel algorithm was developed to assemble clusters of overlapping transcript alignments (ESTs and full-length cDNAs) into maximal alignment assemblies, thereby comprehensively incorporating all available transcript data and capturing subtle splicing variations. Complete and partial gene structures identified by this method were used to improve The Institute for Genomic Research Arabidopsis genome annotation (TIGR release v.4.0). The alignment assemblies permitted the automated modeling of several novel genes and >1000 alternative splicing variations as well as updates (including UTR annotations) to nearly half of the approximately 27 000 annotated protein coding genes. The algorithm of the Program to Assemble Spliced Alignments (PASA) tool is described, as well as the results of automated updates to Arabidopsis gene annotations.

}, keywords = {algorithms, Alternative Splicing, Arabidopsis, DNA, Complementary, Expressed Sequence Tags, Genome, Plant, Introns, Plant Proteins, RNA, Plant, sequence alignment, software, Transcription, Genetic, Untranslated Regions}, issn = {1362-4962}, author = {Haas, Brian J and Delcher, Arthur L and Mount, Stephen M and Wortman, Jennifer R and Smith, Roger K and Hannick, Linda I and Maiti, Rama and Ronning, Catherine M and Rusch, Douglas B and Town, Christopher D and Salzberg, Steven L and White, Owen} }