@article {49606,
	title = {Orchestrating high-throughput genomic analysis with Bioconductor.},
	volume = {12},
	year = {2015},
	month = {2015 Feb},
	pages = {115-21},
	abstract = {<p>Bioconductor is an open-source, open-development software project for the analysis and comprehension of high-throughput data in genomics and molecular biology. The project aims to enable interdisciplinary research, collaboration and rapid development of scientific software. Based on the statistical programming language R, Bioconductor comprises 934 interoperable packages contributed by a large, diverse community of scientists. Packages cover a range of bioinformatic and statistical applications. They undergo formal initial review and continuous automated testing. We present an overview for prospective users and contributors.</p>},
	keywords = {Computational Biology, Gene Expression Profiling, Genomics, High-Throughput Screening Assays, Programming Languages, software, User-Computer Interface},
	issn = {1548-7105},
	doi = {10.1038/nmeth.3252},
	author = {Huber, Wolfgang and Carey, Vincent J and Gentleman, Robert and Anders, Simon and Carlson, Marc and Carvalho, Benilton S and Bravo, H{\'e}ctor Corrada and Davis, Sean and Gatto, Laurent and Girke, Thomas and Gottardo, Raphael and Hahne, Florian and Hansen, Kasper D and Irizarry, Rafael A and Lawrence, Michael and Love, Michael I and MacDonald, James and Obenchain, Valerie and Ole{\'s}, Andrzej K and Pag{\`e}s, Herv{\'e} and Reyes, Alejandro and Shannon, Paul and Smyth, Gordon K and Tenenbaum, Dan and Waldron, Levi and Morgan, Martin}
}
@article {38582,
	title = {Derepression of Cancer/testis antigens in cancer is associated with distinct patterns of DNA hypomethylation},
	journal = {BMC CancerBMC CancerBMC Cancer},
	volume = {13},
	year = {2013},
	note = {Kim, Robert<br/>Kulkarni, Prakash<br/>Hannenhalli, Sridhar<br/>eng<br/>R01 GM100335/GM/NIGMS NIH HHS/<br/>R01GM100335/GM/NIGMS NIH HHS/<br/>Research Support, N.I.H., Extramural<br/>Research Support, Non-U.S. Gov{\textquoteright}t<br/>England<br/>2013/03/26 06:00<br/>BMC Cancer. 2013 Mar 22;13:144. doi: 10.1186/1471-2407-13-144.},
	pages = {144},
	abstract = {BACKGROUND: The Cancer/Testis Antigens (CTAs) are a heterogeneous group of proteins whose expression is typically restricted to the testis. However, they are aberrantly expressed in most cancers that have been examined to date. Broadly speaking, the CTAs can be divided into two groups: the CTX antigens that are encoded by the X-linked genes and the non-X CT antigens that are encoded by the autosomes. Unlike the non-X CTAs, the CTX antigens form clusters of closely related gene families and their expression is frequently associated with advanced disease with poorer prognosis. Regardless however, the mechanism(s) underlying their selective derepression and stage-specific expression in cancer remain poorly understood, although promoter DNA demethylation is believed to be the major driver. METHODS: Here, we report a systematic analysis of DNA methylation profiling data from various tissue types to elucidate the mechanism underlying the derepression of the CTAs in cancer. We analyzed the methylation profiles of 501 samples including sperm, several cancer types, and their corresponding normal somatic tissue types. RESULTS: We found strong evidence for specific DNA hypomethylation of CTA promoters in the testis and cancer cells but not in their normal somatic counterparts. We also found that hypomethylation was clustered on the genome into domains that coincided with nuclear lamina-associated domains (LADs) and that these regions appeared to be insulated by CTCF sites. Interestingly, we did not observe any significant differences in the hypomethylation pattern between the CTAs without CpG islands and the CTAs with CpG islands in the proximal promoter. CONCLUSION: Our results corroborate that widespread DNA hypomethylation appears to be the driver in the derepression of CTA expression in cancer and furthermore, demonstrate that these hypomethylated domains are associated with the nuclear lamina-associated domains (LADS). Taken together, our results suggest that wide-spread methylation changes in cancer are linked to derepression of germ-line-specific genes that is orchestrated by the three dimensional organization of the cancer genome.},
	keywords = {*DNA Methylation, *Gene Expression Regulation, Neoplastic, *Genes, X-Linked, Antigens, Neoplasm/*genetics, Binding Sites, Cluster Analysis, CpG Islands, Gene Expression Profiling, HUMANS, Male, Neoplasms/*genetics/*metabolism, Promoter Regions, Genetic, Protein Binding, Protein Interaction Domains and Motifs, Testis/*metabolism},
	isbn = {1471-2407 (Electronic)<br/>1471-2407 (Linking)},
	author = {Kim, R. and Kulkarni, P. and Sridhar Hannenhalli}
}
@article {38276,
	title = {Gene expression anti-profiles as a basis for accurate universal cancer signatures},
	journal = {BMC bioinformaticsBMC Bioinformatics},
	volume = {13},
	year = {2012},
	note = {http://www.ncbi.nlm.nih.gov/pubmed/23088656?dopt=Abstract},
	type = {10.1186/1471-2105-13-272},
	abstract = {BACKGROUND: Early screening for cancer is arguably one of the greatest public health advances over the last fifty years. However, many cancer screening tests are invasive (digital rectal exams), expensive (mammograms, imaging) or both (colonoscopies). This has spurred growing interest in developing genomic signatures that can be used for cancer diagnosis and prognosis. However, progress has been slowed by heterogeneity in cancer profiles and the lack of effective computational prediction tools for this type of data. RESULTS: We developed anti-profiles as a first step towards translating experimental findings suggesting that stochastic across-sample hyper-variability in the expression of specific genes is a stable and general property of cancer into predictive and diagnostic signatures. Using single-chip microarray normalization and quality assessment methods, we developed an anti-profile for colon cancer in tissue biopsy samples. To demonstrate the translational potential of our findings, we applied the signature developed in the tissue samples, without any further retraining or normalization, to screen patients for colon cancer based on genomic measurements from peripheral blood in an independent study (AUC of 0.89). This method achieved higher accuracy than the signature underlying commercially available peripheral blood screening tests for colon cancer (AUC of 0.81). We also confirmed the existence of hyper-variable genes across a range of cancer types and found that a significant proportion of tissue-specific genes are hyper-variable in cancer. Based on these observations, we developed a universal cancer anti-profile that accurately distinguishes cancer from normal regardless of tissue type (ten-fold cross-validation AUC > 0.92). CONCLUSIONS: We have introduced anti-profiles as a new approach for developing cancer genomic signatures that specifically takes advantage of gene expression heterogeneity. We have demonstrated that anti-profiles can be successfully applied to develop peripheral-blood based diagnostics for cancer and used anti-profiles to develop a highly accurate universal cancer signature. By using single-chip normalization and quality assessment methods, no further retraining of signatures developed by the anti-profile approach would be required before their application in clinical settings. Our results suggest that anti-profiles may be used to develop inexpensive and non-invasive universal cancer screening tests.},
	keywords = {Area Under Curve, Colonic Neoplasms, Gene Expression Profiling, Genetic Variation, Genomics, HUMANS, Oligonucleotide Array Sequence Analysis, Prognosis, Transcriptome, Tumor Markers, Biological},
	author = {H{\'e}ctor Corrada Bravo and Pihur, Vasyl and McCall, Matthew and Irizarry, Rafael A. and Leek, Jeffrey T.}
}
@article {38421,
	title = {The partitioned LASSO-patternsearch algorithm with application to gene expression data},
	journal = {BMC bioinformaticsBMC Bioinformatics},
	volume = {13},
	year = {2012},
	note = {http://www.ncbi.nlm.nih.gov/pubmed/22587526?dopt=Abstract},
	type = {10.1186/1471-2105-13-98},
	abstract = {BACKGROUND: In systems biology, the task of reverse engineering gene pathways from data has been limited not just by the curse of dimensionality (the interaction space is huge) but also by systematic error in the data. The gene expression barcode reduces spurious association driven by batch effects and probe effects. The binary nature of the resulting expression calls lends itself perfectly to modern regularization approaches that thrive in high-dimensional settings. RESULTS: The Partitioned LASSO-Patternsearch algorithm is proposed to identify patterns of multiple dichotomous risk factors for outcomes of interest in genomic studies. A partitioning scheme is used to identify promising patterns by solving many LASSO-Patternsearch subproblems in parallel. All variables that survive this stage proceed to an aggregation stage where the most significant patterns are identified by solving a reduced LASSO-Patternsearch problem in just these variables. This approach was applied to genetic data sets with expression levels dichotomized by gene expression bar code. Most of the genes and second-order interactions thus selected and are known to be related to the outcomes. CONCLUSIONS: We demonstrate with simulations and data analyses that the proposed method not only selects variables and patterns more accurately, but also provides smaller models with better prediction accuracy, in comparison to several alternative methodologies.},
	keywords = {algorithms, Breast Neoplasms, Computer simulation, Female, Gene expression, Gene Expression Profiling, Genomics, HUMANS, Models, Genetic},
	author = {Shi, Weiliang and Wahba, Grace and Irizarry, Rafael A. and H{\'e}ctor Corrada Bravo and Wright, Stephen J.}
}
@article {49548,
	title = {Quantitative measurement of allele-specific protein expression in a diploid yeast hybrid by LC-MS.},
	volume = {8},
	year = {2012},
	month = {2012},
	pages = {602},
	abstract = {<p>Understanding the genetic basis of gene regulatory variation is a key goal of evolutionary and medical genetics. Regulatory variation can act in an allele-specific manner (cis-acting) or it can affect both alleles of a gene (trans-acting). Differential allele-specific expression (ASE), in which the expression of one allele differs from another in a diploid, implies the presence of cis-acting regulatory variation. While microarrays and high-throughput sequencing have enabled genome-wide measurements of transcriptional ASE, methods for measurement of protein ASE (pASE) have lagged far behind. We describe a flexible, accurate, and scalable strategy for measurement of pASE by liquid chromatography-coupled mass spectrometry (LC-MS). We apply this approach to a hybrid between the yeast species Saccharomyces cerevisiae and Saccharomyces bayanus. Our results provide the first analysis of the relative contribution of cis-acting and trans-acting regulatory differences to protein expression divergence between yeast species.</p>},
	keywords = {Alleles, Chromatography, Liquid, Fungal Proteins, Gene Expression Profiling, Gene Expression Regulation, Fungal, HUMANS, Mass Spectrometry, proteomics, Regression Analysis, Saccharomyces, Saccharomyces cerevisiae, Saccharomyces cerevisiae Proteins, Species Specificity},
	issn = {1744-4292},
	doi = {10.1038/msb.2012.34},
	author = {Khan, Zia and Bloom, Joshua S and Amini, Sasan and Singh, Mona and Perlman, David H and Caudy, Amy A and Kruglyak, Leonid}
}
@article {49536,
	title = {Transcript expression analysis of putative Trypanosoma brucei GPI-anchored surface proteins during development in the tsetse and mammalian hosts.},
	volume = {6},
	year = {2012},
	month = {2012},
	pages = {e1708},
	abstract = {<p>Human African Trypanosomiasis is a devastating disease caused by the parasite Trypanosoma brucei. Trypanosomes live extracellularly in both the tsetse fly and the mammal. Trypanosome surface proteins can directly interact with the host environment, allowing parasites to effectively establish and maintain infections. Glycosylphosphatidylinositol (GPI) anchoring is a common posttranslational modification associated with eukaryotic surface proteins. In T. brucei, three GPI-anchored major surface proteins have been identified: variant surface glycoproteins (VSGs), procyclic acidic repetitive protein (PARP or procyclins), and brucei alanine rich proteins (BARP). The objective of this study was to select genes encoding predicted GPI-anchored proteins with unknown function(s) from the T. brucei genome and characterize the expression profile of a subset during cyclical development in the tsetse and mammalian hosts. An initial in silico screen of putative T. brucei proteins by Big PI algorithm identified 163 predicted GPI-anchored proteins, 106 of which had no known functions. Application of a second GPI-anchor prediction algorithm (FragAnchor), signal peptide and trans-membrane domain prediction software resulted in the identification of 25 putative hypothetical proteins. Eighty-one gene products with hypothetical functions were analyzed for stage-regulated expression using semi-quantitative RT-PCR. The expression of most of these genes were found to be upregulated in trypanosomes infecting tsetse salivary gland and proventriculus tissues, and 38\% were specifically expressed only by parasites infecting salivary gland tissues. Transcripts for all of the genes specifically expressed in salivary glands were also detected in mammalian infective metacyclic trypomastigotes, suggesting a possible role for these putative proteins in invasion and/or establishment processes in the mammalian host. These results represent the first large-scale report of the differential expression of unknown genes encoding predicted T. brucei surface proteins during the complete developmental cycle. This knowledge may form the foundation for the development of future novel transmission blocking strategies against metacyclic parasites.</p>},
	keywords = {Animals, Computational Biology, Gastrointestinal Tract, Gene Expression Profiling, GPI-Linked Proteins, HUMANS, Male, Membrane Proteins, Protozoan Proteins, Real-Time Polymerase Chain Reaction, Salivary Glands, Trypanosoma brucei brucei, Trypanosomiasis, African, Tsetse Flies},
	issn = {1935-2735},
	doi = {10.1371/journal.pntd.0001708},
	author = {Savage, Amy F and Cerqueira, Gustavo C and Regmi, Sandesh and Wu, Yineng and El Sayed, Najib M and Aksoy, Serap}
}
@article {49746,
	title = {Direct targeting of Sec23a by miR-200s influences cancer cell secretome and promotes metastatic colonization.},
	journal = {Nat Med},
	volume = {17},
	year = {2011},
	month = {2011 Sep},
	pages = {1101-8},
	abstract = {<p>Although the role of miR-200s in regulating E-cadherin expression and epithelial-to-mesenchymal transition is well established, their influence on metastatic colonization remains controversial. Here we have used clinical and experimental models of breast cancer metastasis to discover a pro-metastatic role of miR-200s that goes beyond their regulation of E-cadherin and epithelial phenotype. Overexpression of miR-200s is associated with increased risk of metastasis in breast cancer and promotes metastatic colonization in mouse models, phenotypes that cannot be recapitulated by E-cadherin expression alone. Genomic and proteomic analyses revealed global shifts in gene expression upon miR-200 overexpression toward that of highly metastatic cells. miR-200s promote metastatic colonization partly through direct targeting of Sec23a, which mediates secretion of metastasis-suppressive proteins, including Igfbp4 and Tinagl1, as validated by functional and clinical correlation studies. Overall, these findings suggest a pleiotropic role of miR-200s in promoting metastatic colonization by influencing E-cadherin-dependent epithelial traits and Sec23a-mediated tumor cell secretome.</p>},
	keywords = {Animals, Cadherins, Cell Line, Tumor, Female, Gene Expression Profiling, Gene Expression Regulation, Neoplastic, HUMANS, Mass Spectrometry, Mice, Mice, Inbred BALB C, Microarray Analysis, MicroRNAs, Neoplasm Metastasis, Statistics, Nonparametric, Vesicular Transport Proteins},
	issn = {1546-170X},
	doi = {10.1038/nm.2401},
	author = {Korpal, Manav and Ell, Brian J and Buffa, Francesca M and Ibrahim, Toni and Blanco, Mario A and Celi{\`a}-Terrassa, Toni and Mercatali, Laura and Khan, Zia and Goodarzi, Hani and Hua, Yuling and Wei, Yong and Hu, Guohong and Garcia, Benjamin A and Ragoussis, Jiannis and Amadori, Dino and Harris, Adrian L and Kang, Yibin}
}
@article {38347,
	title = {Influence of host gene transcription level and orientation on HIV-1 latency in a primary-cell model},
	journal = {Journal of virologyJournal of virology},
	volume = {85},
	year = {2011},
	note = {http://www.ncbi.nlm.nih.gov/pubmed/21430059?dopt=Abstract},
	type = {10.1128/JVI.02536-10},
	abstract = {Human immunodeficiency virus type 1 (HIV-1) establishes a latent reservoir in resting memory CD4(+) T cells. This latent reservoir is a major barrier to the eradication of HIV-1 in infected individuals and is not affected by highly active antiretroviral therapy (HAART). Reactivation of latent HIV-1 is a possible strategy for elimination of this reservoir. The mechanisms with which latency is maintained are unclear. In the analysis of the regulation of HIV-1 gene expression, it is important to consider the nature of HIV-1 integration sites. In this study, we analyzed the integration and transcription of latent HIV-1 in a primary CD4(+) T cell model of latency. The majority of integration sites in latently infected cells were in introns of transcription units. Serial analysis of gene expression (SAGE) demonstrated that more than 90\% of those host genes harboring a latent integrated provirus were transcriptionally active, mostly at high levels. For latently infected cells, we observed a modest preference for integration in the same transcriptional orientation as the host gene (63.8\% versus 36.2\%). In contrast, this orientation preference was not observed in acutely infected or persistently infected cells. These results suggest that transcriptional interference may be one of the important factors in the establishment and maintenance of HIV-1 latency. Our findings suggest that disrupting the negative control of HIV-1 transcription by upstream host promoters could facilitate the reactivation of latent HIV-1 in some resting CD4(+) T cells.},
	keywords = {CD4-Positive T-Lymphocytes, Cells, Cultured, Gene Expression Profiling, Gene Expression Regulation, Viral, HIV-1, HUMANS, Transcription, Genetic, Virus Integration, Virus Latency},
	author = {Shan, Liang and Yang, Hung-Chih and Rabi, S. Alireza and H{\'e}ctor Corrada Bravo and Shroff, Neeta S. and Irizarry, Rafael A. and Zhang, Hao and Margolick, Joseph B. and Siliciano, Janet D. and Siliciano, Robert F.}
}
@article {49649,
	title = {Genome-wide analysis reveals novel genes essential for heme homeostasis in Caenorhabditis elegans.},
	journal = {PLoS Genet},
	volume = {6},
	year = {2010},
	month = {2010 Jul},
	pages = {e1001044},
	abstract = {<p>Heme is a cofactor in proteins that function in almost all sub-cellular compartments and in many diverse biological processes. Heme is produced by a conserved biosynthetic pathway that is highly regulated to prevent the accumulation of heme--a cytotoxic, hydrophobic tetrapyrrole. Caenorhabditis elegans and related parasitic nematodes do not synthesize heme, but instead require environmental heme to grow and develop. Heme homeostasis in these auxotrophs is, therefore, regulated in accordance with available dietary heme. We have capitalized on this auxotrophy in C. elegans to study gene expression changes associated with precisely controlled dietary heme concentrations. RNA was isolated from cultures containing 4, 20, or 500 microM heme; derived cDNA probes were hybridized to Affymetrix C. elegans expression arrays. We identified 288 heme-responsive genes (hrgs) that were differentially expressed under these conditions. Of these genes, 42\% had putative homologs in humans, while genomes of medically relevant heme auxotrophs revealed homologs for 12\% in both Trypanosoma and Leishmania and 24\% in parasitic nematodes. Depletion of each of the 288 hrgs by RNA-mediated interference (RNAi) in a transgenic heme-sensor worm strain identified six genes that regulated heme homeostasis. In addition, seven membrane-spanning transporters involved in heme uptake were identified by RNAi knockdown studies using a toxic heme analog. Comparison of genes that were positive in both of the RNAi screens resulted in the identification of three genes in common that were vital for organismal heme homeostasis in C. elegans. Collectively, our results provide a catalog of genes that are essential for metazoan heme homeostasis and demonstrate the power of C. elegans as a genetic animal model to dissect the regulatory circuits which mediate heme trafficking in both vertebrate hosts and their parasites, which depend on environmental heme for survival.</p>},
	keywords = {Animals, Caenorhabditis elegans, Dose-Response Relationship, Drug, Gene Expression Profiling, Gene Expression Regulation, genes, Genome-Wide Association Study, Heme, Homeostasis, HUMANS, Leishmania, Nematoda, Trypanosoma},
	issn = {1553-7404},
	doi = {10.1371/journal.pgen.1001044},
	author = {Severance, Scott and Rajagopal, Abbhirami and Rao, Anita U and Cerqueira, Gustavo C and Mitreva, Makedonka and El-Sayed, Najib M and Krause, Michael and Hamza, Iqbal}
}
@article {38506,
	title = {Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL): adapting the Partial Phylogenetic Profiling algorithm to scan sequences for signatures that predict protein function},
	journal = {BMC bioinformaticsBMC Bioinformatics},
	volume = {11},
	year = {2010},
	note = {http://www.ncbi.nlm.nih.gov/pubmed/20102603?dopt=Abstract},
	type = {10.1186/1471-2105-11-52},
	abstract = {BACKGROUND: Comparative genomics methods such as phylogenetic profiling can mine powerful inferences from inherently noisy biological data sets. We introduce Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL), a method that applies the Partial Phylogenetic Profiling (PPP) approach locally within a protein sequence to discover short sequence signatures associated with functional sites. The approach is based on the basic scoring mechanism employed by PPP, namely the use of binomial distribution statistics to optimize sequence similarity cutoffs during searches of partitioned training sets. RESULTS: Here we illustrate and validate the ability of the SIMBAL method to find functionally relevant short sequence signatures by application to two well-characterized protein families. In the first example, we partitioned a family of ABC permeases using a metabolic background property (urea utilization). Thus, the TRUE set for this family comprised members whose genome of origin encoded a urea utilization system. By moving a sliding window across the sequence of a permease, and searching each subsequence in turn against the full set of partitioned proteins, the method found which local sequence signatures best correlated with the urea utilization trait. Mapping of SIMBAL "hot spots" onto crystal structures of homologous permeases reveals that the significant sites are gating determinants on the cytosolic face rather than, say, docking sites for the substrate-binding protein on the extracellular face. In the second example, we partitioned a protein methyltransferase family using gene proximity as a criterion. In this case, the TRUE set comprised those methyltransferases encoded near the gene for the substrate RF-1. SIMBAL identifies sequence regions that map onto the substrate-binding interface while ignoring regions involved in the methyltransferase reaction mechanism in general. Neither method for training set construction requires any prior experimental characterization. CONCLUSIONS: SIMBAL shows that, in functionally divergent protein families, selected short sequences often significantly outperform their full-length parent sequence for making functional predictions by sequence similarity, suggesting avenues for improved functional classifiers. When combined with structural data, SIMBAL affords the ability to localize and model functional sites.},
	keywords = {algorithms, Amino Acid Sequence, Gene Expression Profiling, Molecular Sequence Data, Phylogeny, Proteins, Sequence Analysis, Protein, Structure-Activity Relationship},
	author = {J. Selengut and Rusch, Douglas B. and Haft, Daniel H.}
}
@article {49779,
	title = {Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL): adapting the Partial Phylogenetic Profiling algorithm to scan sequences for signatures that predict protein function.},
	journal = {BMC Bioinformatics},
	volume = {11},
	year = {2010},
	month = {2010},
	pages = {52},
	abstract = {<p><b>BACKGROUND: </b>Comparative genomics methods such as phylogenetic profiling can mine powerful inferences from inherently noisy biological data sets. We introduce Sites Inferred by Metabolic Background Assertion Labeling (SIMBAL), a method that applies the Partial Phylogenetic Profiling (PPP) approach locally within a protein sequence to discover short sequence signatures associated with functional sites. The approach is based on the basic scoring mechanism employed by PPP, namely the use of binomial distribution statistics to optimize sequence similarity cutoffs during searches of partitioned training sets.</p><p><b>RESULTS: </b>Here we illustrate and validate the ability of the SIMBAL method to find functionally relevant short sequence signatures by application to two well-characterized protein families. In the first example, we partitioned a family of ABC permeases using a metabolic background property (urea utilization). Thus, the TRUE set for this family comprised members whose genome of origin encoded a urea utilization system. By moving a sliding window across the sequence of a permease, and searching each subsequence in turn against the full set of partitioned proteins, the method found which local sequence signatures best correlated with the urea utilization trait. Mapping of SIMBAL "hot spots" onto crystal structures of homologous permeases reveals that the significant sites are gating determinants on the cytosolic face rather than, say, docking sites for the substrate-binding protein on the extracellular face. In the second example, we partitioned a protein methyltransferase family using gene proximity as a criterion. In this case, the TRUE set comprised those methyltransferases encoded near the gene for the substrate RF-1. SIMBAL identifies sequence regions that map onto the substrate-binding interface while ignoring regions involved in the methyltransferase reaction mechanism in general. Neither method for training set construction requires any prior experimental characterization.</p><p><b>CONCLUSIONS: </b>SIMBAL shows that, in functionally divergent protein families, selected short sequences often significantly outperform their full-length parent sequence for making functional predictions by sequence similarity, suggesting avenues for improved functional classifiers. When combined with structural data, SIMBAL affords the ability to localize and model functional sites.</p>},
	keywords = {algorithms, Amino Acid Sequence, Gene Expression Profiling, Molecular Sequence Data, Phylogeny, Proteins, Sequence Analysis, Protein, Structure-Activity Relationship},
	issn = {1471-2105},
	doi = {10.1186/1471-2105-11-52},
	author = {Selengut, Jeremy D and Rusch, Douglas B and Haft, Daniel H}
}
@article {49778,
	title = {Unexpected abundance of coenzyme F(420)-dependent enzymes in Mycobacterium tuberculosis and other actinobacteria.},
	journal = {J Bacteriol},
	volume = {192},
	year = {2010},
	month = {2010 Nov},
	pages = {5788-98},
	abstract = {<p>Regimens targeting Mycobacterium tuberculosis, the causative agent of tuberculosis (TB), require long courses of treatment and a combination of three or more drugs. An increase in drug-resistant strains of M. tuberculosis demonstrates the need for additional TB-specific drugs. A notable feature of M. tuberculosis is coenzyme F(420), which is distributed sporadically and sparsely among prokaryotes. This distribution allows for comparative genomics-based investigations. Phylogenetic profiling (comparison of differential gene content) based on F(420) biosynthesis nominated many actinobacterial proteins as candidate F(420)-dependent enzymes. Three such families dominated the results: the luciferase-like monooxygenase (LLM), pyridoxamine 5{\textquoteright}-phosphate oxidase (PPOX), and deazaflavin-dependent nitroreductase (DDN) families. The DDN family was determined to be limited to F(420)-producing species. The LLM and PPOX families were observed in F(420)-producing species as well as species lacking F(420) but were particularly numerous in many actinobacterial species, including M. tuberculosis. Partitioning the LLM and PPOX families based on an organism{\textquoteright}s ability to make F(420) allowed the application of the SIMBAL (sites inferred by metabolic background assertion labeling) profiling method to identify F(420)-correlated subsequences. These regions were found to correspond to flavonoid cofactor binding sites. Significantly, these results showed that M. tuberculosis carries at least 28 separate F(420)-dependent enzymes, most of unknown function, and a paucity of flavin mononucleotide (FMN)-dependent proteins in these families. While prevalent in mycobacteria, markers of F(420) biosynthesis appeared to be absent from the normal human gut flora. These findings suggest that M. tuberculosis relies heavily on coenzyme F(420) for its redox reactions. This dependence and the cofactor{\textquoteright}s rarity may make F(420)-related proteins promising drug targets.</p>},
	keywords = {Actinobacteria, Amino Acid Sequence, Binding Sites, Coenzymes, Flavonoids, Gene Expression Profiling, Gene Expression Regulation, Bacterial, Genome, Bacterial, molecular biology, Molecular Sequence Data, Molecular Structure, Mycobacterium tuberculosis, Phylogeny, Protein Conformation, Riboflavin},
	issn = {1098-5530},
	doi = {10.1128/JB.00425-10},
	author = {Selengut, Jeremy D and Haft, Daniel H}
}
@article {38556,
	title = {Unexpected abundance of coenzyme F(420)-dependent enzymes in Mycobacterium tuberculosis and other actinobacteria},
	journal = {Journal of bacteriologyJournal of bacteriology},
	volume = {192},
	year = {2010},
	note = {http://www.ncbi.nlm.nih.gov/pubmed/20675471?dopt=Abstract},
	type = {10.1128/JB.00425-10},
	abstract = {Regimens targeting Mycobacterium tuberculosis, the causative agent of tuberculosis (TB), require long courses of treatment and a combination of three or more drugs. An increase in drug-resistant strains of M. tuberculosis demonstrates the need for additional TB-specific drugs. A notable feature of M. tuberculosis is coenzyme F(420), which is distributed sporadically and sparsely among prokaryotes. This distribution allows for comparative genomics-based investigations. Phylogenetic profiling (comparison of differential gene content) based on F(420) biosynthesis nominated many actinobacterial proteins as candidate F(420)-dependent enzymes. Three such families dominated the results: the luciferase-like monooxygenase (LLM), pyridoxamine 5{\textquoteright}-phosphate oxidase (PPOX), and deazaflavin-dependent nitroreductase (DDN) families. The DDN family was determined to be limited to F(420)-producing species. The LLM and PPOX families were observed in F(420)-producing species as well as species lacking F(420) but were particularly numerous in many actinobacterial species, including M. tuberculosis. Partitioning the LLM and PPOX families based on an organism{\textquoteright}s ability to make F(420) allowed the application of the SIMBAL (sites inferred by metabolic background assertion labeling) profiling method to identify F(420)-correlated subsequences. These regions were found to correspond to flavonoid cofactor binding sites. Significantly, these results showed that M. tuberculosis carries at least 28 separate F(420)-dependent enzymes, most of unknown function, and a paucity of flavin mononucleotide (FMN)-dependent proteins in these families. While prevalent in mycobacteria, markers of F(420) biosynthesis appeared to be absent from the normal human gut flora. These findings suggest that M. tuberculosis relies heavily on coenzyme F(420) for its redox reactions. This dependence and the cofactor{\textquoteright}s rarity may make F(420)-related proteins promising drug targets.},
	keywords = {Actinobacteria, Amino Acid Sequence, Binding Sites, Coenzymes, Flavonoids, Gene Expression Profiling, Gene Expression Regulation, Bacterial, Genome, Bacterial, molecular biology, Molecular Sequence Data, Molecular Structure, Mycobacterium tuberculosis, Phylogeny, Protein Conformation, Riboflavin},
	author = {J. Selengut and Haft, Daniel H.}
}
@article {49644,
	title = {Genomic organization and expression profile of the mucin-associated surface protein (masp) family of the human pathogen Trypanosoma cruzi.},
	journal = {Nucleic Acids Res},
	volume = {37},
	year = {2009},
	month = {2009 Jun},
	pages = {3407-17},
	abstract = {<p>A novel large multigene family was recently identified in the human pathogen Trypanosoma cruzi, causative agent of Chagas disease, and corresponds to approximately 6\% of the parasite diploid genome. The predicted gene products, mucin-associated surface proteins (MASPs), are characterized by highly conserved N- and C-terminal domains and a strikingly variable and repetitive central region. We report here an analysis of the genomic organization and expression profile of masp genes. Masps are not randomly distributed throughout the genome but instead are clustered with genes encoding mucin and other surface protein families. Masp transcripts vary in size, are preferentially expressed during the trypomastigote stage and contain highly conserved 5{\textquoteright} and 3{\textquoteright} untranslated regions. A sequence analysis of a trypomastigote cDNA library reveals the expression of multiple masp variants with a bias towards a particular masp subgroup. Immunofluorescence assays using antibodies generated against a MASP peptide reveals that the expression of particular MASPs at the cell membrane is limited to subsets of the parasite population. Western blots of phosphatidylinositol-specific phospholipase C (PI-PLC)-treated parasites suggest that MASP may be GPI-anchored and shed into the medium culture, thus contributing to the large repertoire of parasite polypeptides that are exposed to the host immune system.</p>},
	keywords = {3{\textquoteright} Flanking Region, 5{\textquoteright} Flanking Region, Amino Acid Sequence, Animals, Base Sequence, Conserved Sequence, Gene Expression Profiling, Genes, Protozoan, Genome, Protozoan, Membrane Proteins, Molecular Sequence Data, Mucins, Multigene Family, Protozoan Proteins, RNA, Messenger, Trypanosoma cruzi},
	issn = {1362-4962},
	doi = {10.1093/nar/gkp172},
	author = {Bartholomeu, Daniella C and Cerqueira, Gustavo C and Le{\~a}o, Ana Carolina A and daRocha, Wanderson D and Pais, Fabiano S and Macedo, Camila and Djikeng, Appolinaire and Teixeira, Santuza M R and El-Sayed, Najib M}
}
@article {49749,
	title = {Measuring differential gene expression by short read sequencing: quantitative comparison to 2-channel gene expression microarrays.},
	journal = {BMC Genomics},
	volume = {10},
	year = {2009},
	month = {2009},
	pages = {221},
	abstract = {<p><b>BACKGROUND: </b>High-throughput cDNA synthesis and sequencing of poly(A)-enriched RNA is rapidly emerging as a technology competing to replace microarrays as a quantitative platform for measuring gene expression.</p><p><b>RESULTS: </b>Consequently, we compared full length cDNA sequencing to 2-channel gene expression microarrays in the context of measuring differential gene expression. Because of its comparable cost to a gene expression microarray, our study focused on the data obtainable from a single lane of an Illumina 1 G sequencer. We compared sequencing data to a highly replicated microarray experiment profiling two divergent strains of S. cerevisiae.</p><p><b>CONCLUSION: </b>Using a large number of quantitative PCR (qPCR) assays, more than previous studies, we found that neither technology is decisively better at measuring differential gene expression. Further, we report sequencing results from a diploid hybrid of two strains of S. cerevisiae that indicate full length cDNA sequencing can discover heterozygosity and measure quantitative allele-specific expression simultaneously.</p>},
	keywords = {algorithms, DNA, Complementary, DNA, Fungal, Gene Expression Profiling, Oligonucleotide Array Sequence Analysis, Saccharomyces cerevisiae, sequence alignment, Sequence Analysis, DNA},
	issn = {1471-2164},
	doi = {10.1186/1471-2164-10-221},
	author = {Bloom, Joshua S and Khan, Zia and Kruglyak, Leonid and Singh, Mona and Caudy, Amy A}
}
@article {49641,
	title = {Analysis of fat body transcriptome from the adult tsetse fly, Glossina morsitans morsitans.},
	journal = {Insect Mol Biol},
	volume = {15},
	year = {2006},
	month = {2006 Aug},
	pages = {411-24},
	abstract = {<p>Tsetse flies (Diptera: Glossinidia) are vectors of pathogenic African trypanosomes. To develop a foundation for tsetse physiology, a normalized expressed sequence tag (EST) library was constructed from fat body tissue of immune-stimulated Glossina morsitans morsitans. Analysis of 20,257 high-quality ESTs yielded 6372 unique genes comprised of 3059 tentative consensus (TC) sequences and 3313 singletons (available at http://aksoylab.yale.edu). We analysed the putative fat body transcriptome based on homology to other gene products with known functions available in the public domain. In particular, we describe the immune-related products, reproductive function related yolk proteins and milk-gland protein, iron metabolism regulating ferritins and transferrin, and tsetse{\textquoteright}s major energy source proline biosynthesis. Expression analysis of the three yolk proteins indicates that all are detected in females, while only the yolk protein with similarity to lipases, is expressed in males. Milk gland protein, apparently important for larval nutrition, however, is primarily synthesized by accessory milk gland tissue.</p>},
	keywords = {Adipose Tissue, Animals, Base Sequence, Computational Biology, DNA Primers, Egg Proteins, Expressed Sequence Tags, Female, Gene Expression Profiling, Insect Vectors, Male, Molecular Sequence Data, Reverse Transcriptase Polymerase Chain Reaction, Sequence Analysis, DNA, Sex Factors, Tsetse Flies},
	issn = {0962-1075},
	doi = {10.1111/j.1365-2583.2006.00649.x},
	author = {Attardo, G M and Strickler-Dinglasan, P and Perkin, S A H and Caler, E and Bonaldo, M F and Soares, M B and El-Sayeed, N and Aksoy, S}
}
@article {38294,
	title = {Genome Properties: a system for the investigation of prokaryotic genetic content for microbiology, genome annotation and comparative genomics},
	journal = {Bioinformatics (Oxford, England)Bioinformatics (Oxford, England)},
	volume = {21},
	year = {2005},
	note = {http://www.ncbi.nlm.nih.gov/pubmed/15347579?dopt=Abstract},
	type = {10.1093/bioinformatics/bti015},
	abstract = {MOTIVATION: The presence or absence of metabolic pathways and structures provide a context that makes protein annotation far more reliable. Compiling such information across microbial genomes improves the functional classification of proteins and provides a valuable resource for comparative genomics. RESULTS: We have created a Genome Properties system to present key aspects of prokaryotic biology using standardized computational methods and controlled vocabularies. Properties reflect gene content, phenotype, phylogeny and computational analyses. The results of searches using hidden Markov models allow many properties to be deduced automatically, especially for families of proteins (equivalogs) conserved in function since their last common ancestor. Additional properties are derived from curation, published reports and other forms of evidence. Genome Properties system was applied to 156 complete prokaryotic genomes, and is easily mined to find differences between species, correlations between metabolic features and families of uncharacterized proteins, or relationships among properties. AVAILABILITY: Genome Properties can be found at http://www.tigr.org/Genome_Properties SUPPLEMENTARY INFORMATION: http://www.tigr.org/tigr-scripts/CMR2/genome_properties_references.spl.},
	keywords = {Chromosome mapping, database management systems, Databases, Genetic, documentation, Gene Expression Profiling, Gene Expression Regulation, Genomics, Information Storage and Retrieval, Microbiological Techniques, natural language processing, Prokaryotic Cells, Proteome, signal transduction, software, User-Computer Interface, Vocabulary, Controlled},
	author = {Haft, Daniel H. and J. Selengut and Brinkac, Lauren M. and Zafar, Nikhat and White, Owen}
}
@article {49637,
	title = {Transcriptional profiling of the hyperthermophilic methanarchaeon Methanococcus jannaschii in response to lethal heat and non-lethal cold shock.},
	journal = {Environ Microbiol},
	volume = {7},
	year = {2005},
	month = {2005 Jun},
	pages = {789-97},
	abstract = {<p>Temperature shock of the hyperthermophilic methanarchaeon Methanococcus jannaschii from its optimal growth temperature of 85 degrees C to 65 degrees C and 95 degrees C resulted in different transcriptional responses characteristic of both the direction of shock (heat or cold shock) and whether the shock was lethal. Specific outcomes of lethal heat shock to 95 degrees C included upregulation of genes encoding chaperones, and downregulation of genes encoding subunits of the H+ transporting ATP synthase. A gene encoding an alpha subunit of a putative prefoldin was also upregulated, which may comprise a novel element in the protein processing pathway in M. jannaschii. Very different responses were observed upon cold shock to 65 degrees C. These included upregulation of a gene encoding an RNA helicase and other genes involved in transcription and translation, and upregulation of genes coding for proteases and transport proteins. Also upregulated was a gene that codes for an 18 kDa FKBP-type PPIase, which may facilitate protein folding at low temperatures. Transcriptional profiling also revealed several hypothetical proteins that respond to temperature stress conditions.</p>},
	keywords = {Adaptation, Physiological, Archaeal Proteins, Cold Temperature, Gene Expression Profiling, Gene Expression Regulation, Archaeal, Heat-Shock Proteins, Hot Temperature, Methanococcus, Temperature, Transcription, Genetic},
	issn = {1462-2912},
	doi = {10.1111/j.1462-2920.2005.00751.x},
	author = {Boonyaratanakornkit, Boonchai B and Simpson, Anjana J and Whitehead, Timothy A and Fraser, Claire M and el-Sayed, Najib M A and Clark, Douglas S}
}
@article {49629,
	title = {Analysis of stage-specific gene expression in the bloodstream and the procyclic form of Trypanosoma brucei using a genomic DNA-microarray.},
	journal = {Mol Biochem Parasitol},
	volume = {123},
	year = {2002},
	month = {2002 Aug 28},
	pages = {115-23},
	abstract = {<p>A microarray comprising 21,024 different PCR products spotted on glass slides was constructed for gene expression studies on Trypanosoma brucei. The arrayed fragments were generated from a T. brucei shotgun clone library, which had been prepared from randomly sheared and size-fractionated genomic DNA. For the identification of stage-specific gene activity, total RNA from in vitro cultures of the human, long slender form and the insect, procyclic form of the parasite was labelled and hybridised to the microarray. Approximately 75\% of the genomic fragments produced a signal and about 2\% exhibited significant differences between the transcript levels in the bloodstream and procyclic forms. A few results were confirmed by Northern blot analysis or reverse-transcription and PCR. Three hundred differentially regulated clones have been selected for sequencing. So far, of 33 clones that showed about 2-fold or more over-expression in bloodstream forms, 15 contained sequences similar to those of VSG expression sites and at least six others appeared non-protein-coding. Of 29 procyclic-specific clones, at least eight appeared not to be protein-coding. A surprisingly large proportion of known regulated genes was already identified in this small sample, and some new ones were found, illustrating the utility of genomic arrays.</p>},
	keywords = {Animals, Blotting, Northern, Escherichia coli, Gene expression, Gene Expression Profiling, Genes, Protozoan, HUMANS, Life Cycle Stages, Molecular Sequence Data, Oligonucleotide Array Sequence Analysis, Polymerase Chain Reaction, Transcription, Genetic, Trypanosoma brucei brucei},
	issn = {0166-6851},
	author = {Diehl, Susanne and Diehl, Frank and El-Sayed, Najib M and Clayton, Christine and Hoheisel, J{\"o}rg D}
}