@article{96545d66ae6545bb83ab9b32b0fbdfc3,
title = "To Petabytes and beyond: Recent advances in probabilistic and signal processing algorithms and their application to metagenomics",
abstract = "As computational biologists continue to be inundated by ever increasing amounts of metagenomic data, the need for data analysis approaches that keep up with the pace of sequence archives has remained a challenge. In recent years, the accelerated pace of genomic data availability has been accompanied by the application of a wide array of highly efficient approaches from other fields to the field of metagenomics. For instance, sketching algorithms such as MinHash have seen a rapid and widespread adoption. These techniques handle increasingly large datasets with minimal sacrifices in quality for tasks such as sequence similarity calculations. Here, we briefly review the fundamentals of the most impactful probabilistic and signal processing algorithms. We also highlight more recent advances to augment previous reviews in these areas that have taken a broader approach. We then explore the application of these techniques to metagenomics, discuss their pros and cons, and speculate on their future directions.",
author = "Elworth, {R. A.Leo} and Qi Wang and Kota, {Pavan K.} and Barberan, {C. J.} and Benjamin Coleman and Advait Balaji and Gaurav Gupta and Baraniuk, {Richard G.} and Anshumali Shrivastava and Treangen, {Todd J.}",
note = "Funding Information: 18-1-2047]; B.C., G.G. and A.S. were supported by NSF-1652131, Nsf-BigData 1838177, AFOSR-YIPFA9550-18-1-0152, Amazon Research Award, and ONR BRC grant for Randomized Numerical Linear Algebra. The open access publication charge for this paper has been waived by Oxford University Press - NAR Editorial Board members are entitled to one free paper per year in recognition of their work on behalf of the journal. Conflict of interest statement. None declared. Funding Information: FunGCAT program from the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via the Army Research Office (ARO) under Federal Award [W911NF-17-2-0089 to R.A.L.E., A.B. and T.J.T.]; Q.W. and A.B. were supported by funds from Rice University; Q.W. was also supported by funds from the National Institute for Neurological Disorders and Stroke (NINDS) of the National Institutes of Health [R21NS106640]; P.K.K., C.B. and R.G.B. were supported by NSF [CCF-1911094, IIS-1838177, IIS-1730574]; ONR grants [N00014-18-12571, N00014-17-1-2551]; AFOSR grant [FA9550-18-1-0478]; DARPA grant [G001534-7500]; NLM Training Program [T15LM007093]; Vannevar Bush Faculty Fellowship, ONR grant [N00014- Publisher Copyright: {\textcopyright} The Author(s) 2020. Published by Oxford University Press on behalf of Nucleic Acids Research.",
year = "2020",
doi = "10.1093/NAR/GKAA265",
language = "English (US)",
volume = "48",
pages = "5217--5234",
journal = "Nucleic Acids Research",
issn = "0305-1048",
publisher = "Oxford University Press",
number = "10",
}