-
Notifications
You must be signed in to change notification settings - Fork 5
/
analysis2_manuscript.tex
750 lines (660 loc) · 45.8 KB
/
analysis2_manuscript.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
\documentclass[hidelinks]{article}
\usepackage[letterpaper,margin=1.0in]{geometry}
\usepackage[utf8]{inputenc}
\pagenumbering{arabic}
\usepackage{authblk}
\usepackage{graphicx}
\usepackage[singlelinecheck=false]{caption} % singlelinecheck makes single line caption left aligned instead of centered
\usepackage{subcaption}
\usepackage{amsmath}
\usepackage{fancyhdr}
\usepackage{longtable}
\usepackage{booktabs}
% hyperlinks
\usepackage{hyperref}
\usepackage{wrapfig}
\usepackage{xspace}
\usepackage{mathrsfs}
\usepackage{graphicx}
\usepackage{lipsum}
\usepackage{makecell}
\pagestyle{fancy}
\fancyhead[R]{\textbf{\stdpopsim with selection}}
% for highlighting text
\usepackage{xcolor}
\usepackage{soul}
% bibliography
\usepackage[
backend=biber,
natbib=true,
style=authoryear,
]{biblatex}
\addbibresource{references.bib}
%\usepackage[round]{natbib} % omit 'round' option if you prefer square brackets
%\bibliographystyle{plainnat}
\newcommand{\Stdpopsim}{\texttt{Stdpopsim}\xspace}
\newcommand{\stdpopsim}{\texttt{stdpopsim}\xspace}
%commands to format figure and table references in the supplement
\newcommand{\beginsupplement}{%
\fancyhead[L]{Supplemental Material}
\setcounter{table}{0}
\renewcommand{\thetable}{S\arabic{table}}%
\setcounter{figure}{0}
\renewcommand{\thefigure}{S\arabic{figure}}%
}
\newcommand{\stopsupplement}{%
\setcounter{table}{0}
\renewcommand{\thetable}{\arabic{table}}%
\setcounter{figure}{0}
\renewcommand{\thefigure}{\arabic{figure}}%
}
\makeatletter
\newcommand{\labelname}[1]{\def\@currentlabelname{#1}}
\makeatother
% Avoid pandoc bug when there are lists in the body.
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\title{Adding selection to \stdpopsim: oh how the tables have turned}
% \author[1,+]{M. Elise Lauterbur}
% \author[3,*]{Ariella L. Gladstein}
\author[4,*]{Graham Gower}
\author[5*]{Nathaniel S. Pope}
\author[5*]{Murillo F. Rodrigues}
\author[6]{Georgia Tsambos}
\author[34]{Linh N. Tran}
\author[2]{Maria Izabel A. Cavassim}
% \author[5,7]{Jeff Adrion}
% \author[5]{Saurabh Belsare}
% \author[8]{Arjun Biddanda}
% \author[5]{Victoria Caudill}
% \author[9]{Jean Cury}
% \author[10]{Ignacio Echevarria}
% \author[11]{Benjamin C. Haller}
% \author[12,13]{Ahmed R. Hasan}
\author[14,15]{Xin Huang}
% \author[16]{Leonardo Nicola Martin Iasi}
% \author[17]{Ekaterina Noskova}
% \author[18]{Jana Obšteter}
% \author[19]{Vitor Antonio Corrêa Pavinato}
% \author[20,21]{Alice Pearson}
% \author[22,23]{David Peede}
% \author[24]{Manolo F. Perez}
\author[5]{Chris C. R. Smith}
% \author[25]{Jeffrey P. Spence}
% \author[5]{Anastasia Teterina}
\author[5]{Silas Tittes}
\author[5]{Scott T. Small}
% \author[26]{Per Unneberg}
% \author[27]{Juan Manuel Vazquez}
% \author[28]{Ryan K. Waples}
% \author[29]{Anthony Wilder Wohns}
% \author[30]{Yan Wong}
% \author[31]{Franz Baumdicker}
% \author[32]{Reed A. Cartwright}
% \author[33]{Gregor Gorjanc}
\author[4]{Kirk E. Lohmueller}
\author[34]{Ryan N. Gutenkunst}
\author[30]{Jerome Kelleher}
\author[35]{Aaron P. Ragsdale}
\author[37]{Daniel R. Schrider}
\author[38]{Ilan Gronau}
\author[5,36]{Peter L. Ralph}
\author[5]{Andrew D. Kern}
\affil[*]{\small{These authors contributed equally to the paper.}}
% \affil[+]{\small{Corresponding authors: [email protected] ; [email protected].}}
% \affil[1]{\small{Department of Ecology and Evolutionary Biology, University of Arizona, Tucson AZ 85719, USA}}
% \affil[2]{\small{Department of Ecology and Evolutionary Biology, University of California, Los Angeles, Los Angeles CA, USA}}
% \affil[3]{\small{Embark Veterinary, Inc., Boston MA 02111, USA}}
\affil[4]{\small{Section for Molecular Ecology and Evolution, Globe Institute, University of Copenhagen, Denmark}}
\affil[5]{\small{Institute of Ecology and Evolution, University of Oregon, Eugene OR 97402, USA}}
% \affil[6]{\small{School of Mathematics and Statistics, University of Melbourne, Australia}}
% \affil[7]{\small{AncestryDNA, San Francisco CA 94107, USA}}
% \affil[8]{\small{54Gene, Inc., Washington DC 20005, USA}}
% \affil[9]{\small{Université Paris-Saclay, CNRS, INRIA, Laboratoire Interdisciplinaire des Sciences du Numérique, UMR 9015 Orsay, France}}
% \affil[10]{\small{School of Life Sciences, University of Glasgow, Glasgow, UK}}
% \affil[11]{\small{Department of Computational Biology, Cornell University, Ithaca NY, USA}}
% \affil[12]{\small{Department of Cell and Systems Biology, University of Toronto, Toronto ON, Canada}}
% \affil[13]{\small{Department of Biology, University of Toronto Mississauga, Mississauga ON, Canada}}
% \affil[14]{\small{Department of Evolutionary Anthropology, University of Vienna, Vienna, Austria}}
% \affil[15]{\small{Human Evolution and Archaeological Sciences (HEAS), University of Vienna, Vienna, Austria}}
% \affil[16]{\small{Department of Evolutionary Genetics, Max Planck Institute for Evolutionary Anthropology, Leipzig, Germany}}
% \affil[17]{\small{Computer Technologies Laboratory, ITMO University, St Petersburg, Russia}}
% \affil[18]{\small{Agricultural Institute of Slovenia, Department of Animal Science, Ljubljana, Slovenia}}
% \affil[19]{\small{Entomology Department, The Ohio State University, Wooster OH, USA}}
% \affil[20]{\small{Department of Genetics, University of Cambridge, Cambridge, UK}}
% \affil[21]{\small{Department of Zoology, University of Cambridge, Cambridge, UK}}
% \affil[22]{\small{Department of Ecology, Evolution, and Organismal Biology, Brown University, Providence RI, USA}}
% \affil[23]{\small{Center for Computational Molecular Biology, Brown University, Providence RI, USA}}
% \affil[24]{\small{Department of Genetics and Evolution, Federal University of Sao Carlos, Sao Carlos 13565905, Brazil}}
% \affil[25]{\small{Department of Genetics, Stanford University School of Medicine, Stanford CA 94305, USA}}
% \affil[26]{\small{Department of Cell and Molecular Biology, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University, Husargatan 3, SE-752 37 Uppsala, Sweden}}
% \affil[27]{\small{Department of Integrative Biology, University of California, Berkeley, Berkeley CA, USA}}
% \affil[28]{\small{Department of Biostatistics, University of Washington, Seattle WA, USA}}
% \affil[29]{\small{Broad Institute of MIT and Harvard, Cambridge MA 02142, USA}}
\affil[30]{\small{Big Data Institute, Li Ka Shing Centre for Health Information and Discovery, University of Oxford, Oxford OX3 7LF, UK}}
% \affil[31]{\small{Cluster of Excellence - Controlling Microbes to Fight Infections, Eberhard Karls Universität Tübingen, Tübingen, Baden-Württemberg, Germany}}
% \affil[32]{\small{School of Life Sciences and The Biodesign Institute, Arizona State University, Tempe AZ, USA}}
% \affil[33]{\small{The Roslin Institute and Royal (Dick) School of Veterinary Studies, University of Edinburgh, Edinburgh EH25 9RG, UK}}
% \affil[34]{\small{Department of Molecular and Cellular Biology, University of Arizona, Tucson AZ 85721, USA}}
\affil[35]{\small{Department of Integrative Biology, University of Wisconsin-Madison, Madison WI, USA}}
\affil[36]{\small{Department of Mathematics, University of Oregon, Eugene OR 97402, USA}}
\affil[37]{\small{Department of Genetics, University of North Carolina at Chapel Hill, Chapel Hill NC 27599, USA}}
\affil[38]{\small{Efi Arazi School of Computer Science, Reichman University, Herzliya, Israel}}
\date{\small{\today{}}}
\begin{document}
\maketitle
\section*{Abstract}
\section*{Introduction}
\label{introduction}
% natural selection
Natural selection is a fundamental force in evolution, shaping the
genetic diversity of populations and driving the adaptation of
species to their environments. The effects of natural selection
on genetic variation are complex, and can be difficult to disentangle
from other evolutionary processes such as mutation, recombination,
and genetic drift \cite[e.g.,][]{gillespie1991causes}.
For instance changes in population size can lead to fluctuations
in genetic diversity across a recombining chromosome
that can mimic the effects of selection \citep{simonsen1995properties, barton1998effect},
% DRS -- I'm not sure Barton reference is right for this?
and lead to spurious inferences about the strength and targets of genetic adaptation
\cite{simonsen1995properties,akey2004population,nielsen2005genomic}.
In turn selection can confound our ability to infer demographic
history from allele frequencies \citep{ewing2016consequences,schrider2016effects} and
estimates of inverse coalescent rate \citep{schrider2016effects, johri2021impact, cousins2024accurate}.
Thus it is imperative to joingly account for the effects of selection
and demography when inferring evolutionary history from genetic data \citep{sheehan2016deep,johri2020toward}.
However this is a challenging task from a modeling perspective \citep{johri2022prospect}.
% simulation
% DRS: lots more refs will be needed in this paragraph. added a few and am happy to help add more later if needed
To meet the growing need for the interpretation,
analysis, and exploration of realistic and complex evolutionary models
the field of population genetics has increasingly turned to simulation.
Simulation in population genetics has a long history
including both backward-in-time coalescent simulations
\citep{kingman1982genealogy,hudson1983testing, hudson1990gene}
and forward-in-time simulations of complex demography and selection
\citep[e.g.,][]{gillespie1984molecular,thornton2014c++, haller2019slim}.
The development of simulation tools has been driven by the need to
understand the effects of complex evolutionary processes on genetic
variation \citep[e.g.,][]{galloway2020few}, to provide a null model for hypothesis testing
\citep[e.g.,][]{hudson1992statistical,hudson1994evidence,sabeti2002detecting}, to
to explore the power and limitations of statistical methods \cite[e.g.,][]{przeworski2002signature},
and increasingly to provide a basis for machine learning and other
simulation-based inference methods \citep[e.g.,][]{pavlidis2010searching,lin2011distinguishing,kern2018diplos,mughal2019localizing,sanchez2021deep,wang2021automatic}.
While this is so, joint simulation of complex demography and selection
is challenging, and requires a deep understanding of the underlying
evolutionary processes, as well as a number of parameter choices including
the strength of selection, the distribution of fitness effects, and the
recombination rate, as well as a parameterized model of demography.
This is a daunting task for many researchers, and can be a barrier to
the adoption of simulation-based methods in population genetics.
Furthermore, sharing and comparing simulation results among studies can
itself be challenging, as different simulation tools may use different structures
and parameter scalins for their models, and may not be directly comparable.
This is a major limitation for the field, as it can make it difficult
to assess the robustness of results, and to compare the results of
different studies.
% stdpopsim
% DRS: lots more refs will be needed in this paragraph. added a few and am happy to help add more later if needed
A lingering challenge with simulation in population genetics has been
reproducibility and the ability to share and compare results among
researchers \citep[e.g.,][]{ragsdale2020lessons}.
This challenge has been addressed in part by the development
of community resources for sharing and distributing simulation software
via the \stdpopsim project \citep{adrion2020community}. \stdpopsim
provides a standardized interface for accessing a wide range of
population genetic models, and has begun to be be widely adopted by the community. %maybe add citations here?
While that is so, the original version of \stdpopsim did not include
models of selection, which is a major limitation for more empirical
applications of population genetic simulation. In particular, modeling selection
through simulation is critical for understanding processes such
as adaptation, the effects of selective sweeps \citep[e.g.][]{braverman1995hitchhiking,fay2000hitchhiking,przeworski2002signature,przeworski2005signature,schrider2015soft}, and the impact of
background selection on genetic diversity
\citep[e.g.][]{charlesworth1993effect,charlesworth1995pattern,williamson2002genealogy,ewing2016consequences,torres2020temporal}.
Ideally, one would like
to have a single, unified framework for simulating both neutral and
non-neutral evolutionary processes, and to be able to compare the
results of these simulations to empirical data in a manner that is
both accessible to a wide range of researchers and highly reproducible.
Further the framework should include the complex realities of
genomes, including heterogenous recombination rates,
variation in the size and density of functional elements, and
non-equilibrium demographic histories. Ideally, one should be able
to model all of these features after estimates/annotations obtained
from a species of interest \citep[e.g.][]{schrider2020background}.
% goals
In this manuscript we provide an overview of a major new addition
to \stdpopsim---the inclusion of models of selection.
We begin by describing the models of selection that we have implemented,
as well as the parameter choices that are available to the user.
This includes models of background selection, selective sweeps, and
models of the distribution of fitness effects (DFE).
Further we describe how these models can be combined with genomic
annotations that are specific to a given species available
through the \stdpopsim API, and combined with a parameterized model of
demography and recombination to provide a realistic simulation of
genetic variation in a population.
We then provide a series of examples of how these models can be used
to benchmark and compare the performance of different methods for
inferring demographic history, the targets of recent positive selection,
and the distribution of fitness effects from genetic data.
\section*{Implementing Selection in \stdpopsim}
\label{selection}
% implementing selection
\stdpopsim previously provided a wide range of models of neutral
demographic history from roughly two dozen species \citep{lauterbur2023expanding}.
This was accomplished through the use of a standardized interface
and data structure that allows for curated addition of new
species and models, and can use either
msprime \citep{Baumdicker2022} or SLiM \citep{haller2019slim}
as the backend engine for efficient population genetic simulation.
To extend \stdpopsim's functionality to include models of selection,
we introduce an important new class representing a specfic distribution of fitness effects
to the \stdpopsim API: the \texttt{DFE} class.
This class provides an interface for specifying
a distribution of the fitness effects of new mutations, and
is added to a \texttt{Contig} object to provide a description
of the fitness effects of mutations along a chromosome.
A \texttt{DFE} object can apply to a specific interval set of a chromosome,
and can be combined with other \texttt{DFE} objects to provide,
a rich model of how selection may differ along a chromosome.
Once specified a model of selection is then run in the same way
but currently is limited to using SLiM simulation engine, which
is perhaps the most flexible simulation engine for modeling selection currently available.
As with other \stdpopsim models, when incorporating selection we aim to hew as closely as possible
to realistic genomes, and provide public annotations of specices-specific functional
elements that can be used along with a \texttt{DFE} object to
readily provide realistic model of selection for a specific genome and set of
populations. A schematic of the \stdpopsim catalog is shown in Figure \ref{fig:schematic}.
From a users perspective, one can specify a species, a portion of the genome to simulate,
a genetic map if available, a model of demography, and a model of selection that is
composed of a set of annotations and a \texttt{DFE} object.
The vast majority of published DFE estimates are based on a limited number of species,
so \stdpopsim currently has implemented published DFEs from
four species: \textit{Arabidopsis thaleana} \citep{huber2018gene}, \textit{Drosophila melanogaster} \citep{ragsdale2016triallelic,huber2017determining},
humans, and the vaquita porpoise \textit{Phocoena sinus} \citep{robinson2022critically}.
While that is so, these DFEs can be mixed and matched with the other species
in the catalog. For instance, one could simulate a butterfly species with a Gamma DFE originally
estimated from humans, or a human population with a DFE estimated from \textit{Drosophila}.
Furthermore the user can specify a custom DFE, and provide their own annotations
of functional elements to simulate selection in a species for which we do not yet have
a published DFE included in the catalog. This flexibility allows for a wide range of
models of selection to be simulated, and for the results of these simulations to be
compared to empirical data or used as a null model for hypothesis testing.
% sweep interface
In addition to the \texttt{DFE} class, we have also implemented a class (\texttt{stdpopsim.ext.selective\_sweep})
that enables selective sweeps to be simulated in \stdpopsim.
This class augments a demographic model with an ``extended event''
which conditions on the introduction of a selected mutation at a given time and position
and with a given selection coefficient. Further the user can specify the minimum frequency
of the selected allele at the time of sampling. As these extended events are implemented
on top of the existing \stdpopsim API, they can be combined with other models of selection
and demography to provide a rich model of the combined effects of multiple disparate evolutionary processes
on genetic variation in a population.
% DRS: This figure could really use some Python API and comand line examples like in Figs 1B,C
% form the original stdpopsim paper
\begin{wrapfigure}[37]{l}{0.4\textwidth}
\vspace{-0.0cm}
\includegraphics[width=\linewidth]{figures/schematics/catalog.pdf}
\caption{\label{fig:schematic}
A schematic of the \stdpopsim catalog for simulating genetic variation
in a population. The user specifies a species, a portion of the genome to simulate,
and optionally a genetic map, a model of demography, and a model of selection that
itself is composed of a distribution of fitness effects (DFE) and a set of functional
annotations.}
\end{wrapfigure}
\hfill
% current numbers of DFEs / species / etc
At this release the \stdpopsim catalog comprises 24 species (i.e. genome representations)
for which we have implemented XX demographic models, have XX genetic maps, and XX DFEs.
Additions to the catalog are ongoing -- we point the reader to our previously published
report detailing this effort \citep{lauterbur2023expanding} and we welcome contributions from the
community---we encourage those interested in contributing to visit https://popsim-consortium.github.io/stdpopsim-docs
or contact one of the PopSim Consortium members for guidance.
\section*{Example Applications with Selection}
\label{applications}
In this section we provide a series of examples of how the new models of selection
in \stdpopsim can be used to benchmark and compare the performance of different
methods for population genetic inference.
We focus on three main areas: demographic inference in the
single and multi-population settings, inference of the distribution of fitness effects,
and the detection of selective sweeps, particularly in the context of background selection.
\section*{Inference of $N_e(t)$ in the context of selection}
One of the most common applications of population genetic inference is to estimate
the effective population size over time, $N_e(t)$, from genetic data. This can be done
using a variety of methods, including the pairwise sequentially Markovian coalescent
(PSMC) \citep{li2011inference}, through use of the site frequency spectrum (SFS),
as well as through identity by decent information.
While that is so, it has been well described that selection, because of its effects on
local and more global patterns of genetic variation, can drive estimates of $N_e(t)$
further from true census population sizes simply because selection increases the
rate of coalescence in the genome.
Using \stdpopsim we can simulate genetic data under a model of selection and then
examine the performance of different methods for inferring $N_e(t)$ from these simulated
data. In Figure \ref{fig:1pop-human-demography} we show the results of such a benchmark
where we have simulated genomes under a model of human out of Africa (OOA) demography
with and without background selection on exons. For this simulation we have used
a genetic map from the HapMap Project \citep{international2007second} along with a
DFE for deleterious alleles (estimated by \cite{huber2017determining})
that acts on exonic positions defined from the HAVANA group release 104 for the human genome.
These simulations were then used to estimate $N_e(t)$ using four methods: 1) MSMC2 \citep{Schiffels2020},
2)the stairway plot method \citep{liu2020stairway}, 3) GONE \citep{santiago2020recent}, and 4) SMC++ \citep{terhorst2017robust}.
In this and what follows we have performed 3 replicate whole genome simulations, so uncertainty in estimates might
be roughly assessed, however we are mindful of wasted compute as thorough benchmarking
isn't the main purpose of this manuscript.
In Figure \ref{fig:1pop-human-demography} we show six panels, each of which shows the estimated $N_e(t)$
from each independent sample in a five-population model of human out-of-Africa demography
from Ragsdale \textit{et al.} (2019)
with samples taken from three modern geographic groups (CEU, CHB, and YRI from left to right).
The top row shows the results of simulations without selection, while the bottom row shows the results
of simulations with a Gamma distributed DFE for the deleterious effects of mutations occurring exons. In each panel we show the true $N_e(t)$ in black, and the estimated $N_e(t)$
in various colors for the four methods. Dotted lines on the bottom row are to visually reinforce that inference is done
from a model with selection. We see that these methods generally perform reasonably well in the absence of selection,
although GONE seems to overestimate $N_e(t)$ in the more recent past for the CEU and CHB populations.
Presumably this could be due to patterns of linkage disequilibrium that result from the combination
of bottleneck and growth in these populations.
In the presence of selection, different methods behave differently.
For instance stairwaiplot which uses the SFS to infer $N_e(t)$ seems to slightly underestimate $N_e(t)$
but is reasonably close to the true $N_e(t)$ in the absence of selection.
Likewise SMC++ seems to be more robust to the presence of selection,
but there is some added noise to the returned estimates.
MSMC2 seems to be the most affected by the presence of selection,
estimates of $N_e(t)$ varying quite a bit from the true $N_e(t)$.
% DRS: I don't agree with the interpretation in the previous two sentences (starting with "Likewise").
% MSMC is already barfing a little bit even without selection, and it is not clear to me from these
% plots that selection makes things any worse. Also, SMC++ is doing quite well without selection so maybe
% the minor drop in accuracy is meaningful in relative terms.
\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{figures/HomSap/OOA/estimated_Ne_t_final}
\caption{
\label{fig:1pop-human-demography}
Performance of methods to infer $N_e(t)$ from a human out-of-Africa model \citep{ragsdale2019models}
with and without background selection on exons. The top row shows estimates of $N_e(t)$ from simulations
without selection, while the bottom row shows estimates of $N_e(t)$ from simulations with a Gamma distributed
DFE acting on exons. In each panel we show the true $N_e(t)$ in black, and the estimated $N_e(t)$ from four methods:
MSMC2 \citep{Schiffels2020}, stairway plot \citep{liu2020stairway}, GONE \citep{santiago2020recent}, and SMC++ \citep{terhorst2017robust}.
}
\end{figure}
To highlight the ease of comparisons between different species and model enabled by \stdpopsim,
we have also performed a similar benchmark using the vaquita porpoise \textit{Phocoena sinus}.
In Figure \ref{fig:1pop-vaquita-demography} we show the results of $N_e(t)$ inference
where we have simulated genomes under a model of vaquita demography from \textcite{robinson2022critically}.
For these simulations we have used the genome structure and exon annotations from the vaquita genome assembly
and a DFE from \textcite{robinson2022critically} that acts on exonic positions, however no genetic map
is currently available, so recombination rates are assumed to be constant across the genome.
The demographic model is a simple two-epoch model where population size has declined in the recent past.
In this case we see that all methods perform well both in the absence and presence of selection,
with a close match between the true $N_e(t)$ and the estimated $N_e(t)$ in all cases.
\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{figures/PhoSin/Vaquita2Epoch_1R22/estimated_Ne_t_final}
\caption{
\label{fig:1pop-vaquita-demography}
Performance of methods to infer $N_e(t)$ from simulations of the vaquita porpoise genome under a single population
model of declining population size \citep{robinson2022critically} with and without background selection on exons.
The top panel shows estimates of $N_e(t)$ from simulations
without selection, while the bottom panel shows estimates of $N_e(t)$ from simulations with a Gamma distributed
DFE acting on exons. In each panel we show the true $N_e(t)$ in black, and the estimated $N_e(t)$ from four methods:
MSMC2 \citep{Schiffels2020}, stairway plot \citep{liu2020stairway}, GONE \citep{santiago2020recent}, and SMC++ \citep{terhorst2017robust}.
}
\end{figure}
\section*{Estimation of the Distribution of Fitness effects}
\label{dfe}
Another common application of population genetic inference is to estimate the distribution of fitness effects (DFE)
from genetic data. Here we show how the new models of selection in \stdpopsim can be used to benchmark and compare
the performance of different methods for inferring the DFE from genetic data. Here we have performed simulations using the
human and vaquita porpoise models described above, with complex demographic histories and models of selection acting on exons.
To remind the reader, the model of selection in each case is a Gamma distributed DFE for deleterious mutations.
\begin{figure}[htbp]
\centering
\includegraphics[width=\linewidth]{figures/HomSap/OOA/HomSap_OutOfAfricaArchaicAdmixture_5R19_Gamma_K17_ensembl_havana_104_exons_DFE_plot}
\caption{A comparison of methods for inferring the distribution of fitness effects (DFE) from genetic data.
Simulations were performed using a human out-of-Africa model \citep{ragsdale2019models} with a Gamma distributed DFE
acting on exons parameterized by a mean selection coefficient and a shape parameter. Estimates of the
parameters of the DFE are shown in the left ($\lvert E(s) \rvert $) and right hand panels (shape) respectively.
The human model has three extant populations (CEU, CHB, and YRI), and parameter estimates from each
sample are shown in different colors.
Estimates from three different methods are shown: 1) GRAPES \cite{galtier2016adaptive}, 2) polyDFE \citep{tataru2020polydfe},
and 3) dadi \citep{gutenkunst2009inferring}.}
\label{fig:homsap-dfe.ooa}
\end{figure}
In Figures \ref{fig:homsap-dfe.constant} and \ref{fig:homsap-dfe.ooa} we show a comparison of estimates
from three software packages for inferring the DFE from genetic data--
dadi \citep{gutenkunst2009inferring}, polyDFE \citep{tataru2020polydfe},
and GRAPES \citep{galtier2016adaptive}.
Figure \ref{fig:homsap-dfe.constant} shows estimates from a single, constant size population of humans,
while Figure \ref{fig:homsap-dfe.ooa} shows estimates from a model of human out-of-Africa demography.
In a constant size population, all methods perform reasonably well, although dadi seems to
perform slightly worse than the other methods examined, at least for the small number of replicates we simulated here.
Notably only polyDFE overlapped the true mean selection coefficient, whereas
the other methods slightly underestimated the strength of selection.
For the shape parameter in constant size simulations GRAPES and polyDFE are roughly
equivalent in their performance whereas dadi seems to yield overestimates.
In the out-of-Africa model (fig \ref{fig:homsap-dfe.ooa}) the performance of the methods is more mixed.
In particular variation in accuracy of the parameters estimated in each
sample population depends on the methods used, and each method seems to
have its own biases with respect to the way in which population history
affects parameter estimates. For instance for dadi we see that estimates of
the shape parameter are most accurate in the YRI sample, while for polyDFE
estimates are consistently least accurate in the YRI sample.
With respect to the strength of selection, as in constant size populations,
all methods are performing reasonably well, although dadi seems to
trail in terms of accuracy.
%add PhoSin DFE figure here
\begin{figure}
\centering
\includegraphics[width=\textwidth]{figures/PhoSin/Vaquita2Epoch_1R22/PhoSin_Vaquita2Epoch_1R22_Gamma_R22_Phocoena_sinus.mPhoSin1.pri.110_exons_DFE_plot.pdf}
\caption{
\label{fig:vaquita-dfe}
A comparison of methods for inferring the distribution of fitness effects (DFE) from genetic data.
Simulations were performed using a model of vaquita porpoise demography \citep{robinson2022critically} with a Gamma distributed DFE
acting on exons parameterized by a mean selection coefficient and a shape parameter. Estimates of the
parameters of the DFE are shown in the left ($\lvert E(s) \rvert $) and right hand panels (shape) respectively.
This vaquita model has a single extant population, labelled pop 0, and parameter estimates from each
of the three different methods are shown: 1) GRAPES \cite{galtier2016adaptive}, 2) polyDFE \citep{tataru2020polydfe},
and 3) dadi \citep{gutenkunst2009inferring}.}
\end{figure}
In Figures \ref{fig:vaquita-dfe.constant} and \ref{fig:vaquita-dfe} we show a comparison of estimates
from the same three software packages for inferring the DFE from genetic data, but for simulations of the
vaquita porpoise genome. In the constant size model (fig \ref{fig:vaquita-dfe.constant}) we see that all methods
perform uniformly worse that in the human genome simulations, which consistent underestimation of the mean selection coefficient
and overestimation of the shape parameter.
In context of our model of vaquita porpoise demography \citep{robinson2022critically}, which models a population
decline towards the present, we see that methods to infer the DFE perform no better-- again each method
signficiantly underestimates the mean selection coefficient and overestimates the shape parameter.
%ADK: add why we think this is the case-- is this a bug? or a feature of the genome?
\section*{Detection of Selective Sweeps}
\label{sweeps}
We next highlight how the new models of selection in \stdpopsim can be used to benchmark and compare
the performance of different methods for detecting selective sweeps from genetic data.
Here we have performed simulations of human chromosome 1 under the three population out-of-Africa model from \cite{gutenkunst2009inferring}
with purifying selection acting on exons under the Gamma-distributed DFE described above, the HapMap II genetic map \citep{international2007second},
but overlay on top of this a model of selective sweeps. In particular we have conditioned simulations on the introduction of a beneficial mutation
at a given location (to be varied) in the genome at a given time, and with a moderately strong selection coefficient ($s = 0.03$; $2Ns \sim 600$).
For each sweep location we performed 200 replicate simulations, sampling from them only if the selected allele reached a frequency of
0.95 or greater. Rather than simulate the entire chromosome arm, replicate simulations were performed on 5cM regions of the chromosome
at each of 100 different locations. As recombination rate varies across the genome, the size of the region in base pairs simulated varies,
however the recombination distance, which is critical for the linked effects of a sweep, is held constant.
%sweeep power figure
\begin{figure}
\centering
\includegraphics[width=0.8 \textwidth]{figures/sweeps/chr1_power.pdf}
\caption{
Power to detect selective sweeps at 100 locations along human chromosome 1 under a three population out-of-Africa model \citep{gutenkunst2009inferring}
with and without purifying selection acting on exons under the Gamma-distributed DFE described above.
The HapMap II genetic map \citep{international2007second} was used in each simulation.
Single beneficial mutations were introduced at each location with a selection coefficient of $s = 0.03$,
and only sampled if they reached a terminal frequency of 0.95 or greater.
Three methods for detecting sweeps were used: 1) sweepfinder2 \citep{degiorgio2016sweepfinder2} (top row),
2) diploshic \citep{kern2018diplos} (middle row), and 3) an empirical cutoff of $\pi$ to find regions of reduced diversity (bottom row).
True positive rate is shown for these methods for samples from the CEU and YRI samples (left and right respectively).
Under each method we show as a heat map exon density along the chromosome, along with the local recombination rate in cM/Mb.
Power is shown as the proportion of replicates in which a sweep was detected at each location.
Each panel shows the results of simulations with purifying selection acting on exons (red; labelled BGS)
and without purifying selection acting on exons (blue; labelled Neutral).
}
\label{fig:chr1_power}
\end{figure}
In Figure \ref{fig:chr1_power} we show the power to detect selective sweeps at 100 locations along human chromosome 1 from
the simulations described above. We show the results of three methods for detecting sweeps: 1) sweepfinder2 \citep{degiorgio2016sweepfinder2},
2) diploshic \citep{kern2018diplos}, and 3) an empirical cutoff of $\pi$ to find regions of reduced diversity.
A few things are immediately apparent from this figure. First, the power to detect sweeps is generally higher in the YRI sample
than in the CEU sample, which is consistent with the fact that the YRI sample has a larger effective population size and thus
more variation to detect a sweep. Further the population size of YRI has been relatively constant over the last 100,000 years,
thus the out-of-Africa bottleneck has does not mute the signal of a sweep as much as in the CEU sample. This is consistent with
what we know about the effects of demography on the power to detect sweeps \citep[e.g.,][]{simonsen1995properties}.
Second, the power to detect sweeps is slightly lower in the presence of purifying selection acting on exons but the effect is not
as strong as one might expect. This is likely because the strength of selection acting on exons is relatively weak, and the
sweep detection methods we have used are relatively robust to the effects of background selection.
Finally and most importantly, the power to detect sweeps varies considerably among regions of the genome.
Indeed variation in power along the chromosome is much greater than the variation in power between the CEU and YRI samples or between
models with or without purifying selection. This is likely due to the effects of recombination rate heterogeniety as well
as the density of functional elements along the chromosome.
%sweep power vs recombination rate
\begin{figure}
\centering
\includegraphics[width=0.8 \textwidth]{figures/sweeps/relationship_power_cM.pdf}
\caption{
Power to detect selective sweeps at 100 locations along human chromosome 1 under a three population out-of-Africa model \citep{gutenkunst2009inferring}
with and without purifying selection as a function of local recombination rate.
Simulations are as described in Figure \ref{fig:chr1_power}.
Each panel shows the results of simulations with purifying selection acting on exons (red; labelled BGS)
and without purifying selection acting on exons (blue; labelled Neutral).
True positive rate is shown for these methods for samples from the CEU and YRI samples (left and right respectively).
Points represent the power at each location and trend lines along with 95\% confidence intervals are shown fit to those points.
}
\label{fig:power-recomb}
\end{figure}
To further explore the effects of recombination rate and exon density on the power to detect sweeps, we show in Figure \ref{fig:power-recomb}
the relationship between power and local recombination rate for our sweep simulations. We see that power to detect sweeps is generally
a decreasing function of local recombination rate, which is consistent with sweeps having a larger genomic footprint in regions of low
recombination. We also see that the presence of purifying selection acting on exons has a relatively small effect on the power to detect sweeps,
but one that is present across the entire range of recombinations rates observed in human chromosome 1.
We also note that the different methods for detecting sweeps that we have explored here respond differently to the effects of recombination rate.
Unsurprisingly, the empirical cutoff of $\pi$ to detect sweeps is the less robust to the effects of recombination rate than the other two methods,
that are specifically designed to find selective sweeps. Whereas sweepfinder2 has a clear decline in power with increasing recombination rate,
diploshics behavior is more variable, with a slight increase in power at intermediate recombination rates in the CEU sample and a decline
in power at high recombination rates in the YRI sample.
In Figure \ref{fig:power-exon} we show the relationship between power and local exon density for our sweep simulations.
Here there is little relationship between power and exon density, which is somewhat surprising given that the power to detect sweeps
is generally lower in the presence of purifying selection acting on exons. However while this is the case we see that
generally all methods have more power to detect sweeps in regions of lower exon density, which is consistent with the idea that
background selection acting on nearby exons can reduce the signature of a sweep. However this is far from a simple pattern:
for instance sweepfinder2 seems to have minimum power in regions of intermediate exon density, while diploshic is roughly constant
or perhaps has a slight increase in power at intermediate exon densities. Noise is the estimates of power with respect to exon density
are prohibitively high for the empirical cutoff of $\pi$ to detect sweeps, so we it's not clear what conclusions can be drawn.
\lipsum[20-25]
% \section*{outline}
% \label{outline}
% \begin{enumerate}
% \item Single-population demographic inference methods
% \begin{enumerate}
% \item mscm2
% \item stairwaiplot
% \item GONE
% \item SMC++
% \end{enumerate}
% \item DFE inference methods
% \begin{enumerate}
% \item dadi
% \item polyDFE
% \item DFE-alpha
% \item GRAPES
% \end{enumerate}
% \item sweeps
% \begin{enumerate}
% \item compare effect of BGS on sweep detection
% \item compare power in different pops
% \item compare methods
% \end{enumerate}
% \end{enumerate}
\section*{Conclusion}
\label{conclusion}
\section*{Data availability}\label{data_availability}
\section*{Acknowledgments}\label{acknowledgements}
\section*{Funding}
\label{funding}
\printbibliography
%%%%%%%% supplementary material
\clearpage
\beginsupplement
\section*{Supplementary Material}
% constant size DFE figure
\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{figures/HomSap/Constant/HomSap_Constant_Gamma_K17_ensembl_havana_104_exons_DFE_plot}
\caption{
Performance of methods to infer the distribution of fitness effects (DFE) from genetic data.
Simulations were performed using a human constant size model with a Gamma distributed DFE
acting on exons parameterized by a mean selection coefficient and a shape parameter. On each row we show the true parameter
as a red line, and the estimates as boxplots from replicate simulations. The human model has three extant populations (CEU, CHB, and YRI).
Each row of the figure shows estimates from a different method: 1) dadi \citep{gutenkunst2009inferring}, 2) polyDFE \citep{tataru2020polydfe},
3) DFE-alpha \citep{eyre2009estimating}, and 4) GRAPES \citep{galtier2016adaptive}.
}
\label{fig:homsap-dfe.constant}
\end{figure}
\begin{figure}
\centering
\includegraphics[width=\textwidth]{figures/PhoSin/Constant/PhoSin_Constant_Gamma_R22_Phocoena_sinus.mPhoSin1.pri.110_exons_DFE_plot.pdf}
\caption{
\label{fig:vaquita-dfe.constant}
A comparison of methods for inferring the distribution of fitness effects (DFE) from genetic data.
Simulations were performed using a model of vaquita porpoise genome under constant population size with a Gamma distributed DFE
acting on exons parameterized by a mean selection coefficient and a shape parameter. Estimates of the
parameters of the DFE are shown in the left ($\lvert E(s) \rvert $) and right hand panels (shape) respectively.
This vaquita model has a single extant population, labelled pop 0, and parameter estimates from each
of the three different methods are shown: 1) GRAPES \cite{galtier2016adaptive}, 2) polyDFE \citep{tataru2020polydfe},
and 3) dadi \citep{gutenkunst2009inferring}.}
\end{figure}
% constant all site size DFE figure
\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{figures/HomSap/Constant/HomSap_Constant_Gamma_K17_all_sites_DFE_plot}
\caption{
\label{fig:homsap-dfe.constant.all_sites}
Performance of methods to infer the distribution of fitness effects (DFE) from genetic data
where all sites along the simulated chromosome can experience selected and neutral mutation
as opposed to exonic regions only.
}
\end{figure}
\begin{table}[ht]
\centering
\small
\caption{\bf{Performance of DFE methods across the simulation scenarios}.
Bold rows show the lowest mean absolute error (MAE) for the two DFE parameters
for each species.}
\begin{tabular}{lllrrrrrr}
\toprule
species ID & demography & annotation & \makecell{MAE \\ $E|s|$ \\ dadi} & \makecell{MAE \\ $E|s|$ \\ grapes} & \makecell{MAE \\ $E|s|$ \\ polyDFE} & \makecell{MAE \\ shape \\ dadi} & \makecell{MAE \\ shape \\ grapes} & \makecell{MAE \\ shape \\ polyDFE} \\
\midrule
HomSap & Constant & all sites & 0.0027 & 0.0049 & \bf{0.0018} & \bf{0.00071} & 0.029 & 0.014 \\
HomSap & Constant & exons & 0.0012 & \bf{0.00081} & 0.0023 & 0.030 & 0.0086 & \bf{0.0068} \\
HomSap & OOA 5R19 & all sites & 0.012 & 0.0079 & \bf{0.0074} & \bf{0.027} & 0.055 & 0.035 \\
HomSap & OOA 5R19 & exons & 0.014 & \bf{0.0049} & 0.0072 & 0.031 & 0.051 & \bf{0.024} \\
PhoSin & Constant & all sites & 0.024 & \bf{0.024} & 0.024 & 0.21 & 0.24 & \bf{0.19} \\
PhoSin & Constant & exons & 0.024 & \bf{0.024} & 0.024 & \bf{0.23} & 0.25 & 0.23 \\
PhoSin & Vaquita2Epoch 1R22 & all sites & 0.024 & \bf{0.023} & 0.024 & 0.20 & 0.23 & \bf{0.18} \\
PhoSin & Vaquita2Epoch 1R22 & exons & 0.024 & \bf{0.023} & 0.024 & \bf{0.18} & 0.23 & 0.21 \\
\bottomrule
\end{tabular}
\label{tab:dfe_table}
\end{table}
%sweep power vs exon density
\begin{figure}
\centering
\includegraphics[width=0.8 \textwidth]{figures/sweeps/relationship_power_exon.pdf}
\caption{
Power to detect selective sweeps at 100 locations along human chromosome 1 under a three population out-of-Africa model \citep{gutenkunst2009inferring}
with and without purifying selection as a function of local exon density.
Simulations are as described in Figure \ref{fig:chr1_power}.
Each panel shows the results of simulations with purifying selection acting on exons (red; labelled BGS)
and without purifying selection acting on exons (blue; labelled Neutral).
True positive rate is shown for these methods for samples from the CEU and YRI samples (left and right respectively).
Points represent the power at each location and trend lines along with 95\% confidence intervals are shown fit to those points.
}
\label{fig:power-exon}
\end{figure}
\stopsupplement
\end{document}