packages <- c("tidyverse", "viridis", "ggpubr",
"vegan", "Biostrings", "phyloseq",
"microbiome", "RColorBrewer", "ggridges",
"genoPlotR", "gghalves", "ggtree")
suppressPackageStartupMessages(lapply(packages,
library,
character.only=T,
warn.conflicts=F,
verbose=F))
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "viridis" "viridisLite" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "ggpubr" "viridis" "viridisLite" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "vegan" "lattice" "permute" "ggpubr" "viridis"
## [6] "viridisLite" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[5]]
## [1] "Biostrings" "GenomeInfoDb" "XVector" "IRanges" "S4Vectors"
## [6] "stats4" "BiocGenerics" "vegan" "lattice" "permute"
## [11] "ggpubr" "viridis" "viridisLite" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "phyloseq" "Biostrings" "GenomeInfoDb" "XVector" "IRanges"
## [6] "S4Vectors" "stats4" "BiocGenerics" "vegan" "lattice"
## [11] "permute" "ggpubr" "viridis" "viridisLite" "lubridate"
## [16] "forcats" "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [26] "graphics" "grDevices" "utils" "datasets" "methods"
## [31] "base"
##
## [[7]]
## [1] "microbiome" "phyloseq" "Biostrings" "GenomeInfoDb" "XVector"
## [6] "IRanges" "S4Vectors" "stats4" "BiocGenerics" "vegan"
## [11] "lattice" "permute" "ggpubr" "viridis" "viridisLite"
## [16] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [21] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [26] "stats" "graphics" "grDevices" "utils" "datasets"
## [31] "methods" "base"
##
## [[8]]
## [1] "RColorBrewer" "microbiome" "phyloseq" "Biostrings" "GenomeInfoDb"
## [6] "XVector" "IRanges" "S4Vectors" "stats4" "BiocGenerics"
## [11] "vegan" "lattice" "permute" "ggpubr" "viridis"
## [16] "viridisLite" "lubridate" "forcats" "stringr" "dplyr"
## [21] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [26] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [31] "datasets" "methods" "base"
##
## [[9]]
## [1] "ggridges" "RColorBrewer" "microbiome" "phyloseq" "Biostrings"
## [6] "GenomeInfoDb" "XVector" "IRanges" "S4Vectors" "stats4"
## [11] "BiocGenerics" "vegan" "lattice" "permute" "ggpubr"
## [16] "viridis" "viridisLite" "lubridate" "forcats" "stringr"
## [21] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [26] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [31] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "genoPlotR" "grid" "ade4" "ggridges" "RColorBrewer"
## [6] "microbiome" "phyloseq" "Biostrings" "GenomeInfoDb" "XVector"
## [11] "IRanges" "S4Vectors" "stats4" "BiocGenerics" "vegan"
## [16] "lattice" "permute" "ggpubr" "viridis" "viridisLite"
## [21] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [26] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [31] "stats" "graphics" "grDevices" "utils" "datasets"
## [36] "methods" "base"
##
## [[11]]
## [1] "gghalves" "genoPlotR" "grid" "ade4" "ggridges"
## [6] "RColorBrewer" "microbiome" "phyloseq" "Biostrings" "GenomeInfoDb"
## [11] "XVector" "IRanges" "S4Vectors" "stats4" "BiocGenerics"
## [16] "vegan" "lattice" "permute" "ggpubr" "viridis"
## [21] "viridisLite" "lubridate" "forcats" "stringr" "dplyr"
## [26] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [31] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [36] "datasets" "methods" "base"
##
## [[12]]
## [1] "ggtree" "gghalves" "genoPlotR" "grid" "ade4"
## [6] "ggridges" "RColorBrewer" "microbiome" "phyloseq" "Biostrings"
## [11] "GenomeInfoDb" "XVector" "IRanges" "S4Vectors" "stats4"
## [16] "BiocGenerics" "vegan" "lattice" "permute" "ggpubr"
## [21] "viridis" "viridisLite" "lubridate" "forcats" "stringr"
## [26] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [31] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [36] "utils" "datasets" "methods" "base"
source("scripts/plot_theme.R")
theme_set(PAdJ_theme())
knitr::opts_chunk$set(cache = T)
genome <- "NatCom_15117"
annotations <-
rio::import("datatables/pharokka_cds_final_merged_output.tsv")
annotation.df <-
rio::import("datatables/phynteny.tsv")
DNA_segs <-
as.dna_seg(annotation.df %>%
mutate(fill = case_when(
category_combined == "moron, auxiliary metabolic gene and host takeover" ~ "#ed9b40",
category_combined == "transcription regulation" ~ "#8c5e5f",
category_combined == "DNA, RNA and nucleotide metabolism" ~ "#5c1a1b",
category_combined == "head and packaging" ~ "#335c67",
category_combined == "connector" ~ "#708c94",
category_combined == "other" ~ "grey60",
category_combined == "tail" ~ "#192e33",
category_combined == "lysis" ~ "#8d6b94"
)) %>%
transmute(name = ID,
start = start,
end = end,
strand = strand,
fill,
color = "black"))
plot_gene_map(dna_segs = list(DNA_segs),
arrow_head_len = 500,
plot_new = T)
An hmmsearch was done with an hmm-profile built from each of the nine marker genes from our earlier paper (https://doi.org/10.1038/s41467-022-31390-5) against contigs from seven phage databases.
Database | Contigs (all) | Contigs (25kbp) |
---|---|---|
IMG/VR | 15,722,824 | 1,128,039 |
GPD | 142,809 | 84,491 |
CHVD | 93,860 | 53,722 |
de Jonge (2022) | 63,481 | 19,157 |
MGV | 358,455 | 18,860 |
GVD | 33,242 | 6,769 |
RefSeq | 4,220 | 3,950 |
setwd("analysis_output/hmm_output/")
hmm_out_full <-
map_dfr(
list.files(),
function(hmm){
read_table(
hmm,
skip = 3,
col_names = F,
comment = '#',
show_col_types = F
) %>%
transmute(
orfName = X1,
contigName = str_remove(X1, '_CDS_\\d+$'),
source = str_remove(X1, '_.+$'),
target = str_remove(X3, '_ali'),
bitscore = X9
) %>%
filter(bitscore >= 50) %>%
group_by(contigName, target) %>%
slice_max(order_by = bitscore,
n = 1,
with_ties = F) %>%
ungroup()
}
)
checkv <- rio::import("datatables/checkv_quality_summary.tsv")
hist.df <-
hmm_out_full %>%
group_by(contigName) %>%
mutate(markers = n_distinct(target)) %>%
ungroup() %>%
filter(target == "00091",
contigName %in% subset(checkv,
completeness == 100 &
provirus == "No" &
warnings == "")$contig_id)
ggplot(data = hist.df,
aes(x = bitscore,
y = as.character(markers),
color = markers,
fill = markers)) +
geom_density_ridges(alpha = 0.1,
jittered_points = TRUE,
point_size = 0.1,
point_alpha = 0.1,
bandwidth = 20) +
geom_vline(xintercept = 700,
color = "tomato3",
linetype = "dashed") +
theme(
axis.line.x = element_blank(),
axis.line.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = ""
) +
xlab("TerL bitscore") +
ylab("marker genes in genome") +
scale_color_viridis() +
scale_fill_viridis() +
scale_y_discrete(expand = expansion(mult = c(0.01, .15))) +
scale_x_continuous(breaks = seq(0,1200, by = 200))
PCs <-
read.table("datatables/heliusvirales_cluster.tsv",
header = F, sep = '\t') %>%
transmute(
PC = V1,
ORFname = V2,
contigName = str_remove(V2, "_CDS_\\d+$")
) %>%
filter(ORFname != "")
recreate Supplementary Table 1
PC_prevalence <-
tibble(PCs) %>%
left_join(annotations[,c(1,22)],
by = c("ORFname"="gene")) %>%
group_by(PC) %>%
mutate(contigs = n_distinct(contigName)) %>%
group_by(PC, contigs, annot) %>%
reframe(annot_contig = n_distinct(contigName)) %>%
select(PC, contigs, annot, annot_contig) %>%
distinct() %>%
arrange(-annot_contig) %>%
group_by(PC) %>%
mutate(
best_annot = case_when(
n() == 1 ~ annot,
dplyr::first(annot) != "hypothetical protein" ~ dplyr::first(annot),
dplyr::first(annot) == "hypothetical protein" ~ dplyr::nth(annot,2),
TRUE ~ "none")
) %>%
ungroup() %>%
select(PC, contigs, best_annot) %>%
distinct() %>%
arrange(-contigs) %>%
mutate(
ref_gene = PC,
PC = paste("PC", row_number(), sep = "_")
) %>%
left_join(rio::import("datatables/deJonge_PCs.tsv"), by = "PC")
rbind(
PC_prevalence %>%
slice_max(contigs, n = 14)
,
PC_prevalence %>%
filter(!is.na(deJonge2022_marker_gene))
) %>%
distinct() %>%
mutate(group = ifelse(is.na(deJonge2022_marker_gene),
"others", "marker_gene")) %>%
ggdotchart(
x = "PC",
y = "contigs",
add = "segment",
add.params = list(color = "grey",
size = 2),
dot.size = 4,
rotate = T,
sorting = "desc",
color = "group",
palette = c("tomato3",
"grey40"),
ggtheme = PAdJ_theme()
) +
theme(#axis.text.y = element_blank(),
panel.grid.major.y = element_blank())
source("scripts/Fig_2B.R")
cowplot::plot_grid(plot_1, plot_2,
ncol = 1, align = "vh",
rel_heights = c(0.25,1))
suppressWarnings(source("scripts/TerL_parsing.R"))
origins %>%
inner_join(helius_groups, by = join_by(contigName)) %>%
group_by(Family, Subfamily, Origin_simple) %>%
reframe(contigs = n()) %>%
mutate(Lineage = paste(
str_replace(Family, "_", " "),
str_replace(Subfamily, "_", " "),
sep = ";"
)) %>%
filter(!Lineage %in% c("unclassified Heliusvirales;unclassified Heliusvirales",
"Utuviridae;unclassified Utuviridae")) %>%
ggbarplot(
x = "Lineage",
fill = "Origin_simple",
y = "contigs",
rotate = T,
palette = c(
"#4DAF4A",
"#E41A1C",
"#925E9F",
"#377EB8",
"grey40",
"#EFC000",
"#ff9cd6",
"#db67ab",
"grey"
),
ggtheme = PAdJ_theme(),
ylab = "number of contigs"
) +
scale_y_continuous(expand = c(0, 0.3)) +
theme(
panel.grid.major.y = element_blank(),
axis.text.y = element_text(face = "italic"),
legend.position = "right"
) +
guides(fill = guide_legend("Genome origin"))
rm(hmm_out_full, annotation.df, DNA_segs, hist.df, PC_prevalence)
iphop_hosts <-
rio::import("datatables/Host_prediction_to_genus_m90.csv",
setclass = "tibble") %>%
transmute(
Virus,
Host_tax = `Host genus`,
Confidence = `Confidence score`
) %>%
right_join(helius_groups[,c(1:4)],
by = c("Virus"="contigName")) %>%
separate_wider_delim(Host_tax,
";",
names = c("H_Domain",
"H_Phylum",
"H_Class",
"H_Order",
"H_Family",
"H_Genus")) %>%
mutate(across(starts_with("H_"),
~replace_na(.,
"unknown")),
across(starts_with("H_"),
~str_remove(.,
".__")),
H_Family = ifelse(H_Family == "",
paste("unknown",
H_Order,
sep = "_"),
H_Family),
H_Genus = ifelse(H_Genus == "",
paste("unknown",
H_Family,
sep = "_"),
H_Genus),
Lineage = paste(Family,
Subfamily,
sep = ";"))
most_common_families <-
iphop_hosts %>%
mutate(H_lineage = paste(H_Phylum, H_Family)) %>%
group_by(H_lineage) %>%
reframe(genomes = n_distinct(Virus)) %>%
arrange(-genomes) %>%
filter(H_lineage != "unknown unknown") %>%
slice_max(genomes, n = 10)
colors_hosts <-
c("#7CAF89",
"#d4b7b7",
"#9b5758",
"#a87eb3",
"#c9afcf",
"#e9dfec",
"#afa4bd",
"#ebe8ee",
"#ffffff",
"#ecb965",
"#277B3D",
"#710F11",
"#935FA0",
"#E59B24",
"#69C9CA",
"grey30",
"grey80"
)
names(colors_hosts) <-
c("Actinobacteriota Actinomycetaceae",
"Firmicutes Enterococcaceae",
"Firmicutes Streptococcaceae",
"Firmicutes_A Acutalibacteraceae",
"Firmicutes_A CAG-74",
"Firmicutes_A Clostridiaceae",
"Firmicutes_A Lachnospiraceae",
"Firmicutes_A Oscillospiraceae",
"Firmicutes_A Ruminococcaceae",
"Firmicutes_C Acidaminococcaceae",
"Actinobacteriota (other)",
"Firmicutes (other)",
"Firmicutes_A (other)",
"Firmicutes_C (other)",
"Proteobacteria (other)",
"x_others",
"x_unknown")
iphop_hosts %>%
mutate(H_lineage = paste(H_Phylum, H_Family),
H_lineage = case_when(
H_lineage %in% most_common_families$H_lineage ~ H_lineage,
H_lineage == "unknown unknown" ~ "x_unknown",
startsWith(H_lineage, "Firmicutes_A") ~ "Firmicutes_A (other)",
startsWith(H_lineage, "Firmicutes_C") ~ "Firmicutes_C (other)",
startsWith(H_lineage, "Firmicutes") ~ "Firmicutes (other)",
startsWith(H_lineage, "Actinobacteriota") ~ "Actinobacteriota (other)",
startsWith(H_lineage, "Proteobacteria") ~ "Proteobacteria (other)",
TRUE ~ "x_others")) %>%
group_by(Lineage, H_lineage) %>%
reframe(genomes = n_distinct(Virus)) %>%
filter(!str_remove(Lineage, ".+;") %in% c("unclassified_Heliusvirales",
"unclassified_Utuviridae",
"unclassified_Hathorviridae",
"unclassified_Aineviridae")) %>%
ggbarplot(x = "Lineage",
fill = "H_lineage",
palette = colors_hosts,
y = "genomes",
rotate = T,
ggtheme = PAdJ_theme(),
ylab = "number of contigs") +
theme(legend.position = "right",
panel.grid.major.y = element_blank())
AMG_pathways <-
rio::import("datatables/AMGs/VIBRANT_AMG_pathways_all_heliusviridae.tsv")
AMG_genome <-
rio::import("datatables/AMGs/VIBRANT_AMG_individuals_all_heliusviridae.tsv")
KO_pathway <-
AMG_pathways %>%
select(-`Total AMGs`) %>%
separate(
col = `Present AMG KOs`,
sep = ",",
into = paste("KO", c(1:19), sep = ""),
fill = "right"
) %>%
pivot_longer(cols = -c(1:3), values_to = "KO") %>%
select(-name) %>%
filter(!is.na(KO))
total <-
helius_groups %>%
mutate(Subfamily = paste(Family, Subfamily, sep = ";")) %>%
group_by(Subfamily) %>%
summarise(total = n())
world_map <-
map_data("world") %>%
filter(! long > 180)
countries <-
world_map %>%
distinct(region)
origins %>%
mutate(Origin = ifelse(startsWith(Origin, "Human"), "Human", "non-human"),
country_detected = case_when(
country_detected == "United Kingdom" ~ "UK",
country_detected == "United States" ~ "USA",
TRUE ~ country_detected
)) %>%
group_by(country_detected) %>%
summarise(category = case_when(
!"Human" %in% c(Origin) ~ "only non-human",
!"non-human" %in% c(Origin) ~ "only Human",
TRUE ~ "both Human and non-human"
)) %>%
full_join(countries,
by = c("country_detected" = "region")) %>%
mutate(category = replace_na(category, "none detected/no data"),
category = factor(category,
levels = c("only Human",
"only non-human",
"both Human and non-human",
"none detected/no data"))) %>%
ggplot(aes(fill = category,
map_id = country_detected)) +
geom_map(map = world_map,
color = "white",
linewidth = 0.1) +
expand_limits(x = world_map$long,
y = world_map$lat) +
coord_map("moll") +
ggthemes::theme_map() +
scale_fill_manual(values = c("#7f9fad", "#bc5090", "#003f5c", "grey40"))
source("scripts/pasolli_parsing.R")
rbind(
pasolli_TerL %>%
group_by(Study) %>%
summarise(prevalence =
round(n_distinct(Subject[diversity > 0])/n_distinct(Subject),
3),
total =
n_distinct(Subject[diversity > 0])) %>%
ungroup(),
pasolli_TerL %>%
summarise(prevalence =
round(n_distinct(Subject[diversity > 0])/n_distinct(Subject),
3),
total = n_distinct(Subject[diversity > 0])) %>%
mutate(Study = "Overall")) %>%
ggdotchart(
x = "Study",
y = "prevalence",
ggtheme = PAdJ_theme(),
rotate = T,
add = "segment",
ylab = "individuals with Heliusvirales",
xlab = "Study",
size = 2,
dot.size = 6.5,
label = "total",
font.label = list(color = "white", size = 8,
vjust = 0.4),
color = "#FC4E07"
) +
scale_y_continuous(labels = scales::label_percent(),
breaks = seq(0,1,by = 0.1)) +
theme(panel.grid.major.y = element_blank())
tree <-
treeio::read.beast("analysis_output/combined_ann.tree")
ggtree(tree,
mrsd = "2023-01-01",
size = 1) +
geom_range(
"CAheight_0.95_HPD",
color = 'steelblue3',
size = 2,
alpha = 0.5) +
geom_label2(aes(
label = round(as.numeric(CAheight_mean), 0),
subset = as.numeric(CAheight_mean) > 100,
x = branch),
vjust = -0.5,
label.padding = unit(0.1, "lines"),
size = 3,
label.size = 0,
label.r = unit(0, "lines")) +
geom_cladelabel(19,
label = "Mataharivirinae",
offset = 100000,
offset.text = 5000,
barsize = 2,
color = "#993333") +
geom_cladelabel(27,
label = "Ravirinae",
offset = 100000,
offset.text = 5000,
barsize = 2,
color = "#cc9999") +
geom_tiplab(size = 2) +
theme_tree2() +
theme(panel.grid.major.x = element_line(linetype = 'dashed',
color = "#D9D9D9")) +
scale_x_continuous(breaks = seq(0,-100000, by = -25000),
labels = c(0, 25,50,75,100),
expand = c(0.15, 0),
name = "time (thousands of years)")
ancient_hg.df <-
TerL.df %>%
mutate(
Study = case_when(
Study == "ancient" & startsWith(Sample, "SAMEA") ~ "Maixner_2021",
Sample == "Otzi" ~ "Maixner_2016",
Study == "ancient" & country != "AUT" ~ "Wibowo_2021",
TRUE~Study
)) %>%
filter(Study %in% c("RampelliS_2015",
"Obregon-TitoAJ_2015",
"Wibowo_2021",
"Maixner_2021",
"Maixner_2016")) %>%
mutate(lifestyle =
case_when(
Study == "Obregon-TitoAJ_2015" & startsWith(Sample, "HC") ~
"rural_agricultural",
Study == "Obregon-TitoAJ_2015" & startsWith(Sample, "SM") ~
"hunter_gatherer",
Study == "Obregon-TitoAJ_2015" & startsWith(Sample, "NO") ~
"urban",
Study == "RampelliS_2015" & startsWith(Sample, "I") ~
"urban",
Study == "RampelliS_2015" & startsWith(Sample, "H") ~
"hunter_gatherer",
TRUE~"ancient")) %>%
filter(lifestyle != "rural_agricultural")
ancient_lme <-
map_dfr(
list(
c("ancient", "hunter_gatherer"),
c("ancient", "urban"),
c("hunter_gatherer", "urban")
) ,
function(combo){
ancient_hg.df %>%
filter(lifestyle %in% combo) %>%
nlme::lme(
fixed = diversity~lifestyle,
random = ~1|Study
) %>%
rstatix::Anova() %>%
rstatix::anova_summary() %>%
mutate(group1 = combo[[1]],
group2 = combo[[2]])
}
) %>%
mutate(p.adj.lme = p.adjust(p, method = "BH")) %>%
select("group1", "group2", "p.adj.lme")
ancient_hg.df_sign <-
ancient_hg.df %>%
rstatix::wilcox_test(diversity ~ lifestyle,
p.adjust.method = "BH") %>%
rstatix::add_xy_position(fun = "max") %>%
select(group1, group2, y.position) %>%
left_join(ancient_lme, by = join_by(group1, group2)) %>%
mutate(p.adj.signif = case_when(
p.adj.lme > 0.05 ~ "ns",
between(p.adj.lme, 0.01, 0.05) ~ "*",
between(p.adj.lme, 0.01, 0.001) ~ "**",
between(p.adj.lme, 0.001, 0.0001) ~ "***",
p.adj.lme < 0.0001 ~ "****"
))
ancient_hg.df %>%
ggboxplot(
y = "diversity",
x = "lifestyle",
ggtheme = PAdJ_theme(),
add = "jitter",
add.params = list(width = 0.1),
ylab = "observed Heliusvirales TerL genes/person/1B bases",
xlab = "population",
color = "lifestyle",
palette = get_palette("jama", 5),
width = 0.5
) +
stat_pvalue_manual(ancient_hg.df_sign,
label = "p.adj.lme",
tip.length = 0.01,
size = 5,
hide.ns = F) +
theme(panel.grid.major.x = element_blank(),
legend.position = "") +
scale_x_discrete(limits = c("ancient",
"hunter_gatherer",
"urban"))
source("scripts/ancient_tree.R")
## Run 0 stress 0.2173385
## Run 1 stress 0.2188407
## Run 2 stress 0.2201978
## Run 3 stress 0.2201978
## Run 4 stress 0.2190294
## Run 5 stress 0.2599828
## Run 6 stress 0.2173246
## ... New best solution
## ... Procrustes: rmse 0.003352754 max resid 0.02641215
## Run 7 stress 0.2408295
## Run 8 stress 0.219092
## Run 9 stress 0.2192606
## Run 10 stress 0.2203787
## Run 11 stress 0.2190229
## Run 12 stress 0.2172797
## ... New best solution
## ... Procrustes: rmse 0.002994012 max resid 0.01719184
## Run 13 stress 0.2190095
## Run 14 stress 0.2192674
## Run 15 stress 0.2173247
## ... Procrustes: rmse 0.003011624 max resid 0.01733858
## Run 16 stress 0.220061
## Run 17 stress 0.2203886
## Run 18 stress 0.256001
## Run 19 stress 0.219092
## Run 20 stress 0.2188407
## *** Best solution was not repeated -- monoMDS stopping criteria:
## 18: stress ratio > sratmax
## 2: scale factor of the gradient < sfgrmin
plot_ordination(physeq = ancient.ps,
ordination = PCA,
justDF = T) %>%
ggscatter(
x = "NMDS1",
y = "NMDS2",
color = "lifestyle",
ggtheme = PAdJ_theme(),
size = 1.5,
mean.point = T,
mean.point.size = 4,
ellipse = T,
ellipse.alpha = 0.1,
ellipse.level = 0.5,
star.plot = F,
star.plot.lty = "dashed",
star.plot.lwd = 0.5,
palette = get_palette("nejm", 3),
) +
theme(
aspect.ratio = 1,
panel.border = element_rect(color = "#969696", linewidth = 1),
axis.line.x = element_blank(),
axis.line.y = element_blank(),
axis.text.y = element_text(size = 10)
) +
guides(color = guide_legend(override.aes = list(alpha = 1)))
ps <-
subset_samples(ancient.ps,
lifestyle != "ancient")
dist_mat <- phyloseq::distance(ps, "unifrac")
set.seed(20230711)
adonis2(formula = dist_mat~lifestyle,
data = data.frame(ps@sam_data),
permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dist_mat ~ lifestyle, data = data.frame(ps@sam_data), permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## lifestyle 1 1.9186 0.08704 7.246 1e-04 ***
## Residual 76 20.1233 0.91296
## Total 77 22.0419 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rm(ps,dist_mat)
ps <-
subset_samples(ancient.ps,
lifestyle != "hunter_gatherer")
dist_mat <- phyloseq::distance(ps, "unifrac")
set.seed(20230711)
adonis2(formula = dist_mat~lifestyle,
data = data.frame(ps@sam_data),
permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dist_mat ~ lifestyle, data = data.frame(ps@sam_data), permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## lifestyle 1 1.5819 0.12867 6.2021 1e-04 ***
## Residual 42 10.7122 0.87133
## Total 43 12.2941 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rm(ps,dist_mat)
ps <-
subset_samples(ancient.ps,
lifestyle != "urban")
dist_mat <- phyloseq::distance(ps, "unifrac")
set.seed(20230711)
adonis2(formula = dist_mat~lifestyle,
data = data.frame(ps@sam_data),
permutations = 9999)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dist_mat ~ lifestyle, data = data.frame(ps@sam_data), permutations = 9999)
## Df SumOfSqs R2 F Pr(>F)
## lifestyle 1 0.994 0.06388 3.6852 3e-04 ***
## Residual 54 14.565 0.93612
## Total 55 15.559 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rm(ps,dist_mat)
ancient_ab.ps <-
phyloseq(
unifrac_data %>%
group_by(contigName, sample) %>%
reframe(mapped = sum(mapped)) %>%
left_join(read_count, by = join_by(sample)) %>%
mutate(mapped = mapped/reads_tot) %>%
select(-reads_tot) %>%
pivot_wider(
names_from = contigName,
values_from = mapped,
values_fill = 0
) %>%
column_to_rownames("sample") %>%
otu_table(taxa_are_rows = F)
,
distinct(unifrac_data[,c(4, 7)]) %>%
left_join(distinct(ancient_abundance.df[,c(1:3)]),
by = join_by(sample)) %>%
mutate(samplename = sample) %>%
column_to_rownames("sample") %>%
sample_data(),
ape::as.phylo(ancient_unifrac_tree),
ancient_tree_families_ORFname %>%
column_to_rownames("contigName") %>%
as.matrix() %>%
tax_table()
)
ancient_lineage.df <-
unifrac_data[,c(1,2,3,4)] %>%
left_join(read_count, by = join_by(sample)) %>%
left_join(ancient_tree_families_ORFname, by = join_by(contigName)) %>%
mutate(lineage = paste(Family, Subfamily, sep = ";")) %>%
select(contigName, contigLen, mapped, sample, reads_tot, lineage) %>%
tibble() %>%
group_by(lineage, sample) %>%
reframe(present = ifelse(mapped > 0, 1, 0)) %>%
distinct() %>%
rbind(tidyr::expand_grid(
sample =
subset(ancient_abundance.df[,c(1,3)],
!sample %in% psmelt2(ancient_ab.ps)$samplename)$sample,
lineage =
unique((helius_groups %>%
mutate(lineage = paste(Family,
Subfamily,
sep = ";")))$lineage),
present = 0)) %>%
pivot_wider(names_from = sample, values_from = present, values_fill = 0) %>%
pivot_longer(-1, names_to = "sample", values_to = "present") %>%
left_join(ancient_abundance.df[,c(1:3)], by = c("sample")) %>%
filter(!lineage %in% c("Hathorviridae;unclassified_Hathorviridae",
"unclassified_Heliusvirales;unclassified_Heliusvirales",
"Utuviridae;unclassified_Utuviridae",
"Aineviridae;unclassified_Aineviridae"))
ancient_lineage.df_sign <-
map(
unique(ancient_lineage.df$lineage),
function(tax){
ancient_lineage.df %>%
group_by(lineage, lifestyle) %>%
reframe(present = n_distinct(sample[present == 1]),
absent = n_distinct(sample)-present) %>%
filter(lineage == tax) %>%
select(-lineage) %>%
column_to_rownames("lifestyle") %>%
rstatix::pairwise_fisher_test(p.adjust.method = "BH") %>%
mutate(lineage = tax)
}
) %>%
list_rbind()
ancient_lineage.df %>%
mutate(Family = str_remove(lineage, ";.+")) %>%
ggplot(aes(x = sample,
y = lineage,
fill = as.character(present))) +
geom_tile() +
facet_grid(Family~lifestyle, scales = "free", space="free") +
scale_fill_manual(values = c("grey70","tomato3")) +
PAdJ_theme() +
theme(axis.text.x = element_blank(),
panel.grid.major = element_blank())
ancient_combinations <-
expand_grid(Sample = ancient_lineage.df$sample, lineage = ancient_lineage.df$lineage)
lineages <-
helius_groups %>%
mutate(lineage = paste(Family,
Subfamily,
sep = ";")) %>%
filter(!lineage %in% c(
"unclassified_Heliusvirales;unclassified_Heliusvirales",
"Utuviridae;unclassified_Utuviridae",
"Hathorviridae;unclassified_Hathorviridae")
)
map_dfr(
unique(lineages$lineage),
function(tax){
subset(ancient.df, !startsWith(Sample, "HCO")) %>%
mutate(Origin = str_remove(Origin, " .+$"),
lineage = paste(Family, Subfamily, sep = ";")) %>%
select(Sample, Origin, count, lineage) %>%
right_join(ancient_combinations, by = join_by(Sample, lineage)) %>%
distinct() %>%
mutate(
count = replace_na(count, 0),
Origin = case_when(
str_extract(Sample, "^...") %in% c("SRR", "ERR", "Otzi") ~ "Ancient",
str_remove(Sample, "\\d+$") %in% c("H", "SM") ~ "Hunter-gatherer",
TRUE ~ "Urban"
)
) %>%
filter(lineage == tax) %>%
group_by(Origin) %>%
summarise(present = n_distinct(Sample[count > 0]),
absent = n_distinct(Sample[count == 0])) %>%
ungroup() %>%
column_to_rownames("Origin") %>%
rstatix::pairwise_fisher_test() %>%
mutate(lineage = tax)
}
)
## # A tibble: 27 × 7
## group1 group2 n p p.adj p.adj.signif lineage
## <chr> <chr> <int> <dbl> <dbl> <chr> <chr>
## 1 Ancient Urban 97 1 1 ns Tonatiuhviridae;unclassified_T…
## 2 Ancient Urban 97 1 1 ns Hunahpuviridae;unclassified_Hu…
## 3 Ancient Urban 97 1 1 ns Aineviridae;Willkavirinae
## 4 Ancient Urban 97 0.194 0.194 ns Aineviridae;Haulvirinae
## 5 Ancient Urban 97 1 1 ns Aineviridae;Intivirinae
## 6 Ancient Urban 97 1 1 ns Aineviridae;Zonvirinae
## 7 Ancient Urban 97 1 1 ns Aineviridae;Mehrvirinae
## 8 Ancient Urban 97 1 1 ns Xiheviridae;unclassified_Xihev…
## 9 Ancient Urban 97 1 1 ns Auroraviridae;unclassified_Aur…
## 10 Ancient Urban 97 1 1 ns Hathorviridae;Sikinikvirinae
## # ℹ 17 more rows
pasolli_TerL <-
pasolli %>%
mutate(study_name = ifelse(startsWith(study_name, "CM"),
"PasolliE_2019",
study_name)) %>%
left_join(sampleMetadata[, c(1, 2, 18)],
by = join_by(sample_id, study_name)) %>%
group_by(sample_id, study_name) %>%
summarise(richness = n_distinct(ORFName) / max(number_bases) * 1e9) %>%
ungroup() %>%
right_join(sampleMetadata, by = join_by(sample_id, study_name)) %>%
mutate(richness = ifelse(is.na(richness),
0,
richness))
plots <-
map(c("LiJ_2014", "NielsenHB_2014", "QinJ_2012", "QinN_2014"),
function(study){
if(study == "LiJ_2014"){
conditions = c("control", "T1D")
} else if(study == "NielsenHB_2014"){
conditions = c("control", "IBD")
} else if(study == "QinJ_2012"){
conditions = c("control", "T2D")
} else if(study == "QinN_2014"){
conditions = c("control", "cirrhosis")
}
pasolli_TerL %>%
filter(body_site == "stool",
study_name == study,
study_condition %in% conditions) %>%
group_by(study_condition, study_name, subject_id) %>%
summarise(richness = sum(richness)) %>%
ungroup() %>%
mutate(study_condition = factor(study_condition,
levels = conditions)) %>%
ggboxplot(
x = "study_condition",
y = "richness",
add = "jitter",
add.params = list(size = 0.25),
ggtheme = PAdJ_theme(),
xlab = "",
ylab = "observed TerL/person/1B bases",
facet.by = "study_name",
color = "study_condition",
palette = c("#5D3A9B","#E66100")
) +
stat_compare_means(comparisons = list(conditions),
label = "p.format") +
theme(legend.position = "")})
cowplot::plot_grid(
plots[[1]],
plots[[2]],
plots[[3]],
plots[[4]]
)
map(
c("LiJ_2014", "NielsenHB_2014", "QinJ_2012", "QinN_2014") ,
function(project){
tree <-
ape::read.tree(paste("datatables/Trees/",
project,
"_unifrac.treefile",
sep = ""))
tree <-
ape::root(tree, outgroup = "IMGVR_9015416")
unifrac_data <-
rio::import(paste("datatables/", project, "_depth.tsv", sep = ""))[,-6] %>%
mutate(present = ifelse(coverage >= 0.5, "yes", "no")) %>%
filter(contigName != "*",
present == "yes")
ps <-
phyloseq(
unifrac_data %>%
select(contigName, sample, mapped) %>%
mutate(mapped = 1) %>%
distinct() %>%
pivot_wider(
names_from = contigName,
values_from = mapped,
values_fill = 0
) %>%
column_to_rownames("sample") %>%
otu_table(taxa_are_rows = F)
,
distinct(unifrac_data[,c(4, 6)]) %>%
left_join(distinct(sampleMetadata[,c(2,3,6,7)]),
by = c("sample"="sample_id")) %>%
mutate(samplename = sample) %>%
column_to_rownames("sample") %>%
sample_data(),
ape::as.phylo(tree),
data.frame(cophenetic(tree)) %>%
rownames_to_column("contigName1") %>%
pivot_longer(-1, names_to = "contigName2", values_to = "distance") %>%
inner_join(helius_groups,
by = c("contigName1" = "contigName")) %>%
group_by(contigName2) %>%
slice_min(distance, n = 1, with_ties = F) %>%
ungroup() %>%
transmute(
contigName = contigName2,
Family = Family,
Subfamily = Subfamily
) %>%
distinct() %>%
column_to_rownames("contigName") %>%
as.matrix() %>%
tax_table()
)
dist_mat <- phyloseq::distance(ps, "unifrac", weighted = F)
sign <-
adonis2(formula = dist_mat~disease,
data = data.frame(ps@sam_data),
permutations = 9999)
PCA <- ordinate(ps, method = "NMDS", distance = "unifrac", weighted = F)
plot <-
plot_ordination(physeq = ps,
ordination = PCA,
justDF = T) %>%
ggscatter(
x = "NMDS1",
y = "NMDS2",
color = "study_condition",
ggtheme = PAdJ_theme(),
size = 1.5,
#alpha = 0.5,
mean.point = T,
mean.point.size = 4,
ellipse = T,
ellipse.alpha = 0.1,
ellipse.level = 0.5,
star.plot = F,
star.plot.lty = "dashed",
star.plot.lwd = 0.5,
palette = get_palette("nejm", 3),
title = paste(project, sign$`Pr(>F)`[1])
) +
theme(
aspect.ratio = 1,
panel.border = element_rect(color = "#969696", size = 1),
axis.line.x = element_blank(),
axis.line.y = element_blank(),
axis.text.y = element_text(size = 10)
) +
guides(color = guide_legend(override.aes = list(alpha = 1)))
# print(plot)
}
)
## Run 0 stress 0.2208444
## Run 1 stress 0.2259328
## Run 2 stress 0.2263239
## Run 3 stress 0.2199322
## ... New best solution
## ... Procrustes: rmse 0.01675631 max resid 0.07466437
## Run 4 stress 0.2327742
## Run 5 stress 0.2302071
## Run 6 stress 0.2282882
## Run 7 stress 0.2199322
## ... New best solution
## ... Procrustes: rmse 3.624489e-05 max resid 0.0001340541
## ... Similar to previous best
## Run 8 stress 0.2223842
## Run 9 stress 0.2259725
## Run 10 stress 0.2235587
## Run 11 stress 0.239643
## Run 12 stress 0.2367693
## Run 13 stress 0.2236092
## Run 14 stress 0.22834
## Run 15 stress 0.2370833
## Run 16 stress 0.2199317
## ... New best solution
## ... Procrustes: rmse 0.0001904864 max resid 0.0008433804
## ... Similar to previous best
## Run 17 stress 0.2235523
## Run 18 stress 0.2199313
## ... New best solution
## ... Procrustes: rmse 0.0003086476 max resid 0.001335987
## ... Similar to previous best
## Run 19 stress 0.22303
## Run 20 stress 0.2389497
## *** Best solution repeated 1 times
## Run 0 stress 0.2917404
## Run 1 stress 0.2947176
## Run 2 stress 0.2927702
## Run 3 stress 0.3041422
## Run 4 stress 0.2966031
## Run 5 stress 0.2926648
## Run 6 stress 0.3099933
## Run 7 stress 0.2959088
## Run 8 stress 0.2930126
## Run 9 stress 0.2973213
## Run 10 stress 0.2960759
## Run 11 stress 0.2931434
## Run 12 stress 0.2951425
## Run 13 stress 0.2927765
## Run 14 stress 0.2919717
## ... Procrustes: rmse 0.01416362 max resid 0.1257702
## Run 15 stress 0.3035012
## Run 16 stress 0.2975479
## Run 17 stress 0.3104784
## Run 18 stress 0.2952524
## Run 19 stress 0.2941249
## Run 20 stress 0.3053422
## *** Best solution was not repeated -- monoMDS stopping criteria:
## 5: no. of iterations >= maxit
## 15: stress ratio > sratmax
## Run 0 stress 0.2705312
## Run 1 stress 0.2704648
## ... New best solution
## ... Procrustes: rmse 0.002333117 max resid 0.02816903
## Run 2 stress 0.2705169
## ... Procrustes: rmse 0.002248468 max resid 0.02832013
## Run 3 stress 0.2721169
## Run 4 stress 0.2704686
## ... Procrustes: rmse 0.002772926 max resid 0.04644884
## Run 5 stress 0.2704679
## ... Procrustes: rmse 0.002853748 max resid 0.04645487
## Run 6 stress 0.2704763
## ... Procrustes: rmse 0.003045414 max resid 0.04664253
## Run 7 stress 0.2721448
## Run 8 stress 0.2704567
## ... New best solution
## ... Procrustes: rmse 0.001288347 max resid 0.0165017
## Run 9 stress 0.2704608
## ... Procrustes: rmse 0.0007018205 max resid 0.008684252
## ... Similar to previous best
## Run 10 stress 0.2725849
## Run 11 stress 0.2704747
## ... Procrustes: rmse 0.001245306 max resid 0.01064706
## Run 12 stress 0.272278
## Run 13 stress 0.2704934
## ... Procrustes: rmse 0.003031239 max resid 0.04646435
## Run 14 stress 0.270482
## ... Procrustes: rmse 0.001703609 max resid 0.0167226
## Run 15 stress 0.2725603
## Run 16 stress 0.272099
## Run 17 stress 0.2927931
## Run 18 stress 0.2721576
## Run 19 stress 0.2704735
## ... Procrustes: rmse 0.001529223 max resid 0.01682137
## Run 20 stress 0.270466
## ... Procrustes: rmse 0.001151238 max resid 0.01001854
## *** Best solution repeated 1 times
## Run 0 stress 0.2200106
## Run 1 stress 0.2200102
## ... New best solution
## ... Procrustes: rmse 0.0002585076 max resid 0.001379132
## ... Similar to previous best
## Run 2 stress 0.2229769
## Run 3 stress 0.216962
## ... New best solution
## ... Procrustes: rmse 0.02466974 max resid 0.2188135
## Run 4 stress 0.2204637
## Run 5 stress 0.2204516
## Run 6 stress 0.2169751
## ... Procrustes: rmse 0.0007653849 max resid 0.005879535
## ... Similar to previous best
## Run 7 stress 0.2169712
## ... Procrustes: rmse 0.0005497715 max resid 0.005324188
## ... Similar to previous best
## Run 8 stress 0.217021
## ... Procrustes: rmse 0.005092033 max resid 0.03482518
## Run 9 stress 0.2199648
## Run 10 stress 0.2192879
## Run 11 stress 0.2229143
## Run 12 stress 0.2170171
## ... Procrustes: rmse 0.00495594 max resid 0.03415168
## Run 13 stress 0.2192831
## Run 14 stress 0.2237799
## Run 15 stress 0.2230721
## Run 16 stress 0.2170814
## ... Procrustes: rmse 0.004369518 max resid 0.03601177
## Run 17 stress 0.2171059
## ... Procrustes: rmse 0.007250819 max resid 0.06027874
## Run 18 stress 0.2170258
## ... Procrustes: rmse 0.005290742 max resid 0.0557636
## Run 19 stress 0.2209736
## Run 20 stress 0.2272606
## *** Best solution repeated 2 times
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
map(
c("LiJ_2014", "NielsenHB_2014", "QinJ_2012", "QinN_2014") ,
function(project){
tree <-
ape::read.tree(paste("datatables/Trees/",
project,
"_unifrac.treefile",
sep = ""))
tree <-
ape::root(tree, outgroup = "IMGVR_9015416")
unifrac_data <-
rio::import(paste("datatables/", project, "_depth.tsv", sep = ""))[,-6] %>%
mutate(present = ifelse(coverage >= 0.5, "yes", "no")) %>%
filter(contigName != "*",
present == "yes")
ps <-
phyloseq(
unifrac_data %>%
select(contigName, sample, mapped) %>%
mutate(mapped = 1) %>%
distinct() %>%
pivot_wider(
names_from = contigName,
values_from = mapped,
values_fill = 0
) %>%
column_to_rownames("sample") %>%
otu_table(taxa_are_rows = F)
,
distinct(unifrac_data[,c(4, 6)]) %>%
left_join(distinct(sampleMetadata[,c(2,3,6,7)]),
by = c("sample"="sample_id")) %>%
mutate(samplename = sample) %>%
column_to_rownames("sample") %>%
sample_data(),
ape::as.phylo(tree),
data.frame(cophenetic(tree)) %>%
rownames_to_column("contigName1") %>%
pivot_longer(-1, names_to = "contigName2", values_to = "distance") %>%
inner_join(helius_groups,
by = c("contigName1" = "contigName")) %>%
group_by(contigName2) %>%
slice_min(distance, n = 1, with_ties = F) %>%
ungroup() %>%
transmute(
contigName = contigName2,
Family = Family,
Subfamily = Subfamily
) %>%
distinct() %>%
column_to_rownames("contigName") %>%
as.matrix() %>%
tax_table()
)
data <-
rbind(plots[[1]]$data,
plots[[2]]$data,
plots[[3]]$data,
plots[[4]]$data)
present <-
psmelt(ps) %>%
group_by(Sample, study_condition, Subfamily) %>%
reframe(present = ifelse(sum(Abundance) > 0, 1, 0))
absent <-
data.frame(
Sample = subset(data,
study_name == project &
!subject_id %in% present$Sample)$subject_id,
study_condition = subset(data,
study_name == project &
!subject_id %in%
present$Sample)$study_condition,
present = 0,
Subfamily = "Ravirinae"
)
df <-
rbind(present,
absent) %>%
pivot_wider(names_from = Subfamily,
values_from = present,
values_fill = 0) %>%
pivot_longer(-c(1, 2),
names_to = "Subfamily",
values_to = "present") %>%
group_by(study_condition, Subfamily) %>%
reframe(present = n_distinct(Sample[present == 1]),
absent = n_distinct(Sample)-present,
fraction = present/(present+absent)) %>%
mutate(study_condition = ifelse(
study_condition == "control", "control", "disease"
))
plot <-
rbind(present,
absent) %>%
pivot_wider(names_from = Subfamily,
values_from = present,
values_fill = 0) %>%
pivot_longer(-c(1, 2),
names_to = "Subfamily",
values_to = "present") %>%
group_by(study_condition, Subfamily) %>%
reframe(present = n_distinct(Sample[present == 1]),
absent = n_distinct(Sample)-present,
fraction = present/(present+absent)) %>%
select(study_condition, Subfamily, fraction) %>%
mutate(study_condition = ifelse(
study_condition == "control", "control", "disease"
)) %>%
pivot_wider(names_from = study_condition, values_from = fraction) %>%
mutate(delta = (disease - control)/(disease + control),
proj = project) %>%
ggplot(aes(x = proj,
y = Subfamily,
fill = delta)) +
geom_tile() +
scale_fill_gradient2(limits = c(-1,1),
low = "#762a83",
mid = "#f7f7f7",
high = "#1b7837") +
PAdJ_theme() +
theme(panel.grid.major = element_blank(),
axis.line = element_blank(),
legend.position = "right") +
xlab("")
assign(paste(project, "plot", sep = "_"),
plot, envir = globalenv())
assign(paste(project, "df", sep = "_"),
df, envir = globalenv())
}
)
## [[1]]
## # A tibble: 38 × 5
## study_condition Subfamily present absent fraction
## <chr> <chr> <int> <int> <dbl>
## 1 disease Antuvirinae 18 13 0.581
## 2 disease Aurinkovirinae 28 3 0.903
## 3 disease Dzuwavirinae 6 25 0.194
## 4 disease Ghrianvirinae 13 18 0.419
## 5 disease Gunesvirinae 2 29 0.0645
## 6 disease Haulvirinae 25 6 0.806
## 7 disease Iavirinae 0 31 0
## 8 disease Intivirinae 1 30 0.0323
## 9 disease Mataharivirinae 24 7 0.774
## 10 disease Oorunvirinae 23 8 0.742
## # ℹ 28 more rows
##
## [[2]]
## # A tibble: 40 × 5
## study_condition Subfamily present absent fraction
## <chr> <chr> <int> <int> <dbl>
## 1 disease Antuvirinae 66 158 0.295
## 2 disease Aurinkovirinae 112 112 0.5
## 3 disease Dzuwavirinae 21 203 0.0938
## 4 disease Ghrianvirinae 6 218 0.0268
## 5 disease Haulvirinae 86 138 0.384
## 6 disease Iavirinae 9 215 0.0402
## 7 disease Intivirinae 0 224 0
## 8 disease Kuarahyvirinae 0 224 0
## 9 disease Mataharivirinae 82 142 0.366
## 10 disease Mehrvirinae 3 221 0.0134
## # ℹ 30 more rows
##
## [[3]]
## # A tibble: 42 × 5
## study_condition Subfamily present absent fraction
## <chr> <chr> <int> <int> <dbl>
## 1 disease Antuvirinae 66 104 0.388
## 2 disease Aurinkovirinae 111 59 0.653
## 3 disease Dzuwavirinae 8 162 0.0471
## 4 disease Ghrianvirinae 6 164 0.0353
## 5 disease Gunesvirinae 5 165 0.0294
## 6 disease Haulvirinae 60 110 0.353
## 7 disease Iavirinae 29 141 0.171
## 8 disease Intivirinae 1 169 0.00588
## 9 disease Kuarahyvirinae 2 168 0.0118
## 10 disease Mataharivirinae 71 99 0.418
## # ℹ 32 more rows
##
## [[4]]
## # A tibble: 40 × 5
## study_condition Subfamily present absent fraction
## <chr> <chr> <int> <int> <dbl>
## 1 disease Antuvirinae 35 88 0.285
## 2 disease Aurinkovirinae 60 63 0.488
## 3 disease Dzuwavirinae 2 121 0.0163
## 4 disease Ghrianvirinae 1 122 0.00813
## 5 disease Gunesvirinae 10 113 0.0813
## 6 disease Haulvirinae 30 93 0.244
## 7 disease Iavirinae 8 115 0.0650
## 8 disease Intivirinae 0 123 0
## 9 disease Mataharivirinae 40 83 0.325
## 10 disease Mehrvirinae 5 118 0.0407
## # ℹ 30 more rows
shared_family.df <-
shared_clusters %>%
filter(Family.x==Family.y,
contigName1 != contigName2) %>%
rowwise() %>%
transmute(pair = paste(sort(c(contigName1, contigName2))[1],
sort(c(contigName1, contigName2))[2]),
Family = Family.x,
percent_shared) %>%
distinct() %>%
filter(# Family %in% c("Utuviridae", "Hathorviridae", "Aineviridae"),
Family != "unclassified_Heliusvirales"
)
plot_1 <-
ggplot(
shared_family.df,
aes(
x = Family,
y = percent_shared,
# color = Family,
fill = Family
)
) +
geom_half_boxplot(
errorbar.draw = F,
errorbar.length = 0.25,
outlier.size = 0.5,
alpha = 0.5,
nudge = 0.025
) +
geom_half_violin(
side = "r",
alpha = 0.5,
nudge = 0.025
) +
PAdJ_theme() +
theme(panel.grid.major.y = element_blank(),
legend.position = "") +
scale_color_manual(values = kleuren) +
scale_fill_manual(values = kleuren) +
coord_flip()
shared_subfamily.df <-
shared_clusters %>%
filter(Subfamily.x==Subfamily.y,
contigName1 != contigName2,
!startsWith(Subfamily.x, "unclassified_Utuviridae"),
!startsWith(Family.x, "unclassified")
) %>%
rowwise() %>%
transmute(pair = paste(sort(c(contigName1, contigName2))[1],
sort(c(contigName1, contigName2))[2]),
Subfamily = paste(Family.x, Subfamily.x, sep = "_"),
percent_shared) %>%
distinct()
plot_2 <-
ggplot(
shared_subfamily.df,
aes(
x = Subfamily,
y = percent_shared,
# color = Subfamily,
fill = Subfamily
)
) +
geom_half_boxplot(
errorbar.draw = FALSE,
alpha = 0.5,
outlier.size = 0.5,
nudge = 0.05
) +
geom_half_violin(
side = "r",
alpha = 0.5,
nudge = 0.05
) +
PAdJ_theme() +
theme(panel.grid.major.y = element_blank(),
legend.position = "") +
scale_color_manual(values = kleuren) +
scale_fill_manual(values = kleuren) +
coord_flip()
cowplot::plot_grid(plot_1, plot_2,
ncol = 1, align = "vh",
rel_heights = c(0.25,1))
rio::import("datatables/pharokka_length_gc_cds_density.tsv") %>%
inner_join(helius_groups, by = c("contig"="contigName")) %>%
mutate(lineage = paste(Family, Subfamily, sep = ";")) %>%
ggboxplot(
x = "lineage",
y = "gc_perc",
ggtheme = PAdJ_theme(),
add = "jitter",
add.params = list(color = "tomato3"),
rotate = T
) +
theme(legend.position = "")
iphop_hosts %>%
mutate(Lineage = paste(Family, Subfamily, sep =";"),
H_Lineage = paste(H_Phylum, H_Genus, sep = ";")) %>%
group_by(Lineage, H_Lineage) %>%
reframe(genomes = n_distinct(Virus)) %>%
filter(!str_remove(Lineage, ".+;") %in% c("unclassified_Heliusvirales",
"unclassified_Utuviridae",
"unclassified_Hathorviridae",
"unclassified_Aineviridae"),
H_Lineage != "unknown;unknown") %>%
ggplot(
aes(x = H_Lineage,
y = Lineage,
fill = genomes)
) +
geom_tile() +
PAdJ_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.25)) +
scale_fill_viridis()
temperate <-
annotations %>%
inner_join(helius_groups, by = c("contig"="contigName")) %>%
group_by(contig, Family, Subfamily) %>%
reframe(temp = n_distinct(gene[category == "integration and excision"])) %>%
group_by(Family, Subfamily) %>%
reframe(
virulent = n_distinct(contig[temp == 0]),
temperate = n_distinct(contig[temp > 0]),
temp_frac = temperate/(temperate+virulent)
)
temperate %>%
filter(!Subfamily %in% c("unclassified_Aineviridae",
"unclassified_Utuviridae",
"unclassified_Hathorvirinae",
"unclassified_Heliusvirales")) %>%
mutate(lineage = paste(Family, Subfamily, sep = ";")) %>%
ggdotchart(
x = "lineage",
y = "temp_frac",
ggtheme = PAdJ_theme(),
add = "segment",
rotate = T
) +
geom_hline(yintercept = with(temperate, sum(temperate)/(sum(temperate) +sum(virulent))),
linetype = "dashed",
color = "tomato3") +
theme(legend.position = "")
pasolli %>%
mutate(study_name = ifelse(startsWith(study_name, "CM"),
"PasolliE_2019",
study_name)) %>%
left_join(sampleMetadata[, c(1, 2, 18)],
by = join_by(sample_id,
study_name)) %>%
group_by(sample_id, study_name) %>%
summarise(richness = n_distinct(ORFName) / max(number_bases) * 1e9) %>%
ungroup() %>%
right_join(sampleMetadata,
by = join_by(sample_id,
study_name)) %>%
mutate(richness = ifelse(is.na(richness),
0,
richness)) %>%
filter(study_name %in% c("Obregon-TitoAJ_2015",
"RampelliS_2015")) %>%
mutate(
group = case_when(
location %in% c("Bologna", "Norman") ~ "urban",
location == "Matses" | country == "TZA" ~ "hunter-gatherer",
TRUE ~ "rural"
)
) %>%
filter(group != "rural") %>%
ggboxplot(
x = "group",
y = "richness",
facet.by = "study_name",
add = "jitter",
color = "group",
palette = "aaas",
ggtheme = PAdJ_theme(),
xlab = "",
ylab = "Heliusvirales TerL/person/1B bases"
) +
stat_compare_means(comparisons = list(c("hunter-gatherer", "urban")),
label = "p.format") +
theme(panel.grid.major.x = element_blank())
ancient_ab_lme <-
map_dfr(
list(
c("ancient", "hunter_gatherer"),
c("ancient", "urban"),
c("hunter_gatherer", "urban")
) ,
function(combo){
ancient_abundance.df %>%
filter(lifestyle %in% combo) %>%
nlme::lme(
fixed = read_fraction~lifestyle,
random = ~1|study_name
) %>%
rstatix::Anova() %>%
rstatix::anova_summary() %>%
mutate(group1 = combo[[1]],
group2 = combo[[2]])
}
) %>%
mutate(p.adj.lme = p.adjust(p, method = "BH")) %>%
select("group1", "group2", "p.adj.lme")
ancient_abundance.df_sign <-
ancient_abundance.df %>%
rstatix::wilcox_test(read_fraction ~ lifestyle,
p.adjust.method = "BH") %>%
rstatix::add_xy_position(fun = "max") %>%
select(group1, group2, y.position) %>%
left_join(ancient_ab_lme, by = join_by(group1, group2)) %>%
mutate(p.adj.signif = case_when(
p.adj.lme > 0.05 ~ "ns",
between(p.adj.lme, 0.01, 0.05) ~ "*",
between(p.adj.lme, 0.01, 0.001) ~ "**",
between(p.adj.lme, 0.001, 0.0001) ~ "***",
p.adj.lme < 0.0001 ~ "****"
),
y.position = c(4.5e-4, 5e-4, 5.5e-4))
ancient_abundance.df %>%
ggboxplot(
y = "read_fraction",
x = "lifestyle",
ggtheme = PAdJ_theme(),
add = "jitter",
add.params = list(width = 0.1),
ylab = "Heliusvirales TerL relative abundance",
xlab = "population",
color = "lifestyle",
palette = get_palette("jama", 5),
width = 0.5
) +
stat_pvalue_manual(ancient_abundance.df_sign,
label = "p.adj.lme",
tip.length = 0.01,
size = 5,
hide.ns = F) +
theme(panel.grid.major.x = element_blank()) +
scale_x_discrete(limits = c("ancient",
"hunter_gatherer",
"urban")) +
scale_y_continuous(labels = scales::label_percent())
source("scripts/ancient_tax.R")
ancient_tax.df %>%
ggboxplot(
y = "richness",
x = "lifestyle",
ggtheme = PAdJ_theme(),
add = "jitter",
ylab = "taxonomic richness",
xlab = "population",
color = "lifestyle",
palette = get_palette("jama", 4),
width = 0.3
) +
stat_pvalue_manual(ancient_tax.df_sign,
label = "p.adj.lme",
tip.length = 0.01,
size = 5,
hide.ns = F) +
theme(panel.grid.major.x = element_blank()) +
scale_x_discrete(limits = c("ancient",
"hunter_gatherer",
"urban"))
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=nl_NL.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=nl_NL.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=nl_NL.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=nl_NL.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Amsterdam
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggtree_3.8.2 gghalves_0.1.4 genoPlotR_0.8.11
## [4] ade4_1.7-22 ggridges_0.5.4 RColorBrewer_1.1-3
## [7] microbiome_1.22.0 phyloseq_1.44.0 Biostrings_2.68.1
## [10] GenomeInfoDb_1.36.1 XVector_0.40.0 IRanges_2.34.1
## [13] S4Vectors_0.38.1 BiocGenerics_0.46.0 vegan_2.6-4
## [16] lattice_0.21-8 permute_0.9-7 ggpubr_0.6.0
## [19] viridis_0.6.4 viridisLite_0.4.2 lubridate_1.9.2
## [22] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2
## [25] purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
## [28] tibble_3.2.1 ggplot2_3.4.3 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.15.0 jsonlite_1.8.7 magrittr_2.0.3
## [4] farver_2.1.1 rmarkdown_2.23 zlibbioc_1.46.0
## [7] vctrs_0.6.3 multtest_2.56.0 RCurl_1.98-1.12
## [10] rstatix_0.7.2 htmltools_0.5.5 curl_5.0.1
## [13] haven_2.5.3 broom_1.0.5 cellranger_1.1.0
## [16] Rhdf5lib_1.22.0 rhdf5_2.44.0 gridGraphics_0.5-1
## [19] sass_0.4.7 bslib_0.5.0 plyr_1.8.8
## [22] cachem_1.0.8 rio_0.5.29 igraph_1.5.0.1
## [25] lifecycle_1.0.3 iterators_1.0.14 pkgconfig_2.0.3
## [28] Matrix_1.6-0 R6_2.5.1 fastmap_1.1.1
## [31] GenomeInfoDbData_1.2.10 digest_0.6.33 aplot_0.1.10
## [34] colorspace_2.1-0 patchwork_1.1.3 labeling_0.4.3
## [37] fansi_1.0.4 timechange_0.2.0 abind_1.4-5
## [40] mgcv_1.8-42 compiler_4.3.0 bit64_4.0.5
## [43] withr_2.5.0 backports_1.4.1 carData_3.0-5
## [46] highr_0.10 maps_3.4.1 ggsignif_0.6.4
## [49] MASS_7.3-60 biomformat_1.28.0 ggsci_3.0.0
## [52] tools_4.3.0 foreign_0.8-82 ape_5.7-1
## [55] zip_2.3.0 glue_1.6.2 nlme_3.1-162
## [58] rhdf5filters_1.12.1 Rtsne_0.16 cluster_2.1.4
## [61] reshape2_1.4.4 generics_0.1.3 gtable_0.3.4
## [64] tzdb_0.4.0 data.table_1.14.8 hms_1.1.3
## [67] car_3.1-2 utf8_1.2.3 foreach_1.5.2
## [70] pillar_1.9.0 yulab.utils_0.0.6 splines_4.3.0
## [73] treeio_1.24.3 survival_3.5-5 bit_4.0.5
## [76] tidyselect_1.2.0 knitr_1.43 gridExtra_2.3
## [79] xfun_0.39 Biobase_2.60.0 stringi_1.7.12
## [82] lazyeval_0.2.2 ggfun_0.1.1 yaml_2.3.7
## [85] evaluate_0.21 codetools_0.2-19 ggplotify_0.1.1
## [88] cli_3.6.1 munsell_0.5.0 jquerylib_0.1.4
## [91] Rcpp_1.0.11 readxl_1.4.3 mapproj_1.2.11
## [94] parallel_4.3.0 bitops_1.0-7 ggthemes_4.2.4
## [97] tidytree_0.4.4 scales_1.2.1 openxlsx_4.2.5.2
## [100] crayon_1.5.2 rlang_1.1.1 cowplot_1.1.1