<- read_tsv(here::here("metadata/DNA/vcf_samples_metadata.tsv") )
dna_meta <- dna_meta_full <- readxl::read_excel(here::here("metadata/raw_metadata_from_NYGC/ALS Consortium DNA Metadata 20201015 .xlsx")) %>% janitor::clean_names()
dna_meta_full
<- read_tsv(here::here("metadata/RNA/rna_metadata_annotated.tsv"))
rna_meta
<- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_splicing_peer20.gene.combined_covariates.txt")) %>%
covariates gather(key = "sample", value = "value", -ID) %>%
spread(key = ID, value = value)
# expansionHunter calls are using external_sample_id
# genotype and others are using VCF ID
<- read_tsv(here::here("data/QTLs/Nov_2020/results/COLOC/all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"))
coloc_res
#coloc_res <- read_tsv(here::here("data/COLOC/Nicolas_suggestive/all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"))
<- filter(coloc_res, QTL == "NYGC_Lumbar_sQTL", QTL_Gene %in% c("C9orf72", "ATXN3")) %>%
lsp filter(PP.H4.abf > 0.5)
lsp
## # A tibble: 5 × 24
## disease GWAS locus GWAS_SNP GWAS_P GWAS_chr GWAS_pos QTL type QTL_SNP
## <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr>
## 1 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL chr14:…
## 2 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL rs2003…
## 3 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL rs2003…
## 4 ALS Nicol… locus… rs3849943 3.77e-30 chr9 27543384 NYGC… sQTL rs1537…
## 5 ALS Nicol… locus… rs101433… 3.24e- 7 chr14 92074037 NYGC… sQTL rs5810…
## # … with 14 more variables: QTL_P <dbl>, QTL_MAF <dbl>, QTL_chr <chr>,
## # QTL_pos <dbl>, QTL_junction <chr>, QTL_Gene <chr>, QTL_Ensembl <chr>,
## # nsnps <dbl>, PP.H0.abf <dbl>, PP.H1.abf <dbl>, PP.H2.abf <dbl>,
## # PP.H3.abf <dbl>, PP.H4.abf <dbl>, LD <dbl>
<- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_sQTL_junctions.bed") )
sqtl_junctions
<- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_sample_key.txt") )
sample_key # 197 samples from 197 donors
# get expansionHunter calls for samples with RNA
<- read.table(here::here("ALS_SC/ATXN3/ExpansionHunter_C9orf72_results.txt"), header=FALSE, stringsAsFactors = FALSE)
c9_expansion_hunter_rna # 403 donors with expansionHunter calls
# feb 2021 - 2044 samples with WGS
<- read.table(here::here("ALS_SC/ATXN3/02_2021_C9orf72_ExpansionHunter_results.txt"), header=FALSE, stringsAsFactors = FALSE)
c9_expansion_hunter_wgs # 167 samples have maximum c9 repeat length > 30
# VCF for 434 samples
<- read.table(here::here("ALS_SC/ATXN3/C9_GWAS_SNP_rs3849943.vcf"), header=TRUE, stringsAsFactors = FALSE )
c9_vcf # R read T as TRUE
$ALT <- "T"
c9_vcf
# rs3849943 - reference allele should be T, alternate should be C. C has an allele frequency around 20% worldwide
<- process_vcf(c9_vcf) %>%
c9_genotypes select(-genotype_binary)
# fix allele flip here
<- tibble(
c9_code_df genotype = c("1/1", "0/1", "0/0"),
genotype_code = factor(c("T/T", "T/C", "C/C"), levels = c("T/T", "T/C", "C/C") ),
genotype_binary = c(2,1,0)
)
<- left_join(c9_genotypes, c9_code_df, by = "genotype")
c9_genotypes
<- process_junctions(sqtl_junctions, "ENSG00000147894.15")
c9_junctions # 11 junctions in 197 samples
#c9_eh_df_rna <- process_expansionhunter(c9_expansion_hunter_rna)
<- process_expansionhunter(c9_expansion_hunter_wgs)
c9_eh_df# 2041 calls, only 1 is NA
# merge together
<- left_join(c9_junctions, c9_genotypes, by = "sample") %>%
c9_all left_join(c9_eh_df, by = "sample") %>%
mutate(genotype_binary = factor(genotype_binary))
# 2167 rows for 197 samples
# extract top colocalised junction
# join on QTL covariates for those samples
<- c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
c9_sqtl_snp select(sample, junction_ratio, genotype_binary) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample") %>%
mutate(genotype_binary = as.numeric(genotype_binary))
# do linear regression on junction ratio against genotype - a splicing QTL in R!
summary(lm(junction_ratio ~ genotype_binary, data = c9_sqtl_snp))
##
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = c9_sqtl_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4998 -0.4983 -0.1195 0.3538 2.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.27972 0.18316 6.987 4.33e-11 ***
## genotype_binary -0.54971 0.07375 -7.454 2.89e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6954 on 195 degrees of freedom
## Multiple R-squared: 0.2217, Adjusted R-squared: 0.2177
## F-statistic: 55.56 on 1 and 195 DF, p-value: 2.892e-12
# P = 1e-12
# now add all the same covariates as tensorQTL
summary(lm(junction_ratio ~ . , data = c9_sqtl_snp))
##
## Call:
## lm(formula = junction_ratio ~ ., data = c9_sqtl_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19512 -0.44398 -0.04007 0.37916 2.37281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.879480 1.312373 1.432 0.15398
## genotype_binary -0.518285 0.080651 -6.426 1.31e-09 ***
## dna_platformX 0.214807 0.143021 1.502 0.13500
## `dna_prepPCR-Free` -0.075160 0.149767 -0.502 0.61643
## InferredCov1 -1.775086 1.441707 -1.231 0.21996
## InferredCov10 0.031426 1.070358 0.029 0.97661
## InferredCov11 -0.026260 1.090898 -0.024 0.98082
## InferredCov12 -0.002791 1.132687 -0.002 0.99804
## InferredCov13 -0.628309 1.137231 -0.552 0.58135
## InferredCov14 -0.075767 1.209412 -0.063 0.95012
## InferredCov15 0.945244 1.377216 0.686 0.49345
## InferredCov16 0.863256 1.473463 0.586 0.55875
## InferredCov17 -0.829829 1.614621 -0.514 0.60797
## InferredCov18 0.361519 1.639576 0.220 0.82575
## InferredCov19 1.282635 1.632608 0.786 0.43319
## InferredCov2 -2.081830 1.149401 -1.811 0.07190 .
## InferredCov20 1.502637 1.790155 0.839 0.40245
## InferredCov3 -0.851180 1.069428 -0.796 0.42721
## InferredCov4 2.972459 1.033283 2.877 0.00454 **
## InferredCov5 -0.613545 1.045911 -0.587 0.55826
## InferredCov6 -0.995302 1.039625 -0.957 0.33977
## InferredCov7 0.422511 1.084553 0.390 0.69735
## InferredCov8 -0.263301 1.050102 -0.251 0.80232
## InferredCov9 1.699327 1.064456 1.596 0.11228
## PC1 134.294938 78.607734 1.708 0.08942 .
## PC2 -68.869449 47.635260 -1.446 0.15012
## PC3 30.729341 24.764210 1.241 0.21639
## PC4 -6.440466 23.312600 -0.276 0.78269
## PC5 -1.668193 7.312508 -0.228 0.81983
## sexmale 0.079182 0.107156 0.739 0.46098
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6906 on 167 degrees of freedom
## Multiple R-squared: 0.3427, Adjusted R-squared: 0.2285
## F-statistic: 3.002 on 29 and 167 DF, p-value: 5.251e-06
# P = 1e-9
# get residuals
<-
c9_sqtl_snp_resid %>% filter(junction_id == "chr9:27567164-27573709") %>%
c9_all select(sample, junction_ratio) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
$resid <- resid(lm(junction_ratio ~ . , data = c9_sqtl_snp_resid) )
c9_sqtl_snp_resid## then add back the SNP
$genotype_binary <- c9_genotypes$genotype_binary[match(row.names(c9_sqtl_snp_resid), c9_genotypes$sample)]
c9_sqtl_snp_resid
## MAIN FIGURE
<-
plot_c9_snp_splicing %>%
c9_sqtl_snp_resid left_join(c9_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = resid)) +
geom_jitter(aes(colour = genotype_code), width = 0.2, height = 0, size = 1) +
geom_boxplot(outlier.color = NA, fill = NA) +
labs(y = "Residual splicing ratio", x = "genotype") +
#ggpubr::stat_compare_means() +
theme_jh() +
labs(colour = "rs3849943") +
scale_colour_brewer(type = "qual", palette = 2)
plot_c9_snp_splicing
Now use C9orf72 expansion repeat
## Get Expansion Calls
<- c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
c9_sqtl_repeat select(sample, junction_ratio, max_call) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample") %>%
filter( !is.na(max_call))
#mutate(max_call = factor(max_call > 30))
# 139 out of 197 samples have ExpansionHunter genotypes
summary(lm(junction_ratio ~ log(max_call), data = c9_sqtl_repeat))
##
## Call:
## lm(formula = junction_ratio ~ log(max_call), data = c9_sqtl_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4786 -0.4952 -0.1193 0.4479 2.4330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.46143 0.10919 -4.226 4.32e-05 ***
## log(max_call) 0.18748 0.03937 4.763 4.81e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7203 on 137 degrees of freedom
## Multiple R-squared: 0.142, Adjusted R-squared: 0.1358
## F-statistic: 22.68 on 1 and 137 DF, p-value: 4.806e-06
# P = 4.81e-06
summary(lm(junction_ratio ~ . , data = c9_sqtl_repeat))
##
## Call:
## lm(formula = junction_ratio ~ ., data = c9_sqtl_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81084 -0.42999 -0.05215 0.42323 1.65273
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.848e+00 1.779e+00 1.038 0.30137
## max_call 1.669e-03 5.881e-04 2.838 0.00541 **
## dna_platformX 3.087e-01 1.653e-01 1.868 0.06442 .
## `dna_prepPCR-Free` NA NA NA NA
## InferredCov1 2.507e+00 2.749e+00 0.912 0.36372
## InferredCov10 -4.387e-02 1.432e+00 -0.031 0.97561
## InferredCov11 9.855e-01 1.492e+00 0.660 0.51040
## InferredCov12 -8.995e-01 1.704e+00 -0.528 0.59866
## InferredCov13 2.194e+00 1.540e+00 1.424 0.15715
## InferredCov14 2.767e+00 1.534e+00 1.804 0.07397 .
## InferredCov15 -1.256e+00 1.765e+00 -0.712 0.47813
## InferredCov16 2.716e-01 1.767e+00 0.154 0.87812
## InferredCov17 -1.950e+00 2.089e+00 -0.934 0.35250
## InferredCov18 1.630e+00 2.028e+00 0.804 0.42310
## InferredCov19 3.177e+00 2.072e+00 1.533 0.12806
## InferredCov2 -6.201e-01 1.775e+00 -0.349 0.72756
## InferredCov20 -4.452e-01 2.251e+00 -0.198 0.84358
## InferredCov3 -9.271e-01 1.364e+00 -0.679 0.49827
## InferredCov4 3.398e+00 1.337e+00 2.542 0.01240 *
## InferredCov5 -2.182e+00 1.464e+00 -1.491 0.13895
## InferredCov6 -4.505e-01 1.367e+00 -0.330 0.74230
## InferredCov7 4.207e-01 1.353e+00 0.311 0.75634
## InferredCov8 1.034e+00 1.356e+00 0.762 0.44774
## InferredCov9 1.954e+00 1.358e+00 1.439 0.15308
## PC1 8.573e+01 9.984e+01 0.859 0.39241
## PC2 -8.938e+01 6.088e+01 -1.468 0.14492
## PC3 2.506e+01 3.206e+01 0.782 0.43601
## PC4 7.387e+00 2.791e+01 0.265 0.79175
## PC5 -9.666e+00 9.648e+00 -1.002 0.31862
## sexmale 2.612e-02 1.427e-01 0.183 0.85513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7216 on 110 degrees of freedom
## Multiple R-squared: 0.3087, Adjusted R-squared: 0.1328
## F-statistic: 1.755 on 28 and 110 DF, p-value: 0.02119
# P = 0.005
# plot association between repeat length and residualised junction ratio
<- c9_sqtl_snp_resid
c9_sqtl_resid_eh
$max_call <- c9_sqtl_repeat$max_call[match(row.names(c9_sqtl_resid_eh), row.names(c9_sqtl_repeat) )]
c9_sqtl_resid_eh
<- c9_sqtl_resid_eh[ !is.na(c9_sqtl_resid_eh$max_call), ]
c9_sqtl_resid_eh # 139 samples
# plot repeat length against residualised splicing - MAIN FIGURE
<-
plot_c9_repeat_splicing %>%
c9_sqtl_resid_eh rownames_to_column(var = "sample") %>%
left_join(rna_meta, by = c("sample" = "external_sample_id") ) %>%
left_join( c9_code_df, by = "genotype_binary") %>%
ggplot(aes(x = max_call, y = resid)) +
geom_point(aes(colour = genotype_code), size = 1) +
geom_smooth(method = "lm", colour = "black") +
labs(x = "C9orf72 repeat length", y = "Residualised Splicing ratio") +
::stat_cor() +
ggpubrscale_x_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
labs(colour = "rs3849943") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
# colour by disease - for supplement
%>%
c9_sqtl_resid_eh rownames_to_column(var = "sample") %>%
left_join(sample_key, by = c("sample" = "participant_id")) %>%
left_join(rna_meta, by = c("sample_id" = "external_sample_id") ) %>%
left_join( c9_code_df, by = "genotype_binary") %>%
ggplot(aes(x = max_call, y = resid)) +
geom_point(aes(colour = disease), size = 0.8) +
geom_smooth(method = "lm", colour = "black") +
labs(x = "C9orf72 repeat length", y = "Residualised Splicing ratio") +
::stat_cor() +
ggpubrscale_x_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
theme_jh()
%>%
c9_sqtl_resid_eh filter(!is.na(max_call)) %>%
ggplot(aes(x = max_call > 100, y = junction_ratio)) + geom_boxplot() + ggpubr::stat_compare_means() +
geom_jitter(width = 0.25) +
labs(x = "C9 repeat > 30", y = "Splicing ratio") +
theme_jh()
%>%
c9_sqtl_resid_eh ggplot(aes(x = max_call, y = resid)) + geom_point() + geom_smooth(method = "lm") +
labs(x = "C9orf72 repeat length", y = "Residual splicing ratio")
%>%
c9_sqtl_resid_eh filter(!is.na(max_call)) %>%
ggplot(aes(x = max_call > 100, y = resid)) + geom_boxplot() + ggpubr::stat_compare_means() +
geom_jitter(width = 0.25) +
labs(x = "C9 repeat > 100", y = "Residual splicing ratio")
%>%
c9_sqtl_resid_eh ggplot(aes(x = genotype_binary, y = max_call)) + geom_point()
chisq.test(table(expansion = c9_sqtl_resid_eh $max_call > 30, genotype = c9_sqtl_resid_eh$genotype_binary))
##
## Pearson's Chi-squared test
##
## data: table(expansion = c9_sqtl_resid_eh$max_call > 30, genotype = c9_sqtl_resid_eh$genotype_binary)
## X-squared = 17.499, df = 2, p-value = 0.0001586
<-
plot_c9_repeat_genotype %>%
c9_sqtl_resid_eh rownames_to_column(var = "sample") %>%
#left_join(rna_meta, by = c("sample" = "external_sample_id") ) %>%
left_join( c9_code_df, by = "genotype_binary") %>%
ggplot(aes(y = max_call, x = genotype_code)) +
geom_jitter(aes(colour = genotype_code), size = 1, width = 0.2, height =0) +
geom_boxplot(outlier.color = NA, fill = NA) +
#geom_smooth(method = "lm", colour = "black") +
labs(y = "C9orf72 repeat length", x = "genotype") +
#ggpubr::stat_cor() +
scale_y_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
labs(colour = "rs3849943") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
<-
c9_multiplot + plot_c9_repeat_genotype ) /
(plot_c9_snp_splicing &
plot_c9_repeat_splicing # theme(axis.ticks.y = element_blank(),
# axis.text.y = element_blank(),
# axis.title.y = element_blank(),
# axis.line.y = element_blank() ) &
# ylim(-1.6,2.6)
# ) +
#plot_layout(nrow = 2, guides = "collect", widths =c(1,2,1)) &
guides(colour = FALSE) #&
#theme(legend.position = "bottom") &
#plot_annotation(title = "rs3849943 - C9orf72 splicing", subtitle = "chr9:27567164-27573709") &
#ylim(-1.6,2.6)
c9_multiplot
ggsave(plot = c9_multiplot, filename = here::here("ALS_SC/plots/C9orf72_genotype_repeat_plots.pdf"), width = 3, height = 4.5)
1 out of 60 homozygous ref have expansions 11 out of 50 are heterozygous 5 out of 14 homozygous alt have expansions
therefore there’s clearly a tagging of the repeat by the SNP and vice versa.
C9orf72 repeat length is weakly associated with splicing of the first intron - P = 0.059 (residual ratio)
# plot
%>%
c9_all filter(junction_id == "chr9:27567164-27573709") %>%
ggplot(aes(x = genotype_binary, y = junction_ratio)) + geom_boxplot(aes( group = genotype_binary)) +
facet_wrap(~junction_id)
%>%
c9_all filter(!is.na(max_call)) %>%
filter(junction_id == "chr9:27567164-27573709") %>%
ggplot(aes(x = genotype_binary, y = junction_ratio)) +
#geom_boxplot(aes( group = max_call > 80), outlier.colour = NA) +
geom_jitter(aes(colour = max_call > 80)) +
facet_wrap(~junction_id)
#chisq.test(table(c9_genotype_eh$genotype_binary, c9_genotype_eh$max_call))
%>%
c9_all filter(!is.na(max_call)) %>%
filter(junction_id == "chr9:27567164-27573709") %>%
mutate(genotype_binary = factor(genotype_binary)) %>%
ggplot(aes(x = max_call > 80, y = junction_ratio)) + geom_boxplot() +
facet_wrap(~junction_id)
%>%
c9_all select(sample, genotype_binary, max_call) %>%
filter(!is.na(max_call)) %>%
distinct() %>%
ggplot(aes(x = genotype_binary, y = max_call)) + geom_point()
<- inner_join(c9_genotypes, c9_eh_df, by = "sample")
c9_genotype_eh
table(SNP_genotype = c9_genotype_eh$genotype_binary, expansion = c9_genotype_eh$max_call > 80)
## expansion
## SNP_genotype FALSE TRUE
## 0 18 9
## 1 87 32
## 2 119 1
%>%
c9_genotype_eh ggplot(aes(x = genotype_binary, y = max_call)) +
geom_jitter(height = 0, width = 0.25)
<-
c9_model %>%
c9_all filter(!is.na(max_call)) %>%
filter(junction_id == "chr9:27567164-27573709")
summary(lm(junction_ratio ~ as.numeric(genotype_binary), data = c9_model))
##
## Call:
## lm(formula = junction_ratio ~ as.numeric(genotype_binary), data = c9_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43377 -0.50194 -0.08515 0.41028 2.90805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.18285 0.20476 5.777 4.89e-08 ***
## as.numeric(genotype_binary) -0.51893 0.08395 -6.182 6.82e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6877 on 137 degrees of freedom
## Multiple R-squared: 0.2181, Adjusted R-squared: 0.2124
## F-statistic: 38.21 on 1 and 137 DF, p-value: 6.823e-09
summary(lm(junction_ratio ~ as.numeric(max_call), data = c9_model))
##
## Call:
## lm(formula = junction_ratio ~ as.numeric(max_call), data = c9_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19386 -0.54406 -0.09168 0.42007 2.68533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1091423 0.0702518 -1.554 0.12259
## as.numeric(max_call) 0.0014902 0.0005401 2.759 0.00659 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.757 on 137 degrees of freedom
## Multiple R-squared: 0.05264, Adjusted R-squared: 0.04573
## F-statistic: 7.612 on 1 and 137 DF, p-value: 0.006589
summary(lm(junction_ratio ~ as.numeric(max_call > 80) + as.numeric(genotype_binary), data = c9_model))
##
## Call:
## lm(formula = junction_ratio ~ as.numeric(max_call > 80) + as.numeric(genotype_binary),
## data = c9_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4011 -0.4920 -0.1165 0.4278 2.8392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.13505 0.22769 4.985 1.85e-06 ***
## as.numeric(max_call > 80) 0.08633 0.17770 0.486 0.628
## as.numeric(genotype_binary) -0.50379 0.08976 -5.613 1.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6896 on 136 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.208
## F-statistic: 19.12 on 2 and 136 DF, p-value: 4.823e-08
%>%
c9_model ggplot(aes( x = max_call > 80, y = junction_ratio)) + geom_boxplot() + ggpubr::stat_compare_means()
Yes, the GWAS variant tags the expansion.
However, does the expansion cause the splicing change? Effect is weaker than that of the SNP.
Is there an enrichment in expanded repeat alleles in ALS patients compared to controls? Obviously yes.
nrow(c9_eh_df)
## [1] 2041
# 2041 samples have EH run on them
setdiff(c9_eh_df$sample, dna_meta_full$external_sample_id)
## [1] "HDA1368" "HDA1362" "HDA1360" "HDA1364" "HDA1359" "HDA1366" "HDA1367"
## [8] "HDA1363" "HDA1361"
<- c9_eh_df %>%
c9_eh_meta left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
filter(!is.na(external_subject_id))
dim(c9_eh_meta)
## [1] 2032 63
# 2032
<- c9_eh_meta %>%
c9_eh_meta filter(!duplicated(external_subject_id)) %>%# throw out duplicate donors
filter(pct_european >= 0.9) # remove admixed samples - quick and dirty for now until I get full MDS values.
dim(c9_eh_meta)
## [1] 1773 63
# 1773 donors
<- mutate(c9_eh_meta, disease = case_when(
c9_eh_meta == "Classical/Typical ALS" ~ "ALS",
subject_group_subcategory == "Classical/Typical ALS, Frontotemporal Dementia (FTD)" ~ "ALS-FTD",
subject_group_subcategory grepl("Frontotemporal Dementia", subject_group_subcategory) ~ "FTD",
grepl("Non-Neurological Control", subject_group) ~ "Control",
TRUE ~ "Other")
)
<- group_by(c9_eh_meta, disease) %>% tally() %>% mutate(disease_n = paste0(disease,"\n(", n, ")") )
disease_n
$disease <- factor(disease_n$disease, levels = c("Control", "ALS", "ALS-FTD", "FTD", "Other") )
disease_n$disease_n <- factor(disease_n$disease_n, levels = disease_n$disease_n[as.numeric(disease_n$disease)])
disease_n
<- left_join(c9_eh_meta, disease_n, by = "disease")
c9_eh_meta
# expansion size
<- c9_eh_meta %>%
c9_dist_plot filter(disease != "Other") %>%
filter(!is.na(disease) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = max_call )) +
geom_violin(fill = NA, colour = "darkorange") +
geom_jitter(height = 0, width = 0.25, size = 0.5) +
coord_flip() +
scale_y_continuous(trans = 'log10') +
geom_hline(yintercept = 30, linetype = 3, colour = "red") + theme_jh() +
labs(x = "", y = "C9orf72 expansion max size")
# proportion of group with expansion > 30
<- c9_eh_meta %>%
c9_prop_plot filter(!is.na(disease) ) %>%
filter(disease != "Other") %>%
group_by(disease_n, expand = !is.na(max_call) & max_call > 30) %>%
tally() %>%
pivot_wider(names_from = expand, values_from = n, values_fill = 0, names_prefix = "c9") %>%
mutate(prop = c9TRUE / (c9TRUE + c9FALSE) ) %>%
mutate(prop_label = paste0(signif(prop * 100, 2), "%" ) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = prop)) +
geom_col() +
coord_flip() +
labs(y = "Proportion > 30 repeats", x = "") +
theme_jh() +
scale_y_continuous(labels = scales::percent, minor_breaks = NULL, breaks = c(0,0.2), limits = c(0,0.4)) +
geom_text(aes(label = prop_label ), colour = "black", nudge_y = 0.04, size = 3)
+ c9_prop_plot + plot_layout(ncol = 2, widths = c(3,1)) c9_dist_plot
ALS-FTD has highest proportion of individuals with >30 repeats
%>%
c9_eh_meta filter(!is.na(disease) ) %>%
filter(disease != "Other") %>%
# filter(max_call <= 30) %>%
group_by(disease, call = max_call > 30) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "c9_30_" ) %>%
mutate(prop = c9_30_TRUE / (c9_30_TRUE + c9_30_FALSE) )
## # A tibble: 4 × 4
## # Groups: disease [4]
## disease c9_30_FALSE c9_30_TRUE prop
## <chr> <int> <int> <dbl>
## 1 ALS 998 103 0.0936
## 2 ALS-FTD 57 16 0.219
## 3 Control 229 1 0.00435
## 4 FTD 89 17 0.160
# subthreshold effects?
%>%
c9_eh_meta filter(!is.na(disease) ) %>%
filter(max_call <= 30) %>%
mutate(call_sum = call1 + call2 ) %>%
ggplot(aes(x = max_call)) + geom_histogram(aes(group = disease), binwidth = 1) +
facet_wrap(~disease_n, scales = "free_y", ncol = 1) +
theme_jh() +
labs(x = "C9orf72 repeat length", title = "C8orf72")
%>%
c9_eh_meta filter(!is.na(disease) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
filter(max_call <= 30) %>%
mutate(call_sum = call1 + call2 ) %>%
ggplot(aes(x = disease_n, y = max_call)) +
coord_flip() +
geom_violin(fill = NA, colour = "darkorange") +
geom_jitter(width = 0.25, height = 0, size = 0.5) +#geom_histogram(aes(group = disease, fill = disease), binwidth = 1) +
::stat_compare_means(ref.group = unique(c9_eh_meta$disease_n[c9_eh_meta$disease == "Control"]), label = "p.format", label.y.npc = 0.9, label.x.npc = 0.1 ) +
ggpubr#facet_wrap(~disease_n, scales = "free_y") +
theme_jh() +
labs(title = "C9orf72", x = "Disease", y = "Expansion length")
%>%
c9_eh_meta filter(!is.na(disease) ) %>%
filter(max_call <= 30) %>%
group_by(disease, call = max_call > 10) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "c9_10_" ) %>%
mutate(prop = c9_10_TRUE / (c9_10_TRUE + c9_10_FALSE) )
## # A tibble: 5 × 4
## # Groups: disease [5]
## disease c9_10_FALSE c9_10_TRUE prop
## <chr> <int> <int> <dbl>
## 1 ALS 870 128 0.128
## 2 ALS-FTD 48 9 0.158
## 3 Control 202 27 0.118
## 4 FTD 82 7 0.0787
## 5 Other 225 22 0.0891
Yes, ALS and FTD cases are enriched for C9 expansions > 30. No difference in proportion of sub-30 expansion lengths between diseases.
<- left_join(rna_meta, dna_meta, by = "external_subject_id", suffix = c(".rna", ".dna"))
all_meta <- all_meta %>% select(sample = vcf_id, disease)
disease_meta
# get expansionHunter calls
#a3_expansion_hunter <- read.table(here::here("ALS_SC/a3/ExpansionHunter_a3_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# Feb 2021 - 1877 samples
<- read.table(here::here("ALS_SC/ATXN3/02_2021_ATXN3_ExpansionHunter_results.txt"), header=FALSE, stringsAsFactors = FALSE)
a3_expansion_hunter
# VCF
#a3_vcf <- read.table(here::here("ALS_SC/ATXN3/ATXN3_QTL_SNP_rs10150068.vcf"), header=TRUE, stringsAsFactors = FALSE )
<- read.table(here::here("ALS_SC/ATXN3/ALS_rs10143310.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_vcf <- read.table(here::here("ALS_SC/ATXN3/ALS_rs200388434.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_qtl_vcf
<- process_vcf(a3_vcf, ID = "rs10143310")
a3_genotypes
<- process_vcf(a3_qtl_vcf , ID = "rs200388434")
a3_qtl_geno # 425 donors
# compare GWAS and QTL SNPs
<- left_join(a3_genotypes, a3_qtl_geno, by = c("sample") ) %>%
a3_compare left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
filter(pct_european > 0.9) %>%
select(A = genotype_binary.x, B = genotype_binary.y) %>%
drop_na()
<- process_junctions(sqtl_junctions, "ENSG00000066427.22")
a3_junctions # 17 junctions
<- process_expansionhunter(a3_expansion_hunter) %>% left_join(disease_meta, by = "sample")
a3_eh_df # repeats for 2781 donors
# is there any enrichment in a3 length in ALS?
# merge together
<- left_join(a3_junctions, a3_genotypes, by = "sample") %>%
a3_all left_join(a3_eh_df, by = "sample") %>%
#mutate(genotype_binary) %>%
left_join(disease_meta) %>%
filter(!is.na(genotype_binary))
# a3_all %>%
# filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
# ggplot(aes(x = genotype_binary, y = junction_ratio)) + geom_boxplot(aes( group = genotype_binary)) +
# facet_wrap(~junction_id)
# a3_all %>%
# filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341", !is.na(max_call)) %>%
# ggplot(aes(x = max_call, y = junction_ratio)) + geom_point() + ggpubr::stat_cor()
#
<- inner_join(a3_genotypes, a3_eh_df, by = "sample") %>%
a3_genotype_eh distinct()
#
# a3_genotype_eh %>%
# filter(!is.na(genotype_binary)) %>%
# ggplot(aes(x = max_call) ) +
# geom_dotplot(aes(fill = as.factor(genotype_binary)))
<-
a3_snp %>%
a3_all filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
select(sample, junction_ratio, genotype_binary) %>%
distinct() %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
<-
a3_code_df tibble(
genotype_binary = c(0,1,2),
genotype_code = factor( c("G/G","G/A", "A/A"), levels = c("G/G","G/A", "A/A") )
)
<-
a3_sqtl_snp_resid %>%
a3_all filter(junction_id == "chr14:92049814-92050341") %>%
select(sample, junction_ratio) %>%
distinct() %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
$resid <- resid(lm(junction_ratio ~ . , data = a3_sqtl_snp_resid) )
a3_sqtl_snp_resid
## then add back the SNP
$genotype_binary <- a3_genotypes$genotype_binary[match(row.names(a3_sqtl_snp_resid), a3_genotypes$sample)]
a3_sqtl_snp_resid
%>%
a3_sqtl_snp_resid left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = junction_ratio)) +
geom_boxplot(aes(group = genotype_code), outlier.colour = NA) +
geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0) +
labs(y = "Raw junction ratio", x = "rs10143310", colour = "rs10143310") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2) +
::stat_compare_means() ggpubr
## plot residualised junctions against genotype
<-
plot_a3_sqtl %>%
a3_sqtl_snp_resid left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = resid)) +
geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0, size = 1) +
geom_boxplot(aes(group = genotype_code), outlier.colour = NA, fill = NA) +
labs(y = "Residual splicing ratio", x = "rs10143310", colour = "rs10143310") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
plot_a3_sqtl
summary(lm(junction_ratio ~ genotype_binary, a3_sqtl_snp_resid ))
##
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = a3_sqtl_snp_resid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88322 -0.70638 -0.05576 0.49782 2.58823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.42893 0.08127 -5.278 3.48e-07 ***
## genotype_binary 0.83434 0.10703 7.795 3.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8882 on 194 degrees of freedom
## Multiple R-squared: 0.2385, Adjusted R-squared: 0.2346
## F-statistic: 60.77 on 1 and 194 DF, p-value: 3.827e-13
# P = 3e-13
summary(lm(resid ~ genotype_binary, a3_sqtl_snp_resid ))
##
## Call:
## lm(formula = resid ~ genotype_binary, data = a3_sqtl_snp_resid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97715 -0.45024 -0.08418 0.46492 2.35687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.31015 0.06600 -4.699 4.94e-06 ***
## genotype_binary 0.65364 0.08692 7.520 1.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7214 on 194 degrees of freedom
## Multiple R-squared: 0.2257, Adjusted R-squared: 0.2217
## F-statistic: 56.55 on 1 and 194 DF, p-value: 1.984e-12
# P = 1.98e-12
summary(lm( junction_ratio ~ genotype_binary, data = a3_snp))
##
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = a3_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88322 -0.70638 -0.05576 0.49782 2.58823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.42893 0.08127 -5.278 3.48e-07 ***
## genotype_binary 0.83434 0.10703 7.795 3.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8882 on 194 degrees of freedom
## Multiple R-squared: 0.2385, Adjusted R-squared: 0.2346
## F-statistic: 60.77 on 1 and 194 DF, p-value: 3.827e-13
# genotype P = 1e-13, R2 = 0.238
summary(lm( junction_ratio ~ ., data = a3_snp))
##
## Call:
## lm(formula = junction_ratio ~ ., data = a3_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8694 -0.4728 -0.0442 0.3901 2.1971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.52529 1.45500 -0.361 0.71854
## genotype_binary 0.78999 0.10011 7.891 3.84e-13 ***
## dna_platformX -0.11409 0.15641 -0.729 0.46677
## `dna_prepPCR-Free` 0.36955 0.16514 2.238 0.02656 *
## InferredCov1 2.34263 1.57920 1.483 0.13986
## InferredCov10 0.18446 1.18789 0.155 0.87678
## InferredCov11 -0.58516 1.20568 -0.485 0.62808
## InferredCov12 -1.54656 1.24150 -1.246 0.21462
## InferredCov13 1.88385 1.24587 1.512 0.13242
## InferredCov14 0.21401 1.32325 0.162 0.87172
## InferredCov15 -0.84822 1.49430 -0.568 0.57105
## InferredCov16 0.53320 1.63746 0.326 0.74512
## InferredCov17 0.77591 1.77928 0.436 0.66335
## InferredCov18 -0.04472 1.79702 -0.025 0.98018
## InferredCov19 -0.52204 1.79051 -0.292 0.77099
## InferredCov2 -8.49275 1.27234 -6.675 3.54e-10 ***
## InferredCov20 -0.87916 1.95161 -0.450 0.65295
## InferredCov3 -2.74220 1.17395 -2.336 0.02069 *
## InferredCov4 2.07475 1.13359 1.830 0.06901 .
## InferredCov5 -3.52671 1.14608 -3.077 0.00244 **
## InferredCov6 -3.12124 1.14236 -2.732 0.00697 **
## InferredCov7 -1.08050 1.21000 -0.893 0.37316
## InferredCov8 1.82286 1.14915 1.586 0.11458
## InferredCov9 -0.87924 1.16650 -0.754 0.45207
## PC1 39.10449 86.02630 0.455 0.65002
## PC2 -21.68610 52.15079 -0.416 0.67807
## PC3 21.00597 27.12694 0.774 0.43982
## PC4 -20.22852 25.49876 -0.793 0.42873
## PC5 11.96327 8.11010 1.475 0.14208
## sexmale -0.10658 0.11709 -0.910 0.36403
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7557 on 166 degrees of freedom
## Multiple R-squared: 0.5283, Adjusted R-squared: 0.4459
## F-statistic: 6.412 on 29 and 166 DF, p-value: 1.852e-15
# genotype P = 1e-14, R2 = 0.444
summary(lm( resid ~ genotype_binary, data = a3_sqtl_snp_resid))
##
## Call:
## lm(formula = resid ~ genotype_binary, data = a3_sqtl_snp_resid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97715 -0.45024 -0.08418 0.46492 2.35687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.31015 0.06600 -4.699 4.94e-06 ***
## genotype_binary 0.65364 0.08692 7.520 1.98e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7214 on 194 degrees of freedom
## Multiple R-squared: 0.2257, Adjusted R-squared: 0.2217
## F-statistic: 56.55 on 1 and 194 DF, p-value: 1.984e-12
#
<-
a3_repeat %>%
a3_all filter(!is.na(max_call), junction_id == "chr14:92049814-92050341") %>%
distinct() %>%
select(sample, junction_ratio, max_call) %>%
left_join(covariates, by = "sample") %>%
column_to_rownames(var = "sample")
summary(lm( junction_ratio ~ max_call, data = a3_repeat))
##
## Call:
## lm(formula = junction_ratio ~ max_call, data = a3_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8628 -0.7984 -0.1966 0.6860 2.9399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.63611 0.33340 -1.908 0.0586 .
## max_call 0.03832 0.01735 2.209 0.0290 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.054 on 128 degrees of freedom
## Multiple R-squared: 0.03671, Adjusted R-squared: 0.02919
## F-statistic: 4.878 on 1 and 128 DF, p-value: 0.02897
# genotype P = 0.046, R2 = 0.026
summary(lm( junction_ratio ~ ., data = a3_repeat))
##
## Call:
## lm(formula = junction_ratio ~ ., data = a3_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.64314 -0.50899 -0.05018 0.45904 2.54844
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.02694 2.41499 -1.667 0.0985 .
## max_call 0.03555 0.01613 2.204 0.0298 *
## dna_platformX -0.14620 0.21451 -0.682 0.4971
## `dna_prepPCR-Free` NA NA NA NA
## InferredCov1 1.95960 3.59738 0.545 0.5871
## InferredCov10 3.33540 1.90467 1.751 0.0830 .
## InferredCov11 -1.53272 1.92088 -0.798 0.4268
## InferredCov12 -4.23302 2.19530 -1.928 0.0566 .
## InferredCov13 2.70406 2.00756 1.347 0.1810
## InferredCov14 -0.13077 2.08099 -0.063 0.9500
## InferredCov15 -2.79994 2.28486 -1.225 0.2233
## InferredCov16 4.26529 2.26107 1.886 0.0621 .
## InferredCov17 2.83138 2.69523 1.051 0.2960
## InferredCov18 -0.13412 2.67375 -0.050 0.9601
## InferredCov19 -0.65843 2.79904 -0.235 0.8145
## InferredCov2 -11.03794 2.30632 -4.786 5.83e-06 ***
## InferredCov20 -2.90635 2.92381 -0.994 0.3226
## InferredCov3 -1.71432 1.78025 -0.963 0.3379
## InferredCov4 0.92645 1.84165 0.503 0.6160
## InferredCov5 -1.40300 2.00862 -0.698 0.4865
## InferredCov6 -4.36149 1.81656 -2.401 0.0182 *
## InferredCov7 -2.20735 1.73941 -1.269 0.2073
## InferredCov8 1.68239 1.76044 0.956 0.3415
## InferredCov9 -1.17957 1.98588 -0.594 0.5539
## PC1 149.42627 131.44222 1.137 0.2583
## PC2 70.84372 79.28267 0.894 0.3737
## PC3 8.92917 41.82913 0.213 0.8314
## PC4 -25.45228 36.91321 -0.690 0.4921
## PC5 -24.77207 12.78642 -1.937 0.0555 .
## sexmale -0.16643 0.18510 -0.899 0.3707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9061 on 101 degrees of freedom
## Multiple R-squared: 0.4383, Adjusted R-squared: 0.2826
## F-statistic: 2.815 on 28 and 101 DF, p-value: 8.393e-05
# genotype P = 0.0273, R2 = 0.285
# geom_density(aes(group = genotype_binary))
$resid <- a3_sqtl_snp_resid$resid[match(row.names(a3_repeat), row.names(a3_sqtl_snp_resid) )]
a3_repeatsummary(lm( resid ~ max_call, data = a3_repeat))
##
## Call:
## lm(formula = resid ~ max_call, data = a3_repeat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35611 -0.59474 -0.08955 0.45327 2.68632
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.47899 0.26657 -1.797 0.0747 .
## max_call 0.02739 0.01387 1.974 0.0505 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8427 on 128 degrees of freedom
## Multiple R-squared: 0.02955, Adjusted R-squared: 0.02197
## F-statistic: 3.898 on 1 and 128 DF, p-value: 0.05049
# P = 0.046
<-
a3_repeat_snp %>%
a3_all filter(!is.na(max_call), !is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
distinct() %>%
select(sample, junction_ratio, max_call, genotype_binary) %>%
left_join(covariates, by = "sample") %>%
mutate( genotype_binary = as.numeric(genotype_binary)) %>%
column_to_rownames(var = "sample")
summary(lm( junction_ratio ~ max_call + genotype_binary, data = a3_repeat_snp))
##
## Call:
## lm(formula = junction_ratio ~ max_call + genotype_binary, data = a3_repeat_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91109 -0.71888 -0.02104 0.57295 2.46989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.488248 0.285444 -1.710 0.0896 .
## max_call 0.003349 0.015641 0.214 0.8308
## genotype_binary 0.951719 0.136530 6.971 1.53e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8999 on 127 degrees of freedom
## Multiple R-squared: 0.3033, Adjusted R-squared: 0.2923
## F-statistic: 27.64 on 2 and 127 DF, p-value: 1.082e-10
summary(lm( junction_ratio ~ ., data = a3_repeat_snp))
##
## Call:
## lm(formula = junction_ratio ~ ., data = a3_repeat_snp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85915 -0.41640 -0.03891 0.41260 2.06728
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.530786 2.064561 -0.741 0.4602
## max_call 0.005141 0.014323 0.359 0.7204
## genotype_binary 0.855057 0.130313 6.562 2.38e-09 ***
## dna_platformX -0.119243 0.180287 -0.661 0.5099
## `dna_prepPCR-Free` NA NA NA NA
## InferredCov1 2.402813 3.023469 0.795 0.4287
## InferredCov10 2.217079 1.609453 1.378 0.1714
## InferredCov11 -1.807002 1.614571 -1.119 0.2657
## InferredCov12 -2.944301 1.855034 -1.587 0.1156
## InferredCov13 2.432729 1.687371 1.442 0.1525
## InferredCov14 -0.002418 1.748669 -0.001 0.9989
## InferredCov15 -1.328956 1.932905 -0.688 0.4933
## InferredCov16 1.042261 1.962349 0.531 0.5965
## InferredCov17 1.093326 2.280116 0.480 0.6326
## InferredCov18 -0.758996 2.248648 -0.338 0.7364
## InferredCov19 -1.352027 2.354280 -0.574 0.5671
## InferredCov2 -9.742614 1.947926 -5.002 2.43e-06 ***
## InferredCov20 -1.206587 2.470361 -0.488 0.6263
## InferredCov3 -2.964736 1.507954 -1.966 0.0521 .
## InferredCov4 1.492236 1.549857 0.963 0.3380
## InferredCov5 -1.951706 1.689826 -1.155 0.2509
## InferredCov6 -3.819686 1.528603 -2.499 0.0141 *
## InferredCov7 -1.137398 1.470617 -0.773 0.4411
## InferredCov8 1.535721 1.479388 1.038 0.3017
## InferredCov9 -0.096022 1.676796 -0.057 0.9544
## PC1 88.829067 110.830350 0.801 0.4248
## PC2 8.646023 67.288632 0.128 0.8980
## PC3 13.721574 35.154703 0.390 0.6971
## PC4 -21.980206 31.020998 -0.709 0.4802
## PC5 -7.859754 11.048695 -0.711 0.4785
## sexmale -0.236334 0.155896 -1.516 0.1327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7614 on 100 degrees of freedom
## Multiple R-squared: 0.6074, Adjusted R-squared: 0.4935
## F-statistic: 5.334 on 29 and 100 DF, p-value: 1.62e-10
# genotype P = 1e-8
# repeat P = 0.5
<- a3_repeat_snp
a3_to_plot
$resid <- a3_sqtl_snp_resid$resid[ match(row.names(a3_to_plot), row.names(a3_sqtl_snp_resid) ) ]
a3_to_plot
<-
plot_repeat_splicing %>%
a3_to_plot left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = max_call, y = resid)) +
geom_point(aes(colour = genotype_code), size = 1) +
geom_smooth(method = "lm", colour = "black") +
theme_jh() +
::stat_cor() +
ggpubrscale_colour_brewer(type = "qual", palette = 2) +
labs(y = "Residual splicing ratio", x = "CAG repeat length", colour = "rs10143310") +
scale_x_continuous(breaks = c(0,10,20,30), limits = c(0,30))
plot_repeat_splicing
cor.test(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call)
##
## Pearson's product-moment correlation
##
## data: a3_repeat_snp$genotype_binary and a3_repeat_snp$max_call
## t = 3.8318, df = 128, p-value = 0.0001984
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1572889 0.4671691
## sample estimates:
## cor
## 0.3207872
chisq.test(table(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call > 16))
##
## Pearson's Chi-squared test
##
## data: table(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call > 16)
## X-squared = 31.994, df = 2, p-value = 1.129e-07
<-
plot_a3_genotype_repeat %>%
a3_repeat_snp left_join(a3_code_df, by = "genotype_binary") %>%
ggplot(aes(x = genotype_code, y = max_call) ) +
geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0, size = 1) +
geom_boxplot(outlier.colour = NA, aes(group = genotype_binary), fill = NA) +
scale_y_continuous(breaks = c(0,10,20,30), limits = c(0,30)) +
labs(y = "CAG repeat length", x = "rs10143310", colour = "rs10143310") +
theme_jh() +
scale_colour_brewer(type = "qual", palette = 2)
<-
atxn3_multiplot +plot_a3_genotype_repeat) /
(plot_a3_sqtl &
plot_repeat_splicing theme(axis.ticks = element_line(colour = "black")) &
# theme(axis.ticks.y = element_blank(),
# axis.text.y = element_blank(),
# axis.title.y = element_blank(),
# axis.line.y = element_blank() ) &
guides(colour = FALSE)
atxn3_multiplot
ggsave(plot = atxn3_multiplot, filename = here::here("ALS_SC/plots/ATXN3_genotype_repeat_plots.pdf"), width = 3,
height = 4.5)
The lead QTL SNP for the ATX3 sQTL is tagging the repeat expansion!
Questions:
Does including all the covariates increase the signal?
If the SNP is tagging the expansion, and the expansion affects splicing, then the expansion should correlate better with the splicing change than the tagging SNP, no?
## DNA cohort as well
<- a3_eh_df %>%
a3_eh_meta left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
filter(!is.na(external_subject_id)) %>%
filter(!duplicated(external_subject_id)) %>%
filter(!is.na(max_call))
# throw out duplicate donors
# 1869 unique donors from 2774 samples
<- a3_eh_meta %>%
a3_eh_meta filter(pct_european >= 0.9) # remove admixed samples - quick and dirty for now until I get full MDS values.
# 1634 Europeans
# I can only match metadata for 1559 non-duplicate donors. Time to update metadata? Emailed Nadia and Jennifer
<- mutate(a3_eh_meta, disease = case_when(
a3_eh_meta == "Classical/Typical ALS" ~ "ALS",
subject_group_subcategory == "Classical/Typical ALS, Frontotemporal Dementia (FTD)" ~ "ALS-FTD",
subject_group_subcategory grepl("Frontotemporal Dementia", subject_group_subcategory) ~ "FTD",
grepl("Non-Neurological Control", subject_group) ~ "Control",
TRUE ~ "Other")
)# 1600 donors
<- group_by(a3_eh_meta, disease) %>% tally() %>% mutate(disease_n = paste0(disease,"\n(", n, ")") )
disease_n
$disease <- factor(disease_n$disease, levels = c("Control", "ALS", "FTD", "ALS-FTD", "Other") )
disease_n$disease_n <- factor(disease_n$disease_n, levels = disease_n$disease_n[as.numeric(disease_n$disease)])
disease_n
<- left_join(a3_eh_meta, disease_n, by = "disease")
a3_eh_meta
# a3_eh_meta %>%
# filter(!is.na(disease)) %>%
# ggplot(aes(x = disease_n, y = max_call)) + geom_violin(fill = NA) + geom_jitter(width = 0.25, height = 0, size = 0.5, colour = "gray") +
# #coord_flip() +
# ggpubr::stat_compare_means(ref.group = disease_n$disease_n[2]) +
# theme_jh() + labs(y ="Max repeat length", title = "ATXN3", x = "")
#
#
# a3_eh_meta %>%
# # filter(disease %in% c("ALS", "Control")) %>%
# ggplot(aes(x = call1 + call2 ) ) + geom_histogram(aes(fill = disease), binwidth = 1) +
# facet_wrap(~disease, scales = "free_y", nrow = 4)
<- 16
a3_threshold
<-
a3_dist_plot %>%
a3_eh_meta filter(disease %in% c("ALS", "Control")) %>%
filter(!is.na(disease) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = max_call )) +
geom_violin(fill = NA, colour = "darkorange") +
geom_jitter(height = 0, width = 0.25, size = 0.5) +
coord_flip() +
geom_hline(yintercept = a3_threshold, linetype = 3, colour = "red") + theme_jh() +
labs(x = "", y = "ATXN3 expansion max size")
# proportion of group with expansion > 16
<- a3_eh_meta %>% filter(!is.na(disease) ) %>%
a3_prop_plot filter(disease %in% c("ALS", "Control")) %>%
group_by(disease_n, expand = !is.na(max_call) & max_call > a3_threshold) %>%
tally() %>%
pivot_wider(names_from = expand, values_from = n, values_fill = 0, names_prefix = "a3") %>%
mutate(prop = a3TRUE / (a3TRUE + a3FALSE) ) %>%
mutate(prop_label = paste0(signif(prop * 100, 2), "%" ) ) %>%
mutate(disease_n = forcats::fct_rev(disease_n)) %>%
ggplot(aes(x = disease_n, y = prop)) +
geom_col() +
coord_flip() +
labs(y = "Proportion > 16 repeats", x = "") +
theme_jh() +
scale_y_continuous(labels = scales::percent, minor_breaks = NULL, breaks = c(0,0.2), limits = c(0,0.25)) +
geom_text(aes(label = prop_label ), colour = "black", nudge_y = 0.04, size = 3)
# proportion European in each sample by disease
# FTD slightly higher distribution - almost all from UK, less diverse than other diease groups
%>%
a3_eh_meta ggplot(aes(x = disease, y = as.numeric(pct_european) )) + geom_jitter() + geom_violin(fill = NA) + theme_jh() +
::stat_compare_means(ref.group = "Control") ggpubr
# table
%>%
a3_eh_meta filter(!is.na(disease) ) %>%
# filter(max_call <= 30) %>%
group_by(disease, call = max_call > 15) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "a3_30_" ) %>%
mutate(prop = a3_30_TRUE / (a3_30_TRUE + a3_30_FALSE) )
## # A tibble: 5 × 4
## # Groups: disease [5]
## disease a3_30_FALSE a3_30_TRUE prop
## <chr> <int> <int> <dbl>
## 1 ALS 186 805 0.812
## 2 ALS-FTD 21 48 0.696
## 3 Control 46 156 0.772
## 4 FTD 24 71 0.747
## 5 Other 52 191 0.786
<- a3_eh_meta %>%
a3_bar_plot filter(disease %in% c("ALS", "Control")) %>%
filter(!is.na(disease) ) %>%
#filter(max_call <= 30) %>%
#mutate(call_sum = call1 + call2 ) %>%
ggplot(aes(x = max_call)) + geom_histogram(aes(group = disease), binwidth = 1) +
facet_wrap(~disease_n, scales = "free_y", ncol = 1) +
theme_jh() +
labs(x = "ATXN3 repeat length", title = "ATXN3")
+ a3_bar_plot + plot_layout(ncol = 2, widths = c(1,1)) a3_dist_plot
# tally
<- a3_eh_meta %>%
a3_tally filter(disease %in% c("Control", "ALS") ) %>%
group_by(disease, max_call) %>% tally() %>%
pivot_wider(names_from =disease, values_from = n, values_fill = 0 )
$ALS_sum <- sum(a3_tally$ALS)
a3_tally$Control_sum <- sum(a3_tally$Control)
a3_tally
$ALS_prop <- a3_tally$ALS / a3_tally$ALS_sum
a3_tally$Control_prop <- a3_tally$Control / a3_tally$Control_sum
a3_tally
$p.value <- map_dbl(1:nrow(a3_tally), ~{
a3_tallyprint(.x)
<- a3_tally[.x,]
.x prop.test(x = c(.x[,2, drop = TRUE], .x[,3, drop = TRUE]), n = c(.x[,4, drop = TRUE], .x[,5, drop = TRUE]))$p.value
})
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
$padj <- p.adjust(a3_tally$p.value, method = "bonferroni")
a3_tally
<-
plot_atxn3_eh_tally %>%
a3_tally filter(max_call > 15 & max_call < 30) %>%
select(max_call, Control_prop, ALS_prop, p.value, padj) %>%
pivot_longer(cols = Control_prop:ALS_prop, values_to = "prop", names_to = "disease" ) %>%
mutate(disease = gsub("_prop", "", disease)) %>%
left_join(disease_n, by = "disease") %>%
ggplot(aes(x = max_call, y = prop)) + geom_col(aes(fill = disease_n), position = position_dodge(), width = 0.75 ) +
theme_jh() +
labs(x = "Maximum ATXN3 repeat length", y = "Proportion of group", fill = "Group") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0,0.26), expand = c(0,0)) +
geom_hline(yintercept = 0) +
geom_text(aes(label = signif(p.value, 2), y = 0.25 ), size =2 )
ggsave(plot = plot_atxn3_eh_tally, filename = here::here("ALS_SC/plots/ATXN3_WGS_EH_tally.pdf"), width =6, height = 3 )
# threshold at 16
%>%
a3_eh_meta filter(disease %in% c("Control", "ALS") ) %>%
group_by(disease, call = max_call >= 21) %>% tally() %>%
pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "a3_" ) %>%
mutate(n = a3_FALSE + a3_TRUE) %>%
mutate(prop = a3_TRUE / (a3_TRUE + a3_FALSE) ) %>%
mutate(p.value = prop.test(x = .$a3_TRUE, n = .$n)$p.value )
## # A tibble: 2 × 6
## # Groups: disease [2]
## disease a3_FALSE a3_TRUE n prop p.value
## <chr> <int> <int> <int> <dbl> <dbl>
## 1 ALS 677 314 991 0.317 0.148
## 2 Control 149 53 202 0.262 0.148
prop.test(x = c(314, 53), n = c(sum(a3_tally$ALS), sum(a3_tally$Control)) )
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(314, 53) out of c(sum(a3_tally$ALS), sum(a3_tally$Control))
## X-squared = 2.0891, df = 1, p-value = 0.1484
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.01573188 0.12468273
## sample estimates:
## prop 1 prop 2
## 0.3168517 0.2623762
# P = 0.36
Compare with ATXN2
<- a3_eh_meta
a2_eh_meta
<- function(x){
max_atxn2 <- str_split_fixed(x, "/", 2)
df <- as.numeric(apply(df, MARGIN = 1, max))
lengths return(lengths)
}$atxn2_max <- max_atxn2(a2_eh_meta$atxn2_repeat_size)
a2_eh_meta
<- a2_eh_meta %>%
a2_tally filter(disease %in% c("Control", "ALS") ) %>%
group_by(disease, atxn2_max) %>% tally() %>%
pivot_wider(names_from =disease, values_from = n, values_fill = 0 ) %>%
arrange(atxn2_max)
$ALS_sum <- sum(a2_tally$ALS)
a2_tally$Control_sum <- sum(a2_tally$Control)
a2_tally
$ALS_prop <- a2_tally$ALS / a2_tally$ALS_sum
a2_tally$Control_prop <- a2_tally$Control / a2_tally$Control_sum
a2_tally
$p.value <- map_dbl(1:nrow(a2_tally), ~{
a2_tallyprint(.x)
<- a2_tally[.x,]
.x prop.test(x = c(.x[,2, drop = TRUE], .x[,3, drop = TRUE]), n = c(.x[,4, drop = TRUE], .x[,5, drop = TRUE]))$p.value
})
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
$padj <- p.adjust(a2_tally$p.value, method = "bonferroni")
a2_tally
%>%
a2_tally filter(atxn2_max > 24 & atxn2_max < 34 ) %>%
select(atxn2_max, Control_prop, ALS_prop, p.value, padj) %>%
pivot_longer(cols = Control_prop:ALS_prop, values_to = "prop", names_to = "disease" ) %>%
mutate(disease = gsub("_prop", "", disease)) %>%
left_join(disease_n, by = "disease") %>%
ggplot(aes(x = atxn2_max, y = prop)) + geom_col(aes(fill = disease_n), position = position_dodge(), width = 0.75 ) +
theme_jh() +
labs(title = "ATXN2", x = "repeat length") +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = 0) +
geom_text(aes(label = signif(p.value, 2), y = 0.2 ) )