dna_meta <- read_tsv(here::here("metadata/DNA/vcf_samples_metadata.tsv") )
dna_meta_full <- dna_meta_full <- readxl::read_excel(here::here("metadata/raw_metadata_from_NYGC/ALS Consortium DNA Metadata 20201015 .xlsx")) %>% janitor::clean_names()

rna_meta <- read_tsv(here::here("metadata/RNA/rna_metadata_annotated.tsv"))


covariates <- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_splicing_peer20.gene.combined_covariates.txt")) %>%
  gather(key = "sample", value = "value", -ID) %>% 
  spread(key = ID, value = value)

# expansionHunter calls are using external_sample_id
# genotype and others are using VCF ID

coloc_res <- read_tsv(here::here("data/QTLs/Nov_2020/results/COLOC/all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"))

#coloc_res <- read_tsv(here::here("data/COLOC/Nicolas_suggestive/all_COLOC_results_merged_H4_0.5_with_LD.tsv.gz"))

lsp <- filter(coloc_res, QTL == "NYGC_Lumbar_sQTL", QTL_Gene %in% c("C9orf72", "ATXN3")) %>%
  filter(PP.H4.abf > 0.5)
lsp
## # A tibble: 5 × 24
##   disease GWAS   locus  GWAS_SNP    GWAS_P GWAS_chr GWAS_pos QTL   type  QTL_SNP
##   <chr>   <chr>  <chr>  <chr>        <dbl> <chr>       <dbl> <chr> <chr> <chr>  
## 1 ALS     Nicol… locus… rs101433… 3.24e- 7 chr14    92074037 NYGC… sQTL  chr14:…
## 2 ALS     Nicol… locus… rs101433… 3.24e- 7 chr14    92074037 NYGC… sQTL  rs2003…
## 3 ALS     Nicol… locus… rs101433… 3.24e- 7 chr14    92074037 NYGC… sQTL  rs2003…
## 4 ALS     Nicol… locus… rs3849943 3.77e-30 chr9     27543384 NYGC… sQTL  rs1537…
## 5 ALS     Nicol… locus… rs101433… 3.24e- 7 chr14    92074037 NYGC… sQTL  rs5810…
## # … with 14 more variables: QTL_P <dbl>, QTL_MAF <dbl>, QTL_chr <chr>,
## #   QTL_pos <dbl>, QTL_junction <chr>, QTL_Gene <chr>, QTL_Ensembl <chr>,
## #   nsnps <dbl>, PP.H0.abf <dbl>, PP.H1.abf <dbl>, PP.H2.abf <dbl>,
## #   PP.H3.abf <dbl>, PP.H4.abf <dbl>, LD <dbl>

C9orf72 sQTL

sqtl_junctions <- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_sQTL_junctions.bed") )



sample_key <- read_tsv(here::here("ALS_SC/ATXN3/LumbarSpinalCord_sample_key.txt") )
# 197 samples from 197 donors

# get expansionHunter calls for samples with RNA
c9_expansion_hunter_rna <- read.table(here::here("ALS_SC/ATXN3/ExpansionHunter_C9orf72_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# 403 donors with expansionHunter calls

# feb 2021 - 2044 samples with WGS
c9_expansion_hunter_wgs <- read.table(here::here("ALS_SC/ATXN3/02_2021_C9orf72_ExpansionHunter_results.txt"), header=FALSE, stringsAsFactors = FALSE)
# 167 samples have maximum c9 repeat length > 30

# VCF for 434 samples
c9_vcf <- read.table(here::here("ALS_SC/ATXN3/C9_GWAS_SNP_rs3849943.vcf"), header=TRUE, stringsAsFactors = FALSE )
# R read T as TRUE
c9_vcf$ALT <- "T"


# rs3849943 - reference allele should be T, alternate should be C. C has an allele frequency around 20% worldwide

c9_genotypes <- process_vcf(c9_vcf) %>%
  select(-genotype_binary)

# fix allele flip here
c9_code_df <- tibble(
  genotype = c("1/1", "0/1", "0/0"),
  genotype_code = factor(c("T/T", "T/C", "C/C"), levels = c("T/T", "T/C", "C/C") ),
  genotype_binary = c(2,1,0)
)

c9_genotypes <- left_join(c9_genotypes, c9_code_df, by = "genotype")

c9_junctions <- process_junctions(sqtl_junctions, "ENSG00000147894.15")
# 11 junctions in 197 samples

#c9_eh_df_rna <- process_expansionhunter(c9_expansion_hunter_rna)
c9_eh_df<- process_expansionhunter(c9_expansion_hunter_wgs)
# 2041 calls, only 1 is NA


# merge together
c9_all <- left_join(c9_junctions, c9_genotypes, by = "sample") %>%
  left_join(c9_eh_df, by = "sample") %>%
  mutate(genotype_binary = factor(genotype_binary))
# 2167 rows for 197 samples

# extract top colocalised junction
# join on QTL covariates for those samples
c9_sqtl_snp <- c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
  select(sample, junction_ratio, genotype_binary) %>%
  left_join(covariates, by = "sample") %>%
  column_to_rownames(var = "sample") %>%
  mutate(genotype_binary = as.numeric(genotype_binary))

# do linear regression on junction ratio against genotype - a splicing QTL in R!
summary(lm(junction_ratio ~ genotype_binary, data = c9_sqtl_snp))
## 
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = c9_sqtl_snp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4998 -0.4983 -0.1195  0.3538  2.8727 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.27972    0.18316   6.987 4.33e-11 ***
## genotype_binary -0.54971    0.07375  -7.454 2.89e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6954 on 195 degrees of freedom
## Multiple R-squared:  0.2217, Adjusted R-squared:  0.2177 
## F-statistic: 55.56 on 1 and 195 DF,  p-value: 2.892e-12
# P = 1e-12

# now add all the same covariates as tensorQTL
summary(lm(junction_ratio ~ . , data = c9_sqtl_snp))
## 
## Call:
## lm(formula = junction_ratio ~ ., data = c9_sqtl_snp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19512 -0.44398 -0.04007  0.37916  2.37281 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.879480   1.312373   1.432  0.15398    
## genotype_binary     -0.518285   0.080651  -6.426 1.31e-09 ***
## dna_platformX        0.214807   0.143021   1.502  0.13500    
## `dna_prepPCR-Free`  -0.075160   0.149767  -0.502  0.61643    
## InferredCov1        -1.775086   1.441707  -1.231  0.21996    
## InferredCov10        0.031426   1.070358   0.029  0.97661    
## InferredCov11       -0.026260   1.090898  -0.024  0.98082    
## InferredCov12       -0.002791   1.132687  -0.002  0.99804    
## InferredCov13       -0.628309   1.137231  -0.552  0.58135    
## InferredCov14       -0.075767   1.209412  -0.063  0.95012    
## InferredCov15        0.945244   1.377216   0.686  0.49345    
## InferredCov16        0.863256   1.473463   0.586  0.55875    
## InferredCov17       -0.829829   1.614621  -0.514  0.60797    
## InferredCov18        0.361519   1.639576   0.220  0.82575    
## InferredCov19        1.282635   1.632608   0.786  0.43319    
## InferredCov2        -2.081830   1.149401  -1.811  0.07190 .  
## InferredCov20        1.502637   1.790155   0.839  0.40245    
## InferredCov3        -0.851180   1.069428  -0.796  0.42721    
## InferredCov4         2.972459   1.033283   2.877  0.00454 ** 
## InferredCov5        -0.613545   1.045911  -0.587  0.55826    
## InferredCov6        -0.995302   1.039625  -0.957  0.33977    
## InferredCov7         0.422511   1.084553   0.390  0.69735    
## InferredCov8        -0.263301   1.050102  -0.251  0.80232    
## InferredCov9         1.699327   1.064456   1.596  0.11228    
## PC1                134.294938  78.607734   1.708  0.08942 .  
## PC2                -68.869449  47.635260  -1.446  0.15012    
## PC3                 30.729341  24.764210   1.241  0.21639    
## PC4                 -6.440466  23.312600  -0.276  0.78269    
## PC5                 -1.668193   7.312508  -0.228  0.81983    
## sexmale              0.079182   0.107156   0.739  0.46098    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6906 on 167 degrees of freedom
## Multiple R-squared:  0.3427, Adjusted R-squared:  0.2285 
## F-statistic: 3.002 on 29 and 167 DF,  p-value: 5.251e-06
# P = 1e-9 

# get residuals
c9_sqtl_snp_resid <- 
  c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
  select(sample, junction_ratio) %>%
  left_join(covariates, by = "sample") %>%
  column_to_rownames(var = "sample")

c9_sqtl_snp_resid$resid <- resid(lm(junction_ratio ~ . , data = c9_sqtl_snp_resid) )
## then add back the SNP
c9_sqtl_snp_resid$genotype_binary <- c9_genotypes$genotype_binary[match(row.names(c9_sqtl_snp_resid), c9_genotypes$sample)]


## MAIN FIGURE
plot_c9_snp_splicing <- 
  c9_sqtl_snp_resid %>%
  left_join(c9_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = genotype_code, y = resid)) + 
  geom_jitter(aes(colour = genotype_code), width = 0.2, height = 0, size = 1) +
  geom_boxplot(outlier.color = NA, fill = NA) +
  labs(y = "Residual splicing ratio", x = "genotype") +
  #ggpubr::stat_compare_means() +
  theme_jh() +
  labs(colour = "rs3849943") +
  scale_colour_brewer(type = "qual", palette = 2)

plot_c9_snp_splicing

Now use C9orf72 expansion repeat

## Get Expansion Calls
c9_sqtl_repeat <- c9_all %>% filter(junction_id == "chr9:27567164-27573709") %>%
  select(sample, junction_ratio, max_call) %>%
  left_join(covariates, by = "sample") %>%
  column_to_rownames(var = "sample") %>%
  filter( !is.na(max_call))
  #mutate(max_call = factor(max_call > 30))

# 139 out of 197 samples have ExpansionHunter genotypes

summary(lm(junction_ratio ~ log(max_call), data = c9_sqtl_repeat))
## 
## Call:
## lm(formula = junction_ratio ~ log(max_call), data = c9_sqtl_repeat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4786 -0.4952 -0.1193  0.4479  2.4330 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.46143    0.10919  -4.226 4.32e-05 ***
## log(max_call)  0.18748    0.03937   4.763 4.81e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7203 on 137 degrees of freedom
## Multiple R-squared:  0.142,  Adjusted R-squared:  0.1358 
## F-statistic: 22.68 on 1 and 137 DF,  p-value: 4.806e-06
# P = 4.81e-06
summary(lm(junction_ratio ~ . , data = c9_sqtl_repeat))
## 
## Call:
## lm(formula = junction_ratio ~ ., data = c9_sqtl_repeat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81084 -0.42999 -0.05215  0.42323  1.65273 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         1.848e+00  1.779e+00   1.038  0.30137   
## max_call            1.669e-03  5.881e-04   2.838  0.00541 **
## dna_platformX       3.087e-01  1.653e-01   1.868  0.06442 . 
## `dna_prepPCR-Free`         NA         NA      NA       NA   
## InferredCov1        2.507e+00  2.749e+00   0.912  0.36372   
## InferredCov10      -4.387e-02  1.432e+00  -0.031  0.97561   
## InferredCov11       9.855e-01  1.492e+00   0.660  0.51040   
## InferredCov12      -8.995e-01  1.704e+00  -0.528  0.59866   
## InferredCov13       2.194e+00  1.540e+00   1.424  0.15715   
## InferredCov14       2.767e+00  1.534e+00   1.804  0.07397 . 
## InferredCov15      -1.256e+00  1.765e+00  -0.712  0.47813   
## InferredCov16       2.716e-01  1.767e+00   0.154  0.87812   
## InferredCov17      -1.950e+00  2.089e+00  -0.934  0.35250   
## InferredCov18       1.630e+00  2.028e+00   0.804  0.42310   
## InferredCov19       3.177e+00  2.072e+00   1.533  0.12806   
## InferredCov2       -6.201e-01  1.775e+00  -0.349  0.72756   
## InferredCov20      -4.452e-01  2.251e+00  -0.198  0.84358   
## InferredCov3       -9.271e-01  1.364e+00  -0.679  0.49827   
## InferredCov4        3.398e+00  1.337e+00   2.542  0.01240 * 
## InferredCov5       -2.182e+00  1.464e+00  -1.491  0.13895   
## InferredCov6       -4.505e-01  1.367e+00  -0.330  0.74230   
## InferredCov7        4.207e-01  1.353e+00   0.311  0.75634   
## InferredCov8        1.034e+00  1.356e+00   0.762  0.44774   
## InferredCov9        1.954e+00  1.358e+00   1.439  0.15308   
## PC1                 8.573e+01  9.984e+01   0.859  0.39241   
## PC2                -8.938e+01  6.088e+01  -1.468  0.14492   
## PC3                 2.506e+01  3.206e+01   0.782  0.43601   
## PC4                 7.387e+00  2.791e+01   0.265  0.79175   
## PC5                -9.666e+00  9.648e+00  -1.002  0.31862   
## sexmale             2.612e-02  1.427e-01   0.183  0.85513   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7216 on 110 degrees of freedom
## Multiple R-squared:  0.3087, Adjusted R-squared:  0.1328 
## F-statistic: 1.755 on 28 and 110 DF,  p-value: 0.02119
# P = 0.005


# plot association between repeat length and residualised junction ratio
c9_sqtl_resid_eh <- c9_sqtl_snp_resid

c9_sqtl_resid_eh$max_call <- c9_sqtl_repeat$max_call[match(row.names(c9_sqtl_resid_eh), row.names(c9_sqtl_repeat) )]

c9_sqtl_resid_eh <- c9_sqtl_resid_eh[ !is.na(c9_sqtl_resid_eh$max_call), ]
# 139 samples


# plot repeat length against residualised splicing - MAIN FIGURE
plot_c9_repeat_splicing <- 
  c9_sqtl_resid_eh  %>%
  rownames_to_column(var = "sample") %>%
  left_join(rna_meta, by = c("sample" = "external_sample_id") ) %>%
  left_join( c9_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = max_call, y = resid)) + 
  geom_point(aes(colour = genotype_code), size = 1) + 
  geom_smooth(method = "lm", colour = "black") +
  labs(x = "C9orf72 repeat length", y = "Residualised Splicing ratio") +
  ggpubr::stat_cor() +
  scale_x_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
  labs(colour = "rs3849943") +
  theme_jh() +
  scale_colour_brewer(type = "qual", palette = 2)


# colour by disease - for supplement
c9_sqtl_resid_eh  %>%
  rownames_to_column(var = "sample") %>%
  left_join(sample_key, by = c("sample" = "participant_id")) %>%
  left_join(rna_meta, by = c("sample_id" = "external_sample_id") ) %>%
  left_join( c9_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = max_call, y = resid)) + 
  geom_point(aes(colour = disease), size = 0.8) + 
  geom_smooth(method = "lm", colour = "black") +
  labs(x = "C9orf72 repeat length", y = "Residualised Splicing ratio") +
  ggpubr::stat_cor() +
  scale_x_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
  theme_jh()

c9_sqtl_resid_eh %>%
  filter(!is.na(max_call)) %>%
  ggplot(aes(x = max_call > 100, y = junction_ratio)) + geom_boxplot() + ggpubr::stat_compare_means() +
  geom_jitter(width = 0.25) +
  labs(x = "C9 repeat > 30", y = "Splicing ratio") +
  theme_jh()

c9_sqtl_resid_eh %>%
  ggplot(aes(x = max_call, y = resid)) + geom_point() + geom_smooth(method = "lm") +
  labs(x = "C9orf72 repeat length", y = "Residual splicing ratio")

c9_sqtl_resid_eh %>%
  filter(!is.na(max_call)) %>%
  ggplot(aes(x = max_call > 100, y = resid)) + geom_boxplot() + ggpubr::stat_compare_means() +
  geom_jitter(width = 0.25) +
  labs(x = "C9 repeat > 100", y = "Residual splicing ratio")

c9_sqtl_resid_eh  %>%
  ggplot(aes(x = genotype_binary, y = max_call)) + geom_point()

chisq.test(table(expansion = c9_sqtl_resid_eh $max_call > 30, genotype = c9_sqtl_resid_eh$genotype_binary))
## 
##  Pearson's Chi-squared test
## 
## data:  table(expansion = c9_sqtl_resid_eh$max_call > 30, genotype = c9_sqtl_resid_eh$genotype_binary)
## X-squared = 17.499, df = 2, p-value = 0.0001586
plot_c9_repeat_genotype <- 
  c9_sqtl_resid_eh  %>%
  rownames_to_column(var = "sample") %>%
  #left_join(rna_meta, by = c("sample" = "external_sample_id") ) %>%
  left_join( c9_code_df, by = "genotype_binary") %>%
  ggplot(aes(y = max_call, x = genotype_code)) + 
  geom_jitter(aes(colour = genotype_code), size = 1, width = 0.2, height =0) + 
    geom_boxplot(outlier.color = NA, fill = NA) +
  #geom_smooth(method = "lm", colour = "black") +
  labs(y = "C9orf72 repeat length", x = "genotype") +
  #ggpubr::stat_cor() +
  scale_y_continuous(trans='log10', breaks = c(2,5,10,30,100, 500)) +
  labs(colour = "rs3849943") +
  theme_jh() +
  scale_colour_brewer(type = "qual", palette = 2)
c9_multiplot <- 
  (plot_c9_snp_splicing +   plot_c9_repeat_genotype ) /
  plot_c9_repeat_splicing &
  # theme(axis.ticks.y = element_blank(), 
  #       axis.text.y = element_blank(), 
  #       axis.title.y = element_blank(),
  #       axis.line.y = element_blank() ) &
   # ylim(-1.6,2.6) 
 # ) +
  #plot_layout(nrow = 2, guides = "collect", widths =c(1,2,1)) &
  guides(colour = FALSE) #&
  #theme(legend.position = "bottom") &
  #plot_annotation(title = "rs3849943 - C9orf72 splicing", subtitle = "chr9:27567164-27573709") &
  #ylim(-1.6,2.6) 

c9_multiplot 

ggsave(plot = c9_multiplot, filename = here::here("ALS_SC/plots/C9orf72_genotype_repeat_plots.pdf"), width = 3, height = 4.5) 

1 out of 60 homozygous ref have expansions 11 out of 50 are heterozygous 5 out of 14 homozygous alt have expansions

therefore there’s clearly a tagging of the repeat by the SNP and vice versa.

C9orf72 repeat length is weakly associated with splicing of the first intron - P = 0.059 (residual ratio)

# plot
c9_all %>%
  filter(junction_id == "chr9:27567164-27573709") %>%
  ggplot(aes(x = genotype_binary, y = junction_ratio)) + geom_boxplot(aes( group = genotype_binary)) +
  facet_wrap(~junction_id)

c9_all %>%
  filter(!is.na(max_call)) %>%
  filter(junction_id == "chr9:27567164-27573709") %>%
  ggplot(aes(x = genotype_binary, y = junction_ratio)) + 
  #geom_boxplot(aes( group = max_call > 80), outlier.colour = NA) +
  geom_jitter(aes(colour = max_call > 80)) +
  facet_wrap(~junction_id)

#chisq.test(table(c9_genotype_eh$genotype_binary, c9_genotype_eh$max_call))


c9_all %>%
  filter(!is.na(max_call)) %>%
    filter(junction_id == "chr9:27567164-27573709") %>%
  mutate(genotype_binary = factor(genotype_binary)) %>%
  ggplot(aes(x = max_call > 80, y = junction_ratio)) + geom_boxplot() +
  facet_wrap(~junction_id)

c9_all %>%
  select(sample, genotype_binary, max_call) %>%
  filter(!is.na(max_call)) %>%
  distinct() %>%
  ggplot(aes(x = genotype_binary, y = max_call)) + geom_point()

c9_genotype_eh <- inner_join(c9_genotypes, c9_eh_df, by = "sample")

table(SNP_genotype = c9_genotype_eh$genotype_binary, expansion = c9_genotype_eh$max_call > 80)
##             expansion
## SNP_genotype FALSE TRUE
##            0    18    9
##            1    87   32
##            2   119    1
c9_genotype_eh %>%
  ggplot(aes(x = genotype_binary, y = max_call)) +
  geom_jitter(height = 0, width = 0.25)

c9_model <- 
  c9_all %>%
  filter(!is.na(max_call)) %>%
    filter(junction_id == "chr9:27567164-27573709")

summary(lm(junction_ratio ~ as.numeric(genotype_binary), data = c9_model))
## 
## Call:
## lm(formula = junction_ratio ~ as.numeric(genotype_binary), data = c9_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43377 -0.50194 -0.08515  0.41028  2.90805 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.18285    0.20476   5.777 4.89e-08 ***
## as.numeric(genotype_binary) -0.51893    0.08395  -6.182 6.82e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6877 on 137 degrees of freedom
## Multiple R-squared:  0.2181, Adjusted R-squared:  0.2124 
## F-statistic: 38.21 on 1 and 137 DF,  p-value: 6.823e-09
summary(lm(junction_ratio ~ as.numeric(max_call), data = c9_model))
## 
## Call:
## lm(formula = junction_ratio ~ as.numeric(max_call), data = c9_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19386 -0.54406 -0.09168  0.42007  2.68533 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          -0.1091423  0.0702518  -1.554  0.12259   
## as.numeric(max_call)  0.0014902  0.0005401   2.759  0.00659 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.757 on 137 degrees of freedom
## Multiple R-squared:  0.05264,    Adjusted R-squared:  0.04573 
## F-statistic: 7.612 on 1 and 137 DF,  p-value: 0.006589
summary(lm(junction_ratio ~ as.numeric(max_call > 80) + as.numeric(genotype_binary), data = c9_model))
## 
## Call:
## lm(formula = junction_ratio ~ as.numeric(max_call > 80) + as.numeric(genotype_binary), 
##     data = c9_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4011 -0.4920 -0.1165  0.4278  2.8392 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.13505    0.22769   4.985 1.85e-06 ***
## as.numeric(max_call > 80)    0.08633    0.17770   0.486    0.628    
## as.numeric(genotype_binary) -0.50379    0.08976  -5.613 1.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6896 on 136 degrees of freedom
## Multiple R-squared:  0.2194, Adjusted R-squared:  0.208 
## F-statistic: 19.12 on 2 and 136 DF,  p-value: 4.823e-08
c9_model %>%
  ggplot(aes( x = max_call > 80, y = junction_ratio)) + geom_boxplot() + ggpubr::stat_compare_means()

Yes, the GWAS variant tags the expansion.

However, does the expansion cause the splicing change? Effect is weaker than that of the SNP.

Is there an enrichment in expanded repeat alleles in ALS patients compared to controls? Obviously yes.

nrow(c9_eh_df)
## [1] 2041
# 2041 samples have EH run on them

setdiff(c9_eh_df$sample, dna_meta_full$external_sample_id)
## [1] "HDA1368" "HDA1362" "HDA1360" "HDA1364" "HDA1359" "HDA1366" "HDA1367"
## [8] "HDA1363" "HDA1361"
c9_eh_meta <- c9_eh_df %>%
  left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
  filter(!is.na(external_subject_id))

dim(c9_eh_meta)
## [1] 2032   63
# 2032 

c9_eh_meta <- c9_eh_meta %>%
  filter(!duplicated(external_subject_id)) %>%# throw out duplicate donors
  filter(pct_european >= 0.9) # remove admixed samples - quick and dirty for now until I get full MDS values.

dim(c9_eh_meta)
## [1] 1773   63
# 1773 donors


c9_eh_meta <- mutate(c9_eh_meta, disease = case_when( 
  subject_group_subcategory == "Classical/Typical ALS" ~ "ALS", 
  subject_group_subcategory == "Classical/Typical ALS, Frontotemporal Dementia (FTD)" ~ "ALS-FTD",
  grepl("Frontotemporal Dementia", subject_group_subcategory) ~ "FTD",
  grepl("Non-Neurological Control", subject_group)  ~ "Control",
  TRUE ~ "Other")
)

disease_n <- group_by(c9_eh_meta, disease) %>% tally() %>% mutate(disease_n = paste0(disease,"\n(", n, ")") )

disease_n$disease <- factor(disease_n$disease, levels =  c("Control", "ALS",  "ALS-FTD", "FTD", "Other") )
disease_n$disease_n <- factor(disease_n$disease_n, levels =  disease_n$disease_n[as.numeric(disease_n$disease)])

c9_eh_meta <- left_join(c9_eh_meta, disease_n, by = "disease")



# expansion size
c9_dist_plot <- c9_eh_meta %>%
  filter(disease != "Other") %>%
  filter(!is.na(disease) )  %>%
  mutate(disease_n = forcats::fct_rev(disease_n)) %>%
  ggplot(aes(x = disease_n, y = max_call )) + 
  geom_violin(fill = NA, colour = "darkorange") + 
  geom_jitter(height = 0, width = 0.25, size = 0.5) +
  coord_flip() +
  scale_y_continuous(trans = 'log10') +
  geom_hline(yintercept = 30, linetype = 3, colour = "red") + theme_jh() +
  labs(x = "", y = "C9orf72 expansion max size")

# proportion of group with expansion > 30
c9_prop_plot <- c9_eh_meta %>% 
  filter(!is.na(disease) ) %>%  
  filter(disease != "Other") %>%
  group_by(disease_n, expand = !is.na(max_call) & max_call > 30) %>% 
  tally() %>% 
  pivot_wider(names_from = expand, values_from = n, values_fill = 0, names_prefix = "c9") %>% 
  mutate(prop = c9TRUE / (c9TRUE + c9FALSE) ) %>% 
  mutate(prop_label = paste0(signif(prop * 100, 2), "%" ) ) %>%
  mutate(disease_n = forcats::fct_rev(disease_n)) %>%
  ggplot(aes(x = disease_n, y = prop)) + 
  geom_col() + 
  coord_flip() + 
  labs(y = "Proportion > 30 repeats", x = "") +
  theme_jh() +
  scale_y_continuous(labels = scales::percent, minor_breaks = NULL, breaks = c(0,0.2), limits = c(0,0.4)) +
  geom_text(aes(label = prop_label ), colour = "black", nudge_y = 0.04, size = 3) 

c9_dist_plot + c9_prop_plot + plot_layout(ncol = 2, widths = c(3,1))

ALS-FTD has highest proportion of individuals with >30 repeats

c9_eh_meta %>%
  filter(!is.na(disease) ) %>%  
  filter(disease != "Other") %>%
 # filter(max_call <= 30) %>%
  group_by(disease, call = max_call > 30) %>% tally() %>%
  pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "c9_30_" ) %>%
  mutate(prop = c9_30_TRUE / (c9_30_TRUE  + c9_30_FALSE) )
## # A tibble: 4 × 4
## # Groups:   disease [4]
##   disease c9_30_FALSE c9_30_TRUE    prop
##   <chr>         <int>      <int>   <dbl>
## 1 ALS             998        103 0.0936 
## 2 ALS-FTD          57         16 0.219  
## 3 Control         229          1 0.00435
## 4 FTD              89         17 0.160
# subthreshold effects?
c9_eh_meta %>%
  filter(!is.na(disease) ) %>%
  filter(max_call <= 30) %>%
  mutate(call_sum = call1 + call2 ) %>%
  ggplot(aes(x = max_call)) + geom_histogram(aes(group = disease), binwidth = 1) +
  facet_wrap(~disease_n, scales = "free_y", ncol = 1) +
  theme_jh() + 
  labs(x = "C9orf72 repeat length", title = "C8orf72")

c9_eh_meta %>%
  filter(!is.na(disease) ) %>%
    mutate(disease_n = forcats::fct_rev(disease_n)) %>%
  filter(max_call <= 30) %>%
  mutate(call_sum = call1 + call2 ) %>%
  ggplot(aes(x = disease_n, y = max_call)) + 
  coord_flip() +
  geom_violin(fill = NA, colour = "darkorange") +
  geom_jitter(width = 0.25, height = 0, size = 0.5) +#geom_histogram(aes(group = disease, fill = disease), binwidth = 1) +
  ggpubr::stat_compare_means(ref.group = unique(c9_eh_meta$disease_n[c9_eh_meta$disease == "Control"]), label = "p.format", label.y.npc = 0.9, label.x.npc = 0.1 ) + 
  #facet_wrap(~disease_n, scales = "free_y") +
  theme_jh() +
  labs(title = "C9orf72", x = "Disease", y = "Expansion length")

c9_eh_meta %>%
  filter(!is.na(disease) ) %>%
  filter(max_call <= 30) %>%
  group_by(disease, call = max_call > 10) %>% tally() %>%
  pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "c9_10_" ) %>%
  mutate(prop = c9_10_TRUE / (c9_10_TRUE  + c9_10_FALSE) )
## # A tibble: 5 × 4
## # Groups:   disease [5]
##   disease c9_10_FALSE c9_10_TRUE   prop
##   <chr>         <int>      <int>  <dbl>
## 1 ALS             870        128 0.128 
## 2 ALS-FTD          48          9 0.158 
## 3 Control         202         27 0.118 
## 4 FTD              82          7 0.0787
## 5 Other           225         22 0.0891

Yes, ALS and FTD cases are enriched for C9 expansions > 30. No difference in proportion of sub-30 expansion lengths between diseases.

ATXN3 sQTL

all_meta <- left_join(rna_meta, dna_meta, by = "external_subject_id", suffix = c(".rna", ".dna"))
disease_meta <- all_meta %>% select(sample = vcf_id, disease)

# get expansionHunter calls
#a3_expansion_hunter <- read.table(here::here("ALS_SC/a3/ExpansionHunter_a3_results.txt"), header=FALSE, stringsAsFactors = FALSE)

# Feb 2021 - 1877 samples
a3_expansion_hunter <- read.table(here::here("ALS_SC/ATXN3/02_2021_ATXN3_ExpansionHunter_results.txt"), header=FALSE, stringsAsFactors = FALSE)


# VCF
#a3_vcf <- read.table(here::here("ALS_SC/ATXN3/ATXN3_QTL_SNP_rs10150068.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_vcf <- read.table(here::here("ALS_SC/ATXN3/ALS_rs10143310.vcf"), header=TRUE, stringsAsFactors = FALSE )
a3_qtl_vcf <- read.table(here::here("ALS_SC/ATXN3/ALS_rs200388434.vcf"), header=TRUE, stringsAsFactors = FALSE )

a3_genotypes <- process_vcf(a3_vcf, ID = "rs10143310")

a3_qtl_geno <-  process_vcf(a3_qtl_vcf , ID = "rs200388434")
# 425 donors
# compare GWAS and QTL SNPs
a3_compare <- left_join(a3_genotypes, a3_qtl_geno, by = c("sample") ) %>%
  left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
  filter(pct_european > 0.9) %>%
  select(A = genotype_binary.x, B = genotype_binary.y) %>%
  drop_na()
a3_junctions <- process_junctions(sqtl_junctions, "ENSG00000066427.22")
# 17 junctions

a3_eh_df <- process_expansionhunter(a3_expansion_hunter) %>% left_join(disease_meta, by = "sample")
# repeats for 2781 donors

# is there any enrichment in a3 length in ALS?
# merge together
a3_all <- left_join(a3_junctions, a3_genotypes, by = "sample") %>%
  left_join(a3_eh_df, by = "sample") %>%
  #mutate(genotype_binary) %>%
  left_join(disease_meta) %>%
  filter(!is.na(genotype_binary))

# a3_all %>%
#   filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
#     ggplot(aes(x = genotype_binary, y = junction_ratio)) + geom_boxplot(aes( group = genotype_binary)) +
#   facet_wrap(~junction_id)

# a3_all %>%
#   filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341", !is.na(max_call)) %>%
#   ggplot(aes(x = max_call, y = junction_ratio)) + geom_point() + ggpubr::stat_cor()
# 

a3_genotype_eh <- inner_join(a3_genotypes, a3_eh_df, by = "sample") %>%
  distinct()

# 
# a3_genotype_eh %>%
#   filter(!is.na(genotype_binary)) %>%
#   ggplot(aes(x = max_call) ) +
#   geom_dotplot(aes(fill = as.factor(genotype_binary)))

a3_snp <- 
  a3_all %>%
  filter(!is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
  select(sample, junction_ratio, genotype_binary) %>%
  distinct() %>%  
  left_join(covariates,  by = "sample") %>%
  column_to_rownames(var = "sample")

a3_code_df <- 
  tibble(
    genotype_binary =  c(0,1,2),
    genotype_code = factor( c("G/G","G/A", "A/A"), levels = c("G/G","G/A", "A/A") )
  )

a3_sqtl_snp_resid <- 
  a3_all %>% 
  filter(junction_id == "chr14:92049814-92050341") %>%
  select(sample, junction_ratio) %>%
  distinct() %>%
  left_join(covariates, by = "sample") %>%
  column_to_rownames(var = "sample")

a3_sqtl_snp_resid$resid <- resid(lm(junction_ratio ~ . , data = a3_sqtl_snp_resid) )

## then add back the SNP
a3_sqtl_snp_resid$genotype_binary <- a3_genotypes$genotype_binary[match(row.names(a3_sqtl_snp_resid), a3_genotypes$sample)]




a3_sqtl_snp_resid %>%
  left_join(a3_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = genotype_code, y = junction_ratio)) +
  geom_boxplot(aes(group = genotype_code), outlier.colour = NA) + 
  geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0)   + 
  labs(y = "Raw junction ratio", x = "rs10143310", colour = "rs10143310") +
  theme_jh() +
  scale_colour_brewer(type = "qual", palette = 2) +
  ggpubr::stat_compare_means()

## plot residualised junctions against genotype
plot_a3_sqtl <-   
  a3_sqtl_snp_resid %>%
  left_join(a3_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = genotype_code, y = resid)) +
  geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0, size = 1)   + 
  geom_boxplot(aes(group = genotype_code), outlier.colour = NA, fill = NA) + 
  labs(y = "Residual splicing ratio", x = "rs10143310", colour = "rs10143310") +
  theme_jh() +
  scale_colour_brewer(type = "qual", palette = 2)

plot_a3_sqtl

summary(lm(junction_ratio ~ genotype_binary, a3_sqtl_snp_resid ))
## 
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = a3_sqtl_snp_resid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88322 -0.70638 -0.05576  0.49782  2.58823 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.42893    0.08127  -5.278 3.48e-07 ***
## genotype_binary  0.83434    0.10703   7.795 3.83e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8882 on 194 degrees of freedom
## Multiple R-squared:  0.2385, Adjusted R-squared:  0.2346 
## F-statistic: 60.77 on 1 and 194 DF,  p-value: 3.827e-13
# P = 3e-13
summary(lm(resid ~ genotype_binary, a3_sqtl_snp_resid ))
## 
## Call:
## lm(formula = resid ~ genotype_binary, data = a3_sqtl_snp_resid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97715 -0.45024 -0.08418  0.46492  2.35687 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.31015    0.06600  -4.699 4.94e-06 ***
## genotype_binary  0.65364    0.08692   7.520 1.98e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7214 on 194 degrees of freedom
## Multiple R-squared:  0.2257, Adjusted R-squared:  0.2217 
## F-statistic: 56.55 on 1 and 194 DF,  p-value: 1.984e-12
# P  = 1.98e-12
summary(lm( junction_ratio ~ genotype_binary, data = a3_snp))
## 
## Call:
## lm(formula = junction_ratio ~ genotype_binary, data = a3_snp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88322 -0.70638 -0.05576  0.49782  2.58823 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.42893    0.08127  -5.278 3.48e-07 ***
## genotype_binary  0.83434    0.10703   7.795 3.83e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8882 on 194 degrees of freedom
## Multiple R-squared:  0.2385, Adjusted R-squared:  0.2346 
## F-statistic: 60.77 on 1 and 194 DF,  p-value: 3.827e-13
# genotype P = 1e-13, R2 = 0.238
summary(lm( junction_ratio ~ ., data = a3_snp))
## 
## Call:
## lm(formula = junction_ratio ~ ., data = a3_snp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8694 -0.4728 -0.0442  0.3901  2.1971 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.52529    1.45500  -0.361  0.71854    
## genotype_binary      0.78999    0.10011   7.891 3.84e-13 ***
## dna_platformX       -0.11409    0.15641  -0.729  0.46677    
## `dna_prepPCR-Free`   0.36955    0.16514   2.238  0.02656 *  
## InferredCov1         2.34263    1.57920   1.483  0.13986    
## InferredCov10        0.18446    1.18789   0.155  0.87678    
## InferredCov11       -0.58516    1.20568  -0.485  0.62808    
## InferredCov12       -1.54656    1.24150  -1.246  0.21462    
## InferredCov13        1.88385    1.24587   1.512  0.13242    
## InferredCov14        0.21401    1.32325   0.162  0.87172    
## InferredCov15       -0.84822    1.49430  -0.568  0.57105    
## InferredCov16        0.53320    1.63746   0.326  0.74512    
## InferredCov17        0.77591    1.77928   0.436  0.66335    
## InferredCov18       -0.04472    1.79702  -0.025  0.98018    
## InferredCov19       -0.52204    1.79051  -0.292  0.77099    
## InferredCov2        -8.49275    1.27234  -6.675 3.54e-10 ***
## InferredCov20       -0.87916    1.95161  -0.450  0.65295    
## InferredCov3        -2.74220    1.17395  -2.336  0.02069 *  
## InferredCov4         2.07475    1.13359   1.830  0.06901 .  
## InferredCov5        -3.52671    1.14608  -3.077  0.00244 ** 
## InferredCov6        -3.12124    1.14236  -2.732  0.00697 ** 
## InferredCov7        -1.08050    1.21000  -0.893  0.37316    
## InferredCov8         1.82286    1.14915   1.586  0.11458    
## InferredCov9        -0.87924    1.16650  -0.754  0.45207    
## PC1                 39.10449   86.02630   0.455  0.65002    
## PC2                -21.68610   52.15079  -0.416  0.67807    
## PC3                 21.00597   27.12694   0.774  0.43982    
## PC4                -20.22852   25.49876  -0.793  0.42873    
## PC5                 11.96327    8.11010   1.475  0.14208    
## sexmale             -0.10658    0.11709  -0.910  0.36403    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7557 on 166 degrees of freedom
## Multiple R-squared:  0.5283, Adjusted R-squared:  0.4459 
## F-statistic: 6.412 on 29 and 166 DF,  p-value: 1.852e-15
# genotype P = 1e-14, R2 = 0.444
summary(lm( resid ~ genotype_binary, data =  a3_sqtl_snp_resid))
## 
## Call:
## lm(formula = resid ~ genotype_binary, data = a3_sqtl_snp_resid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97715 -0.45024 -0.08418  0.46492  2.35687 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.31015    0.06600  -4.699 4.94e-06 ***
## genotype_binary  0.65364    0.08692   7.520 1.98e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7214 on 194 degrees of freedom
## Multiple R-squared:  0.2257, Adjusted R-squared:  0.2217 
## F-statistic: 56.55 on 1 and 194 DF,  p-value: 1.984e-12
# 

a3_repeat <- 
  a3_all %>%
  filter(!is.na(max_call), junction_id == "chr14:92049814-92050341") %>%
  distinct() %>%
  select(sample, junction_ratio, max_call) %>%
  left_join(covariates,  by = "sample") %>%
  column_to_rownames(var = "sample") 

summary(lm( junction_ratio ~ max_call, data = a3_repeat))
## 
## Call:
## lm(formula = junction_ratio ~ max_call, data = a3_repeat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8628 -0.7984 -0.1966  0.6860  2.9399 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.63611    0.33340  -1.908   0.0586 .
## max_call     0.03832    0.01735   2.209   0.0290 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.054 on 128 degrees of freedom
## Multiple R-squared:  0.03671,    Adjusted R-squared:  0.02919 
## F-statistic: 4.878 on 1 and 128 DF,  p-value: 0.02897
# genotype P = 0.046, R2 = 0.026
summary(lm( junction_ratio ~ ., data = a3_repeat))
## 
## Call:
## lm(formula = junction_ratio ~ ., data = a3_repeat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.64314 -0.50899 -0.05018  0.45904  2.54844 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -4.02694    2.41499  -1.667   0.0985 .  
## max_call             0.03555    0.01613   2.204   0.0298 *  
## dna_platformX       -0.14620    0.21451  -0.682   0.4971    
## `dna_prepPCR-Free`        NA         NA      NA       NA    
## InferredCov1         1.95960    3.59738   0.545   0.5871    
## InferredCov10        3.33540    1.90467   1.751   0.0830 .  
## InferredCov11       -1.53272    1.92088  -0.798   0.4268    
## InferredCov12       -4.23302    2.19530  -1.928   0.0566 .  
## InferredCov13        2.70406    2.00756   1.347   0.1810    
## InferredCov14       -0.13077    2.08099  -0.063   0.9500    
## InferredCov15       -2.79994    2.28486  -1.225   0.2233    
## InferredCov16        4.26529    2.26107   1.886   0.0621 .  
## InferredCov17        2.83138    2.69523   1.051   0.2960    
## InferredCov18       -0.13412    2.67375  -0.050   0.9601    
## InferredCov19       -0.65843    2.79904  -0.235   0.8145    
## InferredCov2       -11.03794    2.30632  -4.786 5.83e-06 ***
## InferredCov20       -2.90635    2.92381  -0.994   0.3226    
## InferredCov3        -1.71432    1.78025  -0.963   0.3379    
## InferredCov4         0.92645    1.84165   0.503   0.6160    
## InferredCov5        -1.40300    2.00862  -0.698   0.4865    
## InferredCov6        -4.36149    1.81656  -2.401   0.0182 *  
## InferredCov7        -2.20735    1.73941  -1.269   0.2073    
## InferredCov8         1.68239    1.76044   0.956   0.3415    
## InferredCov9        -1.17957    1.98588  -0.594   0.5539    
## PC1                149.42627  131.44222   1.137   0.2583    
## PC2                 70.84372   79.28267   0.894   0.3737    
## PC3                  8.92917   41.82913   0.213   0.8314    
## PC4                -25.45228   36.91321  -0.690   0.4921    
## PC5                -24.77207   12.78642  -1.937   0.0555 .  
## sexmale             -0.16643    0.18510  -0.899   0.3707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9061 on 101 degrees of freedom
## Multiple R-squared:  0.4383, Adjusted R-squared:  0.2826 
## F-statistic: 2.815 on 28 and 101 DF,  p-value: 8.393e-05
# genotype P = 0.0273, R2 = 0.285
#  geom_density(aes(group = genotype_binary))
a3_repeat$resid <- a3_sqtl_snp_resid$resid[match(row.names(a3_repeat), row.names(a3_sqtl_snp_resid) )]
summary(lm( resid ~ max_call, data = a3_repeat))
## 
## Call:
## lm(formula = resid ~ max_call, data = a3_repeat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35611 -0.59474 -0.08955  0.45327  2.68632 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.47899    0.26657  -1.797   0.0747 .
## max_call     0.02739    0.01387   1.974   0.0505 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8427 on 128 degrees of freedom
## Multiple R-squared:  0.02955,    Adjusted R-squared:  0.02197 
## F-statistic: 3.898 on 1 and 128 DF,  p-value: 0.05049
# P = 0.046

a3_repeat_snp <- 
  a3_all %>%
  filter(!is.na(max_call), !is.na(genotype_binary), junction_id == "chr14:92049814-92050341") %>%
  distinct() %>%
  select(sample, junction_ratio, max_call, genotype_binary) %>%
  left_join(covariates,  by = "sample") %>%
  mutate( genotype_binary = as.numeric(genotype_binary)) %>%
  column_to_rownames(var = "sample")

summary(lm( junction_ratio ~ max_call + genotype_binary, data = a3_repeat_snp))
## 
## Call:
## lm(formula = junction_ratio ~ max_call + genotype_binary, data = a3_repeat_snp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.91109 -0.71888 -0.02104  0.57295  2.46989 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.488248   0.285444  -1.710   0.0896 .  
## max_call         0.003349   0.015641   0.214   0.8308    
## genotype_binary  0.951719   0.136530   6.971 1.53e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8999 on 127 degrees of freedom
## Multiple R-squared:  0.3033, Adjusted R-squared:  0.2923 
## F-statistic: 27.64 on 2 and 127 DF,  p-value: 1.082e-10
summary(lm( junction_ratio ~ ., data = a3_repeat_snp))
## 
## Call:
## lm(formula = junction_ratio ~ ., data = a3_repeat_snp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85915 -0.41640 -0.03891  0.41260  2.06728 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.530786   2.064561  -0.741   0.4602    
## max_call             0.005141   0.014323   0.359   0.7204    
## genotype_binary      0.855057   0.130313   6.562 2.38e-09 ***
## dna_platformX       -0.119243   0.180287  -0.661   0.5099    
## `dna_prepPCR-Free`         NA         NA      NA       NA    
## InferredCov1         2.402813   3.023469   0.795   0.4287    
## InferredCov10        2.217079   1.609453   1.378   0.1714    
## InferredCov11       -1.807002   1.614571  -1.119   0.2657    
## InferredCov12       -2.944301   1.855034  -1.587   0.1156    
## InferredCov13        2.432729   1.687371   1.442   0.1525    
## InferredCov14       -0.002418   1.748669  -0.001   0.9989    
## InferredCov15       -1.328956   1.932905  -0.688   0.4933    
## InferredCov16        1.042261   1.962349   0.531   0.5965    
## InferredCov17        1.093326   2.280116   0.480   0.6326    
## InferredCov18       -0.758996   2.248648  -0.338   0.7364    
## InferredCov19       -1.352027   2.354280  -0.574   0.5671    
## InferredCov2        -9.742614   1.947926  -5.002 2.43e-06 ***
## InferredCov20       -1.206587   2.470361  -0.488   0.6263    
## InferredCov3        -2.964736   1.507954  -1.966   0.0521 .  
## InferredCov4         1.492236   1.549857   0.963   0.3380    
## InferredCov5        -1.951706   1.689826  -1.155   0.2509    
## InferredCov6        -3.819686   1.528603  -2.499   0.0141 *  
## InferredCov7        -1.137398   1.470617  -0.773   0.4411    
## InferredCov8         1.535721   1.479388   1.038   0.3017    
## InferredCov9        -0.096022   1.676796  -0.057   0.9544    
## PC1                 88.829067 110.830350   0.801   0.4248    
## PC2                  8.646023  67.288632   0.128   0.8980    
## PC3                 13.721574  35.154703   0.390   0.6971    
## PC4                -21.980206  31.020998  -0.709   0.4802    
## PC5                 -7.859754  11.048695  -0.711   0.4785    
## sexmale             -0.236334   0.155896  -1.516   0.1327    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7614 on 100 degrees of freedom
## Multiple R-squared:  0.6074, Adjusted R-squared:  0.4935 
## F-statistic: 5.334 on 29 and 100 DF,  p-value: 1.62e-10
# genotype P = 1e-8
# repeat P = 0.5

a3_to_plot <- a3_repeat_snp

a3_to_plot$resid <- a3_sqtl_snp_resid$resid[ match(row.names(a3_to_plot), row.names(a3_sqtl_snp_resid) ) ]

plot_repeat_splicing <- 
  a3_to_plot %>%
  left_join(a3_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = max_call, y = resid)) + 
  geom_point(aes(colour = genotype_code), size = 1) +
    geom_smooth(method = "lm", colour = "black") +
  theme_jh() +
  ggpubr::stat_cor() +
  scale_colour_brewer(type = "qual", palette = 2)  +
  labs(y = "Residual splicing ratio", x = "CAG repeat length", colour = "rs10143310") +
  scale_x_continuous(breaks = c(0,10,20,30), limits = c(0,30)) 

plot_repeat_splicing 

cor.test(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call)
## 
##  Pearson's product-moment correlation
## 
## data:  a3_repeat_snp$genotype_binary and a3_repeat_snp$max_call
## t = 3.8318, df = 128, p-value = 0.0001984
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1572889 0.4671691
## sample estimates:
##       cor 
## 0.3207872
chisq.test(table(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call > 16))
## 
##  Pearson's Chi-squared test
## 
## data:  table(a3_repeat_snp$genotype_binary, a3_repeat_snp$max_call >     16)
## X-squared = 31.994, df = 2, p-value = 1.129e-07
plot_a3_genotype_repeat <- 
  a3_repeat_snp %>%
  left_join(a3_code_df, by = "genotype_binary") %>%
  ggplot(aes(x = genotype_code, y = max_call) ) + 
  geom_jitter(aes(colour = genotype_code), width = 0.25, height = 0, size = 1) + 
  geom_boxplot(outlier.colour = NA, aes(group = genotype_binary), fill = NA) + 
  scale_y_continuous(breaks = c(0,10,20,30), limits = c(0,30)) +
  labs(y = "CAG repeat length", x = "rs10143310", colour = "rs10143310") +
  theme_jh() +
  scale_colour_brewer(type = "qual", palette = 2)


atxn3_multiplot <- 
  (plot_a3_sqtl +plot_a3_genotype_repeat) /
     plot_repeat_splicing &
  theme(axis.ticks = element_line(colour = "black")) &
       # theme(axis.ticks.y = element_blank(), 
       #  axis.text.y = element_blank(), 
       #  axis.title.y = element_blank(),
       #  axis.line.y = element_blank() ) &
  guides(colour = FALSE)

atxn3_multiplot

ggsave(plot = atxn3_multiplot, filename = here::here("ALS_SC/plots/ATXN3_genotype_repeat_plots.pdf"), width = 3, 
       height = 4.5) 

The lead QTL SNP for the ATX3 sQTL is tagging the repeat expansion!

Questions:

Does including all the covariates increase the signal?

If the SNP is tagging the expansion, and the expansion affects splicing, then the expansion should correlate better with the splicing change than the tagging SNP, no?

ATXN3 repeats in the WGS cohort

## DNA cohort as well
a3_eh_meta <- a3_eh_df %>%
  left_join(dna_meta_full, by = c("sample" = "external_sample_id") ) %>%
  filter(!is.na(external_subject_id)) %>%
  filter(!duplicated(external_subject_id)) %>%
  filter(!is.na(max_call))
  # throw out duplicate donors
# 1869 unique donors from 2774 samples

a3_eh_meta <- a3_eh_meta %>%
  filter(pct_european >= 0.9) # remove admixed samples - quick and dirty for now until I get full MDS values.
# 1634 Europeans

# I can only match metadata for 1559 non-duplicate donors. Time to update metadata? Emailed Nadia and Jennifer

a3_eh_meta <- mutate(a3_eh_meta, disease = case_when( 
  subject_group_subcategory == "Classical/Typical ALS" ~ "ALS", 
  subject_group_subcategory == "Classical/Typical ALS, Frontotemporal Dementia (FTD)" ~ "ALS-FTD",
  grepl("Frontotemporal Dementia", subject_group_subcategory) ~ "FTD",
  grepl("Non-Neurological Control", subject_group)  ~ "Control",
  TRUE ~ "Other")
)
# 1600 donors


disease_n <- group_by(a3_eh_meta, disease) %>% tally() %>% mutate(disease_n = paste0(disease,"\n(", n, ")") )

disease_n$disease <- factor(disease_n$disease, levels =  c("Control", "ALS", "FTD", "ALS-FTD", "Other") )
disease_n$disease_n <- factor(disease_n$disease_n, levels =  disease_n$disease_n[as.numeric(disease_n$disease)])

a3_eh_meta <- left_join(a3_eh_meta, disease_n, by = "disease")

# a3_eh_meta %>%
#   filter(!is.na(disease)) %>%
#   ggplot(aes(x = disease_n, y = max_call)) +  geom_violin(fill = NA) + geom_jitter(width = 0.25, height = 0, size = 0.5, colour = "gray") +
#   #coord_flip() +
#   ggpubr::stat_compare_means(ref.group = disease_n$disease_n[2]) +
#   theme_jh() + labs(y ="Max repeat length", title = "ATXN3", x = "")
# 
# 
# a3_eh_meta %>%
#  # filter(disease %in% c("ALS", "Control")) %>%
#   ggplot(aes(x = call1 + call2 ) ) + geom_histogram(aes(fill = disease), binwidth = 1) +
#   facet_wrap(~disease, scales = "free_y", nrow = 4)
a3_threshold <- 16

a3_dist_plot <- 
  a3_eh_meta %>%
  filter(disease %in% c("ALS", "Control")) %>%
  filter(!is.na(disease) )  %>%
  mutate(disease_n = forcats::fct_rev(disease_n)) %>%
  ggplot(aes(x = disease_n, y = max_call )) + 
  geom_violin(fill = NA, colour = "darkorange") + 
  geom_jitter(height = 0, width = 0.25, size = 0.5) +
  coord_flip() +
  geom_hline(yintercept = a3_threshold, linetype = 3, colour = "red") + theme_jh() +
  labs(x = "", y = "ATXN3 expansion max size")

# proportion of group with expansion > 16
a3_prop_plot <- a3_eh_meta %>% filter(!is.na(disease) ) %>% 
    filter(disease %in% c("ALS", "Control")) %>%
  group_by(disease_n, expand = !is.na(max_call) & max_call > a3_threshold) %>% 
  tally() %>% 
  pivot_wider(names_from = expand, values_from = n, values_fill = 0, names_prefix = "a3") %>% 
  mutate(prop = a3TRUE / (a3TRUE + a3FALSE) ) %>% 
  mutate(prop_label = paste0(signif(prop * 100, 2), "%" ) ) %>%
  mutate(disease_n = forcats::fct_rev(disease_n)) %>%
  ggplot(aes(x = disease_n, y = prop)) + 
  geom_col() + 
  coord_flip() + 
  labs(y = "Proportion > 16 repeats", x = "") +
  theme_jh() +
  scale_y_continuous(labels = scales::percent, minor_breaks = NULL, breaks = c(0,0.2), limits = c(0,0.25)) +
  geom_text(aes(label = prop_label ), colour = "black", nudge_y = 0.04, size = 3) 

# proportion European in each sample by disease
# FTD slightly higher distribution - almost all from UK, less diverse than other diease groups
a3_eh_meta %>%
  ggplot(aes(x = disease, y = as.numeric(pct_european) )) + geom_jitter() + geom_violin(fill = NA) + theme_jh() +
  ggpubr::stat_compare_means(ref.group = "Control")

# table
a3_eh_meta %>%
  filter(!is.na(disease) ) %>%
 # filter(max_call <= 30) %>%
  group_by(disease, call = max_call > 15) %>% tally() %>%
  pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "a3_30_" ) %>%
  mutate(prop = a3_30_TRUE / (a3_30_TRUE  + a3_30_FALSE) )
## # A tibble: 5 × 4
## # Groups:   disease [5]
##   disease a3_30_FALSE a3_30_TRUE  prop
##   <chr>         <int>      <int> <dbl>
## 1 ALS             186        805 0.812
## 2 ALS-FTD          21         48 0.696
## 3 Control          46        156 0.772
## 4 FTD              24         71 0.747
## 5 Other            52        191 0.786
a3_bar_plot <- a3_eh_meta %>%
  filter(disease %in% c("ALS", "Control")) %>%
  filter(!is.na(disease) ) %>%
  #filter(max_call <= 30) %>%
  #mutate(call_sum = call1 + call2 ) %>%
  ggplot(aes(x = max_call)) + geom_histogram(aes(group = disease), binwidth = 1) +
  facet_wrap(~disease_n, scales = "free_y", ncol = 1) +
  theme_jh() + 
  labs(x = "ATXN3 repeat length", title = "ATXN3")

a3_dist_plot + a3_bar_plot + plot_layout(ncol = 2, widths = c(1,1))

# tally 
a3_tally <- a3_eh_meta %>%
  filter(disease %in% c("Control", "ALS") ) %>%
  group_by(disease, max_call) %>% tally() %>%
  pivot_wider(names_from =disease, values_from = n, values_fill = 0 ) 

a3_tally$ALS_sum <- sum(a3_tally$ALS)
a3_tally$Control_sum <- sum(a3_tally$Control)

a3_tally$ALS_prop <- a3_tally$ALS / a3_tally$ALS_sum
a3_tally$Control_prop <- a3_tally$Control / a3_tally$Control_sum

a3_tally$p.value <- map_dbl(1:nrow(a3_tally), ~{
  print(.x)
  .x <- a3_tally[.x,]
  prop.test(x = c(.x[,2, drop = TRUE], .x[,3, drop = TRUE]), n = c(.x[,4, drop = TRUE], .x[,5, drop = TRUE]))$p.value
})
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
a3_tally$padj <- p.adjust(a3_tally$p.value, method = "bonferroni")

plot_atxn3_eh_tally <-
  a3_tally %>%
  filter(max_call > 15 & max_call < 30) %>%
  select(max_call, Control_prop, ALS_prop, p.value, padj) %>%
  pivot_longer(cols = Control_prop:ALS_prop, values_to = "prop", names_to = "disease" ) %>%
  mutate(disease = gsub("_prop", "", disease)) %>%
  left_join(disease_n, by = "disease") %>%
  ggplot(aes(x = max_call, y = prop)) + geom_col(aes(fill = disease_n), position = position_dodge(), width = 0.75 ) +
  theme_jh() +
  labs(x = "Maximum ATXN3 repeat length", y = "Proportion of group", fill = "Group") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1), limits = c(0,0.26), expand = c(0,0)) +
  geom_hline(yintercept = 0) +
  geom_text(aes(label = signif(p.value, 2), y = 0.25 ), size =2 ) 


ggsave(plot = plot_atxn3_eh_tally,  filename = here::here("ALS_SC/plots/ATXN3_WGS_EH_tally.pdf"), width =6, height = 3 )

# threshold at 16
a3_eh_meta %>%
  filter(disease %in% c("Control", "ALS") ) %>%
  group_by(disease, call = max_call >= 21) %>% tally() %>%
  pivot_wider(names_from =call, values_from = n, values_fill = 0, names_prefix = "a3_" ) %>%
  mutate(n = a3_FALSE + a3_TRUE) %>%
  mutate(prop = a3_TRUE / (a3_TRUE  + a3_FALSE) ) %>%
  mutate(p.value = prop.test(x = .$a3_TRUE, n = .$n)$p.value  )
## # A tibble: 2 × 6
## # Groups:   disease [2]
##   disease a3_FALSE a3_TRUE     n  prop p.value
##   <chr>      <int>   <int> <int> <dbl>   <dbl>
## 1 ALS          677     314   991 0.317   0.148
## 2 Control      149      53   202 0.262   0.148
prop.test(x = c(314, 53), n = c(sum(a3_tally$ALS), sum(a3_tally$Control))  )
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(314, 53) out of c(sum(a3_tally$ALS), sum(a3_tally$Control))
## X-squared = 2.0891, df = 1, p-value = 0.1484
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.01573188  0.12468273
## sample estimates:
##    prop 1    prop 2 
## 0.3168517 0.2623762
# P = 0.36

Compare with ATXN2

a2_eh_meta <- a3_eh_meta

max_atxn2 <- function(x){
  df <- str_split_fixed(x, "/", 2)
  lengths <- as.numeric(apply(df, MARGIN = 1, max))
  return(lengths)
}
a2_eh_meta$atxn2_max <-  max_atxn2(a2_eh_meta$atxn2_repeat_size)

a2_tally <- a2_eh_meta %>%
  filter(disease %in% c("Control", "ALS") ) %>%
  group_by(disease, atxn2_max) %>% tally() %>%
  pivot_wider(names_from =disease, values_from = n, values_fill = 0 ) %>%
  arrange(atxn2_max)

a2_tally$ALS_sum <- sum(a2_tally$ALS)
a2_tally$Control_sum <- sum(a2_tally$Control)

a2_tally$ALS_prop <- a2_tally$ALS / a2_tally$ALS_sum
a2_tally$Control_prop <- a2_tally$Control / a2_tally$Control_sum

a2_tally$p.value <- map_dbl(1:nrow(a2_tally), ~{
  print(.x)
  .x <- a2_tally[.x,]
  prop.test(x = c(.x[,2, drop = TRUE], .x[,3, drop = TRUE]), n = c(.x[,4, drop = TRUE], .x[,5, drop = TRUE]))$p.value
})
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
a2_tally$padj <- p.adjust(a2_tally$p.value, method = "bonferroni")

a2_tally %>%
  filter(atxn2_max > 24 & atxn2_max < 34 ) %>%
  select(atxn2_max, Control_prop, ALS_prop, p.value, padj) %>%
  pivot_longer(cols = Control_prop:ALS_prop, values_to = "prop", names_to = "disease" ) %>%
  mutate(disease = gsub("_prop", "", disease)) %>%
  left_join(disease_n, by = "disease") %>%
  ggplot(aes(x = atxn2_max, y = prop)) + geom_col(aes(fill = disease_n), position = position_dodge(), width = 0.75 ) +
  theme_jh() +
  labs(title = "ATXN2", x = "repeat length") +
  scale_y_continuous(labels = scales::percent) +
  geom_hline(yintercept = 0) +
  geom_text(aes(label = signif(p.value, 2), y = 0.2 ) )