Read in metadata

Gene expression RNA metadata DNA metadata

How many tissues are available for comparing ALS versus Controls?

Remove any sample with RIN < 5

## RNA
# technical info - from Picard
rna_tech_df <- read_tsv(here::here("metadata/RNA/rna_technical_metrics.tsv")) %>% 
  dplyr::rename(sample = external_sample_id) %>%
  dplyr::select(sample, starts_with('pct'), starts_with('median'), mean_read_length, strand_balance, estimated_library_size)

write_tsv(rna_tech_df, path = here::here("data/differential_expression/ALS/ALS_all_differential_expression_tech_metrics.tsv") )

# metadata - all clincal metadata
rna_metadata <- read_tsv(here::here("data/nov_2020/201127_rna_metadata_qc_pass.tsv")) %>% 
  dplyr::rename(sample = "external_sample_id")
# 1917
table(rna_metadata$QC_PASS)
## 
## FALSE  TRUE 
##    56  1861
## remove QC failed samples from metadata
# rna_qc_fails <- read_tsv(here::here("metadata/RNA/2020_02_10_RNA_samples_failing_QC.txt"))


#all_rna_fails <- c(rna_qc_fails$external_sample_id, new_qc_fails)

rna_metadata <- rna_metadata %>%
  filter( QC_PASS == TRUE ) %>%
  mutate(disease_c9 = case_when(
    disease == "Control" ~ "Control",
    disease == "ALS" & mutations == "C9orf72" ~ "C9ALS",
    disease == "ALS" & mutations == "None" ~ "sALS",
    TRUE ~ "Other"
))
# 1879 RNA samples pass QC

# 3 samples have missing RIN - give them the median
# 7 have missing age
# 513 have missing PMI - useless
rna_metadata <- 
  rna_metadata %>%
  mutate( rin = replace_na(rin, median(rin, na.rm = TRUE)),
          age = replace_na(age, median(age, na.rm = TRUE)),
          pmi = replace_na(pmi, median(pmi, na.rm = TRUE))
          )


## DNA
# get genetic PCs for each sample
dna_metadata <- read_tsv(here::here("metadata/DNA/all_samples_DNA_key.tsv"))
# 1898 unique samples - some donors were sequenced twice -mostly Cerebellum
# 
dna_pcs <- read_tsv(here::here("metadata/DNA/all_cohort_genetic_PCs.txt")) %>%
  select(-external_sample_id, -external_subject_id) %>%
  rename(participant_id = vcf_id)
   #gather(key = "participant_id", value = "PC", -ID) %>%
   #spread( key = ID, value = PC)
names(dna_pcs)[2:6] <- paste0("g", names(dna_pcs[2:6]))
# 513 donors


dna_metadata <- inner_join(dna_metadata, dna_pcs, by = "participant_id") #%>%
  #select(-external_subject_id, -external_sample_id)
# 1894 unique samples with DNA PCs


# remove QC fails and any sample without WGS
# only keep ALS and Control diagnoses
rna_metadata <- dplyr::filter(rna_metadata, 
                       disease %in% c("ALS", "Control")
                       )
# 1294 samples are in categories

rna_metadata <- 
  filter(rna_metadata, 
         sample %in% dna_metadata$sample_id,
)
# 1278 have associated DNA 

# create compact support file including all relevant covariates and nothing more
support <- 
  left_join(rna_metadata, dna_metadata, by = c("sample" = "sample_id", "tissue") ) %>%
  dplyr::select(sample, donor = external_subject_id, tissue, site = site_specimen_collected, rin,prep, mutations, disease, disease_c9, motor_onset = site_of_motor_onset, duration = disease_duration_in_months, sex, age, starts_with("gPC")  )




# tissues <- c("Frontal_Cortex", "Lateral_Motor_Cortex", "Medial_Motor_Cortex", "Temporal_Cortex", "Cerebellum", "Lumbar_Spinal_Cord", "Cervical_Spinal_Cord", "Thoracic_Spinal_Cord", "Unspecified_Motor_Cortex")

tissues <- c("Lumbar_Spinal_Cord", "Cervical_Spinal_Cord", "Thoracic_Spinal_Cord")


# remove low RIN, only Control or ALS
support  <- 
  support %>% 
  mutate( disease = factor(disease,levels = c("Control", "ALS"))) %>%
  filter(rin >= 5, tissue %in% tissues) %>%
  filter( !(tissue == "Unspecified_Motor_Cortex" & site != "Academic Medical Center"))
# 380 meet all criteria

support$onset <- as.numeric(support$age) - ( as.numeric(support$duration) / 12 )

write_tsv(support, here::here("data/differential_expression/ALS/ALS_all_differential_expression_support.tsv"))

remove all samples with RIN < 5 for differential expression.

support %>%
  group_by(tissue, disease) %>% 
  filter(!duplicated(sample)) %>% # 1 duplicate entry in metadata, CGND-HRA-00431 - unsure how got in
  tally() %>% 
  spread(key = disease, value = n, fill = 0) %>% 
  arrange( desc(Control)) %>% 
  ungroup() %>% janitor::adorn_totals(where = "row") %>%
gt::gt()
tissue Control ALS
Cervical_Spinal_Cord 35 139
Lumbar_Spinal_Cord 32 122
Thoracic_Spinal_Cord 10 42
Total 77 303
support %>% 
  filter(rin >= 5, tissue %in% tissues) %>%
  mutate( disease_c9 = factor(disease_c9,levels = c("Control", "sALS", "C9ALS"))) %>%
  group_by(tissue, disease_c9) %>% 
  filter(!duplicated(sample)) %>% # 1 duplicate entry in metadata, CGND-HRA-00431 - unsure how got in
  tally() %>% 
  spread(key = disease_c9, value = n, fill = 0) %>% 
  arrange( desc(Control)) %>% 
  ungroup() %>% gt::gt()
tissue Control sALS C9ALS <NA>
Cervical_Spinal_Cord 35 103 28 8
Lumbar_Spinal_Cord 32 93 21 8
Thoracic_Spinal_Cord 10 34 5 3
support %>%
  filter( rin >= 5, tissue %in% tissues) %>%
  select(donor, disease) %>%
  distinct() %>%
  group_by(disease) %>% tally()
## # A tibble: 2 × 2
##   disease     n
##   <fct>   <int>
## 1 Control    49
## 2 ALS       154

NA refers to other ALS mutations (ANG, SOD1, OPTN)

For each tissue, get together all covariates , plus gene expression for those samples

load(here::here("data/feb_2020/gene_counts_no_tags.RData") )

dim(genes_counts)
## [1] 58884  1924
#support <- filter(rna_metadata, tissue %in% tissues)


for( t in tissues){
  support_loc <- filter(support, tissue == t ) %>% distinct() # any sneaky duplicate rows?
  counts_loc <- genes_counts[, support_loc$sample] 
  tpm_loc <- genes_counts[, support_loc$sample]
  tech_loc <- filter(rna_tech_df, sample %in% support_loc$sample)
  outFile <- here::here(paste0("data/differential_expression/ALS/", t, ".RData") )
  if( rerun){
  save(support_loc, counts_loc, tpm_loc, tech_loc, file = outFile  )
  }
}

Diagnostic Plots

# for testing
#t <- "Thoracic_Spinal_Cord"
diagnosticPlots <- function(t,
                            varpart_formula =  ~ (1|site) + rin + pct_mrna_bases + pct_chimeras + pct_ribosomal_bases + pct_r1_transcript_strand_reads + pct_intergenic_bases +   median_3prime_bias + median_5prime_bias + (1|disease) + (1|prep) + age + (1|sex) ){
  
  inFile <- here::here( paste0("data/differential_expression/ALS/", t, ".RData"))
  stopifnot(file.exists(inFile))
  load(inFile)
  
  row.names(support_loc) <- support_loc$sample
  row.names(tech_loc) <- tech_loc$sample
  
  libsize <- enframe(colSums(counts_loc)) %>% dplyr::rename(sample = name, total_counts = value) 
  
  support_loc <- dplyr::left_join(support_loc, libsize, by = 'sample')
  
  # Tables
  
  # disease by site
  site_table <- support_loc %>% dplyr::group_by(disease, site) %>% dplyr::tally() %>% tidyr::spread(key = disease, value = n) #%>% ungroup %>% gt() %>% tab_header("Disease group by submitting site")
  # disease by prep
  prep_table <- support_loc %>% dplyr::group_by(disease, prep) %>% dplyr::tally() %>% tidyr::spread(key = disease, value = n)# %>% ungroup %>% gt() %>% tab_header("Disease group by library prep type")

  print(site_table)
  
  print(prep_table)
  
  
  # genes_counts 
  isexpr <- rowSums(cpm(counts_loc)>1) >= 0.9 * ncol(counts_loc)
  # create data structure with only expressed genes
  gExpr <- DGEList(counts=counts_loc[isexpr,])
  # Perform TMM normalization
  gExpr <- calcNormFactors(gExpr)
  # Specify variables to be included in the voom() estimates of
  # uncertainty.
  # Recommend including variables with a small number of categories
  # that explain a substantial amount of variation
  #design <- model.matrix(~1)#model.matrix( ~ median_3prime_bias + pct_mrna_bases + disease, all_df)
  vobjGenes <- voom(gExpr)

  voom_pca <- prcomp(t(vobjGenes$E), center = TRUE, scale.=TRUE)
  
  pca_df <- voom_pca$x %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample") %>%
    select(sample, PC1, PC2) %>%
    left_join(support_loc, by = "sample") %>%
    left_join(tech_loc, by = "sample")
  
  plot_pca_df <- function(colourby){
    pca_df %>%
      ggplot(aes(x = PC1, y = PC2)) + 
      geom_point(aes_string(colour = colourby))
  }
    
  pca_plot <-
    plot_pca_df("disease") +
    plot_pca_df("prep") +
    plot_pca_df("pct_mrna_bases") +
    plot_pca_df("site") +
    plot_layout(ncol = 2, nrow = 3) +
    plot_annotation(title = t) & theme_bw()

  
  n_pc <- 10
  
  expression_pca_df <- voom_pca$x[,1:n_pc] %>%
    as.data.frame() %>%
    rownames_to_column(var = "sample")
  
  all_df <- 
    list(expression_pca_df, support_loc, tech_loc) %>%
    purrr::reduce(dplyr::left_join, by = "sample") %>%
    dplyr::select(-donor, -tissue) %>%
    column_to_rownames(var = "sample")
  
  all_df$prep <- as.factor(all_df$prep)
  all_df$disease <- as.factor(all_df$disease)
  all_df$site <- as.factor(all_df$site)
  all_df$sex <- as.factor(all_df$sex)
  all_df$pct_correct_strand_reads <- NULL
  
  if(length(unique(all_df$site)) == 1 ){ all_df$site <- NULL}
  if(length(unique(all_df$prep)) == 1 ){ all_df$prep <- NULL}
  
  all_df <- select(all_df,
                   starts_with("PC", ignore.case = FALSE),
                   "library prep" = prep,
                   "submitting site" = site,
                   "RIN" = rin,
                   "% mRNA bases" = pct_mrna_bases,
                   "median 5' bias" = median_5prime_bias,
                   "median 3' bias" = median_5prime_bias,
                   "% chimeric reads" = pct_chimeras,
                   "% intergenic bases" = pct_intergenic_bases,
                   "% stranding" = pct_r1_transcript_strand_reads,
                   disease,
                   sex, 
                  # starts_with("gPC")
                   )
  # reorder
  all_df <- all_df[ , c(paste0("PC", 1:10), names(all_df)[10:ncol(all_df)] ) ]
  
  # take first 10 expression PCs and correalte with covariates
  n_var <- ncol(all_df)
  matrix_rsquared <- matrix(NA, nrow = n_var, ncol = n_pc) #Number of factors
  matrix_pvalue <- matrix(NA, nrow = n_var, ncol = n_pc) 
  
  for (row_pos in 1:n_var ){
    for (col_pos in 1:n_pc){
      matrix_rsquared[row_pos,col_pos] <- summary( lm(all_df[,col_pos] ~ all_df[,row_pos] ) )$adj.r.squared
      f <- summary( lm(all_df[,col_pos] ~ all_df[,row_pos] ) )$fstatistic
      #print(f)
      if( !is.null(f) ){
        pvalue <- pf(f[1],f[2],f[3],lower.tail=FALSE) 
      } else{
        pvalue <- NA
      }
      matrix_pvalue[row_pos,col_pos] <- pvalue
    }
  }
  
  matrix_rsquared <- as.data.frame(matrix_rsquared)
  dimnames(matrix_rsquared) <- list( names(all_df), names(all_df)[1:n_pc])
  
  matrix_pvalue <- as.data.frame(matrix_pvalue)
  dimnames(matrix_pvalue) <- list( names(all_df), names(all_df)[1:n_pc])
  
  matrix_rsquared <- matrix_rsquared %>%
    rownames_to_column(var = "variable") %>%
    dplyr::filter(!grepl("^PC", variable ) ) %>%
    gather(key = "PC", value = "rsquared", -variable)
  
  

  tech_loc$pct_correct_strand_reads <- NULL
  
  ## add RIN and prep to tech loc for correlation matrix
  # set prep to a numeric variable as only two categories
  
  tech_loc$rin <- support_loc$rin[ match(tech_loc$sample, support_loc$sample)]
  #tech_loc$rin 
  tech_loc$prep <- support_loc$prep[ match(tech_loc$sample, support_loc$sample)]
  tech_loc$prep <- as.numeric(as.factor(tech_loc$prep))
  tech_loc <- tech_loc[complete.cases(tech_loc),]
  tech_loc <- tech_loc %>% dplyr::select(-sample)
  tech_loc <- tech_loc[, apply(tech_loc, MARGIN = 2, sd) != 0 ]
  
  tech_heatmap <- tech_loc %>% cor(method = "spearman") %>% abs()
  
  
  cor_heatmap <- spread(matrix_rsquared, key = "PC", value = "rsquared" ) %>% column_to_rownames(var = "variable")
  
  cor_heatmap <- cor_heatmap[ , paste0("PC", 1:10)]
  

  
  #all_df$big_batch <- all_df$total_counts > mean(all_df$total_counts) + 2 * sd(all_df$total_counts)
  
  # big batch explains very little per-gene variance - library size must be accounted for by limma and TMM.
  var_part_file <- here::here(paste0("data/differential_expression/ALS/", t, "_variancePartition.RData"))
    
  if( rerun | !file.exists(var_part_file) ){
    #require(variancePartition)
    l <- makeCluster(4)
    registerDoParallel(cl)
    # load simulated data:
    # geneExpr: matrix of gene expression values
    # info: information/metadata about each sample
    #form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch)
    form <- varpart_formula #+ gPC1 + gPC2 + gPC3 + gPC4
    

    varPart <- fitExtractVarPartModel( vobjGenes, form, all_df   )
    save(varPart, file = var_part_file)
  }else{
    load(var_part_file)
  }

  # sort variables (i.e. columns) by median fraction
  # of variance explained
  vp <- sortCols( varPart )
  
  vp_names <- c(
                   "library prep" = "prep",
                   "submitting site" = "site",
                   "RIN" = "rin",
                   "% mRNA bases" = "pct_mrna_bases",
                   "median 5' bias" = "median_5prime_bias",
                   "median 3' bias" = "median_3prime_bias",
                   "% chimeric reads" = "pct_chimeras",
                  "% ribosomal bases" = "pct_ribosomal_bases",
                   "% intergenic bases" = "pct_intergenic_bases",
                   "% stranding" = "pct_r1_transcript_strand_reads",
                   "disease" = "disease",
                   "sex" = "sex", 
                  "age at death" = "age",
                  "residuals" = "Residuals"
                   )

  names(vp) <- names(vp_names)[ match(names(vp), vp_names)]

  
  # Figure 1a
  # Bar plot of variance fractions for the first 10 genes
  #plotPercentBars( vp[1:10,] )
  
  return(list(all_df = all_df, vp = vp, cor_heatmap = cor_heatmap, tech_heatmap = tech_heatmap))
}

var_part_raster <- function(obj){
  col <-  c(ggColorHue(ncol(obj)-1), "grey85")
  
  obj %>% 
    as.data.frame() %>% 
    pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
    mutate(variable = factor(variable, levels = names(obj))) %>%
    ggplot(aes(x = variable, y = value, group = variable)) + 
    geom_violin( scale="width", aes(fill = factor(variable))) +
    theme_bw() +
    ggrastr::geom_boxplot_jitter(width=0.07, fill="grey", outlier.colour='black', outlier.size = 0.5) +
    scale_fill_manual(values=col) +
    theme(legend.position="none") +
    theme(plot.title=element_text(hjust=0.5)) +
    theme(axis.text.y = element_text(colour = "black"),
          axis.ticks = element_line(colour = "black")) +
    theme(axis.text.x = element_text(
      angle = 20,
      hjust = 1,
      vjust = 1, 
      colour = "black")) +
    labs(x = "", y = "") +
    scale_y_continuous(labels = scales::percent)
}

Lumbar Spinal Cord

lsc_plots <- diagnosticPlots("Lumbar_Spinal_Cord")
## # A tibble: 7 × 3
##   site                               Control   ALS
##   <chr>                                <int> <int>
## 1 Academic Medical Center                  7    33
## 2 Barrow Neurological Institute            1    11
## 3 Columbia University Medical Center       3    23
## 4 Georgetown University                    2     4
## 5 Johns Hopkins University                 2    18
## 6 University College London               16    19
## 7 University of California San Diego       1    14
## # A tibble: 2 × 3
##   prep                 Control   ALS
##   <chr>                  <int> <int>
## 1 Automated KAPA Total      24    88
## 2 Manual KAPA Total          8    34
p1 <- as.ggplot(pheatmap(lsc_plots$cor_heatmap, main = "Variance of expression PC explained by covariate (R^2)", cluster_cols = FALSE  ))

p2 <- as.ggplot(pheatmap(lsc_plots$tech_heatmap , main = "Absolute Spearman correlation between technical covariates"))

vp_plot <- var_part_raster(lsc_plots$vp)

lsc_pc1_strand_prep <- lsc_plots$all_df %>% ggplot(aes(x = PC1, y = `% stranding`, colour = `submitting site`, shape = `library prep`)) + geom_point() + scale_colour_viridis_d() + theme_bw()

lsc_pc2_disease_prep <- lsc_plots$all_df %>% ggplot(aes(x = PC2, colour = `disease`, y = `% mRNA bases`)) + geom_point() + theme_bw()

lsc_qc_multiplot <- 
  ( p1 + vp_plot + plot_layout(widths = c(1,1.25) ) )  /
  (lsc_pc1_strand_prep + lsc_pc2_disease_prep ) + 
  plot_layout(heights = c(1,0.6)) + 
  plot_annotation(tag_levels = "a") & theme(plot.tag = element_text(face = "bold"))

ggsave(plot = lsc_qc_multiplot, filename = "../../ALS_SC/plots/LSC_QC_multiplot.pdf", width = 12, height = 9)


lsc_qc_multiplot

Cervical Spinal Cord

csc_plots <- diagnosticPlots("Cervical_Spinal_Cord")
## # A tibble: 8 × 3
##   site                               Control   ALS
##   <chr>                                <int> <int>
## 1 Academic Medical Center                  6    36
## 2 Barrow Neurological Institute            1    10
## 3 Columbia University Medical Center       4    26
## 4 Georgetown University                    3     5
## 5 Johns Hopkins University                 3    22
## 6 Mount Sinai                              1    NA
## 7 University College London               16    18
## 8 University of California San Diego       1    22
## # A tibble: 2 × 3
##   prep                 Control   ALS
##   <chr>                  <int> <int>
## 1 Automated KAPA Total      24    92
## 2 Manual KAPA Total         11    47
p1 <- as.ggplot(pheatmap(csc_plots$cor_heatmap, main = "Variance of expression PC explained by covariate (R^2)", cluster_cols = FALSE  ))

p2 <- as.ggplot(pheatmap(csc_plots$tech_heatmap , main = "Absolute Spearman correlation between technical covariates"))

# vp_plot <- plotVarPart( csc_plots$vp ) + labs(title = "Cervical Spinal Cord variance partition")# + plot_spacer() + plot_layout(ncol = 1, nrow = 2)
# 
vp_plot <- var_part_raster(csc_plots$vp)

csc_pc1_strand_prep <- csc_plots$all_df %>% ggplot(aes(x = PC1, y = `% stranding`, colour = `submitting site`, shape = `library prep`)) + geom_point() + scale_colour_viridis_d() + theme_bw()

csc_pc2_disease_prep <- csc_plots$all_df %>% ggplot(aes(x = PC2, colour = `disease`, y = `% mRNA bases`)) + geom_point() + theme_bw()

csc_qc_multiplot <- 
  ( p1 + vp_plot + plot_layout(widths = c(1,1.25) ) )  /
  (csc_pc1_strand_prep + csc_pc2_disease_prep ) + 
  plot_layout(heights = c(1,0.6)) + 
  plot_annotation(tag_levels = "a") & theme(plot.tag = element_text(face = "bold"))

ggsave(plot = csc_qc_multiplot, filename = "../../ALS_SC/plots/csc_QC_multiplot.pdf", width = 12, height = 9)


var_part_raster(csc_plots$vp)

Thoracic Spinal Cord

tsc_plots <- diagnosticPlots("Thoracic_Spinal_Cord")
## # A tibble: 5 × 3
##   site                               Control   ALS
##   <chr>                                <int> <int>
## 1 Barrow Neurological Institute            1    11
## 2 Georgetown University                    1     1
## 3 Johns Hopkins University                 6    18
## 4 Mount Sinai                              1    NA
## 5 University of California San Diego       1    12
## # A tibble: 2 × 3
##   prep                 Control   ALS
##   <chr>                  <int> <int>
## 1 Automated KAPA Total       2     6
## 2 Manual KAPA Total          8    36
p1 <- as.ggplot(pheatmap(tsc_plots$cor_heatmap, main = "Variance of expression PC explained by covariate (R^2)", cluster_cols = FALSE  ))

p2 <- as.ggplot(pheatmap(tsc_plots$tech_heatmap , main = "Absolute Spearman correlation between technical covariates"))

vp_plot <- var_part_raster(tsc_plots$vp)

tsc_pc1_strand_prep <- tsc_plots$all_df %>% ggplot(aes(x = PC1, y = `% stranding`, colour = `submitting site`, shape = `library prep`)) + geom_point() + scale_colour_viridis_d() + theme_bw()

tsc_pc2_disease_prep <- tsc_plots$all_df %>% ggplot(aes(x = PC2, colour = `disease`, y = `% mRNA bases`)) + geom_point() + theme_bw()

tsc_qc_multiplot <- 
  ( p1 + vp_plot + plot_layout(widths = c(1,1.25) ) )  /
  (tsc_pc1_strand_prep + tsc_pc2_disease_prep ) + 
  plot_layout(heights = c(1,0.6)) + 
  plot_annotation(tag_levels = "a") & theme(plot.tag = element_text(face = "bold"))

tsc_qc_multiplot 

ggsave(plot = tsc_qc_multiplot, filename = "../../ALS_SC/plots/tsc_QC_multiplot.pdf", width = 12, height = 9)