Here we show how to replicate our differential expression (DE) results using the data uploaded to Zenodo.

library(zen4R) #remotes::install_github("eblondel/zen4R")
library(tidyverse)
library(limma)
library(edgeR)
library(gt)

# find record
zenodo <- ZenodoManager$new(logger = "INFO")
rec <- zenodo$getRecordById("6385747")
## [zen4R][INFO] ZenodoRequest - Fetching https://zenodo.org/api/records?q=recid:6385747&size=10&page=1&all_versions=1 
## [zen4R][INFO] ZenodoManager - Successfully fetched list of published records - page 1 
## [zen4R][INFO] ZenodoManager - Successfully fetched list of published records! 
## [zen4R][INFO] ZenodoManager - Successfully fetched record for id '6385747'!
files <- rec$listFiles(pretty = TRUE)

## download files
infolder <- "download_zenodo/"
dir.create(infolder, showWarnings = FALSE)
if(!all(files$filename %in% list.files(infolder))){
  rec$downloadFiles(path = infolder)
}
## [zen4R][INFO] ZenodoRecord - Download in sequential mode 
## [zen4R][INFO] ZenodoRecord - Will download 4 files from record '6385747' (doi: '10.5281/zenodo.6385747') - total size: 35.7 MiB 
## [zen4R][INFO] Downloading file 'Cervical_Spinal_Cord.tar.gz' - size: 15.8 MiB
## [zen4R][INFO] Downloading file 'gencode.v30.gene_meta.tsv.gz' - size: 441.9 KiB
## [zen4R][INFO] Downloading file 'Lumbar_Spinal_Cord.tar.gz' - size: 14.2 MiB
## [zen4R][INFO] Downloading file 'Thoracic_Spinal_Cord.tar.gz' - size: 5.4 MiB
## [zen4R][INFO] Files downloaded at '/Volumes/GoogleDrive/My Drive/Work/Mount_Sinai/NYGC_ALS/docs/download_zenodo'.
## [zen4R][INFO] ZenodoRecord - Verifying file integrity... 
## [zen4R][INFO] File 'Cervical_Spinal_Cord.tar.gz': integrity verified (md5sum: ef2509aa14b14a539f30a93e31a8626c)
## [zen4R][INFO] File 'gencode.v30.gene_meta.tsv.gz': integrity verified (md5sum: d9f5386bb6a20a8761b06619feb20cab)
## [zen4R][INFO] File 'Lumbar_Spinal_Cord.tar.gz': integrity verified (md5sum: 4eb7b9be356230c36e31ca987e10f912)
## [zen4R][INFO] File 'Thoracic_Spinal_Cord.tar.gz': integrity verified (md5sum: 8fa6a81bad47eff1de9d411be0c563ca)
## [zen4R][INFO] ZenodoRecord - End of download

Each spinal cord region has a separate tar.gz archive. Today we will work with the Cervical spinal cord region.

We have to first unzip the archive:

# unzip the Cervical Spinal Cord

tissue <- "Cervical_Spinal_Cord"

system( paste0("cd ", infolder,"; tar -xzf ", paste0(tissue, ".tar.gz" ) ))
list.files(file.path(infolder,tissue) )
## [1] "Cervical_Spinal_Cord_gene_counts.tsv.gz" "Cervical_Spinal_Cord_gene_tpm.tsv.gz"    "Cervical_Spinal_Cord_metadata.tsv.gz"

Now we can read in the data

counts_file <- file.path(infolder, tissue, paste0(tissue, "_gene_counts.tsv.gz") )
tpm_file <- file.path(infolder, tissue, paste0(tissue, "_gene_tpm.tsv.gz") )
metadata_file <- file.path(infolder, tissue, paste0(tissue, "_metadata.tsv.gz") )

gene_meta <- read_tsv("download_zenodo/gencode.v30.gene_meta.tsv.gz")

support_loc <- read_tsv(metadata_file) %>% as.data.frame()
tpm_loc <- read_tsv(tpm_file)
counts_loc <- read_tsv(counts_file)

row.names(support_loc) <- support_loc$rna_id

genes_loc <- counts_loc %>% select(-gene_name) %>% 
  column_to_rownames("ensembl_id")
tpm_loc <- tpm_loc %>% select(-gene_name) %>% 
  column_to_rownames("ensembl_id")
support_loc %>% group_by(disease) %>% tally() %>% gt::gt()
disease n
ALS 138
Control 36

We use a liberal cut-off for lowly expressed genes (median TPM > 0) in order to capture genes originating from lowly abundant cell-types, such as motor neurons.

keep.exp <- rowSums(tpm_loc > 0) >= ceiling(0.5 * ncol(genes_loc)) 

genes_loc <- genes_loc[keep.exp, row.names(support_loc)]

print(nrow(genes_loc))
## [1] 25365

We perform variable selection for our linear model.

support_loc$rin_squared <- support_loc$rin^2
support_loc$age_squared <- support_loc$age_rounded^2

cols <- c("sex", "site_id", "disease", "library_prep")
support_loc <- support_loc %>% mutate_at(cols, funs(factor(.)))
support_loc$disease <- factor(support_loc$disease, levels = c("Control", "ALS"))

support_loc$age_squared <- support_loc$age_rounded^2

design_formula <- as.formula("~ disease + sex + rin + rin_squared + age_rounded + age_squared + library_prep + site_id + gPC1 + gPC2 + gPC3 + gPC4 + gPC5 + pct_mrna_bases")

design_loc <- model.matrix(design_formula, data = support_loc)
  
stopifnot( nrow(support_loc) == nrow(design_loc))

We now perform differential expression using Limma Voom

norm_loc <- calcNormFactors(genes_loc, method = "TMM") 

dge <- DGEList(counts=genes_loc, samples=support_loc, norm.factors = norm_loc)  

v <- voom(dge, design_loc)
vfit <- lmFit(v, design_loc)
efit <- eBayes(vfit)

res <- 
    topTable(efit, coef=2, number = Inf, adjust.method = "BH") %>% 
      as.data.frame() %>%
      rownames_to_column(var = "geneid") %>%
      left_join(gene_meta, by = "geneid") %>%
      janitor::clean_names() %>%
      as_tibble() %>%
  select(genename, geneid, everything() ) %>%
  arrange(adj_p_val)

table(res$adj_p_val < 0.05)
## 
## FALSE  TRUE 
## 17787  7602
table(res$adj_p_val < 0.05 & abs(res$log_fc) > 1)
## 
## FALSE  TRUE 
## 25036   353

We have identified thousands of differentially expressed genes. What are the top associations by P-value?

head(res, 10) %>% gt::gt()
genename geneid log_fc ave_expr t p_value adj_p_val b
ASAH1 ENSG00000104763 0.9625602 8.144089 10.938567 4.576936e-21 1.160940e-16 37.28058
GPNMB ENSG00000136235 2.6689097 7.791121 10.416971 1.187241e-19 1.505719e-15 34.08229
CTSS ENSG00000163131 1.6698520 5.915641 10.103238 8.310189e-19 7.026265e-15 32.17182
PLEKHA2 ENSG00000169499 0.6180703 6.131446 9.661946 1.257360e-17 6.641558e-14 29.50611
RAP2B ENSG00000181467 0.7505910 5.472909 9.655350 1.309197e-17 6.641558e-14 29.46948
APOC1 ENSG00000130208 2.0522311 6.133638 9.536245 2.712333e-17 1.146639e-13 28.75712
APOE ENSG00000130203 1.1095442 10.322321 9.436343 4.988064e-17 1.807461e-13 28.15658
HGF ENSG00000019991 1.4004274 3.648112 9.379773 7.037858e-17 2.231441e-13 27.77269
SOAT1 ENSG00000057252 1.1127349 5.613011 9.294720 1.179733e-16 3.324882e-13 27.31602
TYROBP ENSG00000011600 1.2309111 4.673832 9.273163 1.344489e-16 3.410296e-13 27.19099

We can plot the results as a volcano plot.

highlight <- c("CCL18", "CHIT1", "GPNMB", "MNX1", "MOBP")

res %>%
  mutate(case = case_when(
  adj_p_val < 0.05 & log_fc >= 1 ~ "A",
  adj_p_val < 0.05 & log_fc <1 & log_fc >0 ~ "B",
  adj_p_val < 0.05 & log_fc < -1 ~ "C",
  adj_p_val < 0.05 & log_fc > -1 & log_fc < 0 ~ "D",
  TRUE ~ "E"
  )) %>%
  mutate(label = ifelse(genename %in% highlight, genename, "" )) %>%
  ggplot(aes(x = log_fc, y = -log10(p_value))) + 
  geom_point(aes(colour = case), size = 0.8) +
  ggrepel::geom_text_repel(aes(label = label), max.overlaps = Inf ) +
  scale_colour_manual(values = c( "red", "salmon", "navy", "dodgerblue3", "gray" )) +
  theme_bw() + 
  theme(axis.text = element_text(colour = "black")) +
  guides(colour = "none") +
  labs(y = expression(-log[10]~P), x = expression(log[2]~fold~change), title = "Cervical Spinal Cord", subtitle = "ALS vs Control" )