Here we show how to replicate our differential expression (DE) results using the data uploaded to Zenodo.
library(zen4R) #remotes::install_github("eblondel/zen4R")
library(tidyverse)
library(limma)
library(edgeR)
library(gt)
# find record
zenodo <- ZenodoManager$new(logger = "INFO")
rec <- zenodo$getRecordById("6385747")
## [zen4R][INFO] ZenodoRequest - Fetching https://zenodo.org/api/records?q=recid:6385747&size=10&page=1&all_versions=1
## [zen4R][INFO] ZenodoManager - Successfully fetched list of published records - page 1
## [zen4R][INFO] ZenodoManager - Successfully fetched list of published records!
## [zen4R][INFO] ZenodoManager - Successfully fetched record for id '6385747'!
files <- rec$listFiles(pretty = TRUE)
## download files
infolder <- "download_zenodo/"
dir.create(infolder, showWarnings = FALSE)
if(!all(files$filename %in% list.files(infolder))){
rec$downloadFiles(path = infolder)
}
## [zen4R][INFO] ZenodoRecord - Download in sequential mode
## [zen4R][INFO] ZenodoRecord - Will download 4 files from record '6385747' (doi: '10.5281/zenodo.6385747') - total size: 35.7 MiB
## [zen4R][INFO] Downloading file 'Cervical_Spinal_Cord.tar.gz' - size: 15.8 MiB
## [zen4R][INFO] Downloading file 'gencode.v30.gene_meta.tsv.gz' - size: 441.9 KiB
## [zen4R][INFO] Downloading file 'Lumbar_Spinal_Cord.tar.gz' - size: 14.2 MiB
## [zen4R][INFO] Downloading file 'Thoracic_Spinal_Cord.tar.gz' - size: 5.4 MiB
## [zen4R][INFO] Files downloaded at '/Volumes/GoogleDrive/My Drive/Work/Mount_Sinai/NYGC_ALS/docs/download_zenodo'.
## [zen4R][INFO] ZenodoRecord - Verifying file integrity...
## [zen4R][INFO] File 'Cervical_Spinal_Cord.tar.gz': integrity verified (md5sum: ef2509aa14b14a539f30a93e31a8626c)
## [zen4R][INFO] File 'gencode.v30.gene_meta.tsv.gz': integrity verified (md5sum: d9f5386bb6a20a8761b06619feb20cab)
## [zen4R][INFO] File 'Lumbar_Spinal_Cord.tar.gz': integrity verified (md5sum: 4eb7b9be356230c36e31ca987e10f912)
## [zen4R][INFO] File 'Thoracic_Spinal_Cord.tar.gz': integrity verified (md5sum: 8fa6a81bad47eff1de9d411be0c563ca)
## [zen4R][INFO] ZenodoRecord - End of download
Each spinal cord region has a separate tar.gz archive. Today we will work with the Cervical spinal cord region.
We have to first unzip the archive:
# unzip the Cervical Spinal Cord
tissue <- "Cervical_Spinal_Cord"
system( paste0("cd ", infolder,"; tar -xzf ", paste0(tissue, ".tar.gz" ) ))
list.files(file.path(infolder,tissue) )
## [1] "Cervical_Spinal_Cord_gene_counts.tsv.gz" "Cervical_Spinal_Cord_gene_tpm.tsv.gz" "Cervical_Spinal_Cord_metadata.tsv.gz"
Now we can read in the data
counts_file <- file.path(infolder, tissue, paste0(tissue, "_gene_counts.tsv.gz") )
tpm_file <- file.path(infolder, tissue, paste0(tissue, "_gene_tpm.tsv.gz") )
metadata_file <- file.path(infolder, tissue, paste0(tissue, "_metadata.tsv.gz") )
gene_meta <- read_tsv("download_zenodo/gencode.v30.gene_meta.tsv.gz")
support_loc <- read_tsv(metadata_file) %>% as.data.frame()
tpm_loc <- read_tsv(tpm_file)
counts_loc <- read_tsv(counts_file)
row.names(support_loc) <- support_loc$rna_id
genes_loc <- counts_loc %>% select(-gene_name) %>%
column_to_rownames("ensembl_id")
tpm_loc <- tpm_loc %>% select(-gene_name) %>%
column_to_rownames("ensembl_id")
support_loc %>% group_by(disease) %>% tally() %>% gt::gt()
disease | n |
---|---|
ALS | 138 |
Control | 36 |
We use a liberal cut-off for lowly expressed genes (median TPM > 0) in order to capture genes originating from lowly abundant cell-types, such as motor neurons.
keep.exp <- rowSums(tpm_loc > 0) >= ceiling(0.5 * ncol(genes_loc))
genes_loc <- genes_loc[keep.exp, row.names(support_loc)]
print(nrow(genes_loc))
## [1] 25365
We perform variable selection for our linear model.
support_loc$rin_squared <- support_loc$rin^2
support_loc$age_squared <- support_loc$age_rounded^2
cols <- c("sex", "site_id", "disease", "library_prep")
support_loc <- support_loc %>% mutate_at(cols, funs(factor(.)))
support_loc$disease <- factor(support_loc$disease, levels = c("Control", "ALS"))
support_loc$age_squared <- support_loc$age_rounded^2
design_formula <- as.formula("~ disease + sex + rin + rin_squared + age_rounded + age_squared + library_prep + site_id + gPC1 + gPC2 + gPC3 + gPC4 + gPC5 + pct_mrna_bases")
design_loc <- model.matrix(design_formula, data = support_loc)
stopifnot( nrow(support_loc) == nrow(design_loc))
We now perform differential expression using Limma Voom
norm_loc <- calcNormFactors(genes_loc, method = "TMM")
dge <- DGEList(counts=genes_loc, samples=support_loc, norm.factors = norm_loc)
v <- voom(dge, design_loc)
vfit <- lmFit(v, design_loc)
efit <- eBayes(vfit)
res <-
topTable(efit, coef=2, number = Inf, adjust.method = "BH") %>%
as.data.frame() %>%
rownames_to_column(var = "geneid") %>%
left_join(gene_meta, by = "geneid") %>%
janitor::clean_names() %>%
as_tibble() %>%
select(genename, geneid, everything() ) %>%
arrange(adj_p_val)
table(res$adj_p_val < 0.05)
##
## FALSE TRUE
## 17787 7602
table(res$adj_p_val < 0.05 & abs(res$log_fc) > 1)
##
## FALSE TRUE
## 25036 353
We have identified thousands of differentially expressed genes. What are the top associations by P-value?
head(res, 10) %>% gt::gt()
genename | geneid | log_fc | ave_expr | t | p_value | adj_p_val | b |
---|---|---|---|---|---|---|---|
ASAH1 | ENSG00000104763 | 0.9625602 | 8.144089 | 10.938567 | 4.576936e-21 | 1.160940e-16 | 37.28058 |
GPNMB | ENSG00000136235 | 2.6689097 | 7.791121 | 10.416971 | 1.187241e-19 | 1.505719e-15 | 34.08229 |
CTSS | ENSG00000163131 | 1.6698520 | 5.915641 | 10.103238 | 8.310189e-19 | 7.026265e-15 | 32.17182 |
PLEKHA2 | ENSG00000169499 | 0.6180703 | 6.131446 | 9.661946 | 1.257360e-17 | 6.641558e-14 | 29.50611 |
RAP2B | ENSG00000181467 | 0.7505910 | 5.472909 | 9.655350 | 1.309197e-17 | 6.641558e-14 | 29.46948 |
APOC1 | ENSG00000130208 | 2.0522311 | 6.133638 | 9.536245 | 2.712333e-17 | 1.146639e-13 | 28.75712 |
APOE | ENSG00000130203 | 1.1095442 | 10.322321 | 9.436343 | 4.988064e-17 | 1.807461e-13 | 28.15658 |
HGF | ENSG00000019991 | 1.4004274 | 3.648112 | 9.379773 | 7.037858e-17 | 2.231441e-13 | 27.77269 |
SOAT1 | ENSG00000057252 | 1.1127349 | 5.613011 | 9.294720 | 1.179733e-16 | 3.324882e-13 | 27.31602 |
TYROBP | ENSG00000011600 | 1.2309111 | 4.673832 | 9.273163 | 1.344489e-16 | 3.410296e-13 | 27.19099 |
We can plot the results as a volcano plot.
highlight <- c("CCL18", "CHIT1", "GPNMB", "MNX1", "MOBP")
res %>%
mutate(case = case_when(
adj_p_val < 0.05 & log_fc >= 1 ~ "A",
adj_p_val < 0.05 & log_fc <1 & log_fc >0 ~ "B",
adj_p_val < 0.05 & log_fc < -1 ~ "C",
adj_p_val < 0.05 & log_fc > -1 & log_fc < 0 ~ "D",
TRUE ~ "E"
)) %>%
mutate(label = ifelse(genename %in% highlight, genename, "" )) %>%
ggplot(aes(x = log_fc, y = -log10(p_value))) +
geom_point(aes(colour = case), size = 0.8) +
ggrepel::geom_text_repel(aes(label = label), max.overlaps = Inf ) +
scale_colour_manual(values = c( "red", "salmon", "navy", "dodgerblue3", "gray" )) +
theme_bw() +
theme(axis.text = element_text(colour = "black")) +
guides(colour = "none") +
labs(y = expression(-log[10]~P), x = expression(log[2]~fold~change), title = "Cervical Spinal Cord", subtitle = "ALS vs Control" )