Assess the functions of the differentially expressed genes using the following tools:

Gene Ontology enrichment using clusterProfiler

#load(file = here::here("data/differential_expression/ALS/ALS_Spinal_Cord_de_res.RData"))
#de_res <- de_res_final
#load(file = here::here("data/differential_expression/ALS/Model_8d_limma_res.RData"))
#load(here::here("data/differential_expression/ALS/Model_9e_limma_res.RData")) 
#de_res <- de_res_9e
#names(de_res) <- c("LSC", "CSC", "TSC")
#de_res <- model_8_final
setEnrichment <- function(set1, set2, universe = 20000){
  a = sum(set1 %in% set2)
  c = length(set1) - a
  b = length(set2) - a
  d = universe - length(set2) - c
  contingency_table = matrix(c(a, c, b, d), nrow = 2)
  # one-tailed test for enrichment only
  fisher_results = fisher.test(contingency_table, alternative = "greater")
  # returns data.frame containing the lengths of the two sets, the overlap, the enrichment ratio (odds ratio) and P value
  df <- tibble::tibble( set1_length = length(set1), set2_length = length(set2), overlap = a, ratio = fisher_results$estimate, p.value = fisher_results$p.value)
  return(df)
}


#de_res <- de_res_final
load_results <- function(file){
  load(file)
  de_res <- de_res_final
  tissues <- names(de_res)

  # add tissue name as a column 
  de_res <- 
    purrr::map2(
      .x = de_res,
      .y = names(de_res), ~{
        .x$tissue <- .y
        return(.x)
      })
  return(de_res)
}

make_gene_sets <- function(de_res){
  
  # for each result split by direction and get out lists of genes
  
  # output only significant up and downregulated genes
  gene_sets_sig <- 
    map_df(de_res, ~{.x}) %>%
    mutate(direction = ifelse( log_fc > 0, "up", "down") ) %>%
    mutate( set = paste(tissue, direction, sep = ":")) %>%
    filter(adj_p_val < 0.05) %>%
    split(.$set) %>%
    map("genename") #%>%
  #map( ~{ head(.x, 1000)})
  
  # output top X genes sorted by P-value, even if not sig at FDR < 0.05
  gene_sets_top <-
    map_df(de_res, ~{.x}) %>%
    mutate(direction = ifelse( log_fc > 0, "up", "down") ) %>%
    mutate( set = paste(tissue, direction, sep = ":")) %>%
    split(.$set) %>%
    map("genename") %>%
    map( ~{ head(.x, 250)})
  
  tissues <- names(de_res)
  
  tissue_sets <- expand.grid(tissues, c("up","down"), stringsAsFactors = FALSE)
  names(tissue_sets) <- c("tissue", "direction")
  tissue_sets$set <- paste(tissue_sets$tissue, tissue_sets$direction, sep = ":")
  
  gene_sets_top <- gene_sets_top[ tissue_sets$set]
  
  gene_sets_tstat <-
    map(de_res, ~{
      .x <- 
        .x %>%
        #filter(adj_p_val < 0.05) %>%
        arrange( desc(t) )
      tstat <- .x$t
      names(tstat) <- .x$genename
      return(tstat)
    })
  
  gene_sets_lfc <-
    map(de_res, ~{
      .x <- .x %>%
        filter(p_value < 0.05, !duplicated(genename) ) %>% ## OOOH
        arrange( desc(log_fc) )
      lfc <- .x$log_fc
      names(lfc) <- .x$genename
      return(lfc)
    })
  
  return(
    list(sig = gene_sets_sig, top = gene_sets_top, t = gene_sets_tstat, lfc = gene_sets_lfc)
  ) 
}

ma_plot <- function(de_res, title = NULL, annotate_by = NULL){
   de_res <- 
    mutate(de_res,
           class = case_when(
      adj_p_val >= 0.05 ~ "non_sig",
      adj_p_val < 0.05 & abs(log_fc) < 1 ~ "sig",
      adj_p_val < 0.05 & abs(log_fc) >= 1 ~ "sig - strong"
    )) 
  
  plot <- de_res %>%
    ggplot(aes(x = ave_expr, y = log_fc)) + 
    rasterise(geom_point(aes(colour = class ), size = 0.5), dpi = 300) +
    scale_colour_manual(values = c("gray", "orange", "red")) +
    theme_bw() +
        labs(x = "log(average expression)", y =  expression(log[2]~"(fold change)"), title = title) +
    guides(colour = FALSE) +
    #xlim(-2,4.6) +
    ylim(-4.5,4.5) +
    theme_jh()
  
  if(!is.null(annotate_by)){
    plot <- plot + 
      ggrepel::geom_text_repel(
        fontface = "italic",
        data = filter(de_res, genename %in% annotate_by), 
        aes(x = log_fc, y = -log10(p_value), label = genename), 
        min.segment.length = unit(0, "lines"),
        size = 2.5) +
      geom_point(
        data = filter(de_res, genename %in% annotate_by), size = 0.8, colour = "black"
      ) +
      geom_point(aes(colour = class ),
        data = filter(de_res, genename %in% annotate_by), size = 0.6
      )
    
  }
  return(plot)
  
}

# threshold p-values at 1e-16
# threshold max LFC at 3
volcano_plot <- function(de_res, title = NULL, subtitle = NULL, annotate_by = NULL, type = "ALS"){
    if( type == "ALS"){
    xpos <- 0.5
    ymax <- 17
    xlim <- c(-3,3)
  }else{
    xpos <- 0.025
    ymax <- 9
    xlim <- c(-0.042, 0.042)
  }
  
  de_res <- 
    mutate(de_res,
           sig = case_when(
      adj_p_val >= 0.05 ~ "non_sig",
      adj_p_val < 0.05 & abs(log_fc) < 1 ~ "sig",
      adj_p_val < 0.05 & abs(log_fc) >= 1 ~ "sig - strong"
    )) %>%
    mutate(direction = ifelse(log_fc > 0, "up", "down")) %>%
    mutate(log_fc = case_when(
      log_fc >= 3 ~ Inf,
    log_fc < -3 ~ -Inf,
    TRUE ~ log_fc
    )) %>%
    mutate(p_value = ifelse(p_value < 10^-(ymax-1), 10^-(ymax-1), p_value) ) %>%
    mutate(class = paste(sig, direction))
  

  
  de_tally <- group_by(de_res, sig, direction, class) %>% tally() %>%
    filter(sig != "non_sig") %>%
    mutate( position = ifelse(sig == "sig", xpos, 2) ) %>%
    mutate(position = ifelse( direction == "down", -1 * position, position)) %>%
    mutate(n = formatC(n, format="f", big.mark=",", digits=0))
  
  plot <- de_res %>%
    mutate( p_value = ifelse( p_value < 1e-16, Inf, p_value)) %>% #threshold at 1e16
    ggplot(aes(x = log_fc, y = -log10(p_value))) + 
    #geom_point(aes(colour = class ), size = 0.5) +
    rasterise(geom_point(aes(colour = class ), size = 0.5), dpi = 300) +
    scale_colour_manual(values = c("non_sig up" = "gray", 
                                   "non_sig down" = "gray",
                                   "sig up" = "#EB7F56", 
                                   "sig - strong up" = "#B61927",
                                   "sig down" = "#4F8FC4",
                                   "sig - strong down" = "dodgerblue4"
                                   )) +
    theme_bw() +
    labs(y = expression(-log[10]~P~value), x = expression(log[2]~"(fold change)"), title = title, subtitle = subtitle) +
    guides(colour = FALSE) +
    scale_y_continuous(expand = c(0,0), limits = c(0,ymax)) +
    theme_jh() +
    theme(
      plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
      panel.border = element_blank(),
      axis.ticks = element_line(colour = "black")
    ) +
    geom_text(fontface = "bold", data = de_tally, aes(x = position, y = ymax - 0.5, label = n, colour = class), size = 2.5 ) +
    scale_x_continuous(limits = xlim)
  
  if(!is.null(annotate_by)){
    plot <- plot + 
      ggrepel::geom_text_repel(
        fontface = "italic",
        data = filter(de_res, genename %in% annotate_by), 
        aes(x = log_fc, y = -log10(p_value), label = genename), 
        min.segment.length = unit(0, "lines"),
        size = 2.5) +
      geom_point(
        data = filter(de_res, genename %in% annotate_by), size = 0.8, colour = "black"
      ) +
      geom_point(aes(colour = class ),
        data = filter(de_res, genename %in% annotate_by), size = 0.6
      )
    
  }
  return(plot)
  
}

## GSEA
named_list_to_df <- function(named_list){
  map2_df( named_list, names(named_list), ~{
    tibble( term_id = .y, gene = .x)
  } )
}



gsea_plot <- function(data){
  if("set" %in% names(data)){
    data$tissue <- data$set
  } 
  data %>%
  #filter(tissue %in% tissues ) %>%
  #mutate(tissue = factor(tissue, levels =) %>%
  mutate(padj = p.adjust(pvalue, method = "bonferroni")) %>%
  mutate(padj_label = case_when(
    padj < 0.0001 ~ "***",
    padj < 0.001 ~ "**",
    padj < 0.05 ~ "*",
    TRUE ~ "."
    )) %>%
  mutate(ID = fct_rev(ID)) %>%
  #mutate(sig = ifelse(padj < 0.05, "*", "")) %>%
  mutate(NES_label = signif(NES, digits = 2)) %>%
  ggplot(aes( x = tissue, y = ID)) + 
  geom_tile(aes(fill = NES)) + 
  scale_fill_distiller(type = "div", palette = "RdBu", limits = c(-4.2,4.2)) + 
  #eom_tile(aes(colour = padj < 0.05), fill = NA, size = 2 ) +
  scale_colour_manual(values = c(NA, "white")) +
  geom_text(aes(label = padj_label)) +
  #geom_text(aes(label = padj_label), nudge_x = 0.25, nudge_y = 0.25, size = 5) +
  scale_y_discrete(expand = c(0,0)) +
  theme_jh() +
  labs(x = "", y = "") +
  scale_x_discrete(expand = c(0,0), position = "top") +
    theme(axis.line = element_blank() )
}

# plot log2 fold changes of marker genes (nominal P < 0.05)
gsea_box_plots <- function(res, markers, measure = "log_fc"){
  if( !is.data.frame(markers)){
    markers <- named_list_to_df(markers)
  }
  if( measure == "log_fc"){
    measure_string <- expression(log[2]~Fold~Change)
  }else{
    measure_string <- measure
  }
  d <- res %>%
    bind_rows() %>%
    inner_join(markers, by = c("genename" = "gene")) %>%
    filter( p_value < 0.05)
  
  d %>%
  mutate( FDR = ifelse(adj_p_val < 0.05, yes = "< 0.05", no = "> 0.05") ) %>%
  #mutate(term_id = str_sub(term_id, start = 1, end = 1)) %>%
  ggplot(aes_string(x = "term_id", y = measure )) + 
  geom_jitter(width = 0.25, size = 1, aes(colour = FDR) ) +
  geom_boxplot(outlier.color = NA, fill = NA) + 
  scale_colour_manual( values = c("> 0.05" = "gray", "< 0.05" = "maroon") ) + 
  theme_jh() +
  geom_hline(yintercept = 0, linetype = 2) +
  labs( x = "", y = measure_string) +
  coord_flip() +
    facet_wrap(~tissue)
}

Load Marker Genes

# Neuroexpresso
load(here::here("data/misc/neuroexpresso_spinal_cord_human.RData"))

nxp_gsea  <- named_list_to_df(nxp)

## Kelley
load(here::here("data/markers/kelley_markers.RData"))

kelley_gsea <- named_list_to_df(kelley)

# Panglaodb
load(here::here("data/markers/PanglaoDB_markers.RData"))

panglao <- named_list_to_df(pg_list) %>%
  filter(term_id %in% c("Endothelial cells", "Ependymal cells", "Microglia", "Neurons", "Motor neurons", "Oligodendrocytes", "Pericytes", "Astrocytes", "Cholinergic neurons"))

## activation sets
load(here::here("data/markers/activation_markers.RData"))

act_sets <- c( "Disease-associated microglia", "Disease-associated astrocytes", "Reactive astrocytes - MCAO", "Reactive astrocytes - LPS", "Plaque-induced genes") 


activation_gsea <- named_list_to_df(activation_sets) #%>%
  #filter( term_id %in% act_sets)
load(here::here("data/markers/Mathys_single_nucleus.RData"))


mathys_conversion_table <- data.frame(short = c("Ast","End", "Mic", "Ex", "Oli", "Per" ),
                               long = c("Astrocytes", "Endothelial", "Microglia", "Neurons", "Oligos", "Pericytes"),
                               mono = c("A","E","M","N","O","P")
  )

mathys_gsea <- named_list_to_df(mathys) %>%
  left_join(mathys_conversion_table, by = c("term_id" = "short") ) %>%
  mutate(term_id = long) %>%
  select(term_id, gene) %>%
  drop_na()

load(here::here("data/markers/Darmanis_single_cell.RData"))
darmanis_gsea <- named_list_to_df(darmanis)

load(here::here("data/markers/blum_spinal_cord.RData"))
blum_gsea <- named_list_to_df(blum_markers)


load( here::here("data/markers/syngo_1.1.RData"))

syngo_gsea <- named_list_to_df(syngo)

MSigDB - hallmark gene sets

hallmark_file <- here::here("data/MSigDB/h.all.v7.2.symbols.gmt")
hallmarks <- read.gmt(hallmark_file) %>%
  mutate(term = gsub("HALLMARK|_", " ", term)) %>%
  mutate(term = gsub("INTERFERON", "IFN", term)) %>%
  mutate(term = gsub("EPITHELIAL MESENCHYMAL", "E-M", term)) %>%
  mutate(term = gsub("UNFOLDED PROTEIN RESPONSE", "UPR", term)) %>%
  mutate(term = str_trim(term))

hallmark_enrichr <- function(gene_set){
  
  hallmark_res <- 
    map_df(gene_set, ~{
      as.data.frame(enricher(.x, TERM2GENE=hallmarks))
    }, .id = "set"
    )
  hallmark_res %>%
    select(set, ID, qvalue) %>%
    pivot_wider(names_from = set, values_from = qvalue)

  return(hallmark_res)
}

# use t stat or log fc
hallmark_gsea <- function(gene_set){

  hallmark_gsea_res <- map_df(gene_set, ~{
    as.data.frame(GSEA(.x, TERM2GENE = hallmarks, minGSSize = 5, pvalueCutoff = 1, eps = 0))
  }, .id = "set")
  
  return(hallmark_gsea_res)
}

#als_gsea <- hallmark_gsea(als_genes$lfc)


# plot hallmarks
hallmark_gsea_plot <- function(df, plot_all = FALSE){
  df$ID <- gsub("^\\s+", "", df$ID)
  
  if( plot_all == TRUE){
    sig_level <- 1
  }else{
    sig_level <- 0.05
  }
  
  hallmark_levels <- 
    df %>%
    filter(p.adjust < sig_level) %>%
    select(set, ID, NES) %>%
    pivot_wider(names_from = set, values_from = NES) %>%
    rowwise() %>%
    mutate(mean_nes = mean( c(C,L), na.rm = TRUE)) %>%
    arrange( mean_nes) %>%
    mutate(ID = factor(ID, levels = ID)) 
    
  if( plot_all == FALSE){
    hallmark_levels <- hallmark_levels %>% tidyr::drop_na()
  }

  to_plot <-
    df %>%
    mutate(padj = p.adjust) %>%
    mutate(padj_label = case_when(
    padj < 0.0001 ~ "***",
    padj < 0.001 ~ "**",
    padj < 0.05 ~ "*",
    TRUE ~ "."
    )) 
  
  
    to_plot %>%
      filter(ID %in% hallmark_levels$ID) %>%
      mutate(ID = factor(ID, levels = hallmark_levels$ID)) %>%
      mutate(set = factor(set) ) %>% #, levels = c("Cervical", "Thoracic", "Lumbar"))) %>%
    ggplot(aes(x = set, y = ID, label = padj_label, fill = NES)) + 
    geom_tile() + 
    geom_text() +
    scale_fill_distiller(na.value = "lightgray", palette = "RdBu", limits = c(-4,4) ) + 
    #geom_text(aes(label = signif(value, digits = 2) ) ) +
    scale_x_discrete(expand = c(0,0), position = "top") +
    scale_y_discrete(expand = c(0,0)) +
    theme_jh() +
    labs(subtitle = "MSigDB Hallmark gene set enrichment analysis", x = "", y = "", fill = "NES") + theme(plot.subtitle = element_text(hjust = 0))

}

ALS Case vs Control

Updated for March 2022 revisions

#load(here::here("data/differential_expression/ALS/ALS_Spinal_Cord_de_res_Mar2022.RData"))
#de_res_final <- de_res_mar2022
#save(de_res_final,file =  here::here("data/differential_expression/ALS/ALS_Spinal_Cord_de_res_Mar2022.RData"))
als_res <- load_results(here::here("data/differential_expression/ALS/ALS_Spinal_Cord_de_res_Mar2022.RData"))

# for main figure plots ignore the thoracic
als_genes <- make_gene_sets(als_res)
pair_genes <- make_gene_sets(list(C = als_res$CSC, L = als_res$LSC))


# Table 2 - DEG discovery rates
map_df( als_res, ~{ 
  sig_all <- filter(.x, adj_p_val < 0.05 ) %>% nrow()
  sig_med <- filter(.x, adj_p_val < 0.05 & abs(log_fc) >= 1) %>% nrow()
  sig_strong <- filter(.x, adj_p_val < 0.05 & abs(log_fc) >= 2) %>% nrow()
  tibble( sig_all, sig_med, sig_strong)
  }, .id = "tissue")
## # A tibble: 3 x 4
##   tissue sig_all sig_med sig_strong
##   <chr>    <int>   <int>      <int>
## 1 CSC       7349     377         29
## 2 TSC        256      65          9
## 3 LSC       4694     233          7

overlapping genes - upregulated

  • strong genes - LFC > 1

  • Extreme - Genes with log2 fold change > 2

# read in lFCs from full DE and from shared donor DE - restricted to 130 shared donors
deg_combo <- read_tsv(here::here("ALS_SC/tables/als_control_deg_combined.tsv") )
shared_donor_df <- read_tsv( here::here("ALS_SC/tables/shared_donor_als_control_deg_combined.tsv"))

strong_genes <- 
  deg_combo %>% 
  filter( (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc > 1 | lumbar.lfc > 1) ) %>% 
  pull(genename)
length(strong_genes)
## [1] 238
if( rerun){
  gencode <- rtracklayer::import("~/GENCODE/gencode.v30.annotation.gtf.gz")
  gene_type_df <- tibble(gene_id = gencode$gene_id, gene = gencode$gene_name, type = gencode$gene_type) %>% distinct()
  write_tsv(gene_type_df,file ="~/GENCODE/gencode.v30.gene.type.tsv.gz" )
}else{
  gene_type_df <- read_tsv("~/GENCODE/gencode.v30.gene.type.tsv.gz")
}

strong_down_genes <- 
  deg_combo %>% 
  filter( (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc < -1 | lumbar.lfc < -1) ) %>% 
  mutate(mean.lfc = (cervical.lfc + lumbar.lfc)/2) %>% 
  arrange((mean.lfc)) %>%
  left_join(gene_type_df, by = c("genename" = "gene"))

strong_up_genes <-
  deg_combo %>%
  filter( (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc > 1 | lumbar.lfc > 1) ) %>% 
  mutate(mean.lfc = (cervical.lfc + lumbar.lfc)/2) %>% 
  arrange((mean.lfc)) %>%
  left_join(gene_type_df, by = c("genename" = "gene"))

strong_down_genes %>% group_by(type == "protein_coding") %>% tally()
## # A tibble: 2 x 2
##   `type == "protein_coding"`     n
##   <lgl>                      <int>
## 1 FALSE                         37
## 2 TRUE                          30
strong_up_genes %>% group_by(type == "protein_coding") %>% tally()
## # A tibble: 2 x 2
##   `type == "protein_coding"`     n
##   <lgl>                      <int>
## 1 FALSE                         38
## 2 TRUE                         208
## scatter plot strongest genes!
strong_up_gene_plot <- function(data, 
                             highlight = c("GPNMB", "CHIT1", "CCL18",   "DNAJC5B", "CHRNA1", "AQP9")  
                             ){

  to_plot <- data %>%
    select(!contains("thoracic")) %>%
  mutate( gene_set = case_when(
    (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc > 1 & lumbar.lfc > 1) ~ "FDR < 0.05 in both; LFC > 1 in both",
    (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc > 1 & lumbar.lfc <= 1) ~ "FDR < 0.05 in both; LFC > 1 in Cervical",
       (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc <= 1 & lumbar.lfc > 1) ~ "FDR < 0.05 in both; LFC > 1 in Lumbar",
    ) ) %>%
    filter(!is.na(gene_set)) %>%
    mutate(genename = ifelse( genename %in% highlight, genename, "")) 
  
  label_df <- to_plot %>% 
    select(gene_set) %>%
    drop_na() %>%
    group_by(gene_set) %>% tally() %>%
    mutate(x = c(2,0.5,2), 
           y = c(4.5,2,0.5))
  
  # linear model
  lm_df <- coef(lm(cervical.lfc ~ lumbar.lfc, data = to_plot))
  lm_string <- paste0("\u03b2 = ", signif(lm_df[2],2) )
  cor_string <- paste0("\u03c1 = ", signif(cor(to_plot$cervical.lfc, to_plot$lumbar.lfc, method ="pearson"),2) )
  
  to_plot %>%
  ggplot(aes(x = lumbar.lfc, y = cervical.lfc)) + 
  labs(x = expression(Lumbar~log[2]~fold~change), y =expression(Cervical~log[2]~fold~change), colour = "") +
  geom_abline(linetype = 2, colour = "gray") +
  theme_jh() +
  geom_smooth(method = "lm", se = FALSE, colour = "red") +
  theme(legend.position = "bottom") +
  geom_point() +
  geom_point(aes(colour = gene_set), size = 0.8) +  
  geom_text_repel( aes(label = genename), 
                   fontface = "italic", size = 2.5, min.segment.length = 0, force = 10, max.overlaps = 100 ) +
  scale_colour_manual(values = c("firebrick3","salmon", "peachpuff")) +
  geom_text(data = label_df, aes(x = x, y = y, label = n, colour = gene_set), show.legend = FALSE, size = 3 ) +
    xlim(0,3.5) +
    ylim(0,5) +
    annotate(geom = "text", x = 0.5, y = 4.5, label = paste0(cor_string, "\n", lm_string), size = 3.5)

}

## EXTREME GENES
extreme_genes <- map( als_res, ~{ 
  sig_strong <- filter(.x, adj_p_val < 0.05 & abs(log_fc) > 1) %>%  pull(genename)
  } ) 

UpSetR::upset(UpSetR::fromList(extreme_genes))

#intersect(intersect(strong_genes$CSC, strong_genes$TSC), strong_genes$LSC)
# 2 extreme genes in common

extreme_gene_df <- map_df(als_res, ~{filter(.x, genename %in% unlist(extreme_genes) ) })

extreme_gene_table <- 
  deg_combo %>% 
  filter(genename %in% unlist(extreme_genes) & !duplicated(genename) ) %>%
    filter(!is.na(lumbar.lfc) & !is.na(cervical.lfc)) %>%
   mutate(mean_lfc = (cervical.lfc + lumbar.lfc) / 2  ) %>%
  arrange(desc(mean_lfc)) %>%
  mutate(gene = factor(genename, levels = .$genename))

extreme_up_gene_heatmap <- 
  extreme_gene_df %>%
  inner_join(extreme_gene_table, by = "genename") %>%
  filter(gene %in% head(extreme_gene_table, 20)$gene ) %>%
  #filter(mean_lfc > 0) %>%
  mutate(gene = fct_rev(gene)) %>%
  mutate(p_label = ifelse(adj_p_val < 0.05, "*", "") ) %>%
  left_join(tissue_conversion_table, by = c("tissue" = "tissue_short")) %>%
  filter(tissue_mono %in% c("C", "L")) %>%
  ggplot(aes(x = tissue_mono, y = gene)) + 
  geom_tile(aes(fill = log_fc)) +
  geom_text(aes(label = p_label), nudge_y = -0.25) +
  scale_fill_gradient2(low = "white", mid = "#EB7F56", high = "#B61927", limits = c(0,5), na.value = "gray", midpoint = 2   ) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0)) +
  theme_jh() +
  labs(fill = expression(log[2]~fold~change), x = "", y = "")  +
  theme(axis.text.y = element_text(face = 'italic'),
        legend.key.size = unit(0.5, "cm"),
        panel.background=element_rect(fill="gray") )

extreme_up_gene_heatmap

LYZ, CHIT1 and GPNMB are strongly upregulated in all 3 tissues (lfc > 2), but also CCL18, DNAJC5B and CHRNA1, HTRA4 all very strong.

Downregulated gene sharing

strong_down_gene_plot <- function(data, highlight = c("MOBP", "AC009133.6", "LINC00323", "SNORD3C", "ISL1", "MNX1", "MYO1A", "HBD" )
                             ){
  to_plot <- data %>%
    select(!contains("thoracic")) %>%
  mutate( gene_set = case_when(
    (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc < -1 & lumbar.lfc < -1) ~ "FDR < 0.05 in both; LFC < -1 in both",
    (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc < -1 & lumbar.lfc > -1) ~ "FDR < 0.05 in both; LFC < -1 in Cervical",
       (cervical.fdr < 0.05 & lumbar.fdr < 0.05) & (cervical.lfc > -1 & lumbar.lfc < -1) ~ "FDR < 0.05 in both; LFC < -1 in Lumbar",
    ) ) %>%
    filter(!is.na(gene_set)) %>%
    mutate(gene = ifelse( genename %in% highlight, genename, "")) 
  
  label_df <- to_plot %>% 
    select(gene_set) %>%
    drop_na() %>%
    group_by(gene_set) %>% tally() %>%
    mutate(x = c(-2,-0.75, -1.5), 
           y = c(-1.75,-1.5,-0.5))
  
  # linear model
  lm_df <- coef(lm(cervical.lfc ~ lumbar.lfc, data = to_plot))
  lm_string <- paste0("\u03b2 = ", signif(lm_df[2],2) )
  cor_string <- paste0("\u03c1 = ", signif(cor(to_plot$cervical.lfc, to_plot$lumbar.lfc, method ="pearson"),2) )
  
  to_plot %>%
  ggplot(aes(x = lumbar.lfc, y = cervical.lfc)) + 
  labs(x = expression(Lumbar~log[2]~fold~change), y =expression(Cervical~log[2]~fold~change), colour = "") +
  geom_abline(linetype = 2, colour = "gray") +
  theme_jh() +
  geom_smooth(method = "lm", se = FALSE, colour = "red") +
  theme(legend.position = "bottom") +
  geom_point() +
  geom_point(aes(colour = gene_set), size = 0.8) +  
  geom_text_repel( aes(label = gene), 
                   fontface = "italic", size = 2.5, min.segment.length = 0, force = 10, max.overlaps = 100 ) +
  scale_colour_manual(values = c("navy", "dodgerblue", "skyblue" ) ) +
  geom_text(data = label_df, aes(x = x, y = y, label = n, colour = gene_set), show.legend = FALSE, size = 3 ) +
    xlim(-2.8,-0.45) +
    ylim(-2.2,-0.45) +
    annotate(geom = "text", x = -2.5, y = -0.6, label = paste0(cor_string, "\n", lm_string), size = 3.5)

}

strong_down_gene_plot(deg_combo)

#strong_down_gene_plot(shared_donor_df)

extreme_down_gene_heatmap <- 
  extreme_gene_df %>%
  inner_join(extreme_gene_table, by = "genename") %>%
  filter(gene %in% tail(extreme_gene_table, 20)$gene ) %>%
  mutate(p_label = ifelse(adj_p_val < 0.05, "*", "") ) %>%
  left_join(tissue_conversion_table, by = c("tissue" = "tissue_short")) %>%
  filter(tissue_mono %in% c("C", "L")) %>%
  ggplot(aes(x = tissue_mono, y = gene)) + 
  geom_tile(aes(fill = log_fc)  ) +
  geom_text(aes(label = p_label), nudge_y = -0.25) +
  scale_fill_gradient2(low = "dodgerblue4", mid = "#4F8FC4", high = "white", limits = c(-3, 0), na.value = "gray", midpoint = -1.5 ) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0)) +
  theme_jh() +
  labs(fill = expression(log[2]~fold~change), x = "", y = "")  +
  theme(axis.text.y = element_text(face = 'italic'),
        legend.key.size = unit(0.5, "cm"),
        panel.background=element_rect(fill="gray")
  )

extreme_down_gene_heatmap

ggsave(plot = extreme_down_gene_heatmap, file = here::here("ALS_SC/plots/downregulated_gene_heatmap.pdf"), width = 2.5, height = 3.5 )

ggsave(plot = extreme_up_gene_heatmap, file = here::here("ALS_SC/plots/upregulated_gene_heatmap.pdf"), width = 2.5, height = 3.5 )


#    "sig down" = "#4F8FC4",
#                                   "sig - strong down" = "dodgerblue4"
strong_gene_multiplot <- 
  (
  strong_up_gene_plot(deg_combo) + labs(title = "Full cohorts") +
  strong_up_gene_plot(shared_donor_df) + labs(title = "Restricted to shared donors") +
  extreme_up_gene_heatmap + labs(title = "Top 20 upregulated genes") +
    plot_layout(guides = "collect")
  ) / (
  strong_down_gene_plot(deg_combo) + labs(title = "Full cohorts") +
  strong_down_gene_plot(shared_donor_df) + labs(title = "Restricted to shared donors") +
  extreme_down_gene_heatmap + labs(title = "Top 20 downregulated genes") +
    plot_layout(guides = "collect")
  ) +
  plot_layout(nrow = 2, widths = c(2,2,1)) +
  plot_annotation(tag_levels = "a") &
    theme(plot.tag = element_text(face = "bold"), legend.position = "bottom", plot.title = element_text(hjust = 0.5)) 

#strong_gene_multiplot

#strong_gene_multiplot
ggsave(plot = strong_gene_multiplot, filename = here::here("ALS_SC/plots/strong_gene_multiplot.pdf"), width = 9, height = 8 )
# 109 have LFC > 1 in both CSC and LSC and FDR < 0.05 in both
# 238 genes have LFC > 1 in at least 1 of 2 but FDR < 0.05 in both

ggsave(plot = strong_up_gene_plot(deg_combo) + guides(colour = FALSE), file =here::here("ALS_SC/plots/shared_upregulated_genes.pdf"), height = 2.5, width = 2.5)#, device = cairo_pdf )

ggsave(plot = strong_down_gene_plot(deg_combo) + guides(colour = FALSE), file =here::here("ALS_SC/plots/shared_downregulated_genes.pdf"), height = 2.5, width = 2.5)#, device = cairo_pdf )

## combo
both_extremes <- 
strong_up_gene_plot(deg_combo) +
  strong_down_gene_plot(deg_combo) +
  plot_layout(ncol =2) &  
  guides(colour = FALSE)
  
ggsave(plot = both_extremes,  file =here::here("ALS_SC/plots/shared_genes_up_down.pdf"), height = 2.2, width = 4.5, device = cairo_pdf )

Volcano and MA plots

volcano_multiplot <- 
  volcano_plot(als_res$CSC, "Cervical Spinal Cord", 
               subtitle = "35 Controls vs 139 ALS",
               annotate_by = c("LYZ", "GPNMB", "CHIT1", "C3", "MOBP", "MNX1","CCL18")) +
#  volcano_plot(als_res$TSC, "Thoracic Spinal Cord",
 #              subtitle = "10 Controls vs 42 ALS",
 #              annotate_by = c("LYZ", "GPNMB", "CHIT1", "C3", "MOBP", "ISL1", "MNX1")) +
  volcano_plot(als_res$LSC, "Lumbar Spinal Cord",
               subtitle = "32 Controls vs 122 ALS",
               annotate_by = c("LYZ", "GPNMB", "CHIT1", "C3", "MOBP", "ISL1", "MNX1","CCL18")) +
  plot_layout(nrow = 1)
#volcano_multiplot
ggsave(plot = volcano_multiplot, filename = here::here("ALS_SC/plots/als_volcano_multiplot.pdf"), width = 4.5, height = 2.2)

MA plots - annotation broken

if(rerun){
ma_multiplot <- 
  ma_plot(als_res$CSC, title = "Cervical Spinal Cord",annotate_by = c("LYZ", "GPNMB", "CCL18", "C3", "MOBP", "CHIT1")) + 
ma_plot(als_res$LSC, title = "Lumbar Spinal Cord",annotate_by = c("LYZ", "GPNMB", "CCL18", "C3", "MOBP", "CHIT1")) +
  ma_plot(als_res$TSC, title = "Thoracic Spinal Cord",annotate_by = c("LYZ", "GPNMB", "CCL18", "C3", "MOBP", "CHIT1")) +
  plot_layout(ncol = 3)

ma_multiplot

ggsave(plot =ma_multiplot, filename = here::here("ALS_SC/plots/als_ma_multiplot.pdf"), width = 8, height = 3)
}

Pathways

als_hallmark <- hallmark_gsea(pair_genes$lfc)

h_plot <- hallmark_gsea_plot(als_hallmark) + guides(fill = FALSE) + 
  theme(plot.subtitle = element_blank(), plot.background = element_blank() )
h_plot

#als_hallmark <- hallmark_gsea(als_genes$lfc)
 
#hallmark_gsea_plot(als_hallmark, plot_all = TRUE)

ggsave(plot = h_plot, 
       filename = here::here("ALS_SC/plots/als_hallmark_selected.pdf"), width = 2.5, height = 3.5)

Supplementary Figure - all hallmark pathway results

#all_hallmark <- hallmark_gsea(als_genes$lfc)
h_plot_all <- 
  hallmark_gsea_plot(als_hallmark, plot_all = TRUE) + labs(subtitle = "")

h_plot_all

ggsave(plot = h_plot_all, 
       filename = here::here("ALS_SC/plots/als_hallmark_all.png"), width = 4, height = 10)

Neuroexpresso

nxp_gsea_res <- map_df(pair_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = nxp_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")
  
gsea_plot(nxp_gsea_res) 

gsea_box_plots(als_res, markers = nxp )

Using Neuroexpresso markers for Cholinergic Spinal Cord - all 3 tissues show modest downregulation using GSEA. This is not observed using the Kelley markers - smaller set but not specific to motor neurons.

NEW - Synaptic Gene Ontologies (SynGO)

syngo_gsea_res <- map_df(als_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = syngo_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")


gsea_plot(syngo_gsea_res) 

gsea_box_plots(als_res, markers = syngo )

New - Mathys Markers

Top 100 most specific markers between each cluster and every other

mathys_gsea_res <- map_df(pair_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = mathys_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")

mathys_gsea_plot <- gsea_plot(mathys_gsea_res)

New - Darmanis Markers

darmanis_gsea_res <- map_df(pair_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = darmanis_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")

gsea_plot(darmanis_gsea_res)

Kelley Markers

kelley_gsea_res <- map_df(pair_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = kelley_gsea,pvalueCutoff = 1))
}, .id = "set")
  
kelley_gsea_plot <- gsea_plot(kelley_gsea_res) 

kelley_gsea_plot + labs(title = "Kelley et al")

Supplementary plot

gsea_box_plots(als_res, kelley)

als_res$CSC %>%
  filter(genename %in% kelley$Oligos)
## # A tibble: 99 x 9
##    geneid          log_fc ave_expr     t  p_value adj_p_val     b genename tissue
##    <chr>            <dbl>    <dbl> <dbl>    <dbl>     <dbl> <dbl> <chr>    <chr> 
##  1 ENSG00000170271 -0.463     6.50 -7.56 3.18e-12  7.48e-10  17.3 FAXDC2   CSC   
##  2 ENSG00000137221 -0.498     5.26 -7.33 1.11e-11  2.03e- 9  16.1 TJAP1    CSC   
##  3 ENSG00000072864 -0.516     5.47 -7.32 1.20e-11  2.17e- 9  16.0 NDE1     CSC   
##  4 ENSG00000244274 -0.606     7.91 -6.92 1.10e-10  1.20e- 8  13.8 DBNDD2   CSC   
##  5 ENSG00000092871 -0.391     6.65 -6.72 3.12e-10  2.74e- 8  12.8 RFFL     CSC   
##  6 ENSG00000150510 -0.577     5.75 -6.70 3.51e-10  2.98e- 8  12.7 FAM124A  CSC   
##  7 ENSG00000147488 -0.571     7.58 -6.70 3.49e-10  2.98e- 8  12.7 ST18     CSC   
##  8 ENSG00000154493 -0.529     5.09 -6.65 4.51e-10  3.68e- 8  12.5 C10orf90 CSC   
##  9 ENSG00000169247 -0.588     6.34 -6.62 5.32e-10  4.16e- 8  12.3 SH3TC2   CSC   
## 10 ENSG00000170775 -0.655     7.10 -6.59 6.26e-10  4.79e- 8  12.1 GPR37    CSC   
## # … with 89 more rows

Neuron upregulation enrichment in CSC is driven by GABA genes.

Panglaodb

panglao_gsea_res <- map_df(pair_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = panglao,pvalueCutoff = 1))
}, .id = "set")
  

gsea_plot(panglao_gsea_res) 

gsea_box_plots(als_res, panglao ) 

New Supplementary Plots - compare marker overlap and NES of each cell type across datasets

## Plot NES of all datasets 

all_gsea <- bind_rows(
panglao_gsea_res %>% mutate(dataset = "PanglaoDB"),
nxp_gsea_res %>% mutate(dataset = "Neuroexpresso") ,
kelley_gsea_res %>% mutate(dataset = "Kelley"),
darmanis_gsea_res %>% mutate(dataset = "Darmanis"),
mathys_gsea_res %>% mutate(dataset = "Mathys")
)

# make ID conversion table
all_ids <- unique(all_gsea$ID)
all_ids <- all_ids[order(all_ids)]

conversion_df <- tibble(
  ID = all_ids,
  cell = c(
    rep("Astrocyte", 3),
    rep("Endothelial", 3),
    NA,
    rep("Microglia", 2),
    rep("Neuron", 3),
    rep("Oligodendrocyte", 4),
    rep("Pericyte", 1)
  )
)


marker_nes_plot <-
all_gsea %>%
  left_join(conversion_df, by = "ID") %>%
  filter(!is.na(cell)) %>%
 #mutate(dataset = fct_rev(dataset) ) %>%
  mutate(padj = pvalue) %>%  ##p.adjust(pvalue, method = "bonferroni")) %>%
  mutate(padj_label = case_when(
    padj < 0.0001 ~ "***",
    padj < 0.001 ~ "**",
    padj < 0.05 ~ "*",
    TRUE ~ "."
    )) %>%
  mutate(NES_label = signif(NES, digits = 2)) %>%
  ggplot(aes(x = dataset, y = NES)) + 
  geom_col(aes(fill = NES)) +
  facet_wrap(~set + cell, nrow = 2) +
  #coord_flip() +
  scale_fill_distiller(type = "div", palette = "RdBu", limits = c(-4.2,4.2)) + scale_colour_manual(values = c(NA, "white")) +
  theme_classic() +
  geom_text(aes(label = padj_label), nudge_y = 0.1 ) +
  geom_hline(yintercept = 0) +
  labs(y = "Normalised Enrichment Score (NES)", x = "") +
  theme(strip.background = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1),
        axis.text = element_text(colour = "black"),
        axis.ticks = element_line(colour = "black"))

ggsave(plot = marker_nes_plot, filename = here::here("ALS_SC/plots/marker_gene_nes_plot.pdf"), width = 8, height =6)

marker_nes_plot

## UpSet plot for all marker gene sets used
all_gsea_genes <- bind_rows(
panglao %>% mutate(dataset = "PanglaoDB"),
nxp_gsea %>% mutate(dataset = "Neuroexpresso") ,
kelley_gsea %>% mutate(dataset = "Kelley"),
darmanis_gsea %>% mutate(dataset = "Darmanis"),
mathys_gsea %>% mutate(dataset = "Mathys"),
activation_gsea %>% mutate(dataset = "Activation")
)



all_gsea_gene_list <- 
  all_gsea_genes %>%
  left_join(conversion_df, by = c("term_id" = "ID") ) %>%
  filter(!is.na(cell)) %>%
  mutate(set = paste(dataset, cell, sep = " ")) %>%
  split(.$set) %>%
  map(~{.x$gene}) 


## Jaccard index between each set
mark_expand <- crossing(Var1 = names(all_gsea_gene_list), Var2 = names(all_gsea_gene_list) )

mark_jac_res <- map2_df(mark_expand$Var1,mark_expand$Var2, ~{
    int <- length(intersect(all_gsea_gene_list[[.x]], all_gsea_gene_list[[.y]]) ) 
    uni <- length(unique(c(all_gsea_gene_list[[.x]], all_gsea_gene_list[[.y]]) ) ) 
    jac <- int / uni
    
    tibble(x = .x, y = .y, int = int, uni = uni, jac = jac)
  })

annotation_df <- 
  tibble(x = names(all_gsea_gene_list) ) %>%
  distinct() %>%
  tidyr::separate(col = x, into = c("Dataset", "Cell-type"), sep = " ", remove = FALSE) %>%
  #select(-Dataset) %>%
  column_to_rownames("x")

ann_colors = list(
    "Cell-type" = viridis::viridis(n = 6),
    "Dataset" = viridis::plasma(n =5)
)
names(ann_colors$`Cell-type`) <- unique(annotation_df$`Cell-type`)
names(ann_colors$Dataset) <- unique(annotation_df$Dataset)

marker_jaccard_plot <-
  mark_jac_res %>%
  select(x,y, jac) %>%
  mutate(jac = ifelse(jac == 1, NA, jac)) %>%
  pivot_wider(names_from = y, values_from = jac) %>%
  column_to_rownames("x") %>%
  pheatmap::pheatmap(annotation_col = annotation_df, 
                     show_colnames = FALSE, show_rownames = FALSE, 
                     color = colorRampPalette(c("white", "navy"))(50)
                     , annotation_colors =ann_colors  )
marker_jaccard_plot

ggsave(plot = marker_jaccard_plot, filename = here::here("ALS_SC/plots/marker_gene_jaccard_plot.pdf"), width = 6, height =4)

# 
# activation_gene_list <-   
#   activation_gsea %>%
#   split(.$term_id) %>%
#   map(~{.x$gene}) 
# 
# act_expand <- crossing(Var1 = names(activation_gene_list), Var2 = names(activation_gene_list) )
# 
# act_jac_res <- map2_df(mark_expand$Var1,mark_expand$Var2, ~{
#     int <- length(intersect(activation_gene_list[[.x]], activation_gene_list[[.y]]) ) 
#     uni <- length(unique(c(activation_gene_list[[.x]], activation_gene_list[[.y]]) ) ) 
#     jac <- int / uni
#     
#     tibble(x = .x, y = .y, int = int, uni = uni, jac = jac)
#   })
# 
# 
# #act_jaccard_plot <-
#   act_jac_res %>%
#   select(x,y, jac) %>%
#   mutate(jac = ifelse(jac == 1, NA, jac)) %>%
#   pivot_wider(names_from = y, values_from = jac) %>%
#   column_to_rownames("x") %>%
#   pheatmap::pheatmap(
#     #annotation_col = annotation_df, 
#                      show_colnames = FALSE, show_rownames = TRUE, 
#                      color = colorRampPalette(c("white", "navy"))(50)
#                      #, annotation_colors =ann_colors  
#                      )
#   
# annotation_df <- tibble(Dataset = act_jac_res$x)
#   

Activation marker sets

compiled from multiple papers

activation_gsea_res <- map_df(pair_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = activation_gsea,pvalueCutoff = 1))
}, .id = "set")



#activation_gsea_res$ID <- factor(activation_gsea_res$ID, levels = rev(act_sets))

activation_gsea_plot <- gsea_plot(activation_gsea_res) 
activation_gsea_plot 

gsea_box_plots(als_res, markers = activation_sets)

Combine tissue and activation together

als_multiplot <- mathys_gsea_plot + 
  activation_gsea_plot + 
  plot_layout(ncol =2) &
  guides(fill = FALSE) &
  theme_jh()

als_multiplot

ggsave(plot = als_multiplot, filename = here::here("ALS_SC/plots/als_cell_activation.pdf"), width = 3, height = 2)

New for Revisions - compare with Oeckl proteomics results

CSF

load(here::here("other_datasets/Oeckl_Proteomics/Oeckl_Proteomics.RData"))
csf <- oeckl$csf
# csf$fdr <- p.adjust(csf$pval, method = "fdr")
# csf_sig <- filter(csf, fdr < 0.05)
csf_sig <- oeckl$csf_sig

csf_sig_genes <- str_trim(unlist(strsplit(csf_sig$gene,split = ",")))
# 35 significant genes

csf_sig <- filter(csf_sig, !grepl(",", csf_sig$gene))
# 30 non-overlapping genes

filter(als_res$CSC, genename %in% csf_sig$gene) %>% filter(adj_p_val < 0.05)
## # A tibble: 17 x 9
##    geneid          log_fc ave_expr     t  p_value adj_p_val      b genename tissue
##    <chr>            <dbl>    <dbl> <dbl>    <dbl>     <dbl>  <dbl> <chr>    <chr> 
##  1 ENSG00000136235  2.76      7.79 10.6  2.85e-20  7.23e-16 35.5   GPNMB    CSC   
##  2 ENSG00000042493  1.38      5.74  8.52 1.22e-14  7.96e-12 22.8   CAPG     CSC   
##  3 ENSG00000133063  4.53      3.84  6.97 8.31e-11  9.94e- 9 14.2   CHIT1    CSC   
##  4 ENSG00000234906  1.34      3.63  5.92 1.92e- 8  7.53e- 7  8.99  APOC2    CSC   
##  5 ENSG00000100979  0.655     6.47  5.92 1.93e- 8  7.57e- 7  8.81  PLTP     CSC   
##  6 ENSG00000196136  1.62      8.72  5.88 2.43e- 8  9.24e- 7  8.58  SERPINA3 CSC   
##  7 ENSG00000064886  1.79      5.92  5.67 6.64e- 8  2.08e- 6  7.67  CHI3L2   CSC   
##  8 ENSG00000095970  0.961     4.92  5.39 2.55e- 7  6.23e- 6  6.39  TREM2    CSC   
##  9 ENSG00000170458  1.06      5.80  4.99 1.61e- 6  2.83e- 5  4.56  CD14     CSC   
## 10 ENSG00000133048  1.52      6.52  4.54 1.11e- 5  1.39e- 4  2.71  CHI3L1   CSC   
## 11 ENSG00000184500  0.594     4.20  4.49 1.37e- 5  1.66e- 4  2.61  PROS1    CSC   
## 12 ENSG00000131095  0.303    15.0   4.28 3.18e- 5  3.34e- 4  1.84  GFAP     CSC   
## 13 ENSG00000144810  0.795     4.05  4.14 5.54e- 5  5.29e- 4  1.30  COL8A1   CSC   
## 14 ENSG00000154277 -0.419     7.00 -3.69 3.10e- 4  2.23e- 3 -0.476 UCHL1    CSC   
## 15 ENSG00000131386  0.352     9.05  3.41 8.24e- 4  5.05e- 3 -1.37  GALNT15  CSC   
## 16 ENSG00000073754  0.841    -3.72  2.53 1.26e- 2  4.45e- 2 -2.88  CD5L     CSC   
## 17 ENSG00000224389  0.401     8.61  2.80 5.75e- 3  2.40e- 2 -3.16  C4B      CSC
filter(als_res$LSC, genename %in% csf_sig$gene) %>% filter(adj_p_val < 0.05)
## # A tibble: 11 x 9
##    geneid          log_fc ave_expr     t     p_value adj_p_val       b genename tissue
##    <chr>            <dbl>    <dbl> <dbl>       <dbl>     <dbl>   <dbl> <chr>    <chr> 
##  1 ENSG00000136235  1.73      7.00  5.61 0.000000104 0.0000103  7.43   GPNMB    LSC   
##  2 ENSG00000234906  1.32      3.31  5.59 0.000000114 0.0000112  7.39   APOC2    LSC   
##  3 ENSG00000042493  0.855     5.34  4.98 0.00000183  0.0000833  4.71   CAPG     LSC   
##  4 ENSG00000095970  0.845     4.62  4.37 0.0000239   0.000567   2.33   TREM2    LSC   
##  5 ENSG00000133063  2.73      2.85  4.02 0.0000939   0.00159    1.22   CHIT1    LSC   
##  6 ENSG00000144810  0.603     4.57  3.75 0.000260    0.00354    0.0936 COL8A1   LSC   
##  7 ENSG00000184500  0.610     4.30  3.54 0.000548    0.00620   -0.560  PROS1    LSC   
##  8 ENSG00000170458  0.796     5.62  3.39 0.000922    0.00915   -1.12   CD14     LSC   
##  9 ENSG00000278637  0.663    -1.89  2.89 0.00445     0.0294    -2.09   HIST1H4A LSC   
## 10 ENSG00000196136  0.925     8.38  2.82 0.00545     0.0341    -2.73   SERPINA3 LSC   
## 11 ENSG00000100979  0.316     6.31  2.66 0.00872     0.0481    -3.16   PLTP     LSC
filter(als_res$TSC, genename %in% csf_sig$gene) %>% filter(adj_p_val < 0.05)
## # A tibble: 1 x 9
##   geneid          log_fc ave_expr     t    p_value adj_p_val     b genename tissue
##   <chr>            <dbl>    <dbl> <dbl>      <dbl>     <dbl> <dbl> <chr>    <chr> 
## 1 ENSG00000136235   3.17     8.28  5.62 0.00000132    0.0256  5.27 GPNMB    TSC
setEnrichment(set1 = filter(als_res$CSC, adj_p_val < 0.05) %>% pull(genename),
              set2 = csf_sig$gene,
              universe = nrow(als_res$CSC) 
              )
## # A tibble: 1 x 5
##   set1_length set2_length overlap ratio p.value
##         <int>       <int>   <int> <dbl>   <dbl>
## 1        7349          30      17  3.21 0.00137

32 proteins (35 genes) differentially expressed in ALS CSF, of which 17 are significant in CSC, 11 in LSC and 1 in TSC. GPNMB is top in all 3.

Spinal Cord proteomics

sp <- oeckl$sp
#sp$fdr <- p.adjust(sp$pval, method = "fdr")


#sp_sig <- filter(sp, fdr < 0.05)
sp_sig <- oeckl$sp_sig
sp_sig_genes <- unlist(strsplit(sp_sig$gene,split = ";"))
# 297 genes
table( !grepl(";", sp_sig$gene) )
## 
## FALSE  TRUE 
##     5   287
# 287 non-overlapping genes
sp_sig <- filter(sp_sig, !grepl(";", sp_sig$gene))

filter(als_res$CSC, genename %in% sp_sig_genes) %>% filter(adj_p_val < 0.05)
## # A tibble: 158 x 9
##    geneid          log_fc ave_expr     t  p_value adj_p_val     b genename tissue
##    <chr>            <dbl>    <dbl> <dbl>    <dbl>     <dbl> <dbl> <chr>    <chr> 
##  1 ENSG00000136235  2.76      7.79 10.6  2.85e-20  7.23e-16  35.5 GPNMB    CSC   
##  2 ENSG00000130203  1.18     10.3  10.0  1.44e-18  9.14e-15  31.6 APOE     CSC   
##  3 ENSG00000204287  1.24      7.97  9.05 5.04e-16  7.99e-13  25.9 HLA-DRA  CSC   
##  4 ENSG00000197746  0.532    10.4   9.00 6.85e-16  9.14e-13  25.6 PSAP     CSC   
##  5 ENSG00000145703  1.76      4.42  8.80 2.25e-15  2.38e-12  24.4 IQGAP2   CSC   
##  6 ENSG00000131981  1.39      5.22  8.76 2.88e-15  2.71e-12  24.2 LGALS3   CSC   
##  7 ENSG00000042493  1.38      5.74  8.52 1.22e-14  7.96e-12  22.8 CAPG     CSC   
##  8 ENSG00000081237  0.977     6.05  8.45 1.89e-14  1.09e-11  22.3 PTPRC    CSC   
##  9 ENSG00000120594  0.784     7.62  8.11 1.31e-13  5.34e-11  20.4 PLXDC2   CSC   
## 10 ENSG00000177119  0.844     6.52  8.00 2.59e-13  9.26e-11  19.8 ANO6     CSC   
## # … with 148 more rows
# 158 overlap
#matrix(c(4455, 112, 287, nrow(als_res$CSC) ), nrow = 2 )

setEnrichment(set1 = filter(als_res$CSC, adj_p_val < 0.05) %>% pull(genename),
              set2 = sp_sig$gene,
              universe = nrow(als_res$CSC) 
              )
## # A tibble: 1 x 5
##   set1_length set2_length overlap ratio  p.value
##         <int>       <int>   <int> <dbl>    <dbl>
## 1        7349         287     153  2.84 3.27e-18
#filter(als_res$LSC, genename %in% sp_sig_genes) %>% filter(adj_p_val < 0.05)
#filter(als_res$TSC, genename %in% sp_sig_genes) %>% filter(adj_p_val < 0.05)


csc_prot <- 
  filter(als_res$CSC, genename %in% c(sp_sig_genes, csf_sig_genes) ) %>% 
  filter(adj_p_val < 0.05) %>%
  left_join(sp_sig, by = c("genename" = "gene") ) %>%
  rename(sp_lfc = lfc, sp_p = pval) %>%
  left_join(csf_sig, by = c("genename" = "gene")) %>%
  rename(csf_lfc = lfc, csf_p = pval) %>%
  mutate(darmanis = darmanis_gsea$term_id[match(genename, darmanis_gsea$gene)]) %>%
  mutate(mathys = mathys_gsea$term_id[match(genename, mathys_gsea$gene)]) %>%
  mutate(kelley = kelley_gsea$term_id[match(genename, kelley_gsea$gene)]) %>%
  mutate(panglao = panglao$term_id[match(genename, panglao$gene)]) %>%
   mutate(nxp = nxp_gsea$term_id[match(genename, nxp_gsea$gene)])

292 peptides (297 unique genes) are differentially expressed between ALS and control in spinal cord tissue, of which 158, 107 and 19 are DEGs in spinal cord RNA-seq

of the 158 CSC overlapping genes, 16 (91%) go in the opposite direction

compare fold changes

#library(ggrepel)
#library(patchwork)

compare_rna_prot_plot <- function(prot, rna, mytitle, highlight = c("CHIT1", "GPNMB", "IQGAP2", "LYZ", "MOBP", "PEX5L", "APOE", "SERPINA3", "UCHL1"), cross_zero = TRUE){
  to_plot <- 
  inner_join(prot, rna, by = c("gene"= "genename"), suffix = c(".prot", ".gene") ) %>%
  filter(adj_p_val < 0.05) 
  print(table(rna = to_plot$lfc > 0, prot = to_plot$log_fc > 0))
  p <- to_plot %>%
  ggplot(aes(y = lfc, x = log_fc, label = gene)) +
  geom_point(size = 1, colour = "black") +
  geom_point(size = 0.8, colour = "purple") +
  geom_text_repel(data = filter(to_plot, gene %in% highlight),max.overlaps = 10, size = 2.5, fontface = "italic", min.segment.length = unit(x = 0, units = "lines")) + 
    labs(y = expression(~log[2]~"(Protein)"), x = expression(~log[2]~"(mRNA)"), title = mytitle) +
  geom_abline(slope = 1, intercept = 0, linetype = 3) +
  theme_jh() +
  theme(panel.border = element_blank(), 
        axis.ticks = element_line(colour = "black"),
        plot.title = element_text(hjust = 0, size = 8)
        ) +
    scale_x_continuous(n.breaks = 3, breaks = waiver()) +
    scale_y_continuous(n.breaks = 3, breaks = waiver())
  
  if(cross_zero == TRUE){
    p <- p +  geom_hline(yintercept = 0, linetype = 3) + geom_vline(xintercept = 0, linetype = 3)
  }
  return(p)
}

## main figure!
#library(ggrepel)
cervical_compare_plot <- 
  compare_rna_prot_plot(prot = sp_sig, rna = als_res$CSC, "Spinal Cord" ) +
compare_rna_prot_plot(prot = csf_sig, rna = als_res$CSC, "Cerebrospinal Fluid", cross_zero = FALSE) + 
  plot_layout(ncol = 2)
##        prot
## rna     FALSE TRUE
##   FALSE    40   13
##   TRUE      3   97
##       prot
## rna    FALSE TRUE
##   TRUE     1   16
cervical_compare_plot 

 ggsave(cervical_compare_plot, filename = here::here("ALS_SC/plots/cervical_rna_protein_compare.pdf"), width = 4.5, height = 2.2)



## supplement
lumbar_compare_plot <-
  compare_rna_prot_plot(prot = sp_sig, rna = als_res$LSC, "Spinal Cord") +
compare_rna_prot_plot(prot = csf_sig, rna = als_res$LSC, "Cerebrospinal Fluid") + 
  plot_layout(ncol = 2)
##        prot
## rna     FALSE TRUE
##   FALSE    28    8
##   TRUE      2   64
##        prot
## rna     TRUE
##   FALSE    1
##   TRUE    10
ggsave(lumbar_compare_plot, filename = here::here("ALS_SC/plots/lumbar_rna_protein_compare.pdf"), width = 6, height = 3)
# put Cervical in main figure

# put Lumbar in supplement

# make table with fold changes of protein and RNA

rna_prot_df <- bind_rows(
  csf_sig %>% 
    mutate(cohort = "CSF_protein"),
  sp_sig %>% mutate(cohort = "SC_protein"),
  als_res$CSC %>% filter(adj_p_val < 0.05) %>% 
    select(gene = genename, lfc = log_fc, pval = p_value) %>%
    mutate(cohort = "Cervical_mRNA"),
  als_res$LSC %>% filter(adj_p_val < 0.05) %>% 
    select(gene = genename, lfc = log_fc, pval = p_value) %>% 
    mutate(cohort = "Lumbar_mRNA")
) %>%
  filter( gene %in% .$gene[duplicated(.$gene)] & gene %in% c(csf_sig$gene, sp_sig$gene) ) %>%
  filter( gene != "SOD2") # weird duplicates


rna_prot_df_wide <- rna_prot_df  %>% 
  pivot_wider(names_from = cohort, 
              values_from = c(lfc, pval), 
              names_glue =  "{cohort}.{.value}", names_sort = TRUE, 
              ) %>%
  select(gene, starts_with("SC"), starts_with("CSF"), starts_with("Cervical"), starts_with("Lumbar") ) %>%
  arrange((SC_protein.pval))

#View(rna_prot_df_wide)


write_tsv(rna_prot_df_wide, file = here::here("ALS_SC/tables/oeckl_protein_comparison.tsv"))

New - run GSEA with markers using Oeckl Spinal Cord proteomics

rank_oeckl <- function(data, p = 0.05){
  genes <- data %>% 
    filter( !grepl(";|,", gene), !is.na(lfc), pval < p ) %>%
    filter(!duplicated(gene) ) %>%
    select(gene,lfc) %>%
    distinct() %>%
    arrange(desc(lfc)) 
  ranked <- setNames(genes$lfc, genes$gene)
  
}

nxp_compare <- 
nxp_gsea %>%
  left_join( inner_join(sp, csf, by = "gene",suffix = c(".sp", ".csf") ),
             by = "gene") 

nxp_compare %>%
  filter(pval.sp < 0.05) %>%
  ggplot(aes(x = term_id, y = lfc.sp)) + geom_jitter(aes(colour = pval.sp < 0.05), width = 0.25) +
  geom_text_repel( aes(label = gene))

nxp_compare %>%
    filter(pval.csf < 0.05) %>%
  ggplot(aes(x = term_id, y = lfc.csf)) + geom_jitter(aes(colour = pval.csf < 0.05), width = 0.25) +
  geom_text_repel(data = filter(nxp_compare, pval.csf < 0.05), aes(label = gene))

sp_ranked <- rank_oeckl(sp, p = 1) # 1201 genes
csf_ranked <- rank_oeckl(csf, p = 1) #392 genes

oeckl_nxp_gsea_res <- map_df(list("CSF" = csf_ranked, "Spinal Cord" = sp_ranked ), ~{
  as.data.frame(GSEA(.x, TERM2GENE = nxp_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")
  
gsea_plot(oeckl_nxp_gsea_res) 

oeckl_kelley_gsea_res <- map_df(list("CSF" = csf_ranked, "Spinal Cord" = sp_ranked ), ~{
  as.data.frame(GSEA(.x, TERM2GENE = kelley_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")
  
gsea_plot(oeckl_kelley_gsea_res) 

oeckl_activation_gsea_res <- map_df(list("CSF" = csf_ranked, "Spinal Cord" = sp_ranked ), ~{
  as.data.frame(GSEA(.x, TERM2GENE = activation_gsea,pvalueCutoff = 1,eps = 0))
}, .id = "set")

gsea_plot(oeckl_activation_gsea_res) 

Disease Duration ————-

dur_res <- load_results(here::here("data/differential_expression/ALS/Model_8_limma_res_Mar2022.RData"))
dur_genes <- make_gene_sets(dur_res)
dur_genes$lfc$TSC <- NULL

dur_anno <-  c("CHIT1", "C3", "GPNMB", "CHRNA1", "PON3", "DEFA1","MNX1")
duration_volcano <- 
  volcano_plot(dur_res$CSC, title = "Cervical Spinal Cord", subtitle = "139 ALS",  
               annotate_by = dur_anno, type = "duration") +
#  volcano_plot(dur_res$TSC, title = "Thoracic Spinal Cord", subtitle = "42 ALS",  
#               annotate_by = c("CHIT1", "C3", "LYZ", "GPNMB", "MOBP", "GFAP"),type = "duration") + 
  volcano_plot(dur_res$LSC, title = "Lumbar Spinal Cord", subtitle = "122 ALS", 
               annotate_by = dur_anno,type = "duration") 

#duration_volcano 

ggsave(plot = duration_volcano, filename = here::here("ALS_SC/plots/duration_volcano.pdf"), width = 5, height = 2.2)

Cell types and activation

dur_activation_gsea_res <- 
  map_df(dur_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = activation_gsea,pvalueCutoff = 1, minGSSize = 5, eps = 0))
}, .id = "set")
  
gsea_plot(dur_activation_gsea_res) 

gsea_box_plots(dur_res, markers = activation_sets)

dur_kelley_gsea_res <- map_df(dur_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = kelley_gsea,pvalueCutoff = 1, minGSSize = 5, eps = 0))
}, .id = "set")
  
gsea_box_plots(dur_res, kelley)

dur_panglao_gsea_res <- map_df(dur_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = panglao,pvalueCutoff = 1, minGSSize = 5, eps = 0))
}, .id = "set")
  
gsea_plot(dur_panglao_gsea_res) 

gsea_box_plots(dur_res, panglao)

dur_nxp_gsea_res <- map_df(dur_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = nxp_gsea,pvalueCutoff = 1, minGSSize = 5, eps = 0))
}, .id = "set")
  
gsea_plot(dur_nxp_gsea_res) 

dur_mathys_gsea_res <- map_df(dur_genes$lfc, ~{
  as.data.frame(GSEA(.x, TERM2GENE = mathys_gsea,pvalueCutoff = 1, minGSSize = 5, eps = 0))
}, .id = "set")
  
dur_mathys_gsea_plot <- gsea_plot(dur_mathys_gsea_res)


dur_activation_gsea_plot <- gsea_plot(dur_activation_gsea_res) 

dur_gsea_multiplot <- 
  dur_mathys_gsea_plot +
  dur_activation_gsea_plot +
  plot_layout(ncol = 2, guides = "collect") #&
 # guides(fill = "none")

dur_gsea_multiplot

ggsave(plot = dur_gsea_multiplot, filename = here::here("ALS_SC/plots/dur_cell_activation.pdf"), width = 4, height = 2)

Duration Deconvolution

deconv_res <- read_tsv(here::here("data/deconvolution/Mathys_MuSiC_residual_results.tsv"))

conversion_table <- data.frame(short = c("Ast","End", "Mic", "Ex", "Oli", "Per" ),
                               long = c("Astrocytes", "Endothelial", "Microglia", "Neurons", "Oligos", "Pericytes")
  )

mathys_music_df <- deconv_res %>%
  inner_join(conversion_table, by = c("cell" = "short") ) %>%
  select(-cell) %>%
  rename(cell = long) %>%
  select(sample, disease, duration, cell, tissue, deconv, resid, p_nom, p_resid)


duration_deconv_cor_df <- 
  mathys_music_df %>%
  filter( tissue == "Cervical_Spinal_Cord", disease == "ALS") %>%
  mutate(duration = as.numeric(duration)) %>%
  split(.$cell) %>%
  map_df( ~{ cor.test(.x$deconv, .x$duration, method = "spearman" ) %>%
      broom::tidy() }, .id = "cell") %>%
  mutate(padj = p.adjust(p.value, method = "bonferroni")) %>%
  mutate(cor = paste0("R = ",signif(estimate, digits =  2)) ) %>%
  mutate(padj_label = case_when(
    padj < 0.0001 ~ "***",
    padj < 0.001 ~ "**",
    padj < 0.05 ~ "*",
    TRUE ~ "."
    )) 

duration_deconv_plot <- 
mathys_music_df %>%
  filter( tissue == "Cervical_Spinal_Cord", disease == "ALS") %>%
  mutate(duration = as.numeric(duration)) %>%
  filter(!is.na(duration)) %>%
  left_join(duration_deconv_cor_df, by = "cell" ) %>%
  mutate(cell = paste0(cell, "\n", padj_label)) %>%
 ggplot(aes(x = duration, y = deconv)) +
    rasterise(
      geom_point(size = 0.3, colour = "blue" ),
      dpi = 300
      )+
  geom_smooth(method = "lm", colour = "black") +
    #geom_boxplot(fill = NA,notch = FALSE, na.rm = TRUE, outlier.color = NA) +
    facet_wrap( ~ cell, scales = "free_y",  nrow = 2 ) +
    #scale_colour_manual(values = c("#4F8FC4", "#B61927")) +
    labs( x = "", y = "" ) +
    guides(fill = FALSE, colour = FALSE) +
    theme_jh() +
    theme(
      plot.title = element_blank(),
      strip.background = element_blank(),
      #strip.switch.pad.grid = unit(0,units ="pt"),
      #panel.spacing.x = unit(0,units = "pt"), 
      strip.placement = "outside",
      panel.border = element_blank(),
      strip.text.x = element_text( face = 'plain', hjust = 0.5, vjust = 1, margin = margin(t =5, b = 0, unit = "pt")),
      strip.text.y.left = element_text(angle = 0)#,
      #plot.margin = margin()
    ) +
    labs(y = "") +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1) ) +
  ggpubr::stat_cor(aes(label = ..r.label..), method = "spearman", label.y.npc = 1, label.x.npc = 0.33, size = 3)

  duration_deconv_plot

ggsave( plot = duration_deconv_plot,filename = here::here("ALS_SC/plots/dur_deconvolution.pdf"), width = 4.5, height = 3.5 )