正文

前言

前面我们演示了生信工程师Roman Hillje分享的scRNA-seq workflow的部分（你还缺scRNA-seq的workflow吗？）, 今天继续后面的学习。

Roman Hillje的github主页：
https://github.com/romanhaa
workflow：
https://romanhaa.github.io/projects/scrnaseq_workflow/

12 Perform analysis for Cerebro

为了进一步研究Cerebro https://github.com/romanhaa/Cerebro 中的数据集，我们现在添加了更多的meta data，计算标记基因(marker genes)，标记基因的通路富集，并导出数据集。这个过程在很大程度上遵循了cerebroApp网站 https://romanhaa.github.io/cerebroApp/articles/cerebroApp_workflow_Seurat.html 上描述的工作流程。

12.1 Add meta data

load("labels_assinged_seurat.Rdata")
load("thresholds.RData")
seurat@misc$experiment <- list(
  experiment_name = 'bone_marrow_donor_cells',
  organism = 'hg',
  date_of_analysis = Sys.Date()
)

seurat@misc$parameters <- list(
  gene_nomenclature = 'gene_name',
  discard_genes_expressed_in_fewer_cells_than = 6,
  keep_mitochondrial_genes = TRUE,
  variables_to_regress_out = 'nCount',
  number_PCs = 15,
  cluster_resolution = 0.8
)

seurat@misc$parameters$filtering <- list(
  UMI_min = thresholds_nCount[1],
  UMI_max = thresholds_nCount[2],
  genes_min = thresholds_nFeature[1],
  genes_max = thresholds_nFeature[2],
  pct_mito_min = thresholds_percent_MT[1],
  pct_mito_max = thresholds_percent_MT[2]
)

#github上的，自己科学上网下吧
if(!require("cerebroApp")){
  BiocManager::install('romanhaa/cerebroApp')
}
library(cerebroApp)

seurat@misc$technical_info$cerebroApp_version <- utils::packageVersion('cerebroApp')
seurat@misc$technical_info$Seurat <- utils::packageVersion('Seurat')
seurat@misc$technical_info <- list(
  'R' = capture.output(devtools::session_info())
)

12.2 Calculate relationship trees

这里，我们为所有三个分组变量:样本、聚类和细胞类型计算关系树——就像我们之前为聚类所做的那样。它们中的每一个都被添加到Seurat对象的@misc槽中，这样它就可以被提取到Cerebro中。

#Samples:

Idents(seurat) <- "sample"
seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)
seurat@misc$trees$sample <- seurat@tools$BuildClusterTree
#Clusters:

Idents(seurat) <- "seurat_clusters"
seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)
seurat@misc$trees$seurat_clusters <- seurat@tools$BuildClusterTree
#Cell types:

Idents(seurat) <- "cell_type_singler_blueprintencode_main"
seurat <- BuildClusterTree(
  seurat,
  dims = 1:15,
  reorder = FALSE,
  reorder.numeric = FALSE
)
seurat@misc$trees$cell_type_singler_blueprintencode_main <- seurat@tools$BuildClusterTree

12.3 cerebroApp steps

在这里，我们计算线粒体和核糖体转录本的百分比，得到最多表达的基因和标记基因，使用富集过的通路进行测试，并进行基因集富集分析GSEA。因为cereBroAPP作者提桶跑路，已经不支持seurat5了，所以没办法只能自己改函数，改过的函数存在cereBroFunction.R脚本里面

#感觉是cerebroApp不能兼容Seurat5，找不到原来位置的counts(因为v5在layers下面了)
#检查源代码发现确实是这样的，没办法想继续用就得自己修改源代码再定义一个函数
#修改过的函数在cereBroFunction_forSeuratV5.R脚本里面，source一下
source("cereBroFunction_forSeuratV5.R")
seurat <- addPercentMtRibo(
  seurat,
  assay = "RNA",
  organism = 'hg',
  gene_nomenclature = 'name'
)

seurat <- getMostExpressedGenes(
  seurat,
  assay = 'RNA',
  groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main')
)

#Marker Gene结果保存在object@misc[["marker_genes"]][["cerebro_seurat"]][[groups[1]]]
seurat <- getMarkerGenes(
  seurat,
  organism = 'hg',
  groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'),
  name = 'cerebro_seurat',
  only_pos = TRUE
)

#富集分析结果存在seurat@misc[["enriched_pathways"]]中
seurat <- cerebroApp::getEnrichedPathways(
  seurat,
  marker_genes_input = 'cerebro_seurat',
  adj_p_cutoff = 0.01,
  max_terms = 100
)

#GSEA富集分析,做不了，有坑，自己把基因提出来做好了
 seurat <- performGeneSetEnrichmentAnalysis(
   seurat,
   assay = 'RNA',
   GMT_file = 'h.all.v2023.2.Hs.symbols.gmt',
   groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'),
   thresh_p_val = 0.05,
   thresh_q_val = 0.1,
   parallel.sz = 1,
   verbose = FALSE
 )

cereBroFunction.R:

#函数 addPercentMtRibo
addPercentMtRibo <- function(object, assay = "RNA", organism ="hg" ,gene_nomenclature = 'name'){
  
  calculatePercentGenes<-function (object, assay = "RNA", genes ,message) {
    result <- pbapply::pblapply(genes, function(x) {
      genes_here <- intersect(x, rownames(object@assays[[assay]]$counts))
      if (length(genes_here) == 1) {
        object@assays[[assay]]$counts[genes_here, ]/Matrix::colSums(object@assays[[assay]]$counts)
        print(paste0("Only one ", message,"gene was found"))
      }
      else {
        Matrix::colSums(object@assays[[assay]]$counts[genes_here, 
        ])/Matrix::colSums(object@assays[[assay]]$counts)




    

      }
    })
  }
  
  #配置文件中记录了线粒体和核糖体基因
  genes_mt <- readr::read_tsv(system.file(paste0("extdata/genes_mt_", 
                                                 organism, "_", gene_nomenclature, ".tsv.gz"), package = "cerebroApp"), 
                              col_types = readr::cols(), col_names = FALSE) %>% dplyr::select(1) %>% 
    t() %>% as.vector()
  genes_ribo <- readr::read_tsv(system.file(paste0("extdata/genes_ribo_", 
                                                   organism, "_", gene_nomenclature, ".tsv.gz"), package = "cerebroApp"), 
                                col_types = readr::cols(), col_names = FALSE) %>% dplyr::select(1) %>% 
    t() %>% as.vector()
  
  genes_mt_here <- intersect(genes_mt, rownames(object@assays[[assay]]$counts))
  genes_ribo_here <- intersect(genes_ribo, rownames(object@assays[[assay]]$counts))
  
  values_mt <- calculatePercentGenes(object, assay = assay, message="mitochondrial", list(genes_mt = genes_mt_here))
  values_ribo <- calculatePercentGenes(object, assay = assay, message="ribosomal", list(genes_ribo = genes_ribo_here))
  
  object@meta.data$mt_percent <- as.vector(values_mt[[1]])
  object@meta.data$ribo_percent <- as.vector(values_ribo[[1]])
  
  return(object)
}

#函数 getMostExpressedGenes
getMostExpressedGenes<-function (object, assay = "RNA", groups = NULL) {
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor(object@meta.data[[current_group]])) {
      group_levels <- levels(object@meta.data[[current_group]])
    }
    else if (is.character(object@meta.data[[current_group]])) {
      group_levels <- unique(object@meta.data[[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- object@meta.data[[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) without group assignment  (NA) for `", 
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    if (length(group_levels) == 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Group `", current_group, "` contains only one subgroup. Will calculate most expressed genes ", 
                     "across all cells of this group..."))
      transcripts_counts_per_gene <- Matrix::rowSums(object@assays[[assay]]$counts)
      total_transcript_count <- sum(transcripts_counts_per_gene)
      transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count
      transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, 
                                           decreasing = TRUE)
      table <- tibble::tibble(group = group_levels, gene = names(transcripts_percent_per_gene)[1:100], 
                              pct = transcripts_percent_per_gene[1:100])
      most_expressed_genes <- table %>% dplyr::mutate(group = factor(group, 
                                                                     levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, 
                                                                                                                    group))
      object@misc[["most_expressed_genes"]][[current_group]] <- most_expressed_genes
    }
    else if (length(group_levels) > 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Get most expressed genes for ", length(group_levels), 
                     " groups in `", current_group, "`..."))
      results <- pbapply::pblapply(group_levels, function(x) {
        cells_of_current_group_level <- rownames(object@meta.data)[which(object@meta.data[[current_group]] == 
                                                                           x)]
        transcript_count_matrix <- object@assays[[assay]]$counts[,cells_of_current_group_level]
        if (is.vector(transcript_count_matrix) == TRUE) {
          transcripts_counts_per_gene <- transcript_count_matrix
        }
        else {
          transcripts_counts_per_gene <- Matrix::rowSums(transcript_count_matrix)
        }
        total_transcript_count <- sum(transcripts_counts_per_gene)
        transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count
        transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, 
                                             decreasing = TRUE)
        table <- tibble::tibble(group = x, gene = names(transcripts_percent_per_gene)[1:100], 
                                pct = transcripts_percent_per_gene[1:100])
      })
      most_expressed_genes <- do.call(rbind, results) %>% 
        dplyr::mutate(group = factor(group, levels = group_levels)) %>% 
        dplyr::rename(`:=`(!!current_group, group))
      object@misc[["most_expressed_genes"]][[current_group]] <- most_expressed_genes
    }
  }
  return(object)
}

#函数 getMarkerGenes
getMarkerGenes <- function (object, organism = NULL, groups = NULL, 
          name = "cerebro_seurat", only_pos = TRUE, min_pct = 0.7, 
          thresh_logFC = 0.25, thresh_p_val = 0.01, test = "wilcox", 
          verbose = TRUE, ...) {
  if (is.null(object@misc$marker_genes)) {
    object@misc$marker_genes <- list()
  }
  if (organism %in% c("hg", "mm") == FALSE) {
    message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] No information about genes on cell surface because organism is ", 
                   "either not specified or not human/mouse."))
  }
  else {
    if (organism == "hg" || organism == "human") {
      temp_attributes <- "hgnc_symbol"
      temp_dataset <- "hsapiens_gene_ensembl"
    }
    else if (organism == "mm" || organism == "mouse") {
      temp_attributes <- "external_gene_name"
      temp_dataset <- "mmusculus_gene_ensembl"
    }
    attempt <- 1
    while (!exists("genes_on_cell_surface") && attempt <= 
           3) {
      try(genes_on_cell_surface <- biomaRt::getBM(attributes = temp_attributes, 
                                                  filters = "go", values = "GO:0009986", mart = biomaRt::useMart("ensembl", 
                                                                                                                 dataset = temp_dataset))[, 1])
    }
    if (!exists("genes_on_cell_surface")) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Genes in GO term \"cell surface\" (GO:0009986) could not be ", 
                     "retrieved, possibly due to the server not being reachable at the ", 
                     "moment."))
    }
  }
  object@misc$marker_genes[[name]] <- list()
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor(object@meta.data[[current_group]])) {
      group_levels <- levels(object@meta.data[[current_group]])
    }
    else if (is.character(object@meta.data[[current_group]])) {
      group_levels <- unique(object@meta.data[[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- object@meta.data[[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) without group assignment (NA) for `", 
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    if (length(group_levels) == 1) {
      warning(paste0("Only one group level found for group `", 
                     current_group, "`. Will skip this group and proceed to next."), 
              call. = FALSE)
    }
    else if (length(group_levels) > 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Get marker genes for ", length(group_levels), 
                     " groups in `", current_group, "`..."))
      Seurat::Idents(object) <- current_group
      results <- Seurat::FindAllMarkers(object, 
                                        only.pos = only_pos, min.pct = min_pct, logfc.threshold = thresh_logFC, 
                                        return.thresh = thresh_p_val, test.use = test, 
                                        verbose = verbose, ...)
      if (nrow(results) == 0) {
        message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                       "] No marker genes found for any of the level of `", 
                       current_group, "`."))
        results <- "no_markers_found"
      }
      else if (nrow(results) > 0) {
        if (exists("genes_on_cell_surface") && "gene" %in% 
            colnames(results)) {
          results <- results %>% dplyr::mutate(on_cell_surface = .data$gene %in% 
                                                 genes_on_cell_surface)
        }
      }
      if ("cluster" %in% colnames(results)) {
        results <- results %>% dplyr::rename(`:=`(!!current_group, 
                                                  .data$cluster)) %>% dplyr::select(tidyselect::all_of(current_group), 
                                                                                    tidyselect::any_of("gene"), dplyr::everything())
      }




    

      object@misc[["marker_genes"]][[name]][[current_group]] <- results
    }
  }
  return(object)
}

#函数 performGeneSetEnrichmentAnalysis
performGeneSetEnrichmentAnalysis<-function (object, assay = "RNA", GMT_file, 
                                            groups = NULL, name = "cerebro_GSVA", 
                                            thresh_p_val = 0.05, thresh_q_val = 0.1, ...) {
  
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Loading gene sets..."))
  read_GMT_file<-function (file) 
  {
    gmt <- readr::read_delim(file, delim = ";", col_names = c("X1"), 
                             col_types = readr::cols())
    gene_set_genes <- list()
    for (i in seq_len(nrow(gmt))) {
      temp_genes <- strsplit(gmt$X1[i], split = "\t")[[1]] %>% 
        unlist()
      temp_genes <- temp_genes[3:length(temp_genes)]
      gene_set_genes[[i]] <- temp_genes
    }
    gene_set_loaded <- list(genesets = gene_set_genes, geneset.names = lapply(strsplit(gmt$X1, 
                                                                                       split = "\t"), "[", 1) %>% unlist(), geneset.description = lapply(strsplit(gmt$X1, 
                                                                                                                                                                  split = "\t"), "[", 2) %>% unlist())
    return(gene_set_loaded)
  }
  gene_sets <- read_GMT_file(GMT_file)
  names(gene_sets$genesets) <- gene_sets$geneset.names
  gene_sets_tibble <- tibble::tibble(name = gene_sets$geneset.names, 
                                     description = gene_sets$geneset.description, length = NA, 
                                     genes = NA)
  for (i in seq_along(gene_sets$genesets)) {
    gene_sets_tibble$length[i] <- gene_sets$genesets[[i]] %>% 
      length()
    gene_sets_tibble$genes[i] <- gene_sets$genesets[[i]] %>% 
      unlist() %>% paste(.data, collapse = ",")
  }
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Loaded ", 
                 length(gene_sets$genesets), " gene sets from GMT file."))
  expressed_genes <- Matrix::rowSums(object@assays[[assay]]$counts)
  expressed_genes <- which(expressed_genes != 0)
  message(paste0("[", format(Sys.time(), "%H:%M:%S"), "] Extracting transcript counts ", 
                 "from `data` slot of `", assay, "` assay..."))
  matrix_full <- object@assays[[assay]]$counts[expressed_genes, 
  ]
  object@misc$enriched_pathways[[name]] <- list()
  for (i in seq_along(groups)) {
    current_group <- groups[i]
    if (is.factor(object@meta.data[[current_group]])) {
      group_levels <- levels(object@meta.data[[current_group]])
    }
    else if (is.character(object@meta.data[[current_group]])) {
      group_levels <- unique(object@meta.data[[current_group]])
      if (any(is.na(group_levels))) {
        number_of_cells_without_group_assignment <- object@meta.data[[current_group]] %>% 
          is.na() %>% which(. == TRUE) %>% length()
        group_levels <- stats::na.omit(group_levels)
        warning(paste0("Found ", number_of_cells_without_group_assignment, 
                       " cell(s) ", "without group assignment (NA) for `", 
                       current_group, "`. These cells will be ignored during the analysis."), 
                call. = FALSE)
      }
    }
    message(glue::glue("[", format(Sys.time(), "%H:%M:%S"), 
                       "] Performing analysis for ", "{length(group_levels)} subgroups of group `{current_group}`..."))
    if (length(group_levels) == 1) {
      message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                     "] Only one group level found ", "for group `", 
                     current_group, "`. Will skip this group and proceed ", 
                     "to next."), call. = FALSE)
      results <- NULL
    }
    else if (length(group_levels) > 1) {
      matrix_mean_by_group <- future.apply::future_sapply(group_levels, 
                                                          USE.NAMES = TRUE, simplify = TRUE, future.globals = FALSE, 
                                                          function(x) {
                                                            cells_in_this_group <- which(object@meta.data[[current_group]] == 
                                                                                           x)
                                                            if (length(cells_in_this_group) == 1) {
                                                              matrix_full[, cells_in_this_group]
                                                            }
                                                            else {
                                                              Matrix::rowMeans(matrix_full[, cells_in_this_group])
                                                            }
                                                          })
      enrichment_scores <- GSVA::gsva(expr = matrix_mean_by_group, 
                                      gset.idx.list = gene_sets$genesets, ...)
      results <- tibble::tibble(group = character(), name = character(), 
                                enrichment_score = numeric(), p_value = numeric(), 
                                q_value = numeric())
      for (i in seq_len(ncol(enrichment_scores))) {
        temp_results <- tibble::tibble(group = colnames(matrix_mean_by_group)[i], 
                                       name = rownames(enrichment_scores), enrichment_score = enrichment_scores[, 
                                                                                                                i])
        if (nrow(temp_results) < 2) {
          message(paste0("[", format(Sys.time(), "%H:%M:%S"), 
                         "] Only 1 gene set ", "available, therefore p- and q-values cannot be calculated."))
          temp_results <- temp_results %>% dplyr::mutate(p_value = NA, 
                                                         q_value = NA)
        }
        else if (nrow(temp_results) >= 2) {
          temp_p_values <- stats::pnorm(-abs(scale(temp_results$enrichment_score)[, 
                                                                                  1]))
          temp_q_values <- suppressWarnings(qvalue::qvalue(temp_p_values, 
                                                           pi0 = 1, ties = min)$lfdr)
          temp_results <- temp_results %>% dplyr::mutate(p_value = temp_p_values, 
                                                         q_value = temp_q_values) %>% dplyr::filter(.data$p_value <= 
                                                                                                      thresh_p_val, .data$q_value <= thresh_q_val)
        }
        results <- dplyr::bind_rows(results, temp_results) %>% 
          dplyr::arrange(.data$q_value)
      }
      results <- dplyr::left_join(results, gene_sets_tibble, 
                                  by = "name") %>% dplyr::select(c("group", "name", 
                                                                   "description", "length", "genes", "enrichment_score", 
                                                                   "p_value", "q_value")) %>% dplyr::rename(`:=`(!!current_group, 
                                                                                                                 .data$group))
      group_levels_with_results <- group_levels[group_levels %in% 
                                                  results[[current_group]]]
      results[[current_group]] <- factor(results[[current_group]], 
                                         levels = group_levels_with_results)
      message(glue::glue("[", format(Sys.time(), "%H:%M:%S"), 
                         "] {nrow(results)} gene sets ", "passed the thresholds across all subgroups of group `{current_group}`."))
      if (nrow(results) == 0) {
        results <- "no_gene_sets_enriched"
      }
      object@misc[["enriched_pathways"]][[name]][[current_group]] <- results
    }
  }
  return(object)
}

13 单基因的表达

13.1单基因的小提琴图

首先，我们发现用宽度而不是面积来衡量小提琴是很有用的。
其次，我们建议对小提琴进行修剪，以避免小提琴延伸到负尺度，这是没有意义的，因为没有任何负的表达值。
第三，在数据中添加一些噪声可能是有意义的，以避免突出显示刻度底部的不同值，例如log(1)， log(2)等。在使用Seurat对表达量进行可视化时，这是默认情况下完成的(甚至没有选项)。

在这种情况下，差异非常细微，如下图所示(红细胞和单核细胞)。

p1 <- seurat@meta.data %>%
  mutate(expression = seurat@assays$SCT@data['CD69',]) %>%
  ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) +




    

  geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete() +
  scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) +
  labs(title = "CD69 expression (without noise)") +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'none'
  )

p2 <- seurat@meta.data %>%
  mutate(
    expression = seurat@assays$SCT@data['CD69',],
    expression = expression + rnorm(nrow(.))/200
  ) %>%
  ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete() +
  scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) +
  labs(title = "CD69 expression (with noise)") +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'none'
  )

p1 + p2 + plot_layout(ncol = 1)

ggsave(
  'plots/expression_cd69_violin_plot.png',
  p1 + p2 + plot_layout(ncol = 1),
  height = 6, width = 9
)

13.2 多基因点图

点图可以展示多个基因的表达，通过一个特定的分组变量。接下来的例子中，我们将使用检测到的细胞类型和聚类的标记基因。

13.2.1 根据细胞类型划分

# cells will be grouped by samples that they have been assigned to
group_ids <- unique(seurat@meta.data$cell_type_singler_blueprintencode_main)

# select a set of genes for which we want to show expression
genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$cell_type_singler_blueprintencode_main %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  arrange(p_val_adj) %>%
  filter(row_number() == 1) %>%
  arrange(cell_type_singler_blueprintencode_main) %>%
  pull(gene)


#这里若没有DC种类的细胞，则
group_ids = group_ids[group_ids!="DC"]
# for every sample-gene combination, calculate the average expression across
# all cells and then transform the data into a data frame
expression_levels_per_group <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$cell_type_singler_blueprintencode_main == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
  select(cell_type_singler_blueprintencode_main, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(expression = value) %>%
  mutate(id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene))

# for every sample-gene combination, calculate the percentage of cells in the
# respective group that has at least 1 transcript (this means we consider it
# as expressing the gene) and then transform the data into a data frame
percentage_of_cells_expressing_gene <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$cell_type_singler_blueprintencode_main == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>%
  select(cell_type_singler_blueprintencode_main, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(cell_count = value) %>%
  left_join(
    .,
    seurat@meta.data %>%
      group_by(cell_type_singler_blueprintencode_main) %>%
      tally(),
    by = 'cell_type_singler_blueprintencode_main') %>%
  mutate(
    id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene),
    percent_cells = cell_count / n
  )

# merge the two data frames created before and plot the data
p <- left_join(
    expression_levels_per_group,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(
    cell_type_singler_blueprintencode_main = factor(cell_type_singler_blueprintencode_main, levels = rev(group_ids)),
    gene = factor(gene, levels = genes_to_show)
  ) %>%
  arrange(gene) %>%
  ggplot(aes(gene, cell_type_singler_blueprintencode_main)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cell type', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p

ggsave('plots/marker_genes_by_cell_type_as_dot_plot.png', p, height = 5, width = 6)

13.2.2 根据聚类结果进行划分

group_ids <- levels(seurat@meta.data$seurat_clusters)

genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$seurat_clusters %>%
  group_by(seurat_clusters) %>%
  arrange(p_val_adj) %>%
  filter(row_number() == 1) %>%
  arrange(seurat_clusters) %>%
  pull(gene)
genes_to_show <- genes_to_show[!duplicated(genes_to_show)]

expression_levels_per_group <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$seurat_clusters == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group])
    }
  ) %>%
  t() %>%
  as.data.frame()
expression_levels_per_group <- expression_levels_per_group[,!duplicated(colnames(expression_levels_per_group))] %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(expression = value) %>%
  mutate(id_to_merge = paste0(cluster, '_', gene))

percentage_of_cells_expressing_gene <- vapply(
    group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) {
      cells_in_current_group <- which(seurat@meta.data$seurat_cluster == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() 
percentage_of_cells_expressing_gene <- percentage_of_cells_expressing_gene[,!duplicated(colnames(percentage_of_cells_expressing_gene))] %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  dplyr::rename(cell_count = value) %>%
  left_join(
    .,
    seurat@meta.data %>%
      group_by(seurat_clusters) %>%
      tally() %>%
      dplyr::rename(cluster = seurat_clusters),
    by = 'cluster') %>%
  mutate(
    id_to_merge = paste0(cluster, '_', gene),
    percent_cells = cell_count / n
  )

p <- left_join(
    expression_levels_per_group,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(
    cluster = factor(cluster, levels = rev(group_ids)),
    gene = factor(gene, levels = genes_to_show)
  ) %>%
  arrange(gene) %>%
  ggplot(aes(gene, cluster)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cluster', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

p

ggsave('plots/marker_genes_by_cluster_as_dot_plot.png', p, height = 7, width = 8)

14 基因集中的表达量

我们根据细胞类型划分检查GO通路B_CELL_ACTIVATION中基因的表达量

library(msigdbr)




    


gene_set <- msigdbr(species = 'Homo sapiens', category = 'C5', subcategory = 'BP') %>%
  filter(gs_name == 'GOBP_B_CELL_ACTIVATION') %>%
  pull(gene_symbol) %>%
  unique()

gene_set_present <- gene_set[which(gene_set %in% rownames(seurat@assays$SCT@data))]

head(gene_set_present)
# [1] "ABL1"    "ADA"     "ADAM17"  "ADGRG3"  "AHR"     "AKAP17A"

14.1 Mean log-expression

temp_labels <- seurat@meta.data %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  tally() %>%
  dplyr::rename('cell_type' = cell_type_singler_blueprintencode_main)

p <- tibble(
  cell_type = seurat@meta.data$cell_type_singler_blueprintencode_main,
  mean_expression = Matrix::colMeans(seurat@assays$SCT@data[gene_set_present,])
  ) %>%
  ggplot(aes(x = cell_type, y = mean_expression, fill = cell_type)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
  geom_text(
    data = temp_labels,
    aes(
      x = cell_type,
      y = -Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete() +
  scale_y_continuous(name = 'Mean log expression', labels = scales::comma, expand = c(0.08, 0)) +
  labs(title = "GO term 'B_CELL_ACTIVATION'") +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'none'
  )

p

ggsave('plots/gene_set_expression_b_cell_activation_mean.png', p, height = 4, width = 7)

14.2 Module score

seurat <- AddModuleScore(
  seurat,
  features = list(gene_set_present),
  name = 'GO_B_CELL_ACTIVATION',
  assay = 'SCT'
)

temp_labels <- seurat@meta.data %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  tally()

p <- seurat@meta.data %>%
  ggplot(
    aes(
      x = cell_type_singler_blueprintencode_main,
      y = GO_B_CELL_ACTIVATION1,
      fill = cell_type_singler_blueprintencode_main
    )
  ) +
  geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
  geom_text(
    data = temp_labels,
    aes(
      x = cell_type_singler_blueprintencode_main,
      y = -Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete() +
  scale_y_continuous(name = 'Module score', labels = scales::comma, expand = c(0.08, 0)) +
  labs(title = "GO term 'B_CELL_ACTIVATION'") +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'none'
  )

p

ggsave('plots/gene_set_expression_b_cell_activation_module_score.png', p, height = 4, width = 7)

15 GSEA富集分析

15.1 GSVA

为了说明原因，我们将使用来自MSigDB的50个Hallmark基因集，并将单核细胞与造血干细胞(HSC;控制)进行比较。感谢Alessandro Raveane的代码。

# function to read GMT file
read_GMT_file <- function(file) {
  gmt <- readr::read_delim(
    file,
    delim = ';',
    col_names = c('X1'),
    col_types = readr::cols()
  )

  gene_set_genes <- list()
  for ( i in seq_len(nrow(gmt)) )
  {
    temp_genes <- strsplit(gmt$X1[i], split = '\t')[[1]] %>% unlist()
    temp_genes <- temp_genes[3:length(temp_genes)]
    gene_set_genes[[i]] <- temp_genes
  }
  gene_set_loaded <- list(
    genesets = gene_set_genes,
    geneset.names = lapply(strsplit(gmt$X1, split = '\t'), '[', 1) %>% unlist(),
    geneset.description = lapply(
        strsplit(gmt$X1, split = '\t'), '[', 2
      ) %>% unlist()
  )

  return(gene_set_loaded)
}

# load gene sets from GMT file
gene_sets <- read_GMT_file('h.all.v2023.2.Hs.symbols.gmt')

# set gene set names
names(gene_sets$genesets) <- gene_sets$geneset.names

# get indices of cells which are either HSC or monocytes
cells_to_analyze <- seurat@meta.data %>%
  mutate(row_number = row_number()) %>%
  filter(grepl(cell_type_singler_blueprintencode_main, pattern = 'HSC|Monocytes')) %>%
  arrange(cell_type_singler_blueprintencode_main) %>%
  pull(row_number)

# get list of genes unique genes across all gene sets
genes_to_analyze <- gene_sets$genesets %>% unlist() %>% unique()
# filter gene list for those which are present in the data set
genes_to_analyze <- genes_to_analyze[which(genes_to_analyze %in% rownames(seurat@assays$SCT@counts))]

# get expression matrix and reduce it to cells and genes of interest
expression_matrix <- seurat@assays$SCT@counts[ genes_to_analyze , cells_to_analyze] %>% as.matrix()

# perform GSVA
gsva <- GSVA::gsva(
  expression_matrix,
  gset.idx.list = gene_sets$genesets,
  parallel.sz = 1
)

# load limma library for statistical testing
library(limma)

#HSC细胞和Monocytes细胞的差异表达分析
# generate design matrix
design_matrix <- tibble(
  control = 1,
  test = c(
    rep(0, seurat@meta.data %>% filter(cell_type_singler_blueprintencode_main == 'HSC') %>% nrow()),
    rep(1, seurat@meta.data %>% filter(cell_type_singler_blueprintencode_main == 'Monocytes') %>% nrow())
  )
)

head(design_matrix)
# A tibble: 6 x 2
#   control  test
#      
# 1       1     0
# 2       1     0
# 3       1     0
# 4       1     0
# 5       1     0
# 6       1     0

# fit linear model, followed by empirical Bayes statistics for differential
# enrichment analysis
fit <- lmFit(gsva, design_matrix)
fit <- eBayes(fit)

# prepare data for plotting
data <- topTable(fit, coef = 'test', number = 50) %>%
  mutate(gene_set = rownames(fit$t)) %>%
  arrange(t) %>%
  mutate(
    gene_set = factor(gene_set, levels = gene_set),
    just = ifelse(t < 0, 0, 1),
    nudge_y = ifelse(t < 0, 1, -1),
    color = ifelse(t < -5 | t > 5, 'black', 'grey')
  )

DataFrame(data)
# DataFrame with 50 rows and 10 columns
#         logFC     AveExpr         t      P.Value    adj.P.Val         B                            gene_set      just   nudge_y       color
#                                                 
# 1   -0.369443  -0.1569690  -35.1619 1.26066e-186 1.05055e-185   415.980  HALLMARK_TGF_BETA_SIGNALING                0         1       black
# 2   -0.251413  -0.0820012  -27.2261 1.96152e-127 8.91598e-127   279.761  HALLMARK_NOTCH_SIGNALING                   0         1       black
# 3   -0.288169  -0.1202293  -26.1464 1.40712e-119 5.41201e-119   261.689  HALLMARK_ESTROGEN_RESPONSE_EARLY           0         1       black
# 4   -0.135034  -0.1511146  -22.3565  8.50194e-93  2.36165e-92   200.094  HALLMARK_INTERFERON_ALPHA_RESPONSE         0         1       black
# 5   -0.203651  -0.1049498  -22.0728  7.44006e-91  1.95791e-90   195.628  HALLMARK_INTERFERON_GAMMA_RESPONSE         0         1       black
# ...       ...         ...       ...          ...          ...       ...                                 ...       ...       ...         ...
# 46   0.200259  0.25341594   35.9250 2.31901e-192 2.31901e-191   429.182 HALLMARK_WNT_BETA_CATENIN_SIGNALING         1        -1       black
# 47   0.293113 -0.08115610   36.1995 2.00784e-194 2.50980e-193   433.929 HALLMARK_MITOTIC_SPINDLE                    1        -1       black
# 48   0.338449  0.11583774   36.2560 7.56196e-195 1.26033e-193   434.906 HALLMARK_CHOLESTEROL_HOMEOSTASIS            1        -1       black
# 49   0.251678  0.00262136   37.5117 2.86772e-204 7.16930e-203   456.592 HALLMARK_HYPOXIA                            1        -1       black
# 50   0.282907 -0.05021404   48.5971 1.72523e-285 8.62616e-284   643.571 HALLMARK_TNFA_SIGNALING_VIA_NFKB            1        -1       black

# plot t-value
p <- ggplot(data = data, aes(x = gene_set, y = t, fill = t)) +
  geom_col() +
  geom_hline(yintercept = c(-5,5), linetype = 'dashed', color = 'grey80') +
  geom_text(
    aes(
      x = gene_set,
      y = 0,
      label = gene_set,
      hjust = just,
      color = color
    ),
    nudge_y = data$nudge_y, size = 3
  ) +
  scale_x_discrete(name = '', labels = NULL) +
  scale_y_continuous(name = 't-value', limits = c(-55,55)) +
  scale_fill_distiller(palette = 'Spectral', limits = c(-max(data$t), max(data$t))) +
  scale_color_manual(values = c('black' = 'black', 'grey' = 'grey')) +
  coord_flip() +
  theme_bw() +
  theme(
    panel.grid = element_blank(),
    axis.ticks.y =  element_blank(),
    legend.position = 'none'
  )

p

ggsave('plots/gsva_hallmark_gene_sets_monocytes_vs_hsc.png', height = 7.5, width = 7.5)

16 保存结果和Session info

最后我们将seurat对象保存，可以用于后续分析

saveRDS(seurat, 'data/seurat.rds')

保存我们使用的R和R包版本信息，以便后续进行可重复性分析。

devtools::session_info()

文末友情宣传

强烈建议你推荐给身边的博士后以及年轻生物学PI，多一点数据认知，让他们的科研上一个台阶：

你还缺scRNA-seq的workflow吗（二）？

正文

(adsbygoogle = window.adsbygoogle || []).push({}); 前言