前言 前面我们演示了生信工程师Roman Hillje分享的scRNA-seq workflow的部分(你还缺scRNA-seq的workflow吗? ), 今天继续后面的学习。
Roman Hillje的github主页:
https://github.com/romanhaa workflow:
https://romanhaa.github.io/projects/scrnaseq_workflow/ 12 Perform analysis for Cerebro 为了进一步研究Cerebro https://github.com/romanhaa/Cerebro 中的数据集,我们现在添加了更多的meta data,计算标记基因(marker genes),标记基因的通路富集,并导出数据集。这个过程在很大程度上遵循了cerebroApp网站 https://romanhaa.github.io/cerebroApp/articles/cerebroApp_workflow_Seurat.html 上描述的工作流程。
12.1 Add meta data load("labels_assinged_seurat.Rdata") load("thresholds.RData") seurat@misc$experiment <- list( experiment_name = 'bone_marrow_donor_cells', organism = 'hg', date_of_analysis = Sys.Date() ) seurat@misc$parameters <- list( gene_nomenclature = 'gene_name', discard_genes_expressed_in_fewer_cells_than = 6, keep_mitochondrial_genes = TRUE, variables_to_regress_out = 'nCount', number_PCs = 15, cluster_resolution = 0.8 ) seurat@misc$parameters$filtering <- list( UMI_min = thresholds_nCount[1], UMI_max = thresholds_nCount[2], genes_min = thresholds_nFeature[1], genes_max = thresholds_nFeature[2], pct_mito_min = thresholds_percent_MT[1], pct_mito_max = thresholds_percent_MT[2] ) #github上的,自己科学上网下吧 if(!require("cerebroApp")){ BiocManager::install('romanhaa/cerebroApp') } library(cerebroApp) seurat@misc$technical_info$cerebroApp_version <- utils::packageVersion('cerebroApp') seurat@misc$technical_info$Seurat <- utils::packageVersion('Seurat') seurat@misc$technical_info <- list( 'R' = capture.output(devtools::session_info()) )
12.2 Calculate relationship trees 这里,我们为所有三个分组变量:样本、聚类和细胞类型计算关系树——就像我们之前为聚类所做的那样。它们中的每一个都被添加到Seurat对象的@misc
槽中,这样它就可以被提取到Cerebro中。
#Samples: Idents(seurat) <- "sample" seurat <- BuildClusterTree( seurat, dims = 1:15, reorder = FALSE, reorder.numeric = FALSE ) seurat@misc$trees$sample <- seurat@tools$BuildClusterTree #Clusters: Idents(seurat) <- "seurat_clusters" seurat <- BuildClusterTree( seurat, dims = 1:15, reorder = FALSE, reorder.numeric = FALSE ) seurat@misc$trees$seurat_clusters <- seurat@tools$BuildClusterTree #Cell types: Idents(seurat) <- "cell_type_singler_blueprintencode_main" seurat <- BuildClusterTree( seurat, dims = 1:15, reorder = FALSE, reorder.numeric = FALSE ) seurat@misc$trees$cell_type_singler_blueprintencode_main <- seurat@tools$BuildClusterTree
12.3 cerebroApp steps 在这里,我们计算线粒体和核糖体转录本的百分比,得到最多表达的基因和标记基因,使用富集过的通路进行测试,并进行基因集富集分析GSEA。因为cereBroAPP作者提桶跑路,已经不支持seurat5了,所以没办法只能自己改函数,改过的函数存在cereBroFunction.R
脚本里面
#感觉是cerebroApp不能兼容Seurat5,找不到原来位置的counts(因为v5在layers下面了) #检查源代码发现确实是这样的,没办法想继续用就得自己修改源代码再定义一个函数 #修改过的函数在cereBroFunction_forSeuratV5.R脚本里面,source一下 source("cereBroFunction_forSeuratV5.R") seurat <- addPercentMtRibo( seurat, assay = "RNA", organism = 'hg', gene_nomenclature = 'name' ) seurat <- getMostExpressedGenes( seurat, assay = 'RNA', groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main') ) #Marker Gene结果保存在object@misc[["marker_genes"]][["cerebro_seurat"]][[groups[1]]] seurat <- getMarkerGenes( seurat, organism = 'hg', groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'), name = 'cerebro_seurat', only_pos = TRUE ) #富集分析结果存在seurat@misc[["enriched_pathways"]]中 seurat <- cerebroApp::getEnrichedPathways( seurat, marker_genes_input = 'cerebro_seurat', adj_p_cutoff = 0.01, max_terms = 100 ) #GSEA富集分析,做不了,有坑,自己把基因提出来做好了 seurat <- performGeneSetEnrichmentAnalysis( seurat, assay = 'RNA', GMT_file = 'h.all.v2023.2.Hs.symbols.gmt', groups = c('sample','seurat_clusters','cell_type_singler_blueprintencode_main'), thresh_p_val = 0.05, thresh_q_val = 0.1, parallel.sz = 1, verbose = FALSE )
cereBroFunction.R
:
#函数 addPercentMtRibo addPercentMtRibo <- function (object, assay = "RNA" , organism ="hg" ,gene_nomenclature = 'name' ){ calculatePercentGenes<-function (object, assay = "RNA" , genes ,message) { result <- pbapply::pblapply(genes, function (x) { genes_here <- intersect(x, rownames(object@assays[[assay]]$counts)) if (length(genes_here) == 1 ) { object@assays[[assay]]$counts[genes_here, ]/Matrix::colSums(object@assays[[assay]]$counts) print(paste0("Only one " , message,"gene was found" )) } else { Matrix::colSums(object@assays[[assay]]$counts[genes_here, ])/Matrix::colSums(object@assays[[assay]]$counts) } }) } #配置文件中记录了线粒体和核糖体基因 genes_mt <- readr::read_tsv(system.file(paste0("extdata/genes_mt_" , organism, "_" , gene_nomenclature, ".tsv.gz" ), package = "cerebroApp" ), col_types = readr::cols(), col_names = FALSE ) %>% dplyr::select(1 ) %>% t() %>% as.vector() genes_ribo <- readr::read_tsv(system.file(paste0("extdata/genes_ribo_" , organism, "_" , gene_nomenclature, ".tsv.gz" ), package = "cerebroApp" ), col_types = readr::cols(), col_names = FALSE ) %>% dplyr::select(1 ) %>% t() %>% as.vector() genes_mt_here <- intersect(genes_mt, rownames(object@assays[[assay]]$counts)) genes_ribo_here <- intersect(genes_ribo, rownames(object@assays[[assay]]$counts)) values_mt <- calculatePercentGenes(object, assay = assay, message="mitochondrial" , list(genes_mt = genes_mt_here)) values_ribo <- calculatePercentGenes(object, assay = assay, message="ribosomal" , list(genes_ribo = genes_ribo_here)) object@meta.data$mt_percent <- as.vector(values_mt[[1 ]]) object@meta.data$ribo_percent <- as.vector(values_ribo[[1 ]]) return (object) }#函数 getMostExpressedGenes getMostExpressedGenes<-function (object, assay = "RNA" , groups = NULL ) { for (i in seq_along(groups)) { current_group <- groups[i] if (is.factor(object@meta.data[[current_group]])) { group_levels <- levels(object@meta.data[[current_group]]) } else if (is.character(object@meta.data[[current_group]])) { group_levels <- unique(object@meta.data[[current_group]]) if (any(is.na(group_levels))) { number_of_cells_without_group_assignment <- object@meta.data[[current_group]] %>% is.na() %>% which(. == TRUE ) %>% length() group_levels <- stats::na.omit(group_levels) warning (paste0("Found " , number_of_cells_without_group_assignment, " cell(s) without group assignment (NA) for `" , current_group, "`. These cells will be ignored during the analysis." ), call. = FALSE ) } } if (length(group_levels) == 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Group `" , current_group, "` contains only one subgroup. Will calculate most expressed genes " , "across all cells of this group..." )) transcripts_counts_per_gene <- Matrix::rowSums(object@assays[[assay]]$counts) total_transcript_count <- sum(transcripts_counts_per_gene) transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, decreasing = TRUE ) table <- tibble::tibble(group = group_levels, gene = names(transcripts_percent_per_gene)[1 :100 ], pct = transcripts_percent_per_gene[1 :100 ]) most_expressed_genes <- table %>% dplyr::mutate(group = factor(group, levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, group)) object@misc[["most_expressed_genes" ]][[current_group]] <- most_expressed_genes } else if (length(group_levels) > 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Get most expressed genes for " , length(group_levels), " groups in `" , current_group, "`..." )) results <- pbapply::pblapply(group_levels, function (x) { cells_of_current_group_level <- rownames(object@meta.data)[which(object@meta.data[[current_group]] == x)] transcript_count_matrix <- object@assays[[assay]]$counts[,cells_of_current_group_level] if (is.vector(transcript_count_matrix) == TRUE ) { transcripts_counts_per_gene <- transcript_count_matrix } else { transcripts_counts_per_gene <- Matrix::rowSums(transcript_count_matrix) } total_transcript_count <- sum(transcripts_counts_per_gene) transcripts_percent_per_gene <- transcripts_counts_per_gene/total_transcript_count transcripts_percent_per_gene <- sort(transcripts_percent_per_gene, decreasing = TRUE ) table <- tibble::tibble(group = x, gene = names(transcripts_percent_per_gene)[1 :100 ], pct = transcripts_percent_per_gene[1 :100 ]) }) most_expressed_genes <- do.call(rbind, results) %>% dplyr::mutate(group = factor(group, levels = group_levels)) %>% dplyr::rename(`:=`(!!current_group, group)) object@misc[["most_expressed_genes" ]][[current_group]] <- most_expressed_genes } } return (object) }#函数 getMarkerGenes getMarkerGenes <- function (object, organism = NULL , groups = NULL , name = "cerebro_seurat" , only_pos = TRUE , min_pct = 0.7 , thresh_logFC = 0.25 , thresh_p_val = 0.01 , test = "wilcox" , verbose = TRUE , ... ) { if (is.null(object@misc$marker_genes)) { object@misc$marker_genes <- list() } if (organism %in % c("hg" , "mm" ) == FALSE ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] No information about genes on cell surface because organism is " , "either not specified or not human/mouse." )) } else { if (organism == "hg" || organism == "human" ) { temp_attributes <- "hgnc_symbol" temp_dataset <- "hsapiens_gene_ensembl" } else if (organism == "mm" || organism == "mouse" ) { temp_attributes <- "external_gene_name" temp_dataset <- "mmusculus_gene_ensembl" } attempt <- 1 while (!exists("genes_on_cell_surface" ) && attempt <= 3 ) { try (genes_on_cell_surface <- biomaRt::getBM(attributes = temp_attributes, filters = "go" , values = "GO:0009986" , mart = biomaRt::useMart("ensembl" , dataset = temp_dataset))[, 1 ]) } if (!exists("genes_on_cell_surface" )) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Genes in GO term \"cell surface\" (GO:0009986) could not be " , "retrieved, possibly due to the server not being reachable at the " , "moment." )) } } object@misc$marker_genes[[name]] <- list() for (i in seq_along(groups)) { current_group <- groups[i] if (is.factor(object@meta.data[[current_group]])) { group_levels <- levels(object@meta.data[[current_group]]) } else if (is.character(object@meta.data[[current_group]])) { group_levels <- unique(object@meta.data[[current_group]]) if (any(is.na(group_levels))) { number_of_cells_without_group_assignment <- object@meta.data[[current_group]] %>% is.na() %>% which(. == TRUE ) %>% length() group_levels <- stats::na.omit(group_levels) warning (paste0("Found " , number_of_cells_without_group_assignment, " cell(s) without group assignment (NA) for `" , current_group, "`. These cells will be ignored during the analysis." ), call. = FALSE ) } } if (length(group_levels) == 1 ) { warning (paste0("Only one group level found for group `" , current_group, "`. Will skip this group and proceed to next." ), call. = FALSE ) } else if (length(group_levels) > 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Get marker genes for " , length(group_levels), " groups in `" , current_group, "`..." )) Seurat::Idents(object) <- current_group results <- Seurat::FindAllMarkers(object, only.pos = only_pos, min.pct = min_pct, logfc.threshold = thresh_logFC, return.thresh = thresh_p_val, test.use = test, verbose = verbose, ... ) if (nrow(results) == 0 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] No marker genes found for any of the level of `" , current_group, "`." )) results <- "no_markers_found" } else if (nrow(results) > 0 ) { if (exists("genes_on_cell_surface" ) && "gene" %in % colnames(results)) { results <- results %>% dplyr::mutate(on_cell_surface = .data$gene %in % genes_on_cell_surface) } } if ("cluster" %in % colnames(results)) { results <- results %>% dplyr::rename(`:=`(!!current_group, .data$cluster)) %>% dplyr::select(tidyselect::all_of(current_group), tidyselect::any_of("gene" ), dplyr::everything()) } object@misc[["marker_genes" ]][[name]][[current_group]] <- results } } return (object) }#函数 performGeneSetEnrichmentAnalysis performGeneSetEnrichmentAnalysis<-function (object, assay = "RNA" , GMT_file, groups = NULL , name = "cerebro_GSVA" , thresh_p_val = 0.05 , thresh_q_val = 0.1 , ... ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Loading gene sets..." )) read_GMT_file<-function (file) { gmt <- readr::read_delim(file, delim = ";" , col_names = c("X1" ), col_types = readr::cols()) gene_set_genes <- list() for (i in seq_len(nrow(gmt))) { temp_genes <- strsplit(gmt$X1[i], split = "\t" )[[1 ]] %>% unlist() temp_genes <- temp_genes[3 :length(temp_genes)] gene_set_genes[[i]] <- temp_genes } gene_set_loaded <- list(genesets = gene_set_genes, geneset.names = lapply(strsplit(gmt$X1, split = "\t" ), "[" , 1 ) %>% unlist(), geneset.description = lapply(strsplit(gmt$X1, split = "\t" ), "[" , 2 ) %>% unlist()) return (gene_set_loaded) } gene_sets <- read_GMT_file(GMT_file) names(gene_sets$genesets) <- gene_sets$geneset.names gene_sets_tibble <- tibble::tibble(name = gene_sets$geneset.names, description = gene_sets$geneset.description, length = NA , genes = NA ) for (i in seq_along(gene_sets$genesets)) { gene_sets_tibble$length[i] <- gene_sets$genesets[[i]] %>% length() gene_sets_tibble$genes[i] <- gene_sets$genesets[[i]] %>% unlist() %>% paste(.data, collapse = "," ) } message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Loaded " , length(gene_sets$genesets), " gene sets from GMT file." )) expressed_genes <- Matrix::rowSums(object@assays[[assay]]$counts) expressed_genes <- which(expressed_genes != 0 ) message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Extracting transcript counts " , "from `data` slot of `" , assay, "` assay..." )) matrix_full <- object@assays[[assay]]$counts[expressed_genes, ] object@misc$enriched_pathways[[name]] <- list() for (i in seq_along(groups)) { current_group <- groups[i] if (is.factor(object@meta.data[[current_group]])) { group_levels <- levels(object@meta.data[[current_group]]) } else if (is.character(object@meta.data[[current_group]])) { group_levels <- unique(object@meta.data[[current_group]]) if (any(is.na(group_levels))) { number_of_cells_without_group_assignment <- object@meta.data[[current_group]] %>% is.na() %>% which(. == TRUE ) %>% length() group_levels <- stats::na.omit(group_levels) warning (paste0("Found " , number_of_cells_without_group_assignment, " cell(s) " , "without group assignment (NA) for `" , current_group, "`. These cells will be ignored during the analysis." ), call. = FALSE ) } } message(glue::glue("[" , format(Sys.time(), "%H:%M:%S" ), "] Performing analysis for " , "{length(group_levels)} subgroups of group `{current_group}`..." )) if (length(group_levels) == 1 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Only one group level found " , "for group `" , current_group, "`. Will skip this group and proceed " , "to next." ), call. = FALSE ) results <- NULL } else if (length(group_levels) > 1 ) { matrix_mean_by_group <- future.apply::future_sapply(group_levels, USE.NAMES = TRUE , simplify = TRUE , future.globals = FALSE , function (x) { cells_in_this_group <- which(object@meta.data[[current_group]] == x) if (length(cells_in_this_group) == 1 ) { matrix_full[, cells_in_this_group] } else { Matrix::rowMeans(matrix_full[, cells_in_this_group]) } }) enrichment_scores <- GSVA::gsva(expr = matrix_mean_by_group, gset.idx.list = gene_sets$genesets, ... ) results <- tibble::tibble(group = character(), name = character(), enrichment_score = numeric(), p_value = numeric(), q_value = numeric()) for (i in seq_len(ncol(enrichment_scores))) { temp_results <- tibble::tibble(group = colnames(matrix_mean_by_group)[i], name = rownames(enrichment_scores), enrichment_score = enrichment_scores[, i]) if (nrow(temp_results) < 2 ) { message(paste0("[" , format(Sys.time(), "%H:%M:%S" ), "] Only 1 gene set " , "available, therefore p- and q-values cannot be calculated." )) temp_results <- temp_results %>% dplyr::mutate(p_value = NA , q_value = NA ) } else if (nrow(temp_results) >= 2 ) { temp_p_values <- stats::pnorm(-abs(scale(temp_results$enrichment_score)[, 1 ])) temp_q_values <- suppressWarnings(qvalue::qvalue(temp_p_values, pi0 = 1 , ties = min)$lfdr) temp_results <- temp_results %>% dplyr::mutate(p_value = temp_p_values, q_value = temp_q_values) %>% dplyr::filter(.data$p_value <= thresh_p_val, .data$q_value <= thresh_q_val) } results <- dplyr::bind_rows(results, temp_results) %>% dplyr::arrange(.data$q_value) } results <- dplyr::left_join(results, gene_sets_tibble, by = "name" ) %>% dplyr::select(c("group" , "name" , "description" , "length" , "genes" , "enrichment_score" , "p_value" , "q_value" )) %>% dplyr::rename(`:=`(!!current_group, .data$group)) group_levels_with_results <- group_levels[group_levels %in % results[[current_group]]] results[[current_group]] <- factor(results[[current_group]], levels = group_levels_with_results) message(glue::glue("[" , format(Sys.time(), "%H:%M:%S" ), "] {nrow(results)} gene sets " , "passed the thresholds across all subgroups of group `{current_group}`." )) if (nrow(results) == 0 ) { results <- "no_gene_sets_enriched" } object@misc[["enriched_pathways" ]][[name]][[current_group]] <- results } } return (object) }
13 单基因的表达 13.1单基因的小提琴图 首先,我们发现用宽度而不是面积来衡量小提琴是很有用的。 其次,我们建议对小提琴进行修剪,以避免小提琴延伸到负尺度,这是没有意义的,因为没有任何负的表达值。 第三,在数据中添加一些噪声可能是有意义的,以避免突出显示刻度底部的不同值,例如log(1), log(2)等。在使用Seurat对表达量进行可视化时,这是默认情况下完成的(甚至没有选项)。 在这种情况下,差异非常细微,如下图所示(红细胞和单核细胞)。
p1 <- seurat@meta.data %>% mutate(expression = seurat@assays$SCT@data['CD69',]) %>% ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) + geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) + theme_bw() + scale_fill_manual(values = custom_colors$discrete) + scale_x_discrete() + scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) + labs(title = "CD69 expression (without noise)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), legend.position = 'none' ) p2 <- seurat@meta.data %>% mutate( expression = seurat@assays$SCT@data['CD69',], expression = expression + rnorm(nrow(.))/200 ) %>% ggplot(aes(x = cell_type_singler_blueprintencode_main, y = expression, fill = cell_type_singler_blueprintencode_main)) + geom_violin(draw_quantiles = c(0.5), scale = 'width', trim = TRUE) + theme_bw() + scale_fill_manual(values = custom_colors$discrete) + scale_x_discrete() + scale_y_continuous(name = 'Log-normalized expression', labels = scales::comma) + labs(title = "CD69 expression (with noise)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), legend.position = 'none' ) p1 + p2 + plot_layout(ncol = 1) ggsave( 'plots/expression_cd69_violin_plot.png', p1 + p2 + plot_layout(ncol = 1), height = 6, width = 9 )
13.2 多基因点图 点图可以展示多个基因的表达,通过一个特定的分组变量。接下来的例子中,我们将使用检测到的细胞类型和聚类的标记基因。
13.2.1 根据细胞类型划分 # cells will be grouped by samples that they have been assigned to group_ids <- unique(seurat@meta.data$cell_type_singler_blueprintencode_main) # select a set of genes for which we want to show expression genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$cell_type_singler_blueprintencode_main %>% group_by(cell_type_singler_blueprintencode_main) %>% arrange(p_val_adj) %>% filter(row_number() == 1) %>% arrange(cell_type_singler_blueprintencode_main) %>% pull(gene) #这里若没有DC种类的细胞,则 group_ids = group_ids[group_ids!="DC"] # for every sample-gene combination, calculate the average expression across # all cells and then transform the data into a data frame expression_levels_per_group <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which(seurat@meta.data$cell_type_singler_blueprintencode_main == x) Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group]) } ) %>% t() %>% as.data.frame() %>% mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>% select(cell_type_singler_blueprintencode_main, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(expression = value) %>% mutate(id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene)) # for every sample-gene combination, calculate the percentage of cells in the # respective group that has at least 1 transcript (this means we consider it # as expressing the gene) and then transform the data into a data frame percentage_of_cells_expressing_gene <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which(seurat@meta.data$cell_type_singler_blueprintencode_main == x) Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0) } ) %>% t() %>% as.data.frame() %>% mutate(cell_type_singler_blueprintencode_main = rownames(.)) %>% select(cell_type_singler_blueprintencode_main, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(cell_count = value) %>% left_join( ., seurat@meta.data %>% group_by(cell_type_singler_blueprintencode_main) %>% tally(), by = 'cell_type_singler_blueprintencode_main') %>% mutate( id_to_merge = paste0(cell_type_singler_blueprintencode_main, '_', gene), percent_cells = cell_count / n ) # merge the two data frames created before and plot the data p <- left_join( expression_levels_per_group, percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells), by = 'id_to_merge' ) %>% mutate( cell_type_singler_blueprintencode_main = factor(cell_type_singler_blueprintencode_main, levels = rev(group_ids)), gene = factor(gene, levels = genes_to_show) ) %>% arrange(gene) %>% ggplot(aes(gene, cell_type_singler_blueprintencode_main)) + geom_point(aes(color = expression, size = percent_cells)) + scale_color_distiller( palette = 'Reds', direction = 1, name = 'Log-normalised\nexpression', guide = guide_colorbar(frame.colour = "black", ticks.colour = "black") ) + scale_size(name = 'Percent\nof cells', labels = scales::percent) + labs(y = 'Cell type', color = 'Expression') + coord_fixed() + theme_bw() + theme( axis.title = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) ) p ggsave('plots/marker_genes_by_cell_type_as_dot_plot.png', p, height = 5, width = 6)
13.2.2 根据聚类结果进行划分 group_ids <- levels(seurat@meta.data$seurat_clusters) genes_to_show <- seurat@misc$marker_genes$cerebro_seurat$seurat_clusters %>% group_by(seurat_clusters) %>% arrange(p_val_adj) %>% filter(row_number() == 1) %>% arrange(seurat_clusters) %>% pull(gene) genes_to_show <- genes_to_show[!duplicated(genes_to_show)] expression_levels_per_group <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which(seurat@meta.data$seurat_clusters == x) Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_group]) } ) %>% t() %>% as.data.frame() expression_levels_per_group <- expression_levels_per_group[,!duplicated(colnames(expression_levels_per_group))] %>% mutate(cluster = rownames(.)) %>% select(cluster, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(expression = value) %>% mutate(id_to_merge = paste0(cluster, '_', gene)) percentage_of_cells_expressing_gene <- vapply( group_ids, FUN.VALUE = numeric(length(genes_to_show)), function(x) { cells_in_current_group <- which(seurat@meta.data$seurat_cluster == x) Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_group] != 0) } ) %>% t() %>% as.data.frame() percentage_of_cells_expressing_gene <- percentage_of_cells_expressing_gene[,!duplicated(colnames(percentage_of_cells_expressing_gene))] %>% mutate(cluster = rownames(.)) %>% select(cluster, everything()) %>% pivot_longer( cols = c(2:ncol(.)), names_to = 'gene' ) %>% dplyr::rename(cell_count = value) %>% left_join( ., seurat@meta.data %>% group_by(seurat_clusters) %>% tally() %>% dplyr::rename(cluster = seurat_clusters), by = 'cluster') %>% mutate( id_to_merge = paste0(cluster, '_', gene), percent_cells = cell_count / n ) p <- left_join( expression_levels_per_group, percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells), by = 'id_to_merge' ) %>% mutate( cluster = factor(cluster, levels = rev(group_ids)), gene = factor(gene, levels = genes_to_show) ) %>% arrange(gene) %>% ggplot(aes(gene, cluster)) + geom_point(aes(color = expression, size = percent_cells)) + scale_color_distiller( palette = 'Reds', direction = 1, name = 'Log-normalised\nexpression', guide = guide_colorbar(frame.colour = "black", ticks.colour = "black") ) + scale_size(name = 'Percent\nof cells', labels = scales::percent) + labs(y = 'Cluster', color = 'Expression') + coord_fixed() + theme_bw() + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) ) p ggsave('plots/marker_genes_by_cluster_as_dot_plot.png', p, height = 7, width = 8)
14 基因集中的表达量 我们根据细胞类型划分检查GO通路B_CELL_ACTIVATION中基因的表达量
library(msigdbr) gene_set <- msigdbr(species = 'Homo sapiens', category = 'C5', subcategory = 'BP') %>% filter(gs_name == 'GOBP_B_CELL_ACTIVATION') %>% pull(gene_symbol) %>% unique() gene_set_present <- gene_set[which(gene_set %in% rownames(seurat@assays$SCT@data))] head(gene_set_present) # [1] "ABL1" "ADA" "ADAM17" "ADGRG3" "AHR" "AKAP17A"
14.1 Mean log-expression temp_labels <- seurat@meta.data %>% group_by(cell_type_singler_blueprintencode_main) %>% tally() %>% dplyr::rename('cell_type' = cell_type_singler_blueprintencode_main) p <- tibble( cell_type = seurat@meta.data$cell_type_singler_blueprintencode_main, mean_expression = Matrix::colMeans(seurat@assays$SCT@data[gene_set_present,]) ) %>% ggplot(aes(x = cell_type, y = mean_expression, fill = cell_type)) + geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) + geom_text( data = temp_labels, aes( x = cell_type, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1 ), color = 'black', size = 2.8 ) + theme_bw() + scale_fill_manual(values = custom_colors$discrete) + scale_x_discrete() + scale_y_continuous(name = 'Mean log expression', labels = scales::comma, expand = c(0.08, 0)) + labs(title = "GO term 'B_CELL_ACTIVATION'") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), legend.position = 'none' ) p ggsave('plots/gene_set_expression_b_cell_activation_mean.png', p, height = 4, width = 7)
14.2 Module score seurat <- AddModuleScore( seurat, features = list(gene_set_present), name = 'GO_B_CELL_ACTIVATION', assay = 'SCT' ) temp_labels <- seurat@meta.data %>% group_by(cell_type_singler_blueprintencode_main) %>% tally() p <- seurat@meta.data %>% ggplot( aes( x = cell_type_singler_blueprintencode_main, y = GO_B_CELL_ACTIVATION1, fill = cell_type_singler_blueprintencode_main ) ) + geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) + geom_text( data = temp_labels, aes( x = cell_type_singler_blueprintencode_main, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1 ), color = 'black', size = 2.8 ) + theme_bw() + scale_fill_manual(values = custom_colors$discrete) + scale_x_discrete() + scale_y_continuous(name = 'Module score', labels = scales::comma, expand = c(0.08, 0)) + labs(title = "GO term 'B_CELL_ACTIVATION'") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), legend.position = 'none' ) p ggsave('plots/gene_set_expression_b_cell_activation_module_score.png', p, height = 4, width = 7)
15 GSEA富集分析 15.1 GSVA 为了说明原因,我们将使用来自MSigDB的50个Hallmark基因集,并将单核细胞与造血干细胞(HSC;控制)进行比较。感谢Alessandro Raveane的代码。
# function to read GMT file read_GMT_file <- function(file) { gmt <- readr::read_delim( file, delim = ';', col_names = c('X1'), col_types = readr::cols() ) gene_set_genes <- list() for ( i in seq_len(nrow(gmt)) ) { temp_genes <- strsplit(gmt$X1[i], split = '\t')[[1]] %>% unlist() temp_genes <- temp_genes[3:length(temp_genes)] gene_set_genes[[i]] <- temp_genes } gene_set_loaded <- list( genesets = gene_set_genes, geneset.names = lapply(strsplit(gmt$X1, split = '\t'), '[', 1) %>% unlist(), geneset.description = lapply( strsplit(gmt$X1, split = '\t'), '[', 2 ) %>% unlist() ) return(gene_set_loaded) } # load gene sets from GMT file gene_sets <- read_GMT_file('h.all.v2023.2.Hs.symbols.gmt') # set gene set names names(gene_sets$genesets) <- gene_sets$geneset.names # get indices of cells which are either HSC or monocytes cells_to_analyze <- seurat@meta.data %>% mutate(row_number = row_number()) %>% filter(grepl(cell_type_singler_blueprintencode_main, pattern = 'HSC|Monocytes')) %>% arrange(cell_type_singler_blueprintencode_main) %>% pull(row_number) # get list of genes unique genes across all gene sets genes_to_analyze <- gene_sets$genesets %>% unlist() %>% unique() # filter gene list for those which are present in the data set genes_to_analyze <- genes_to_analyze[which(genes_to_analyze %in% rownames(seurat@assays$SCT@counts))] # get expression matrix and reduce it to cells and genes of interest expression_matrix <- seurat@assays$SCT@counts[ genes_to_analyze , cells_to_analyze] %>% as.matrix() # perform GSVA gsva <- GSVA::gsva( expression_matrix, gset.idx.list = gene_sets$genesets, parallel.sz = 1 ) # load limma library for statistical testing library(limma) #HSC细胞和Monocytes细胞的差异表达分析 # generate design matrix design_matrix <- tibble( control = 1, test = c( rep(0, seurat@meta.data %>% filter(cell_type_singler_blueprintencode_main == 'HSC') %>% nrow()), rep(1, seurat@meta.data %>% filter(cell_type_singler_blueprintencode_main == 'Monocytes') %>% nrow()) ) ) head(design_matrix) # A tibble: 6 x 2 # control test # # 1 1 0 # 2 1 0 # 3 1 0 # 4 1 0 # 5 1 0 # 6 1 0 # fit linear model, followed by empirical Bayes statistics for differential # enrichment analysis fit <- lmFit(gsva, design_matrix) fit <- eBayes(fit) # prepare data for plotting data <- topTable(fit, coef = 'test', number = 50) %>% mutate(gene_set = rownames(fit$t)) %>% arrange(t) %>% mutate( gene_set = factor(gene_set, levels = gene_set), just = ifelse(t < 0, 0, 1), nudge_y = ifelse(t < 0, 1, -1), color = ifelse(t < -5 | t > 5, 'black', 'grey') ) DataFrame(data) # DataFrame with 50 rows and 10 columns # logFC AveExpr t P.Value adj.P.Val B gene_set just nudge_y color # # 1 -0.369443 -0.1569690 -35.1619 1.26066e-186 1.05055e-185 415.980 HALLMARK_TGF_BETA_SIGNALING 0 1 black # 2 -0.251413 -0.0820012 -27.2261 1.96152e-127 8.91598e-127 279.761 HALLMARK_NOTCH_SIGNALING 0 1 black # 3 -0.288169 -0.1202293 -26.1464 1.40712e-119 5.41201e-119 261.689 HALLMARK_ESTROGEN_RESPONSE_EARLY 0 1 black # 4 -0.135034 -0.1511146 -22.3565 8.50194e-93 2.36165e-92 200.094 HALLMARK_INTERFERON_ALPHA_RESPONSE 0 1 black # 5 -0.203651 -0.1049498 -22.0728 7.44006e-91 1.95791e-90 195.628 HALLMARK_INTERFERON_GAMMA_RESPONSE 0 1 black # ... ... ... ... ... ... ... ... ... ... ... # 46 0.200259 0.25341594 35.9250 2.31901e-192 2.31901e-191 429.182 HALLMARK_WNT_BETA_CATENIN_SIGNALING 1 -1 black # 47 0.293113 -0.08115610 36.1995 2.00784e-194 2.50980e-193 433.929 HALLMARK_MITOTIC_SPINDLE 1 -1 black # 48 0.338449 0.11583774 36.2560 7.56196e-195 1.26033e-193 434.906 HALLMARK_CHOLESTEROL_HOMEOSTASIS 1 -1 black # 49 0.251678 0.00262136 37.5117 2.86772e-204 7.16930e-203 456.592 HALLMARK_HYPOXIA 1 -1 black # 50 0.282907 -0.05021404 48.5971 1.72523e-285 8.62616e-284 643.571 HALLMARK_TNFA_SIGNALING_VIA_NFKB 1 -1 black # plot t-value p <- ggplot(data = data, aes(x = gene_set, y = t, fill = t)) + geom_col() + geom_hline(yintercept = c(-5,5), linetype = 'dashed', color = 'grey80') + geom_text( aes( x = gene_set, y = 0, label = gene_set, hjust = just, color = color ), nudge_y = data$nudge_y, size = 3 ) + scale_x_discrete(name = '', labels = NULL) + scale_y_continuous(name = 't-value', limits = c(-55,55)) + scale_fill_distiller(palette = 'Spectral', limits = c(-max(data$t), max(data$t))) + scale_color_manual(values = c('black' = 'black', 'grey' = 'grey')) + coord_flip() + theme_bw() + theme( panel.grid = element_blank(), axis.ticks.y = element_blank(), legend.position = 'none' ) p ggsave('plots/gsva_hallmark_gene_sets_monocytes_vs_hsc.png', height = 7.5, width = 7.5)
16 保存结果和Session info 最后我们将seurat对象保存,可以用于后续分析
saveRDS(seurat, 'data/seurat.rds')
保存我们使用的R和R包版本信息,以便后续进行可重复性分析。
devtools::session_info()