一、总体代码
Seurat包官方文档https://satijalab.org/seurat/articles/pbmc3k_tutorial
#加载所需R包library(Seurat)library(tidyverse)library(dplyr)library(patchwork)library(ggplot2)#读取10X数据scRNA.counts=Read10X(“~/path)#创建Seurat对象scRNA <- CreateSeuratObject(scRNA.counts ,min.cells = 3,project="os", min.features = 300)#去除线粒体和红细胞相关的基因##去除线粒体scRNA[["percent.mt"]] <- PercentageFeatureSet(scRNA, pattern = "^MT-")##红细胞基因比例HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")HB_matched <- match(HB.genes, rownames(scRNA@assays$RNA))HB.genes <- rownames(scRNA@assays$RNA)[HB_matched]HB.genes <- HB.genes[!is.na(HB.genes)]scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes)#观察数据结构levels(scRNA@active.ident)col.num <- length(levels(scRNA@active.ident))violin <- VlnPlot(scRNA,features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"),cols =rainbow(col.num),pt.size = 0.01,ncol = 4) +theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())#均一化数据scRNAa <- NormalizeData(scRNAa, normalization.method = "LogNormalize", scale.factor = 10000)#寻找高变基因scRNAa <- FindVariableFeatures(scRNAa, selection.method = "vst", nfeatures = 2000)top10 <- head(VariableFeatures(scRNA1), 10)plot1 <- VariableFeaturePlot(scRNA1)plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5)plot <- CombinePlots(plots = list(plot1, plot2),legend="bottom")#数据缩放scale.genes <- rownames(scRNAa)scRNAb <- ScaleData(scRNAa, features = scale.genes)#去除周期的影响CaseMatch(c(cc.genes$s.genes,cc.genes$g2m.genes),VariableFeatures(scRNA1))g2m_genes = cc.genes$g2m.genesg2m_genes = CaseMatch(search = g2m_genes, match = rownames(scRNA1))s_genes = cc.genes$s.geness_genes = CaseMatch(search = s_genes, match = rownames(scRNA1))scRNA11 <- CellCycleScoring(object=scRNA11, g2m.features=g2m_genes, s.features=s_genes)scRNA2 <- CellCycleScoring(object=scRNA2, g2m.features=g2m_genes, s.features=s_genes)#去除周期的影响library(future)plan("multicore",workers = 128)plan("multisession", workers = 64)options(future.globals.maxSize= 2147483648)scRNAb <- ScaleData(scRNA11, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(scRNA1))#数据降维scRNAb1 <- RunPCA(scRNAb, features = VariableFeatures(scRNAb))plot1 <- DimPlot(scRNAb1, reduction = "pca", group.by="orig.ident")plot1ElbowPlot(scRNAb1, ndims=20, reduction="pca")pc.num=1:20scRNAb1 <- FindNeighbors(scRNAb1, dims = pc.num)scRNAb1 <- FindClusters(scRNAb1, resolution = 1.0)scRNAb1<-BuildClusterTree(scRNAb1)install.packages('ape')PlotClusterTree(scRNAb1)scRNAb1 = RunTSNE(scRNAb1, dims = pc.num)embed_tsne <- Embeddings(scRNAb1, 'tsne')write.csv(embed_tsne,'embed_tsne.csv')plot1 = DimPlot(scRNAb1, reduction = "tsne")plot1DimPlot(scRNAb1, reduction = "tsne",label = TRUE)ggsave("tSNE.pdf", plot = plot1, width = 8, height = 7)scRNAb1 <- RunUMAP(scRNAb1, dims = pc.num)embed_umap <- Embeddings(scRNAb1, 'umap')write.csv(embed_umap,'embed_umap.csv')plot2 = DimPlot(scRNAb1, reduction = "umap")plot2#加载所需R包 library(Seurat) library(tidyverse) library(dplyr) library(patchwork) library(ggplot2) #读取10X数据 scRNA.counts=Read10X(“~/path) #创建Seurat对象 scRNA <- CreateSeuratObject(scRNA.counts ,min.cells = 3,project="os", min.features = 300) #去除线粒体和红细胞相关的基因 ##去除线粒体 scRNA[["percent.mt"]] <- PercentageFeatureSet(scRNA, pattern = "^MT-") ##红细胞基因比例 HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") HB_matched <- match(HB.genes, rownames(scRNA@assays$RNA)) HB.genes <- rownames(scRNA@assays$RNA)[HB_matched] HB.genes <- HB.genes[!is.na(HB.genes)] scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes) #观察数据结构 levels(scRNA@active.ident) col.num <- length(levels(scRNA@active.ident)) violin <- VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), cols =rainbow(col.num), pt.size = 0.01, ncol = 4) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) #均一化数据 scRNAa <- NormalizeData(scRNAa, normalization.method = "LogNormalize", scale.factor = 10000) #寻找高变基因 scRNAa <- FindVariableFeatures(scRNAa, selection.method = "vst", nfeatures = 2000) top10 <- head(VariableFeatures(scRNA1), 10) plot1 <- VariableFeaturePlot(scRNA1) plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5) plot <- CombinePlots(plots = list(plot1, plot2),legend="bottom") #数据缩放 scale.genes <- rownames(scRNAa) scRNAb <- ScaleData(scRNAa, features = scale.genes) #去除周期的影响 CaseMatch(c(cc.genes$s.genes,cc.genes$g2m.genes),VariableFeatures(scRNA1)) g2m_genes = cc.genes$g2m.genes g2m_genes = CaseMatch(search = g2m_genes, match = rownames(scRNA1)) s_genes = cc.genes$s.genes s_genes = CaseMatch(search = s_genes, match = rownames(scRNA1)) scRNA11 <- CellCycleScoring(object=scRNA11, g2m.features=g2m_genes, s.features=s_genes) scRNA2 <- CellCycleScoring(object=scRNA2, g2m.features=g2m_genes, s.features=s_genes) #去除周期的影响 library(future) plan("multicore",workers = 128) plan("multisession", workers = 64) options(future.globals.maxSize= 2147483648) scRNAb <- ScaleData(scRNA11, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(scRNA1)) #数据降维 scRNAb1 <- RunPCA(scRNAb, features = VariableFeatures(scRNAb)) plot1 <- DimPlot(scRNAb1, reduction = "pca", group.by="orig.ident") plot1 ElbowPlot(scRNAb1, ndims=20, reduction="pca") pc.num=1:20 scRNAb1 <- FindNeighbors(scRNAb1, dims = pc.num) scRNAb1 <- FindClusters(scRNAb1, resolution = 1.0) scRNAb1<-BuildClusterTree(scRNAb1) install.packages('ape') PlotClusterTree(scRNAb1) scRNAb1 = RunTSNE(scRNAb1, dims = pc.num) embed_tsne <- Embeddings(scRNAb1, 'tsne') write.csv(embed_tsne,'embed_tsne.csv') plot1 = DimPlot(scRNAb1, reduction = "tsne") plot1 DimPlot(scRNAb1, reduction = "tsne",label = TRUE) ggsave("tSNE.pdf", plot = plot1, width = 8, height = 7) scRNAb1 <- RunUMAP(scRNAb1, dims = pc.num) embed_umap <- Embeddings(scRNAb1, 'umap') write.csv(embed_umap,'embed_umap.csv') plot2 = DimPlot(scRNAb1, reduction = "umap") plot2#加载所需R包 library(Seurat) library(tidyverse) library(dplyr) library(patchwork) library(ggplot2) #读取10X数据 scRNA.counts=Read10X(“~/path) #创建Seurat对象 scRNA <- CreateSeuratObject(scRNA.counts ,min.cells = 3,project="os", min.features = 300) #去除线粒体和红细胞相关的基因 ##去除线粒体 scRNA[["percent.mt"]] <- PercentageFeatureSet(scRNA, pattern = "^MT-") ##红细胞基因比例 HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") HB_matched <- match(HB.genes, rownames(scRNA@assays$RNA)) HB.genes <- rownames(scRNA@assays$RNA)[HB_matched] HB.genes <- HB.genes[!is.na(HB.genes)] scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes) #观察数据结构 levels(scRNA@active.ident) col.num <- length(levels(scRNA@active.ident)) violin <- VlnPlot(scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), cols =rainbow(col.num), pt.size = 0.01, ncol = 4) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) #均一化数据 scRNAa <- NormalizeData(scRNAa, normalization.method = "LogNormalize", scale.factor = 10000) #寻找高变基因 scRNAa <- FindVariableFeatures(scRNAa, selection.method = "vst", nfeatures = 2000) top10 <- head(VariableFeatures(scRNA1), 10) plot1 <- VariableFeaturePlot(scRNA1) plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, size=2.5) plot <- CombinePlots(plots = list(plot1, plot2),legend="bottom") #数据缩放 scale.genes <- rownames(scRNAa) scRNAb <- ScaleData(scRNAa, features = scale.genes) #去除周期的影响 CaseMatch(c(cc.genes$s.genes,cc.genes$g2m.genes),VariableFeatures(scRNA1)) g2m_genes = cc.genes$g2m.genes g2m_genes = CaseMatch(search = g2m_genes, match = rownames(scRNA1)) s_genes = cc.genes$s.genes s_genes = CaseMatch(search = s_genes, match = rownames(scRNA1)) scRNA11 <- CellCycleScoring(object=scRNA11, g2m.features=g2m_genes, s.features=s_genes) scRNA2 <- CellCycleScoring(object=scRNA2, g2m.features=g2m_genes, s.features=s_genes) #去除周期的影响 library(future) plan("multicore",workers = 128) plan("multisession", workers = 64) options(future.globals.maxSize= 2147483648) scRNAb <- ScaleData(scRNA11, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(scRNA1)) #数据降维 scRNAb1 <- RunPCA(scRNAb, features = VariableFeatures(scRNAb)) plot1 <- DimPlot(scRNAb1, reduction = "pca", group.by="orig.ident") plot1 ElbowPlot(scRNAb1, ndims=20, reduction="pca") pc.num=1:20 scRNAb1 <- FindNeighbors(scRNAb1, dims = pc.num) scRNAb1 <- FindClusters(scRNAb1, resolution = 1.0) scRNAb1<-BuildClusterTree(scRNAb1) install.packages('ape') PlotClusterTree(scRNAb1) scRNAb1 = RunTSNE(scRNAb1, dims = pc.num) embed_tsne <- Embeddings(scRNAb1, 'tsne') write.csv(embed_tsne,'embed_tsne.csv') plot1 = DimPlot(scRNAb1, reduction = "tsne") plot1 DimPlot(scRNAb1, reduction = "tsne",label = TRUE) ggsave("tSNE.pdf", plot = plot1, width = 8, height = 7) scRNAb1 <- RunUMAP(scRNAb1, dims = pc.num) embed_umap <- Embeddings(scRNAb1, 'umap') write.csv(embed_umap,'embed_umap.csv') plot2 = DimPlot(scRNAb1, reduction = "umap") plot2
二、重点函数解释
1.Read10X
Read10X() 函数从 10X 读取 cellranger流程的输出,返回唯一分子识别(UMI)计数矩阵。该矩阵中的值代表在每个细胞(列)中检测到的每个特征(即基因;行)的分子数。
2.CreateSeuratObject
使用计数矩阵创建一个 Seurat 对象。该对象可作为一个容器,同时包含单细胞数据集的数据(如计数矩阵)和分析(如 PCA 或聚类结果)。
3.PercentageFeatureSet()
PercentageFeatureSet()函数计算线粒体QC指标,该函数计算源自一组特征的计数百分比。
nFeature_RNA: 可以理解为被检测到的基因数
nCount_RNA: 可以理解为被测序到的转录本的数量
R语言正则表达式常用符号:
4.NormalizeData()
从数据集中移除不需要的细胞后,下一步就是对数据进行归一化处理。默认情况下,我们采用全局缩放归一化方法 "LogNormalize"(对数归一化),将每个细胞的特征表达测量值按总表达量归一化,再乘以缩放因子(默认为 10,000),然后对结果进行对数转换。归一化值存储在 [["RNA"]]@data。
5.FindVariableFeatures()
通过直接模拟单细胞数据固有的均值-方差关系改进了之前的版本,并在 FindVariableFeatures() 函数中实现。默认情况下,会返回每个数据集的 2000 个特征。这些特征将用于下游分析,如 PCA。
6.ScaleData()
应用线性变换("缩放"),这是 PCA 等降维技术之前的标准预处理步骤。ScaleData() 函数
· 移动每个基因的表达量,使细胞间的平均表达量为 0
· 调整每个基因的表达量,使细胞间的方差为 1
· 这一步在下游分析中给予同等权重,因此高表达基因不会占主导地位
结果存储在 [["RNA"]]@scale.data中
在 Seurat v2 中,可以使用 ScaleData() 函数移除单细胞数据集中不需要的变异源。例如,可以 "回归 "与细胞周期阶段或线粒体污染相关的异质性。在 Seurat v3 中,ScaleData()仍然支持这些特征,即
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
7.RunPCA()
决定数据的维度:
JackStrawPlot() 函数提供了一种可视化工具,用于比较每个 PC 的 p 值分布与均匀分布(虚线)。显著 "的 PC 将显示出低 p 值特征的强烈富集(虚线上方的实心曲线)。在这种情况下,在前 10-12 个 PC 之后,显著性会急剧下降。
另一种启发式方法是生成 "肘图":根据每个主成分所解释的方差百分比对主成分进行排序(ElbowPlot() 函数)。可以观察到 PC9-10 周围出现了一个 "弯头",表明前 10 个 PC 中捕捉到了大部分真实信号。
8.FindNeighbors/FindClusters
这些方法将细胞嵌入一个图结构–例如 K 近邻(KNN)图,在具有相似特征表达模式的细胞之间划出边,然后尝试将该图划分为高度相互关联的 "准小区 "或 "社区"。
与 PhenoGraph 一样,首先根据 PCA 空间中的欧氏距离构建一个 KNN 图,然后根据两个单元的局部邻域中的共享重叠(Jaccard 相似性)来细化它们之间的边权重。这一步骤使用 FindNeighbors() 函数执行,并将之前定义的数据集维度(前 10 个 PCs)作为输入。
为了对细胞进行聚类,我们接下来会应用模块化优化技术,如卢万算法(默认值)或 SLM,对细胞进行迭代聚类,目的是优化标准模块化函数。FindClusters() 函数实现了这一过程,并包含一个分辨率参数,用于设置下游聚类的 "粒度",数值越大,聚类数量越多。我们发现,将该参数设置在 0.4-1.2 之间,通常能为 3K 左右的单细胞数据集带来良好的结果。对于更大的数据集,最佳分辨率通常会提高。可以使用 Idents() 函数找到聚类。
- 最新
- 最热
只看作者