如何使用ggplot函数将内容添加到cnetplot中?

7
我有一个R中的数据集,它是“Formal class enrichResult”的类别。我使用DOSE包中的cnetplot()绘制了该数据集中的基因 - 该函数应该是基于ggplot图形的。它会以一个相互作用通路中的基因网络为图表展示: enter image description here 我的代码如下:
kegg_organism = "hsa"
kegg_enrich <-  enrichKEGG(gene   = df$geneID,
                   organism     = 'hsa',
                   pvalueCutoff = 0.05,
                   pAdjustMethod = 'fdr')

kegg <- setReadable(kegg_enrich, 'org.Hs.eg.db', 'ENTREZID')
kegg_genes <- kegg[,]

gene_of_interest <- dplyr::filter(kegg_genes, grepl('CALML6', geneID))
gene_of_interest <- enrichDF2enrichResult(gene_of_interest)

gene_list_scores <- df$Score
names(gene_list_scores) <- df$geneID
gene_list_scores <- na.omit(gene_list_scores)
gene_list_scores <- sort(gene_list_scores, decreasing = TRUE)

plot <- cnetplot(gene_of_interest, foldChange = gene_list_scores)

plot <- plot + scale_color_gradient2(name='Score', low='steelblue', high='firebrick')

我希望在这个图形上叠加表示药物类型类别的形状,以表示基因所对应的药物类型类别,但我在尝试时遇到了困难。

我的药物数据与enrichResult数据是分开的,我的药物数据如下:

drugs <- structure(list(Gene = c("ACE", "AQP1", "IRS2", "SMAD3", 
"HLA-B"), Druggability = c("KINASE", "DRUGGABLE GENOME", "CLINICALLY ACTIONABLE", 
"KINASE", "CLINICALLY ACTIONABLE")), row.names = c(NA, -5L), class = c("data.table", 
"data.frame"))

#drugs data is 2 columns like:
Gene        Druggability
TLN2        KINASE
PDGFC       DRUGGABLE GENOME 

我正在编写代码来叠加药物形状在图表上,像这样:
drugs <- fread('genes_dgidb_export.tsv')
drugs <- dplyr::select(drugs, Gene, Druggability)
drugs <- drugs[1:80,] #making data same size otherwise I get an Aesthetics number mismatch error
Druggability <- drugs$Druggability
names(Druggability) <- drugs$Gene

options(ggrepel.max.overlaps = Inf)
pother <- cnetplot(gene_of_interest,
                   categorySize ='pvalue', 
                   foldChange = gene_list_scores, 
                  )

pother <- pother + scale_color_gradient2(name='Score', low='steelblue', high='red') +
  scale_size_continuous(range = c(2, 8)) 

#Overlaying shapes by drug:
library(ggraph)

pother + geom_node_point(aes(shape=Druggability)) +
  scale_shape_manual(values=c(2, 5, 3, 4))

在这里输入图片描述

但是,这里叠加的形状不匹配/位置错误,并且将药物分配给了通路节点中的米色部分,而它们应该只被分配给红色基因节点。

除了现有函数之外,是否还有其他函数可用于在此图中叠加形状点以正确对应基因?

示例输入数据和用于获取enrichResult的软件包:

library(data.table)
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(DOSE)
library(clusterProfiler)
library(multienrichjam)
library(RColorBrewer)

df <- structure(list(geneID = c(107986084L, 100874369L, 100506380L, 100288797L, 
100137049L, 100130742L, 723788L, 643136L, 641649L, 497258L), 
    Score = c(0.809859097, 0.913054347, 0.423823357, 0.369738668, 
    0.798110485, 0.78013134, 0.764999211, 0.231925398, 0.317150593, 
    0.754656732)), row.names = c(NA, -10L), class = c("data.table", 
"data.frame"))

kegg_genes <- structure(list(ID = c("hsa04924", "hsa04925", "hsa04022", "hsa04934", 
"hsa05166", "hsa04218", "hsa05410", "hsa04024", "hsa05414", "hsa04933"
), Description = c("Renin secretion", "Aldosterone synthesis and secretion", 
"cGMP-PKG signaling pathway", "Cushing syndrome", "Human T-cell leukemia virus 1 infection", 
"Cellular senescence", "Hypertrophic cardiomyopathy", "cAMP signaling pathway", 
"Dilated cardiomyopathy", "AGE-RAGE signaling pathway in diabetic complications"
), GeneRatio = c("22/381", "25/381", "31/381", "28/381", "33/381", 
"27/381", "20/381", "32/381", "20/381", "20/381"), BgRatio = c("69/8093", 
"98/8093", "167/8093", "155/8093", "222/8093", "156/8093", "90/8093", 
"219/8093", "96/8093", "100/8093"), pvalue = c(2.67752556162864e-13, 
1.75172505637096e-12, 3.06146322777988e-11, 5.54589140412457e-10, 
2.99051127478473e-09, 3.0449722225371e-09, 4.26754375382907e-09, 
8.08219152477885e-09, 1.39375994107875e-08, 2.90351315318027e-08
), p.adjust = c(8.19322821858365e-11, 2.68013933624758e-10, 3.12269249233547e-09, 
4.2426069241553e-08, 1.55293583349392e-07, 1.55293583349392e-07, 
1.86552626953099e-07, 3.09143825822791e-07, 4.73878379966775e-07, 
8.88475024873162e-07), qvalue = c(4.70680809254719e-11, 1.53967412849448e-10, 
1.79391003171663e-09, 2.43727332760211e-08, 8.92123440638062e-08, 
8.92123440638062e-08, 1.0716989577285e-07, 1.77595524294483e-07, 
2.72231473871522e-07, 5.10407049032742e-07), geneID = c("CALML6/CALML4/PLCB1/CLCA2/PPP3CA/PLCB3/PDE3A/PDE1A/NPR1/NPPA/GNAS/EDNRA/EDN1/ACE/CACNA1D/CACNA1C/AQP1/AGT/ADRB2/ADRB1/ADORA1/ADCY5", 
"CALML6/CALML4/KCNK9/PRKD3/PLCB1/CREB5/PRKD1/PRKCE/PLCB3/NPR1/NPPA/LDLR/KCNK3/GNAS/CYP21A2/CYP11B2/CAMK2B/CACNA1D/CACNA1C/ATP2B1/ATF1/AGT/ADCY9/ADCY5/ADCY3", 
"CALML6/CALML4/PLCB1/CREB5/IRS2/PDE5A/SLC8A1/MAP2K2/PRKCE/PPP3CA/PLCB3/PDE3A/NPR1/NPPB/NPPA/NFATC2/MEF2A/INSR/GTF2I/EDNRA/CACNA1D/CACNA1C/ATP2B1/AKT2/ADRB2/ADRB1/ADRA2B/ADORA1/ADCY9/ADCY5/ADCY3", 
"WNT3A/LEF1/PDE11A/PLCB1/CREB5/TCF7L2/MAP2K2/PLCB3/PBX1/LDLR/KCNK3/GNAS/DVL1/CYP21A2/CYP17A1/CYP11B1/CDKN2C/CDKN1A/CDK6/CCNE1/CAMK2B/CACNA1D/CACNA1C/CCND1/AGT/ADCY9/ADCY5/ADCY3", 
"TLN2/VAC14/CREB5/CDC16/KAT2B/PIK3R3/MAD1L1/XPO1/TLN1/TGFB2/TGFB1/TERT/STAT5B/RELA/PTEN/MAP2K2/PPP3CA/NFKBIA/NFATC2/SMAD3/IL6/HLA-DQB1/HLA-DQA1/HLA-B/CDKN2C/CDKN1A/CDC20/CCNE1/CCND1/AKT2/ADCY9/ADCY5/ADCY3", 
"CALML6/CALML4/TRPM7/HIPK2/BTRC/PIK3R3/TGFB2/TGFB1/RRAS/RELA/PTEN/MAP2K2/PPP3CA/NFATC2/SMAD3/IL6/IGFBP3/HLA-B/FOXO3/CDKN1A/CDK6/CDC25A/CCNE1/CACNA1D/ZFP36L1/CCND1/AKT2", 
"PRKAG2/CACNA2D2/TGFB2/TGFB1/SLC8A1/SGCD/PRKAG1/ITGB5/ITGA9/ITGA1/IL6/IGF1/EDN1/ACE/DAG1/CACNB4/CACNB2/CACNA1D/CACNA1C/AGT", 
"CALML6/CALML4/PDE10A/CREB5/PIK3R3/RRAS/RELA/MAP2K2/PDE3A/NPR1/NPPA/NFKBIA/HTR1D/GRIN2B/GNAS/GIPR/GIP/EDNRA/EDN1/CHRM2/CAMK2B/CACNA1D/CACNA1C/ATP2B1/AKT2/ADRB2/ADRB1/ADORA1/ADCY9/ADCY5/ADCY3/ACOX1", 
"CACNA2D2/TGFB2/TGFB1/SLC8A1/SGCD/ITGB5/ITGA9/ITGA1/IGF1/GNAS/DAG1/CACNB4/CACNB2/CACNA1D/CACNA1C/AGT/ADRB1/ADCY9/ADCY5/ADCY3", 
"PLCD3/NOX4/PLCB1/PIK3R3/VEGFA/TGFB2/TGFB1/STAT5B/RELA/PRKCE/PLCB3/SMAD3/IL6/FN1/EDN1/COL4A4/BCL2/CCND1/AKT2/AGT"
), Count = c(22L, 25L, 31L, 28L, 33L, 27L, 20L, 32L, 20L, 20L
)), row.names = c("hsa04924", "hsa04925", "hsa04022", "hsa04934", 
"hsa05166", "hsa04218", "hsa05410", "hsa04024", "hsa05414", "hsa04933"
), class = "data.frame")

使用上面示例数据运行后输出的图表(之前的图表是整个数据的):

enter image description here

我在红色下划线处用Paint标出了已分配可药性的基因,但由于某种原因,这些形状会连接到通路节点。


无法重现,gene_of_interest为空。 - zx8754
但我猜问题出在这一行:geom_node_point,尝试在绘图之前对数据进行子集化,仅包括在“可药性”中存在的基因。 - zx8754
谢谢你的帮助,可能样本不够大无法通过kegg富集分析。我已经尝试提供kegg_genes作为样本,现在应该可以运行到绘图了(输出不同的通路但原理相同)。代码现在应该都能运行并得出结果,唯一的问题是在df中只给出了10个样本基因和它们的Scores,所以绘图点将不会被标记为红色,但我知道这一步对我来说是有效的,所以我确保我的Druggability数据中的基因应该在示例图上显示形状。 - DN1
尽管我的真实数据中的Druggability基因列表仅由进入cnetplot的基因组成(没有具有Druggability的基因在该列中被分配为“NA”),但没有任何未出现在图中的额外基因。 - DN1
查看cnetplot函数的源代码,它看起来很复杂,但是可以修改该函数以获得您想要的效果。也许可以尝试联系作者:于广闯(guangchuangyu@gmail.com / https://twitter.com/guangchuangyu)。 - jared_mamrot
显示剩余3条评论
1个回答

1
  1. 我使用了clusterProfiler的示例来使代码可重现(https://yulab-smu.top/biomedical-knowledge-mining-book/universal-api.html)

  2. 我使用了https://www.dgidb.org/downloads中的categories.tsv文件

library(clusterProfiler)
library(dplyr)
library(ggraph)
library(msigdbr)

data(geneList, package="DOSE")
cat_table = read.table("categories.tsv",sep="\t",header = T,quote = "" )
geneList=geneList[c(1:50,12476:12495)] # a 70 gene sub list to simplify the plot
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% 
  dplyr::select(gs_name, entrez_gene) 
em2 <- GSEA(geneList, TERM2GENE = m_t2g)
em2 = setReadable(em2, 'org.Hs.eg.db', 'ENTREZID')
p = cnetplot(em2,foldChange = geneList)
m = match(p$data$name ,cat_table$entrez_gene_symbol)
category = cat_table$category[m]
p + geom_node_point(aes(shape= category))

enter image description here

这里的重点是将基因名称与包含基因名称和通路名称的cnetplot对象(p$data$name)中的名称相匹配,因此匹配至关重要。
进行验证。
cat_table[m[!is.na(m)],c(1,4)]

       entrez_gene_symbol              category
6211               KIF23                ENZYME
13765              CENPE                KINASE

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接