小提琴图是用来展示多组数据的分布状态以及概率密度 。现在生物信息学数据展示最常用的可视化方式之一。

关于小提琴的代码之前已经介绍了很多,大家可以直接通过本站的搜索框搜索。本文直接通过示例绘制分组小提琴绘制,解决大样本数据分析难点。


library(ggplot2)
library(dplyr)

# 来源https://github.com/KPLab/SCS_CAF

# violin plots represent gene expression for each subpopulation. The color of each violin
# represents the mean gene expression after log2 transformation.

# gene: Gene name of interest as string. DATAuse: Gene expression matrix with rownames containing 
# gene names. tsne.popus = dbscan output, axis= if FALSE no axis is printed. legend_position= default "none" 
# indicates where legend is plotted. gene_name = if FALSE gene name will not be included in the plot.

plot.violin2 <-
    function(gene,
             Data,
             Group,
             axis = FALSE,
             legend_position = "none",
             gene_name = FALSE) {
        testframe <-
            data.frame(expression = as.numeric(Data[paste(gene), ]),
                       Population = Group)
        testframe$Population <- as.factor(testframe$Population)
        colnames(testframe) <- c("expression", "Population")
        
        col.mean <- vector()
        for (i in levels(testframe$Population)) {
            col.mean <-
                c(col.mean, mean(testframe$expression[which(testframe$Population == i)]))
        }
        col.mean <- log2(col.mean + 1)
        
        col.means <- vector()
        
        for (i in testframe$Population) {
            col.means <- c(col.means, col.mean[as.numeric(i)])
        }
        testframe$Mean <- col.means
        testframe$expression <- log2(testframe$expression + 1)
        
        p <-
            ggplot(testframe,
                   aes(
                       x = Population,
                       y = expression,
                       fill = Mean,
                       color = Mean
                   )) +
            geom_violin(scale = "width") +
            labs(title = paste(gene),
                 y = "log2(expression)",
                 x = "Population") +
            theme_classic() +
            
            scale_color_gradientn(
                colors = c("#FFFF00", "#FFD000", "#FF0000", "#360101"),
                limits = c(0, 14)
            ) +
            scale_fill_gradientn(
                colors = c("#FFFF00", "#FFD000", "#FF0000", "#360101"),
                limits = c(0, 14)
            ) +
            
            #theme(axis.title.y =  element_blank()) +
            theme(axis.ticks.y =  element_blank()) +
            theme(axis.line.y =   element_blank()) +
            theme(axis.text.y =   element_blank()) +
            theme(axis.title.x = element_blank()) +
            
            theme(legend.position = legend_position)
        
        if (axis == FALSE) {
            p <- p +
                theme(
                    axis.line.x = element_blank(),
                    axis.text.x = element_blank(),
                    axis.ticks.x = element_blank()
                )
        }
        if (gene_name == FALSE) {
            p <- p +  theme(plot.title = element_blank())
        } else{
            p <- p + theme(plot.title = element_text(size = 10, face = "bold"))
        }
        p + ylab(gene)
    }

# 演示数据
data <- read.csv("data.csv", header = T, row.names = 1)
group <- read.csv("cluster.csv", header = F)$V1


gs <- list(NULL)

gs[[1]] <- plot.violin2(gene = "Nuf2",
                        Data = data,
                        Group = group)

gs[[2]] <- plot.violin2(gene = "Mki67",
                        Data = data,
                        Group = group)

gs[[3]] <- plot.violin2(gene = "Top2a",
                        Data = data,
                        Group = group)

gs[[4]] <- plot.violin2(gene = "Cep55",
                        Data = data,
                        Group = group,
                        axis = T)
# 图片分组
pdf("violin.pdf", width = 3, height = 6)
gridExtra::grid.arrange(grobs=gs,ncol = 1)
dev.off()

参考资料:

1.https://github.com/KPLab/SCS_CAF