Biological Interpretability
In this tutorial, we will show how to explain the extracted features using CellMirror feature embeddings and linear decoder weights of cLDVAE. We continue with the previous example of tumros and cell lines data integration.
Gene set enrichment analysis for extracted features (R)
1library(here)
2library(magrittr)
3library(tidyverse)
4source(here::here('CellMirror_utils','CellMirror_methods.R'))
5
6s.loadings<-read.csv('C:\\Users\\我的电脑\\Desktop\\待办\\en[1000]_de[1000]_cLDVAE_only_salient_loadings_matrix_lr3e-06_beta1_gamma-100_bs128_time2023-03-22 12_31_29.075603.csv')
7gene_stats<-read.csv('C:\\Users\\我的电脑\\Desktop\\待办\\common_HVGs_noScale_stats_SD_mean_time2023-01-20 13_48_31.884973.csv')
8colnames(s.loadings)[1]<-'ensembl_gene_id'
9colnames(gene_stats)[1]<-'ensembl_gene_id'
10df<-s.loadings%>%dplyr::left_join(gene_stats[,c('ensembl_gene_id','symbol')],by='ensembl_gene_id')
11rownames(df)<-df$ensembl_gene_id
12df<-df[,-1]
13
14gsc_data <- GSEABase::getGmt("C:\\Users\\我的电脑\\Desktop\\待办\\c5.go.v2022.1.Hs.symbols.gmt")
15
16i <- 2
17gene_stat<-set_names(df[,i],df$symbol)
18
19cPCA_GSEA <- run_fGSEA(gsc = gsc_data,
20 gene_stat = gene_stat,
21 nperm = 1e5,
22 perm_type = 'gene') %>%
23dplyr::arrange(dplyr::desc(NES)) %>%
24dplyr::select(-leadingEdge)
25
26cPCA_GSEA_data <- rbind.data.frame(cPCA_GSEA %>% dplyr::arrange(dplyr::desc(NES)) %>% head(8))
27cPCA_GSEA_data$pathway <- factor(cPCA_GSEA_data$pathway, levels = cPCA_GSEA_data$pathway)
28cPCA_GSEA_data$pathway <- factor(cPCA_GSEA_data$pathway, levels = unique(cPCA_GSEA_data$pathway))
29cPCA_GSEA_data <- as.data.frame(cPCA_GSEA_data)
30cPCA_GSEA_data$pval <- signif(cPCA_GSEA_data$pval, 3)
31cPCA_GSEA_data$`adjusted pval` <- signif(cPCA_GSEA_data$padj, 3)
32cPCA_GSEA_data$NES <- signif(cPCA_GSEA_data$NES, 3)
33
34cPCA_1<-cPCA_GSEA_data[,c('pathway','NES','pval')]
35colnames(cPCA_1)<-c('pathway','cPCA salient feature 1 NES','cPCA salient feature 1 pval')
36
37cPCA_2<-cPCA_GSEA_data[,c('pathway','NES','pval')]
38colnames(cPCA_2)<-c('pathway','cPCA salient feature 2 NES','cPCA salient feature 2 pval')
39
40cLDVAE_1<-cPCA_GSEA_data[,c('pathway','NES','pval')]
41colnames(cLDVAE_1)<-c('pathway','cLDVAE salient feature 1 NES','cLDVAE salient feature 1 pval')
42
43cLDVAE_2<-cPCA_GSEA_data[,c('pathway','NES','pval')]
44colnames(cLDVAE_2)<-c('pathway','cLDVAE salient feature 2 NES','cLDVAE salient feature 2 pval')
45
46temp<-cPCA_1 %>% dplyr::full_join(cPCA_2, by="pathway") %>% dplyr::full_join(cLDVAE_1, by="pathway") %>% dplyr::full_join(cLDVAE_2, by="pathway")
47
48data<-c()
49for (p in temp$pathway){
50for (f in c("cPCA salient feature 1 NES","cPCA salient feature 2 NES","cLDVAE salient feature 1 NES","cLDVAE salient feature 2 NES")){
51 if(!is.na(temp[temp$pathway==p,f])){
52 add<-c(p,f,temp[temp$pathway==p,f])
53 data<-rbind(data,add)
54 }
55}
56}
57data<-as.data.frame(data)
58colnames(data)<-c("pathway","feature","NES")
59for (p in data$pathway){
60for (f in data$feature){
61 data[(data$pathway==p & data$feature==f),"pval"]<-temp[temp$pathway==p,str_replace(f,"NES","pval")]
62}
63}
64data$feature<-sapply(data$feature,function(x) str_replace(x,pattern = "NES",replacement = ""))
65data$NES<-as.numeric(data$NES)
66
67ggplot(data = data, aes(x=feature,y=pathway))+
68geom_point(aes(color=pval,size=NES))+
69scale_color_continuous(high="#fff5ee",low="#8b0000")+
70#scale_color_continuous(high="#FFFFCC",low="#FD6E32")+
71#scale_color_continuous(high="#55B0F5",low="#28547A")+
72theme_bw()+
73guides()+
74theme(axis.text.x = element_text(size=9),
75 axis.text.y = element_text(size=9))+
76scale_y_discrete(limits=data$pathway)
Scatter plot between salient features and tumor purity (R)
1library(magrittr)
2ann<-read.csv('C:\\Users\\我的电脑\\Desktop\\待办\\en[1000]_de[1000]_s2_z100_beta1_gamma-100_lr3e-6_4th_cLDVAE_mnn_k1_80_k2_100_comb_Ann_agg0.612_time2023-03-23 01_50_42.573336.csv')
3tg_s<-read.csv('C:\\Users\\我的电脑\\Desktop\\待办\\en[1000]_de[1000]_cLDVAE_only_TCGA_salient_features_lr3e-06_beta1_gamma-100_bs128_dim2_time2023-03-22 12_31_29.044376.csv')
4tg_s_cPCA<-read.csv('C:\\Users\\我的电脑\\Desktop\\待办\\cPCA_only_salient_features_s2_z100 2023-03-23 02_19_53 .csv')
5colnames(tg_s)[1]<-'sampleID'
6colnames(tg_s_cPCA)[1]<-'sampleID'
7tumor_ann<-ann[!is.na(ann$purity) & ann$type=='tumor', ]
8
9df <- tg_s %>% dplyr::inner_join(tumor_ann[,c('sampleID','purity')], by = 'sampleID') %>% dplyr::left_join(tg_s_cPCA, by='sampleID')
10
11purity<-df$purity
12salients<-df[,c('s1','s2','PC1','PC2')]
13
14library(lars)
15
16model.lasso<-lars(as.matrix(salients),purity,type='lasso')
17plot(model.lasso)
18#summary(model.lasso)
19
20cv.model.lasso<-cv.lars(as.matrix(salients),purity,K=10)
21select<-cv.model.lasso$index[which.min(cv.model.lasso$cv)]
22coef<-coef.lars(model.lasso,mode='fraction',s=select)
23coef[which(coef!=0)]
24coef[which.min(coef)]
25
26
27p4<-ggplot2::ggplot(salients,
28 ggplot2::aes(salients[,'PC2'], purity)) +
29ggplot2::geom_point(size=0.5,col='#EFD092') +
30ggplot2::ylab('Tumor purity estimate') +
31ggplot2::xlab('Salient feature 2') +
32ggpubr::stat_cor(label.y.npc = 'bottom', size=5) +
33ggplot2::geom_smooth(method = 'lm',col='Red') +
34ggplot2::theme_classic() +
35ggplot2::theme(axis.text=ggplot2::element_text(size=12),
36 axis.title.x = ggplot2::element_text(size=12),
37 axis.title.y = ggplot2::element_text(size=12),
38 axis.line = element_line(size=1.1))