library(HoneyBADGER)
library(biomaRt)
library(pheatmap)
library(uwot)
library(ggfortify)
Loading required package: ggplot2
Reading data from GEO.
exp_norm<- read.table("data/RSEM_TPM_240_NormalCells.txt")
exp_cancer<- read.table("data/GSE118389_tpm_rsem.txt")
meta_celltypes<- read.table("data/cell_types_tab_S9.txt", stringsAsFactors = FALSE, row.names=2)
meta_celltypes[,1]<- NULL
dim(meta_celltypes)
[1] 1112 1
names(meta_celltypes)<- "cell"
length(intersect(rownames(meta_celltypes), colnames(exp_cancer)))
[1] 1112
Taking a look at the data
dim(exp_norm)
[1] 23368 240
head(exp_norm[,1:10])
dim(exp_cancer)
[1] 21785 1534
head(exp_cancer[,1:10])
exp_cancer<- exp_cancer[, rownames(meta_celltypes)]
hist(colSums(exp_cancer))
hist(colSums(exp_norm))
In this case we are going to use a TPM filter of 10 for both the reference and test datasets.
mean_gene_norm<-rowMeans(exp_norm)
summary(mean_gene_norm)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.01 1.34 30.80 14.81 45114.25
length(which(mean_gene_norm> 10))
[1] 7221
exp_norm<- exp_norm[mean_gene_norm>10,]
mean_gene_cancer<- rowMeans(exp_cancer)
summary(mean_gene_cancer)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.375 5.749 44.176 25.944 21644.406
length(which(mean_gene_cancer> 10))
[1] 9041
exp_cancer<- exp_cancer[mean_gene_cancer>10,]
inter_genes<- intersect(rownames(exp_cancer), rownames(exp_norm))
length(inter_genes)
[1] 5560
exp_cancer<- exp_cancer[inter_genes,]
exp_norm <- exp_norm[inter_genes,]
set.seed(43)
sce_cells_umap <- umap(t(exp_cancer),
n_components = 2,
#a=1,
#b=0.6,
min_dist=0.01,
metric="cosine",
verbose = TRUE,
n_threads = max(1, RcppParallel::defaultNumThreads()/2) )
11:54:53 UMAP embedding parameters a = 1.896 b = 0.8006
11:54:53 Read 1112 rows and found 5560 numeric columns
11:54:53 Using Annoy for neighbor search, n_neighbors = 15
11:54:53 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:54:54 Writing NN index file to temp file /tmp/Rtmp4veicN/filef10477e73ea
11:54:54 Searching Annoy index using 4 threads, search_k = 1500
11:54:56 Annoy recall = 100%
11:54:56 Commencing smooth kNN distance calibration using 4 threads
11:54:57 Initializing from normalized Laplacian + noise
11:54:57 Commencing optimization for 500 epochs, with 24190 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:54:59 Optimization finished
colnames(sce_cells_umap)<- c("UMAP1", "UMAP2")
rownames(sce_cells_umap)<- colnames(exp_cancer)
sce_cells_umap<- cbind(sce_cells_umap, meta)
head(sce_cells_umap)
NA
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=cell) )+
geom_point()
meta<- data.frame( row.names=colnames(exp_cancer),
patient= sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[1]),
plate= sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[2]),
well = sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[3]),
stringsAsFactors=FALSE)
length(intersect(rownames(meta), rownames(meta_celltypes)))
[1] 1112
meta<- cbind(meta, meta_celltypes)
head(meta)
NA
mart.obj <- useMart(biomart = "ensembl", dataset = 'hsapiens_gene_ensembl', host = "feb2014.archive.ensembl.org") ## version used in manuscript
PT039_meta<- meta[meta$patient == "PT039",]
## Subsetting cells
set.seed(43)
#cells_keep<- sample(rownames(PT039_meta), 100)
cells_keep<- rownames(PT039_meta)
PT039_meta<- PT039_meta[cells_keep,]
head(PT039_meta)
PT039_exp<- exp_cancer[,cells_keep]
Making a new HoneyBadger object
hb <- new('HoneyBADGER', name='PT039')
hb$setGexpMats(log2(as.matrix(PT039_exp)+1), log2(exp_norm+1), mart.obj, filter=FALSE, scale=TRUE, verbose=TRUE)
Initializing expression matrices ...
Scaling coverage ...
Normalizing gene expression for 5560 genes and 273 cells ...
`select_()` is deprecated as of dplyr 0.7.0.
Please use `select()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
Batch submitting query [================>--------------------------------------------------------------------------------------] 17% eta: 4s
Batch submitting query [=================================>---------------------------------------------------------------------] 33% eta: 3s
Batch submitting query [===================================================>---------------------------------------------------] 50% eta: 2s
Batch submitting query [====================================================================>----------------------------------] 67% eta: 1s
Batch submitting query [=====================================================================================>-----------------] 83% eta: 1s
Done setting initial expression matrices!
hb$plotGexpProfile() ## initial visualization
hb$setMvFit(verbose=TRUE) ## model variance
Modeling expected variance ... Done!
hb$setGexpDev(verbose=TRUE) ## model necessary expression deviation to identify CNVs
Optimal deviance: 0.6496971
hb$calcGexpCnvBoundaries(init=TRUE, verbose=FALSE) ## HMM
Loading required package: rjags
Loading required package: coda
Linked to JAGS 4.3.0
Loaded modules: basemod,bugs
NULL
Double check what CNVs were identified
bgf <- hb$bound.genes.final
genes <- hb$genes
regions.genes <- range(genes[unlist(bgf)])
print(regions.genes)
GRanges object with 3 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr12 861759-33049774 *
[2] chr2 75873909-242668893 *
[3] chr3 3168600-57307496 *
-------
seqinfo: 58 sequences from an unspecified genome; no seqlengths
The initial HMM step identified a number of candidate CNVs to test.
Now, we can use the bayesian model to derive the final posterior probability of each CNV in each cell.
Indeed, our initial HMM has identified a number of candidate CNVs to test. We can now retest all identified CNVs on all cells to derive the final posterior probability of each CNV in each cell.
hb$retestIdentifiedCnvs(retestBoundGenes = TRUE, retestBoundSnps = FALSE, verbose=FALSE)
## look at final results
results <- hb$summarizeResults(geneBased=TRUE, alleleBased=FALSE)
print(head(results[,1:5]))
Cluster cells on these posterior probabilities and visualize them as a heatmap.
trees <- hb$visualizeResults(geneBased=TRUE, alleleBased=FALSE, details=TRUE, margins=c(5,5))
We can again visualize our results, this time, ordering the cells based on their posterior probabilities of harboring CNVs.
## order cells
hc <- trees$hc
order <- hc$labels[hc$order]
## plot all chromosomes
hb$plotGexpProfile(cellOrder=hc$order)
We can compare this result with the one from the paper (Figure 2D,E):
We observe also a highlighted amplification in chr12, that was later confirmed by WES (Figure 2E)
## plot just identified cnvs
hb$plotGexpProfile(cellOrder=hc$order, region=hb$cnvs[['gene-based']][['amp']])
hb$plotGexpProfile(cellOrder=hc$order, region=hb$cnvs[['gene-based']][['del']])
Finally, we can also visualize the results from HoneyBadger into our UMAPs.
head(results)
prob_res<- data.frame(t(results[,8:ncol(results)]))
colnames(prob_res)<- paste0("prob_", results$seqnames, "_", results$start, "_", results$end)
length(intersect(rownames(prob_res), rownames(sce_cells_umap)))
[1] 273
sce_cells_umap<- cbind(sce_cells_umap,prob_res[rownames(sce_cells_umap),])
head(sce_cells_umap)
NA
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=cell) )+
geom_point()
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=patient) )+
geom_point()
sessionInfo()