Loading libraries


library(HoneyBADGER)
library(biomaRt)
library(pheatmap)
library(uwot)
library(ggfortify)
Loading required package: ggplot2

Loading data

Reading data from GEO.

exp_norm<- read.table("data/RSEM_TPM_240_NormalCells.txt")
exp_cancer<- read.table("data/GSE118389_tpm_rsem.txt")
meta_celltypes<- read.table("data/cell_types_tab_S9.txt", stringsAsFactors = FALSE, row.names=2)
meta_celltypes[,1]<- NULL
dim(meta_celltypes)
[1] 1112    1
names(meta_celltypes)<- "cell"
length(intersect(rownames(meta_celltypes), colnames(exp_cancer)))
[1] 1112

Taking a look at the data

dim(exp_norm)
[1] 23368   240
head(exp_norm[,1:10])
dim(exp_cancer)
[1] 21785  1534
head(exp_cancer[,1:10])
exp_cancer<- exp_cancer[, rownames(meta_celltypes)]
hist(colSums(exp_cancer))

hist(colSums(exp_norm))

Filtering for genes

In this case we are going to use a TPM filter of 10 for both the reference and test datasets.

mean_gene_norm<-rowMeans(exp_norm)
summary(mean_gene_norm)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    0.00     0.01     1.34    30.80    14.81 45114.25 
length(which(mean_gene_norm> 10))
[1] 7221
exp_norm<- exp_norm[mean_gene_norm>10,]
mean_gene_cancer<- rowMeans(exp_cancer)
summary(mean_gene_cancer)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
    0.000     0.375     5.749    44.176    25.944 21644.406 
length(which(mean_gene_cancer> 10))
[1] 9041
exp_cancer<- exp_cancer[mean_gene_cancer>10,]
inter_genes<- intersect(rownames(exp_cancer), rownames(exp_norm))
length(inter_genes)
[1] 5560
exp_cancer<- exp_cancer[inter_genes,]
exp_norm <- exp_norm[inter_genes,]

Generating UMAP of the data

set.seed(43)
sce_cells_umap <- umap(t(exp_cancer),                       
                       n_components = 2,
                       #a=1,
                       #b=0.6,
                       min_dist=0.01,
                       metric="cosine",
                       verbose = TRUE,
                       n_threads = max(1, RcppParallel::defaultNumThreads()/2) )
11:54:53 UMAP embedding parameters a = 1.896 b = 0.8006
11:54:53 Read 1112 rows and found 5560 numeric columns
11:54:53 Using Annoy for neighbor search, n_neighbors = 15
11:54:53 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:54:54 Writing NN index file to temp file /tmp/Rtmp4veicN/filef10477e73ea
11:54:54 Searching Annoy index using 4 threads, search_k = 1500
11:54:56 Annoy recall = 100%
11:54:56 Commencing smooth kNN distance calibration using 4 threads
11:54:57 Initializing from normalized Laplacian + noise
11:54:57 Commencing optimization for 500 epochs, with 24190 positive edges
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:54:59 Optimization finished
colnames(sce_cells_umap)<- c("UMAP1", "UMAP2")
rownames(sce_cells_umap)<- colnames(exp_cancer)
sce_cells_umap<- cbind(sce_cells_umap, meta)

head(sce_cells_umap)
NA
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=cell) )+
  geom_point()

Creating meta data

meta<- data.frame( row.names=colnames(exp_cancer), 
                   patient= sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[1]),
                   plate= sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[2]),
                   well = sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[3]),
                   stringsAsFactors=FALSE)
length(intersect(rownames(meta), rownames(meta_celltypes)))
[1] 1112
meta<- cbind(meta, meta_celltypes)
head(meta)
NA

Running HoneyBadger analysis for PT039

mart.obj <- useMart(biomart = "ensembl", dataset = 'hsapiens_gene_ensembl', host = "feb2014.archive.ensembl.org") ## version used in manuscript
PT039_meta<- meta[meta$patient  == "PT039",]
## Subsetting cells
set.seed(43)
#cells_keep<- sample(rownames(PT039_meta), 100)
cells_keep<- rownames(PT039_meta)
PT039_meta<- PT039_meta[cells_keep,]
head(PT039_meta)
PT039_exp<- exp_cancer[,cells_keep]

Making a new HoneyBadger object

hb <- new('HoneyBADGER', name='PT039')

hb$setGexpMats(log2(as.matrix(PT039_exp)+1), log2(exp_norm+1), mart.obj, filter=FALSE, scale=TRUE, verbose=TRUE)
Initializing expression matrices ... 
Scaling coverage ... 
Normalizing gene expression for 5560 genes and 273 cells ... 
`select_()` is deprecated as of dplyr 0.7.0.
Please use `select()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
Batch submitting query [================>--------------------------------------------------------------------------------------]  17% eta:  4s
Batch submitting query [=================================>---------------------------------------------------------------------]  33% eta:  3s
Batch submitting query [===================================================>---------------------------------------------------]  50% eta:  2s
Batch submitting query [====================================================================>----------------------------------]  67% eta:  1s
Batch submitting query [=====================================================================================>-----------------]  83% eta:  1s
                                                                                                                                              
Done setting initial expression matrices! 
hb$plotGexpProfile() ## initial visualization

hb$setMvFit(verbose=TRUE) ## model variance
Modeling expected variance ... Done!
hb$setGexpDev(verbose=TRUE) ## model necessary expression deviation to identify CNVs
Optimal deviance: 0.6496971
hb$calcGexpCnvBoundaries(init=TRUE, verbose=FALSE) ## HMM
Loading required package: rjags
Loading required package: coda
Linked to JAGS 4.3.0
Loaded modules: basemod,bugs
NULL

Double check what CNVs were identified

bgf <- hb$bound.genes.final
genes <- hb$genes
regions.genes <- range(genes[unlist(bgf)])
print(regions.genes)
GRanges object with 3 ranges and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]    chr12    861759-33049774      *
  [2]     chr2 75873909-242668893      *
  [3]     chr3   3168600-57307496      *
  -------
  seqinfo: 58 sequences from an unspecified genome; no seqlengths

The initial HMM step identified a number of candidate CNVs to test.

Now, we can use the bayesian model to derive the final posterior probability of each CNV in each cell.

Indeed, our initial HMM has identified a number of candidate CNVs to test. We can now retest all identified CNVs on all cells to derive the final posterior probability of each CNV in each cell.

hb$retestIdentifiedCnvs(retestBoundGenes = TRUE, retestBoundSnps = FALSE, verbose=FALSE)
## look at final results
results <- hb$summarizeResults(geneBased=TRUE, alleleBased=FALSE)
print(head(results[,1:5]))

Cluster cells on these posterior probabilities and visualize them as a heatmap.

trees <- hb$visualizeResults(geneBased=TRUE, alleleBased=FALSE, details=TRUE, margins=c(5,5))

We can again visualize our results, this time, ordering the cells based on their posterior probabilities of harboring CNVs.

## order cells
hc <- trees$hc
order <- hc$labels[hc$order]
## plot all chromosomes
hb$plotGexpProfile(cellOrder=hc$order)

We can compare this result with the one from the paper (Figure 2D,E):

Karaayvaz 2018 2D,E

We observe also a highlighted amplification in chr12, that was later confirmed by WES (Figure 2E)

## plot just identified cnvs
hb$plotGexpProfile(cellOrder=hc$order, region=hb$cnvs[['gene-based']][['amp']])

hb$plotGexpProfile(cellOrder=hc$order, region=hb$cnvs[['gene-based']][['del']])

Finally, we can also visualize the results from HoneyBadger into our UMAPs.

head(results)
prob_res<- data.frame(t(results[,8:ncol(results)]))
colnames(prob_res)<- paste0("prob_", results$seqnames, "_", results$start, "_", results$end)
length(intersect(rownames(prob_res), rownames(sce_cells_umap)))
[1] 273
sce_cells_umap<- cbind(sce_cells_umap,prob_res[rownames(sce_cells_umap),]) 
head(sce_cells_umap)
NA
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=cell) )+
  geom_point()

ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=patient) )+
  geom_point()

sessionInfo()
---
title: "HoneyBadger on Karaayvaz et al 2018"
author: "Ariel Madrigal"
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: yes
    toc_float:
      collapsed: no
      smooth_scroll: no
  html_document:
    toc: yes
    df_print: paged
---

## Loading libraries

```{r}
library(HoneyBADGER)
library(biomaRt)
library(pheatmap)
library(uwot)
library(ggfortify)
```

## Loading data

Reading data from GEO. 

```{r}
exp_norm<- read.table("data/RSEM_TPM_240_NormalCells.txt")
exp_cancer<- read.table("data/GSE118389_tpm_rsem.txt")
meta_celltypes<- read.table("data/cell_types_tab_S9.txt", stringsAsFactors = FALSE, row.names=2)
meta_celltypes[,1]<- NULL
dim(meta_celltypes)
names(meta_celltypes)<- "cell"
length(intersect(rownames(meta_celltypes), colnames(exp_cancer)))
```

Taking a look at the data

```{r}
dim(exp_norm)
head(exp_norm[,1:10])
```

```{r}
dim(exp_cancer)
head(exp_cancer[,1:10])
```

```{r}
exp_cancer<- exp_cancer[, rownames(meta_celltypes)]
```


```{r}
hist(colSums(exp_cancer))
hist(colSums(exp_norm))
```
## Filtering for genes

In this case we are going to use a TPM filter of 10 for both the reference and test datasets. 

```{r}
mean_gene_norm<-rowMeans(exp_norm)
summary(mean_gene_norm)
length(which(mean_gene_norm> 10))
exp_norm<- exp_norm[mean_gene_norm>10,]
```

```{r}
mean_gene_cancer<- rowMeans(exp_cancer)
summary(mean_gene_cancer)
length(which(mean_gene_cancer> 10))
exp_cancer<- exp_cancer[mean_gene_cancer>10,]
```

```{r}
inter_genes<- intersect(rownames(exp_cancer), rownames(exp_norm))
length(inter_genes)
exp_cancer<- exp_cancer[inter_genes,]
exp_norm <- exp_norm[inter_genes,]
```
## Generating UMAP of the data

```{r}
set.seed(43)
sce_cells_umap <- umap(t(exp_cancer),                       
                       n_components = 2,
                       #a=1,
                       #b=0.6,
                       min_dist=0.01,
                       metric="cosine",
                       verbose = TRUE,
                       n_threads = max(1, RcppParallel::defaultNumThreads()/2) )
colnames(sce_cells_umap)<- c("UMAP1", "UMAP2")
rownames(sce_cells_umap)<- colnames(exp_cancer)
sce_cells_umap<- cbind(sce_cells_umap, meta)

head(sce_cells_umap)

```

```{r}
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=cell) )+
  geom_point()
```


## Creating meta data

```{r}
meta<- data.frame( row.names=colnames(exp_cancer), 
                   patient= sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[1]),
                   plate= sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[2]),
                   well = sapply(colnames(exp_cancer), function(X) unlist( strsplit(X, "_"))[3]),
                   stringsAsFactors=FALSE)
length(intersect(rownames(meta), rownames(meta_celltypes)))
meta<- cbind(meta, meta_celltypes[rownames(meta),,drop=FALSE])
head(meta)

```


## Running HoneyBadger analysis for PT039

```{r}
mart.obj <- useMart(biomart = "ensembl", dataset = 'hsapiens_gene_ensembl', host = "feb2014.archive.ensembl.org") ## version used in manuscript
```

```{r}
PT039_meta<- meta[meta$patient  == "PT039",]
## Subsetting cells
set.seed(43)
#cells_keep<- sample(rownames(PT039_meta), 100)
cells_keep<- rownames(PT039_meta)
PT039_meta<- PT039_meta[cells_keep,]
head(PT039_meta)
PT039_exp<- exp_cancer[,cells_keep]
```

Making a new HoneyBadger object

```{r}
hb <- new('HoneyBADGER', name='PT039')

hb$setGexpMats(log2(as.matrix(PT039_exp)+1), log2(exp_norm+1), mart.obj, filter=FALSE, scale=TRUE, verbose=TRUE)
```
```{r, width=15, height=10}
hb$plotGexpProfile() ## initial visualization
```
```{r}
hb$setMvFit(verbose=TRUE) ## model variance

hb$setGexpDev(verbose=TRUE) ## model necessary expression deviation to identify CNVs
```
```{r}
hb$calcGexpCnvBoundaries(init=TRUE, verbose=FALSE) ## HMM
```
Double check what CNVs were identified

```{r}
bgf <- hb$bound.genes.final
genes <- hb$genes
regions.genes <- range(genes[unlist(bgf)])
print(regions.genes)

```
The initial HMM step identified a number of candidate CNVs to test. 

Now, we can use the bayesian model to derive the final posterior probability of each CNV in each cell. 

Indeed, our initial HMM has identified a number of candidate CNVs to test. We can now retest all identified CNVs on all cells to derive the final posterior probability of each CNV in each cell.

```{r}
hb$retestIdentifiedCnvs(retestBoundGenes = TRUE, retestBoundSnps = FALSE, verbose=FALSE)
```

```{r}
## look at final results
results <- hb$summarizeResults(geneBased=TRUE, alleleBased=FALSE)
print(head(results[,1:5]))
```

Cluster cells on these posterior probabilities and visualize them as a heatmap.
 
```{r}
trees <- hb$visualizeResults(geneBased=TRUE, alleleBased=FALSE, details=TRUE, margins=c(5,5))
```

We can again visualize our results, this time, ordering the cells based on their posterior probabilities of harboring CNVs.

```{r}
## order cells
hc <- trees$hc
order <- hc$labels[hc$order]
## plot all chromosomes
hb$plotGexpProfile(cellOrder=hc$order)
```

We can compare this result with the one from the paper (Figure 2D,E):

![Karaayvaz 2018 2D,E](Karaayvaz_2018_2DE.png)

We observe also a highlighted amplification in chr12, that was later confirmed by WES (Figure 2E)

```{r}
## plot just identified cnvs
hb$plotGexpProfile(cellOrder=hc$order, region=hb$cnvs[['gene-based']][['amp']])
```


```{r}
hb$plotGexpProfile(cellOrder=hc$order, region=hb$cnvs[['gene-based']][['del']])
```

Finally, we can also visualize the results from HoneyBadger into our UMAPs. 

```{r}
head(results)
prob_res<- data.frame(t(results[,8:ncol(results)]))
colnames(prob_res)<- paste0("prob_", results$seqnames, "_", results$start, "_", results$end)
length(intersect(rownames(prob_res), rownames(sce_cells_umap)))
sce_cells_umap<- cbind(sce_cells_umap,prob_res[rownames(sce_cells_umap),]) 
head(sce_cells_umap)

```
```{r}
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=cell) )+
  geom_point()
```

```{r}
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=patient) )+
  geom_point()
```   

```{r, fig.width=20, fig.height=8}
ggplot(sce_cells_umap, aes(UMAP1, UMAP2, color=prob_chr12_861759_33049774) )+
  geom_point()+
  scale_color_viridis_c()
```


```{r}
sessionInfo()
```