Empty drops Analysis
Cell Ranger whitelsit plus 5000 empty drops with editdist >4
plots <- makedgcmatrix(gene_CR_pmth_emptydrops)
outs.ddcmatrix <- plots[[3]]
br.out <- barcodeRanks(outs.ddcmatrix)
# Making a plot.
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")
abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
abline(h=1000)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"),
legend=c("knee", "inflection"))

e.out <- emptyDrops(outs.ddcmatrix,
lower = 100,
niters = 10000,
test.ambient = TRUE,
ignore = NULL,
alpha = NULL,
round = TRUE,
by.rank = NULL,
BPPARAM = SerialParam())
summary(e.out$FDR < 0.01)
## Mode FALSE TRUE NA's
## logical 1037 1182 3802
#diagnostic plot
is.cell <- e.out$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 1182
plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"),
xlab="Total UMI count", ylab="-Log Probability")

#QC to check if permutations should be increased
table(Limited=e.out$Limited, Significant=is.cell)
## Significant
## Limited FALSE TRUE
## FALSE 1037 83
## TRUE 0 1099
#Check the Pvalu distributions
set.seed(100)
limit <- 200
hist(e.out$PValue[e.out$Total <= limit & e.out$Total > 0],
xlab="P-value", main="", col="grey80")

Sockeyewhitelsit plus 5000 empty drops with editdist >4
plots <- makedgcmatrix(gene_S_pmth_empty)
outs.ddcmatrix <- plots[[3]]
br.out <- barcodeRanks(outs.ddcmatrix)
# Making a plot.
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")
abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"),
legend=c("knee", "inflection"))

e.out_S <- emptyDrops(outs.ddcmatrix,
lower = 100,
niters = 10000,
test.ambient = TRUE,
ignore = NULL,
alpha = NULL,
round = TRUE,
by.rank = NULL,
BPPARAM = SerialParam())
#summary stats
summary(e.out_S$FDR < 0.01)
## Mode FALSE TRUE NA's
## logical 1257 1488 3772
#diagnostic plot
is.cell <- e.out_S$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 1488
plot(e.out_S$Total, -e.out_S$LogProb, col=ifelse(is.cell, "red", "black"),
xlab="Total UMI count", ylab="-Log Probability")

#QC to check if permutations should be increased
table(Limited=e.out_S$Limited, Significant=is.cell)
## Significant
## Limited FALSE TRUE
## FALSE 1257 143
## TRUE 0 1345
BLAZE plus 5000 empty drops with editdist >4
plots <- makedgcmatrix(gene_B_emptydrops)
outs.ddcmatrix <- plots[[3]]
br.out <- barcodeRanks(outs.ddcmatrix)
# Making a plot.
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")
abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"),
legend=c("knee", "inflection"))

e.out_B <- emptyDrops(outs.ddcmatrix,
lower = 100,
niters = 10000,
test.ambient = TRUE,
ignore = NULL,
alpha = NULL,
round = TRUE,
by.rank = NULL,
BPPARAM = SerialParam())
#summary stats
summary(e.out_B$FDR < 0.01)
## Mode FALSE TRUE NA's
## logical 946 1124 3733
#diagnostic plot
is.cell.B <- e.out_B$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 1488
plot(e.out_B$Total, -e.out_B$LogProb, col=ifelse(is.cell, "red", "black"),
xlab="Total UMI count", ylab="-Log Probability")

Function for UMAP
plot_umap <- function(count.matrix, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7, fig_name = ''){
# init
rst_figures <- list()
rst_table = data.frame()
counts <- read.csv(count.matrix)
seurat_object <- CreateSeuratObject(counts = counts, project = "singlecell", min.cells = 3, min.features = 0.5)
rst_table <- rbind(rst_table, data.frame("Cells"=dim(seurat_object)[2],
"median feature per Cell"=median(seurat_object$nFeature_RNA), row.names = paste0('min features > 0'),check.names = FALSE))
#remove unwanted cells. below are default settings but you can modify these
seurat_object <- subset(seurat_object, subset = nFeature_RNA > min.features & nFeature_RNA < max.feature)
rst_table <- rbind(rst_table, data.frame("Cells"=dim(seurat_object)[2],
"median feature per Cell"=median(seurat_object$nFeature_RNA), row.names = paste0('min features > 0', min.features),check.names = FALSE))
#now you have removed unwanted cells, it is time to normalize the data. By default, Seurat employs a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.
seurat_object <- NormalizeData(seurat_object, normalization.method = "LogNormalize", scale.factor = 10000)
#you can alternatively input seurat_object <- NormalizeData(seurat_object) instead of above.
#we now identify highly variable features in order to determine a subset of features that exhibit high cell-to-cell
#variation in the dataset.
seurat_object <- FindVariableFeatures(seurat_object, selection.method = "vst", nfeatures = 2000)
#now we apply a linear transformation (scaling) that is a standard pre-processing step prior to dimensional reduction techniques like PCA
all.genes <- rownames(seurat_object)
seurat_object <- ScaleData(seurat_object, features = all.genes)
#we can visualise both cells and features that define the PCA
seurat_object <- RunPCA(seurat_object, features = VariableFeatures(object = seurat_object))
#this can be visualised using an Elbow Plot
#rst_figures <- append(rst_figures, ElbowPlot(seurat_object))
#to cluster cells, I have used the number of sig. PCs that I observed in the above plots. The findneighbors function is for constuction of a KNN graph based on euclidean distance in PCA space and refines edge weights between any two cells based on the shared overlap in their local neighborhoods (jaccard similarity). It uses the input of previously defined dimensionality of the dataset.
seurat_object <- FindNeighbors(seurat_object, dims = 1:npc)
#now to actually cluster the cells, we apply modularity optimisation techniques (default is Louvain algorithm). The findclusters function contains a resolution parameter which sets the granularity of downstream clustering. Settings are recommended between 0.4-1.2, but may need to be increased for larger datasets.
seurat_object <- FindClusters(seurat_object, resolution = cluster_res)
#to View metadata
#seurat_object@meta.data
#run non-linear dimensional reduction (UMAP/tSNE)
seurat_object <- RunUMAP(seurat_object, dims = 1:npc)
rst_figures <- append(rst_figures, list(DimPlot(seurat_object, reduction = "umap") + labs(color = "cluster \n(from PCA)", title = '') + theme(text = element_text(size = 10)) ))
rst_figures <- append(rst_figures, list(
FeaturePlot(seurat_object, reduction = "umap", features = 'nCount_RNA')+labs(color = "UMI count",title = '')+ theme(text = element_text(size = 10)),
FeaturePlot(seurat_object, reduction = "umap", features = 'nFeature_RNA')+labs(color = str_wrap("Feature count (isoform/gene)",15),title = '')+ theme(text = element_text(size = 10))
))
plot_pc <- ElbowPlot(seurat_object)+labs(title = 'SD explained by each PC') + theme(text = element_text(size = 10))
plot_umap <- grid.arrange(plot_pc, tableGrob(rst_table), rst_figures[[1]], rst_figures[[2]], rst_figures[[3]], ncol=2, top=textGrob(fig_name))
list(plot_umap, seurat_object)
}
Make seurat object for Cell ranger Sockeye and BLAZE
Seurat objects
# Cellranger
plots <- plot_umap(gene_CR_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene counts, non-Q20 PromethIon data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 898
## Number of edges: 24680
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8284
## Number of communities: 8
## Elapsed time: 0 seconds

umap_object_C <- plots[[2]]
plots <- plot_umap(gene_S_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene counts, non-Q20 PromethIon data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1379
## Number of edges: 39464
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8201
## Number of communities: 10
## Elapsed time: 0 seconds

umap_object_S <- plots[[2]]
plots <- plot_umap(gene_B_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene counts, non-Q20 PromethIon data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 803
## Number of edges: 21735
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7958
## Number of communities: 7
## Elapsed time: 0 seconds

umap_object_B <- plots[[2]]
Summary stats for ovelap between empty drops and seurat objects
#Make a df with FDR of TRUE cells
#CR
is.true.cell_CR <- as.data.frame(e.out@listData[["FDR"]], e.out@rownames)
is.true.cell_CR <- is.true.cell_CR %>% filter(is.true.cell_CR$`e.out@listData[["FDR"]]` <= 0.01)
is.true.cell_CR <- tibble::rownames_to_column(is.true.cell_CR, "cell_id")
#Sockeye
is.true.cell_S <- as.data.frame(e.out_S@listData[["FDR"]], e.out_S@rownames)
is.true.cell_S <- is.true.cell_S %>% filter(is.true.cell_S$`e.out_S@listData[["FDR"]]` <= 0.01)
is.true.cell_S <- tibble::rownames_to_column(is.true.cell_S, "cell_id")
#BLAZE
is.true.cell_B <- as.data.frame(e.out_B@listData[["FDR"]], e.out_B@rownames)
is.true.cell_B <- is.true.cell_B %>% filter(is.true.cell_B$`e.out_B@listData[["FDR"]]` <= 0.01)
is.true.cell_B <- tibble::rownames_to_column(is.true.cell_B, "cell_id")
#Fucntion for retriving the suerat cells and clsuter in D.f
overlap_true_cell <- function(seurat_object){
seurat_cluster.df = as.data.frame(seurat_object$seurat_clusters)
seurat_cluster.df <- tibble::rownames_to_column(seurat_cluster.df, "cell_id")
list(seurat_cluster.df)
}
#obtain cluster d.f from seurat object
overlap_CR <- overlap_true_cell(umap_object_C)
overlap_B <- overlap_true_cell(umap_object_B)
overlap_S <- overlap_true_cell(umap_object_S)
#Check overlaps between Seurat object and true cells
summary(overlap_CR[[1]]$cell_id %in% is.true.cell_CR$cell_id)
## Mode FALSE TRUE
## logical 37 861
summary(overlap_B[[1]]$cell_id %in% is.true.cell_B$cell_id)
## Mode FALSE TRUE
## logical 3 800
summary(overlap_S[[1]]$cell_id %in% is.true.cell_S$cell_id)
## Mode FALSE TRUE
## logical 211 1168
Plot UMAP coloured by empty drops
#Plot Empty drops on Gene UMAP
grid.arrange( DimPlot(umap_object_C, reduction = "umap", group.by = 'is.cell', cols = c("grey", "blue")) + labs(color = "is.cell", title = 'CellRanger') + theme(text = element_text(size = 12)),
DimPlot(umap_object_B, reduction = "umap", group.by = 'is.cell', cols = c("grey", "blue")) + labs(color = "is.cell", title = 'Sockeye') + theme(text = element_text(size = 12)),
DimPlot(umap_object_S, reduction = "umap", group.by = 'is.cell', cols = c("grey", "blue")) + labs(color = "is.cell", title = 'BLAZE') + theme(text = element_text(size = 12)),
top=textGrob('Empty drops vs real cells'), ncol=3)

Plot BLAZE UMAP coloured by empty drops and BLAZE cells not found by cell Ranger
#looking at barcodes only found by BLAZE and Sockeye
only <- read.csv("data/barcodeB+Sonly.csv")
only <- data.frame(only[,-1], row.names=only[,1]) #make col row names
umap_object_B <- AddMetaData(umap_object_B, metadata = only, col.name = 'B.F.only')
grid.arrange(DimPlot(umap_object_B, reduction = "umap", group.by = 'is.cell') + labs(color = "is.cell", title = 'BLAZE') + theme(text = element_text(size = 10)),
DimPlot(umap_object_B, reduction = "umap", group.by = 'B.F.only') + labs(color = "is.cell", title = 'BLAZE') + theme(text = element_text(size = 10)),
top=textGrob('Empty drops vs real cells'), ncol=2)

Plot UMAPS coloured by edit distacne and empty drops.
edit_dist_S <- read.csv("data/addition_sy_bc_ed.csv", header = T)
edit_dist_S_2 <- dplyr::filter(edit_dist_S, edit_dist_S$ED <= 2)
edit_dist_S_2 <- data.frame(edit_dist_S_2[,-1], row.names=edit_dist_S_2[,1]) #make col row names
names(edit_dist_S_2)[1] <- "ed"
edit_dist_S_2$ed <- "TRUE"
umap_object_S <- AddMetaData(umap_object_S, metadata = edit_dist_S_2, col.name = 'ed')
#Pull out cells that are empty drops and have small edit distance
cells_S_real <- dplyr::filter(cells_S, cells_S$FDR == "Cells")
cells_S_real <- rownames_to_column(cells_S_real, "Barcode")
edit_dist_S_2 <- rownames_to_column(edit_dist_S_2, "Barcode")
#join objects from
df <- join_all(list(cells_S_real,edit_dist_S_2), by = 'Barcode', type = 'inner')
#prepare meta data
df <- data.frame(df[,-1], row.names=df[,1]) #make col row names
df$ed_cell <- "ed_cell"
df <- df[3]
#add meta data
umap_object_S <- AddMetaData(umap_object_S, metadata = df, col.name = 'ed_cell')
#filter for empty drops
cells_S_empty <- dplyr::filter(cells_S, cells_S$FDR == "Empty_drops")
cells_S_empty <- rownames_to_column(cells_S_empty, "Barcode")
#make combined data frame
df2<- as.data.frame(c(cells_S_empty$Barcode, edit_dist_S_2$Barcode))
#remove duplicated rows and add row names
df2 <- df2 %>% distinct(df2[1])
df2$Union <- "True"
df2 <- data.frame(df2[,-1], row.names=df2[,1]) #make col row names
#add meta data
umap_object_S <- AddMetaData(umap_object_S, metadata = df2, col.name = 'U')
#Plot
grid.arrange( DimPlot(umap_object_S, reduction = "umap", group.by = 'is.cell') + labs(color = "Empty_drops", title = 'Sockeye') + theme(text = element_text(size = 10)),
DimPlot(umap_object_S, reduction = "umap", group.by = 'ed', cols = c("red"), na.value = "grey") + labs(color = "ed <= 2", title = 'Sockeye') + theme(text = element_text(size = 10)),
DimPlot(umap_object_S, reduction = "umap", group.by = 'ed_cell', cols = c("blue"), na.value = "grey") + labs(color = "'True' Cells \n with ed <=2", title = 'Sockeye') + theme(text = element_text(size = 10)),
DimPlot(umap_object_S, reduction = "umap", group.by = 'U', cols = c("orange"), na.value = "grey") + labs(color = "ed <=2 or \n Empty drops", title = 'Sockeye') + theme(text = element_text(size = 10)) ,ncol=2)
