1 Input files

#Want to see if importing the data does better 
gene_CR_pmth_emptydrops <- 'data/transcript_count_Emptydrops_edit4_CR.csv'
gene_CR_pmth <- 'data/trans_count_C+F_promith.csv'

#####
gene_B_pmth <- 'data/trans_count_B+F_promith.csv'
gene_B_emptydrops <- 'data/transcript_count_Emptydrops_edit4_B.csv'

gene_S_pmth_empty <- "data/transcript_count_Emptydrops_edit4_S.csv"
gene_S_pmth <- 'data/trans_count_S+F_promith.csv'

2 Define function for dgc matrix as input to empty drops fucntion

makedgcmatrix <- function(count.matrix){

    counts <- read.csv(count.matrix)
    
    seurat_object <- CreateSeuratObject(counts = counts, project = "singlecell", min.cells = 3, min.features = 0.5)
    
      list(makedgcmatrix, seurat_object, seurat_object@assays[["RNA"]]@counts, seurat_object@assays[["RNA"]]@data)
}

3 Empty drops Analysis

3.1 Cell Ranger whitelsit plus 5000 empty drops with editdist >4

plots <- makedgcmatrix(gene_CR_pmth_emptydrops)

outs.ddcmatrix <- plots[[3]]

br.out <- barcodeRanks(outs.ddcmatrix)

# Making a plot.
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")

abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
abline(h=1000)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), 
    legend=c("knee", "inflection"))

e.out <- emptyDrops(outs.ddcmatrix, 
 lower = 100,
  niters = 10000,
  test.ambient = TRUE,
  ignore = NULL,
  alpha = NULL,
  round = TRUE,
  by.rank = NULL,
  BPPARAM = SerialParam())

summary(e.out$FDR < 0.01)
##    Mode   FALSE    TRUE    NA's 
## logical    1037    1182    3802
#diagnostic plot
is.cell <- e.out$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 1182
plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"),
    xlab="Total UMI count", ylab="-Log Probability")

#QC to check if permutations should be increased
table(Limited=e.out$Limited, Significant=is.cell)
##        Significant
## Limited FALSE TRUE
##   FALSE  1037   83
##   TRUE      0 1099
#Check the Pvalu distributions 
set.seed(100)
limit <- 200   

hist(e.out$PValue[e.out$Total <= limit & e.out$Total > 0],
    xlab="P-value", main="", col="grey80") 

3.2 Sockeyewhitelsit plus 5000 empty drops with editdist >4

plots <- makedgcmatrix(gene_S_pmth_empty)

outs.ddcmatrix <- plots[[3]]

br.out <- barcodeRanks(outs.ddcmatrix)

# Making a plot.
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")

abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), 
    legend=c("knee", "inflection"))

e.out_S <- emptyDrops(outs.ddcmatrix, 
 lower = 100,
  niters = 10000,
  test.ambient = TRUE,
  ignore = NULL,
  alpha = NULL,
  round = TRUE,
  by.rank = NULL,
  BPPARAM = SerialParam())

#summary stats
summary(e.out_S$FDR < 0.01)
##    Mode   FALSE    TRUE    NA's 
## logical    1257    1488    3772
#diagnostic plot
is.cell <- e.out_S$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 1488
plot(e.out_S$Total, -e.out_S$LogProb, col=ifelse(is.cell, "red", "black"),
    xlab="Total UMI count", ylab="-Log Probability")

#QC to check if permutations should be increased
table(Limited=e.out_S$Limited, Significant=is.cell)
##        Significant
## Limited FALSE TRUE
##   FALSE  1257  143
##   TRUE      0 1345

3.3 BLAZE plus 5000 empty drops with editdist >4

plots <- makedgcmatrix(gene_B_emptydrops)

outs.ddcmatrix <- plots[[3]]

br.out <- barcodeRanks(outs.ddcmatrix)

# Making a plot.
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")

abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), 
    legend=c("knee", "inflection"))

e.out_B <- emptyDrops(outs.ddcmatrix, 
 lower = 100,
  niters = 10000,
  test.ambient = TRUE,
  ignore = NULL,
  alpha = NULL,
  round = TRUE,
  by.rank = NULL,
  BPPARAM = SerialParam())

#summary stats
summary(e.out_B$FDR < 0.01)
##    Mode   FALSE    TRUE    NA's 
## logical     946    1124    3733
#diagnostic plot
is.cell.B <- e.out_B$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 1488
plot(e.out_B$Total, -e.out_B$LogProb, col=ifelse(is.cell, "red", "black"),
    xlab="Total UMI count", ylab="-Log Probability")

4 Function for UMAP

plot_umap <- function(count.matrix, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7, fig_name = ''){

  # init 
  rst_figures <- list()
  rst_table = data.frame()

    counts <- read.csv(count.matrix)
    
    seurat_object <- CreateSeuratObject(counts = counts, project = "singlecell", min.cells = 3, min.features = 0.5)
    rst_table <- rbind(rst_table, data.frame("Cells"=dim(seurat_object)[2],
                                             "median feature per Cell"=median(seurat_object$nFeature_RNA), row.names = paste0('min features > 0'),check.names = FALSE))
    
    #remove unwanted cells. below are default settings but you can modify these
    seurat_object <- subset(seurat_object, subset = nFeature_RNA > min.features & nFeature_RNA < max.feature) 
        rst_table <- rbind(rst_table, data.frame("Cells"=dim(seurat_object)[2],
                                             "median feature per Cell"=median(seurat_object$nFeature_RNA), row.names = paste0('min features > 0', min.features),check.names = FALSE))
    
    #now you have removed unwanted cells, it is time to normalize the data. By default, Seurat employs a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.
    seurat_object <- NormalizeData(seurat_object, normalization.method = "LogNormalize", scale.factor = 10000)
    #you can alternatively input seurat_object <- NormalizeData(seurat_object) instead of above.
    #we now identify highly variable features in order to determine a subset of features that exhibit high cell-to-cell 
    
    #variation in the dataset.
    seurat_object <- FindVariableFeatures(seurat_object, selection.method = "vst", nfeatures = 2000)
    
    #now we apply a linear transformation (scaling) that is a standard pre-processing step prior to dimensional reduction techniques like PCA
    all.genes <- rownames(seurat_object)
    seurat_object <- ScaleData(seurat_object, features = all.genes)
    #we can visualise both cells and features that define the PCA
    seurat_object <- RunPCA(seurat_object, features = VariableFeatures(object = seurat_object))
    #this can be visualised using an Elbow Plot
    #rst_figures <- append(rst_figures, ElbowPlot(seurat_object))
    
    #to cluster cells, I have used the number of sig. PCs that I observed in the above plots. The findneighbors function is for constuction of a KNN graph based on euclidean distance in PCA space and refines edge weights between any two cells based on the shared overlap in their local neighborhoods (jaccard similarity). It uses the input of previously defined dimensionality of the dataset.
    seurat_object <- FindNeighbors(seurat_object, dims = 1:npc)
    #now to actually cluster the cells, we apply modularity optimisation techniques (default is Louvain algorithm). The findclusters function contains a resolution parameter which sets the granularity of downstream clustering. Settings are recommended between 0.4-1.2, but may need to be increased for larger datasets.
    seurat_object <- FindClusters(seurat_object, resolution = cluster_res)
    
    #to View metadata
    #seurat_object@meta.data
    #run non-linear dimensional reduction (UMAP/tSNE)
    seurat_object <- RunUMAP(seurat_object, dims = 1:npc)
    
    rst_figures <- append(rst_figures, list(DimPlot(seurat_object, reduction = "umap") + labs(color = "cluster \n(from PCA)", title = '') + theme(text = element_text(size = 10))  )) 
    
    
    rst_figures <- append(rst_figures, list(
      FeaturePlot(seurat_object, reduction = "umap", features = 'nCount_RNA')+labs(color = "UMI count",title = '')+ theme(text = element_text(size = 10)),
    FeaturePlot(seurat_object, reduction = "umap", features = 'nFeature_RNA')+labs(color = str_wrap("Feature count (isoform/gene)",15),title = '')+ theme(text = element_text(size = 10))
    ))
    
    
    plot_pc <- ElbowPlot(seurat_object)+labs(title = 'SD explained by each PC') + theme(text = element_text(size = 10))
    plot_umap <- grid.arrange(plot_pc, tableGrob(rst_table), rst_figures[[1]], rst_figures[[2]], rst_figures[[3]], ncol=2, top=textGrob(fig_name))
    list(plot_umap, seurat_object)
}

5 Make seurat object for Cell ranger Sockeye and BLAZE

5.1 Seurat objects

# Cellranger
plots <- plot_umap(gene_CR_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene counts, non-Q20 PromethIon data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 898
## Number of edges: 24680
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8284
## Number of communities: 8
## Elapsed time: 0 seconds

umap_object_C <- plots[[2]]

plots <- plot_umap(gene_S_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene counts, non-Q20 PromethIon data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1379
## Number of edges: 39464
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8201
## Number of communities: 10
## Elapsed time: 0 seconds

umap_object_S <- plots[[2]]

plots <- plot_umap(gene_B_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene counts, non-Q20 PromethIon data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 803
## Number of edges: 21735
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7958
## Number of communities: 7
## Elapsed time: 0 seconds

umap_object_B <- plots[[2]]

6 Summary stats for ovelap between empty drops and seurat objects

#Make a df with FDR of TRUE cells 
#CR
is.true.cell_CR <- as.data.frame(e.out@listData[["FDR"]], e.out@rownames)
is.true.cell_CR <- is.true.cell_CR %>% filter(is.true.cell_CR$`e.out@listData[["FDR"]]` <= 0.01)
is.true.cell_CR <- tibble::rownames_to_column(is.true.cell_CR, "cell_id")

#Sockeye
is.true.cell_S <- as.data.frame(e.out_S@listData[["FDR"]], e.out_S@rownames)
is.true.cell_S <- is.true.cell_S %>% filter(is.true.cell_S$`e.out_S@listData[["FDR"]]` <= 0.01)
is.true.cell_S <- tibble::rownames_to_column(is.true.cell_S, "cell_id")

#BLAZE
is.true.cell_B <- as.data.frame(e.out_B@listData[["FDR"]], e.out_B@rownames)
is.true.cell_B <- is.true.cell_B %>% filter(is.true.cell_B$`e.out_B@listData[["FDR"]]` <= 0.01)
is.true.cell_B <- tibble::rownames_to_column(is.true.cell_B, "cell_id")

#Fucntion for retriving the suerat cells and clsuter in D.f 
overlap_true_cell <- function(seurat_object){
  seurat_cluster.df = as.data.frame(seurat_object$seurat_clusters) 
  seurat_cluster.df <- tibble::rownames_to_column(seurat_cluster.df, "cell_id")
  list(seurat_cluster.df)
}

#obtain cluster d.f from seurat object
overlap_CR <- overlap_true_cell(umap_object_C)
overlap_B <- overlap_true_cell(umap_object_B)
overlap_S <- overlap_true_cell(umap_object_S)

#Check overlaps between Seurat object and true cells 
summary(overlap_CR[[1]]$cell_id %in% is.true.cell_CR$cell_id)
##    Mode   FALSE    TRUE 
## logical      37     861
summary(overlap_B[[1]]$cell_id %in% is.true.cell_B$cell_id)
##    Mode   FALSE    TRUE 
## logical       3     800
summary(overlap_S[[1]]$cell_id %in% is.true.cell_S$cell_id)
##    Mode   FALSE    TRUE 
## logical     211    1168

7 Add meta data to seurat objects so UMAP plots can coloured.

#Function That takes dgc matrix and outputs d.f c(barcode, Cell or emptydrop)
True.cells <- function(e.out){
  cells.1= as.data.frame(e.out@rownames)
  cells.2= as.data.frame(e.out$FDR)
  T.F.cells = cbind(cells.1, cells.2)
  T.F.cells <- data.frame(T.F.cells[,-1], row.names=T.F.cells[,1]) #make col row names
    #change col anmes 
  setnames(T.F.cells, c('FDR'))
  T.F.cells %>%
    mutate(FDR = case_when(FDR < 0.01 ~ "Cells",
                           FDR > 0.01 ~ "Empty_drops"
                           ))
}

 
cells_CR <- True.cells(e.out)
cells_S <- True.cells(e.out_S)
cells_B <- True.cells(e.out_B)

#this needs some fixing need to swap around the TRU FALSEW statment for FDR.
umap_object_C <- AddMetaData(umap_object_C,  metadata = cells_CR, col.name = 'is.cell')
umap_object_S <- AddMetaData(umap_object_S,  metadata = cells_S, col.name = 'is.cell')
umap_object_B <- AddMetaData(umap_object_B,  metadata = cells_B, col.name = 'is.cell')

8 Plot UMAP coloured by empty drops

#Plot Empty drops on Gene UMAP
grid.arrange( DimPlot(umap_object_C, reduction = "umap", group.by = 'is.cell', cols = c("grey", "blue")) + labs(color = "is.cell", title = 'CellRanger') + theme(text = element_text(size = 12)),
              DimPlot(umap_object_B, reduction = "umap", group.by = 'is.cell', cols = c("grey", "blue")) + labs(color = "is.cell", title = 'Sockeye') + theme(text = element_text(size = 12)),
              DimPlot(umap_object_S, reduction = "umap", group.by = 'is.cell', cols = c("grey", "blue")) + labs(color = "is.cell", title = 'BLAZE') + theme(text = element_text(size = 12)),
              top=textGrob('Empty drops vs real cells'), ncol=3)

9 Plot BLAZE UMAP coloured by empty drops and BLAZE cells not found by cell Ranger

#looking at barcodes only found by BLAZE and Sockeye 

only <- read.csv("data/barcodeB+Sonly.csv")
only <- data.frame(only[,-1], row.names=only[,1]) #make col row names

umap_object_B <- AddMetaData(umap_object_B,  metadata = only, col.name = 'B.F.only')

grid.arrange(DimPlot(umap_object_B, reduction = "umap", group.by = 'is.cell') + labs(color = "is.cell", title = 'BLAZE') + theme(text = element_text(size = 10)),
              DimPlot(umap_object_B, reduction = "umap", group.by = 'B.F.only') + labs(color = "is.cell", title = 'BLAZE') + theme(text = element_text(size = 10)),
              top=textGrob('Empty drops vs real cells'), ncol=2)

10 Plot UMAPS coloured by edit distacne and empty drops.

edit_dist_S <- read.csv("data/addition_sy_bc_ed.csv", header = T)

edit_dist_S_2 <- dplyr::filter(edit_dist_S, edit_dist_S$ED <= 2)
edit_dist_S_2 <- data.frame(edit_dist_S_2[,-1], row.names=edit_dist_S_2[,1]) #make col row names
names(edit_dist_S_2)[1] <- "ed"
edit_dist_S_2$ed <- "TRUE"

umap_object_S <- AddMetaData(umap_object_S,  metadata = edit_dist_S_2, col.name = 'ed')

#Pull out cells that are empty drops and have small edit distance
cells_S_real <- dplyr::filter(cells_S, cells_S$FDR == "Cells")
cells_S_real <- rownames_to_column(cells_S_real, "Barcode")

edit_dist_S_2 <- rownames_to_column(edit_dist_S_2, "Barcode")

#join objects from 
df <- join_all(list(cells_S_real,edit_dist_S_2), by = 'Barcode', type = 'inner')

#prepare meta data
df <- data.frame(df[,-1], row.names=df[,1]) #make col row names
df$ed_cell <- "ed_cell"
df <- df[3]

#add meta data
umap_object_S <- AddMetaData(umap_object_S,  metadata = df, col.name = 'ed_cell')

#filter for empty drops
cells_S_empty <- dplyr::filter(cells_S, cells_S$FDR == "Empty_drops")
cells_S_empty <- rownames_to_column(cells_S_empty, "Barcode")
#make combined data frame 
df2<- as.data.frame(c(cells_S_empty$Barcode, edit_dist_S_2$Barcode))

#remove duplicated rows and add row names
df2 <- df2 %>% distinct(df2[1])
df2$Union <- "True"
df2 <- data.frame(df2[,-1], row.names=df2[,1]) #make col row names

#add meta data
umap_object_S <- AddMetaData(umap_object_S,  metadata = df2, col.name = 'U')

#Plot
grid.arrange( DimPlot(umap_object_S, reduction = "umap", group.by = 'is.cell') + labs(color = "Empty_drops", title = 'Sockeye') + theme(text = element_text(size = 10)),
              DimPlot(umap_object_S, reduction = "umap", group.by = 'ed', cols = c("red"), na.value = "grey") + labs(color = "ed <= 2", title = 'Sockeye') + theme(text = element_text(size = 10)),
              DimPlot(umap_object_S, reduction = "umap", group.by = 'ed_cell', cols = c("blue"), na.value = "grey") + labs(color = "'True' Cells \n with ed <=2", title = 'Sockeye') + theme(text = element_text(size = 10)),
              DimPlot(umap_object_S, reduction = "umap", group.by = 'U', cols = c("orange"), na.value = "grey") + labs(color = "ed <=2 or \n Empty drops", title = 'Sockeye') + theme(text = element_text(size = 10)) ,ncol=2)