0.1 Introduction

This file documents the Identification of marker genes between clusters. UMAP is generated from gene counts using Cell Ranger whitelsit. FLAMES was used for quantification.

# BLAZE q20
trans_B_q20 <- '../data/trans_count_B+F_q20.csv'
gene_B_q20 <- '../data/gene_count_B+F_q20.csv'
# BLAZE non q20 GridIon
trans_B_nonq20_gd <- '../data/trans_count_B+F_non_q20.csv'
gene_B_nonq20_gd <- '../data/gene_count_B+F_non_q20.csv'
# BLAZE non q20 PromithIon
trans_B_nonq20_pmth <- '../data/trans_count_B+F_promith.csv'
gene_B_nonq20_pmth <- '../data/gene_count_B+F_promith.csv'


# Cellranger q20
trans_C_q20 <- '../data/trans_count_C+F_q20.csv'
gene_C_q20 <- '../data/gene_count_C+F_q20.csv'
# Cellranger non q20 GridIon
trans_C_nonq20_gd <- '../data/trans_count_C+F_non_q20.csv'
gene_C_nonq20_gd <- '../data/gene_count_C+F_non_q20.csv'
# Cellranger non q20 PromithIon
trans_C_nonq20_pmth <- '../data/trans_count_C+F_promith.csv'
gene_C_nonq20_pmth <- '../data/gene_count_C+F_promith.csv'

0.2 Part 1: Define function for ploting

plot_umap <- function(count.matrix, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7, fig_name = ''){

 # init 
  rst_figures <- list()
  rst_table = data.frame()

    counts <- read.csv(count.matrix, row.names = 1)
    seurat_object <- CreateSeuratObject(counts = counts, project = "singlecell", min.cells = 3,min.features=1)
    rst_table <- rbind(rst_table, data.frame("Cells"=dim(seurat_object)[2],
                                             "median feature per Cell"=median(seurat_object$nFeature_RNA), "Meadian reads per feature"= median(seurat_object$nCount_RNA), row.names = paste0('min features > 0'),check.names = FALSE))
    seurat_object <- CreateSeuratObject(counts = counts, project = "singlecell", min.cells = 3, min.features = min.features)

    
    #remove unwanted cells. below are default settings but you can modify these
    seurat_object <- subset(seurat_object, subset = nFeature_RNA > min.features & nFeature_RNA < max.feature) 
        rst_table <- rbind(rst_table, data.frame("Cells"=dim(seurat_object)[2],
                                             "median feature per Cell"=median(seurat_object$nFeature_RNA), "Meadian reads per feature"= median(seurat_object$nCount_RNA), row.names = paste0('min features > ', min.features),check.names = FALSE))
    
    #now you have removed unwanted cells, it is time to normalize the data. By default, Seurat employs a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.
    seurat_object <- NormalizeData(seurat_object, normalization.method = "LogNormalize", scale.factor = 10000)
    #you can alternatively input seurat_object <- NormalizeData(seurat_object) instead of above.
    #we now identify highly variable features in order to determine a subset of features that exhibit high cell-to-cell 
    
    #variation in the dataset.
    seurat_object <- FindVariableFeatures(seurat_object, selection.method = "vst", nfeatures = 2000)
    
    #now we apply a linear transformation (scaling) that is a standard pre-processing step prior to dimensional reduction techniques like PCA
    all.genes <- rownames(seurat_object)
    seurat_object <- ScaleData(seurat_object, features = all.genes)
    #we can visualise both cells and features that define the PCA
    seurat_object <- RunPCA(seurat_object, features = VariableFeatures(object = seurat_object))
    #this can be visualised using an Elbow Plot
    #rst_figures <- append(rst_figures, ElbowPlot(seurat_object))
    
    #to cluster cells, I have used the number of sig. PCs that I observed in the above plots. The findneighbors function is for constuction of a KNN graph based on euclidean distance in PCA space and refines edge weights between any two cells based on the shared overlap in their local neighborhoods (jaccard similarity). It uses the input of previously defined dimensionality of the dataset.
    seurat_object <- FindNeighbors(seurat_object, dims = 1:npc)
    #now to actually cluster the cells, we apply modularity optimisation techniques (default is Louvain algorithm). The findclusters function contains a resolution parameter which sets the granularity of downstream clustering. Settings are recommended between 0.4-1.2, but may need to be increased for larger datasets.
    seurat_object <- FindClusters(seurat_object, resolution = cluster_res)
    
    #to View metadata
    #seurat_object@meta.data
    #run non-linear dimensional reduction (UMAP/tSNE)
    seurat_object <- RunUMAP(seurat_object, dims = 1:npc)
    
    rst_figures <- append(rst_figures, list(DimPlot(seurat_object, reduction = "umap") + labs(color = "cluster \n(from PCA)", title = '') + theme(text = element_text(size = 10))  )) 
    
    
    rst_figures <- append(rst_figures, list(
      FeaturePlot(seurat_object, reduction = "umap", features = 'nCount_RNA')+labs(color = "UMI count",title = '')+ theme(text = element_text(size = 10)),
    FeaturePlot(seurat_object, reduction = "umap", features = 'nFeature_RNA')+labs(color = str_wrap("Feature count (isoform/gene)",15),title = '') + theme(text = element_text(size = 10))
    ))

    plot_pc <- ElbowPlot(seurat_object)+labs(title = 'SD explained by each PC') + theme(text = element_text(size = 10))
    plot_umap <- grid.arrange( plot_pc, tableGrob(rst_table), rst_figures[[1]], rst_figures[[2]], rst_figures[[3]], ncol=2, top=textGrob(fig_name))
    
    
    list(plot_umap, seurat_object)
    
}

0.3 Part 2: Promith DATA -> BLAZE whitelsit Genes Counts

# Cellranger
plots <- plot_umap(gene_B_nonq20_pmth, min.features = 200,max.feature = 999999999, npc = 5, cluster_res = 0.7,fig_name = 'Cellranger+FLAME (Gene Counts, Q20 data)')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 803
## Number of edges: 21861
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7817
## Number of communities: 6
## Elapsed time: 0 seconds

umap_object_B <- plots[[2]]

#Find MArkers 
markers <- FindAllMarkers(object = umap_object_B, 
                          only.pos = TRUE,
                          logfc.threshold = 0.25)

write.csv(markers, "B+F_promith_Marker_genes_per_cluster.csv")

Markers_C3 <- markers %>%
  dplyr::filter(cluster == 3 & avg_log2FC > 1 & p_val_adj < 0.05)

write.csv(Markers_C3, "BLAZE_Markers_C3_lFC>1_padj<0.05.csv")


#Known Marker genes for Cortical Neuronal differntiation 
#Gene Features 
#Pax6, 
#FOXG1
#VIM
#CUX1
#VGlut1
#GFAP
FeaturePlot(umap_object_B, features = c("ENSG00000007372.23", "ENSG00000176165.10", "ENSG00000026025.16", "ENSG00000257923.11", "ENSG00000104888.10", "ENSG00000131095.13","ENSG00000124785.9"))

# Transcription factors known to be upregualted in differentiating cells. 
FeaturePlot(umap_object_B, features = c("ENSG00000162374.17", "ENSG00000171786.6", "ENSG00000124785.9", "ENSG00000148123.15"))