In [1]:
import pandas as pd
import numpy as np 
from tqdm import tqdm
from matplotlib import pyplot as plt
import collections
import upsetplot

import analysis_functions as af
In [2]:
# config filename:

# Cellranger output
filtered_matrix_h5 = "../data/hiPSC_diff/cellranger/filtered_feature_bc_matrix.h5"
raw_matrix_h5 = "../data/hiPSC_diff/cellranger/raw_feature_bc_matrix.h5"

# BLAZE output
blaze_raw_bc_fn = '../data/hiPSC_diff/blaze/gridion_lsk110_raw_bc.csv'

# sockeye
sock_whitelist = '../data/hiPSC_diff/sockeye/gridion_lsk110_whitelist.tsv'
sock_count = '../data/hiPSC_diff/sockeye/gridion_lsk110_uncorrected_bc_counts.tsv'

# this list is obtain from emtryDrops run in R script
cr_empty = {"ACTATCTAGTAGACAT",
"TCCCACACACTCCCTA",
"AATGCCATCACTGGTA",
"CCGGACACACACAGCC",
"ATGCCTCAGACTCGAG",
"CTAGACATCCCTCATG",
"ACACGCGGTCTCCTGT",
"TCATTGTGTTCTCCCA",
"CTCAACCTCCCATGGG",
"ACGGAAGCATTGAGCT",
"TCCCAGTGTCCGCAGT",
"AGTACCATCCCTCAAC",
"TAGCACATCAACTGAC",
"GAGTTACGTAACCCTA",
"TCGACCTCATGACTGT",
"TATTGGGTCCCATGGG",
"TCACTCGCATGTACGT",
"TGAGCATGTTCTCCCA",
"TCCCATGCACATACTG",
"GATGAGGTCTAAGGAA",
"TCCCATGGTCTCAAGT",
"CTGGCAGAGTATAGAC",
"CTTACCGGTACACTCA",
"AGTCACAGTGCCGTAC",
"TCGGGTGTCCCATGGG",
"TTCTGTAGTCCCTCAT",
"TATATCCCATAGCACT",
"TCCCATGGTCGTACTA",
"TGTTACTTCTAGAACC",
"GTTGTAGCACTATCCC",
"TGCTTGCTCCTCTCTT",
"GGAATCTAGTTCCGTA",
"CTTCCTTGTAGACACG",
"GATGCTACACATTCGA",
"GGGCTACCACCAGTTA",
"CAACGATTCTGACGCG",
"CAAAGAAAGTCTCTGA"}

Cellranger metrics¶

In [3]:
# get cell ranger matrix
class cr_matrix:
    def __init__(self, mat_file):
        self.mat = af.get_matrix_from_h5(mat_file)
    
    def get_barcode_umi_count(self):
        '''
        Get the barcode umi count (columns sum) of the matrix
        return:
            np.array
        '''
        return np.array(self.mat.matrix.sum(0)).reshape(-1)
    def get_barcode_str(self, as_str = True):
        '''
        get the barcode stored in the matrix.hs
        
        Option:
            as_str: <bool> whether convert the barcode to str (True by default)
        return:
            np.array of barcodes
        '''
        if as_str:
            return np.array([x.decode('UTF-8').split('-')[0] for x in self.mat.barcodes])
        else:
            return self.mat.barcodes


filtered_feature_bc_matrix = cr_matrix(filtered_matrix_h5)
raw_feature_bc_matrix = cr_matrix(raw_matrix_h5)
In [26]:
cr_filtered_bc = set(filtered_feature_bc_matrix.get_barcode_str())
cr_count_d = pd.DataFrame({
    "BC": raw_feature_bc_matrix.get_barcode_str(),
    "counts": raw_feature_bc_matrix.get_barcode_umi_count()
})
all_cr_bc = set(cr_count_d.BC)
In [5]:
cr_count_d['is_cell'] = cr_count_d.BC.isin(cr_filtered_bc)
cr_count_d = cr_count_d.sort_values(by=['counts'], ascending = False).reset_index()

BLAZE¶

In [6]:
# blaze raw bc table
raw_bc_d = pd.read_csv(blaze_raw_bc_fn)

minQ distribution for raw bc with or without errors¶

In [7]:
set_true = raw_bc_d[raw_bc_d.raw_bc.isin(cr_filtered_bc)].raw_bc_min_q
set_false = raw_bc_d[~raw_bc_d.raw_bc.isin(set(cr_count_d.BC))].raw_bc_min_q
In [8]:
plt.hist(set_true, label = 'Raw BCs exectly match short read barcode', density = True, bins = np.arange(0,40,3), alpha = 0.5, histtype='step')
plt.hist(set_false,label = 'Raw BCs do not match any short read barcode', density = True, bins = np.arange(0,40,3), alpha = 0.5, histtype='step')
#plt.hist(set_false2, label = '3', density = True, bins = np.arange(0,40,3), alpha = 0.5, histtype='step')
plt.xlabel('minimum base quality')
plt.ylabel('Density')
plt.legend()
# plt.savefig('minQ_grid_nonq20.svg',format = 'svg')
plt.show()

10X BC (ground true)¶

In [9]:
whole_whitelist = []
with open("../data/3M-february-2018.txt", 'r') as f:
    for line in f:
        whole_whitelist.append(line.strip())
whole_whitelist = set(whole_whitelist)

Thresholding strategy¶

Default threshod of minQ: 15¶

In [10]:
# threshold for minQ
def tune_minQ(minQ, d):
    raw_bc_d_HQ = d[d.raw_bc_min_q >= minQ].raw_bc.value_counts().to_frame()
    raw_bc_d_HQ['bc_rank'] = list(range(1,len(raw_bc_d_HQ)+1))
    raw_bc_d_HQ.reset_index(inplace=True)
    raw_bc_d_HQ = raw_bc_d_HQ.rename(columns = {'index':'BC', 'raw_bc':'counts'})
    rst_d = raw_bc_d_HQ[raw_bc_d_HQ.BC.isin(whole_whitelist)]
    rst_d.reset_index(inplace=True)
    print(rst_d.counts.sum())
    return rst_d

# use default threshold 15 
raw_bc_d_minQ15_filtered = tune_minQ(15, d = raw_bc_d)
2826133

Default threshod of count Quantile (95-percentile of top N cell / 20)¶

In [11]:
#### Number of barcode vs Expected number of cells# 99-percentile of top N cell / 10
def percentile_count_thres(count_array, exp_cells=None):
    if exp_cells:
        top_count = np.sort(count_array)[::-1][:exp_cells]
        return np.quantile(top_count, 0.95)/20
    else:
        exp_cells = 0
        new_exp_cells = int(len(count_array)/2)
        count_array = np.sort(count_array)[::-1]
        while exp_cells != new_exp_cells:  
            exp_cells = new_exp_cells
            
            top_count = count_array[:exp_cells]
            t = np.quantile(top_count, 0.95)/20

            new_exp_cells = (count_array >= t).sum()
            print(exp_cells,new_exp_cells, t)
        return t

threshold = percentile_count_thres(raw_bc_d_minQ15_filtered.counts,500)
In [12]:
# 95-percentile of top N cell / 200
def percentile_count_thres_high_sensitivity(count_array, exp_cells=None):
    if exp_cells:
        top_count = np.sort(count_array)[::-1][:exp_cells]
        return np.quantile(top_count, 0.95)/200
    else:
        exp_cells = 0
        new_exp_cells = int(len(count_array)/2)
        count_array = np.sort(count_array)[::-1]
        while exp_cells != new_exp_cells:  
            exp_cells = new_exp_cells
            
            top_count = count_array[:exp_cells]
            t = np.quantile(top_count, 0.95)/50

            new_exp_cells = (count_array >= t).sum()
            print(exp_cells,new_exp_cells, t)
        return t

threshold_high_sen = percentile_count_thres_high_sensitivity(raw_bc_d_minQ15_filtered.counts,500)
In [30]:
threshold_high_sen
Out[30]:
44.64099999999997
In [29]:
sum(raw_bc_d_minQ15_filtered.counts > 150)
Out[29]:
862

Number of barcode vs Expected number of cells¶

In [14]:
x = np.arange(1,4000,50)
y = [sum(raw_bc_d_minQ15_filtered.counts >= percentile_count_thres(raw_bc_d_minQ15_filtered.counts,i)) for i in x ]
plt.plot(x,y)
plt.xlabel("N value (specified number of cells)")
plt.ylabel("Number of cells passed the threshold")
plt.ylim(0,1000)
# plt.savefig('tuneN_nonq20.svg', format='svg')
plt.show()

BLAZE accuracy (compared to Cellranger whitelist)¶

In [15]:
# Agreement BLAZE and cellranger
bc_pass_idx = raw_bc_d_minQ15_filtered.counts >= threshold
passed_bc = set(raw_bc_d_minQ15_filtered[bc_pass_idx].BC)
from upsetplot import from_contents
a = from_contents({'BLAZE':passed_bc, 'CellRanger':cr_filtered_bc})
upsetplot.plot(a, show_counts=True)
plt.show()
In [16]:
# Barcode rank plot
fig, ax = plt.subplots(1,1, figsize = (8,5))
ax.loglog(list(raw_bc_d_minQ15_filtered.counts), marker="o", linestyle="")

ax.set_xlabel('Barcodes')
ax.set_ylabel('High confident read counts')
ax.set_title('Barcode rank plot')
ax.axhline(y=threshold, color='r', linestyle='--', label = 'cell calling threshold')
ax.legend()

fig.show()

Sockeye¶

Loading sockeye whitelist and check the agreement¶

In [ ]:
 
In [17]:
# load sockeye whitelist
sock_whitelist = set(pd.read_csv(sock_whitelist, header = None)[0])

# sockeye raw count
sock_count = pd.read_csv(sock_count, header = None, sep = '\t', names = ['BC', 'counts'])

# sort the count
sock_count = sock_count.sort_values(by=['counts'], ascending = False)

Overlap BLAZE x Sockeye x Cellranger¶

In [18]:
# upset plot
from upsetplot import from_contents

a = from_contents({'BLAZE':passed_bc, 
                   'Sockeye':sock_whitelist, 
                   'CellRanger':cr_filtered_bc})
upsetplot.plot(a, show_counts=True)
plt.title("PromithIon")
# plt.savefig('upset_nonq20.svg',format='svg')
plt.show()
In [32]:
bc_pass_idx_high_sen = raw_bc_d_minQ15_filtered.counts >= threshold_high_sen
passed_bc_high_sen = set(raw_bc_d_minQ15_filtered[bc_pass_idx_high_sen].BC)
# upset plot
from upsetplot import from_contents

a = from_contents({'BLAZE HS':passed_bc_high_sen, 
                   'Sockeye':sock_whitelist, 
                   'CellRanger':cr_filtered_bc})
upsetplot.plot(a, show_counts=True)
plt.title("GridIon")
plt.show()

I plot the following three categories:

  • Cat1: Found by Sockeye only
  • Cat2: Found by CellRanger only
  • Cat3: Found by Sockeye and CellRanger but not BLAZE
  • Cat4: Found by all of three methods
In [20]:
# 4 categories of cells on  range plot from three methods
cat1 = sock_whitelist-cr_filtered_bc-passed_bc
cat2 = cr_filtered_bc-sock_whitelist-passed_bc
cat3 = (cr_filtered_bc &  sock_whitelist) - passed_bc
cat4= (cr_filtered_bc &  sock_whitelist & passed_bc)
# One CellRanger Knee plot
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8))

ax1.set_title('Knee plot (count using CellRanger)')
ax1.loglog(cr_count_d[~cr_count_d.BC.isin(cat1 | cat2 | cat3 | cat4)].index+1,
           cr_count_d[~cr_count_d.BC.isin(cat1 | cat2 | cat3 | cat4)].counts,
           marker = 'o', linestyle="", label="Other",  alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat1)].index+1)*1.2,
           cr_count_d[cr_count_d.BC.isin(cat1)].counts,
           marker = 'o', linestyle="", label="Found by Sockeye only",  alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat2)].index+1)*1.2**2,
           cr_count_d[cr_count_d.BC.isin(cat2)].counts,
           marker = 'o', linestyle="", label="Found by CellRanger only",  alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat3)].index+1)*1.2**3,
           cr_count_d[cr_count_d.BC.isin(cat3)].counts,
           marker = 'o', linestyle="", label="Found by CR & SE but not BLAZE",  alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat4)].index+1),
           cr_count_d[cr_count_d.BC.isin(cat4)].counts,
           marker = 'o', linestyle="", label="Found by CR & SE & BLAZE",  alpha = 0.5, markersize=6)
#ax1.axhline(y=threshold, color='r', linestyle='--', label = 'BLAZE cell calling threshold')
ax1.legend()
ax1.set_xlabel('Barcodes')
ax1.set_ylabel('UNI counts')
ax1.set_xlim(1,1000000)



ax2.set_title('Knee plot (count using Sockeye)')
ax2.loglog(sock_count[~sock_count.BC.isin(cat1 | cat2 | cat3 | cat4)].index+1,
           sock_count[~sock_count.BC.isin(cat1 | cat2 | cat3 | cat4)].counts,
           marker = 'o', linestyle="", label="Other",  alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat1)].index+1)*1.2,
           sock_count[sock_count.BC.isin(cat1)].counts,
           marker = 'o', linestyle="", label="Found by Sockeye only",  alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat2)].index+1)*1.2**2,
           sock_count[sock_count.BC.isin(cat2)].counts,
           marker = 'o', linestyle="", label="Found by CellRanger only",  alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat3)].index+1)*1.2**3,
           sock_count[sock_count.BC.isin(cat3)].counts,
           marker = 'o', linestyle="", label="Found by CR & SE but not BLAZE",  alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat4)].index+1),
           sock_count[sock_count.BC.isin(cat4)].counts,
           marker = 'o', linestyle="", label="Found by CR & SE & BLAZE",  alpha = 0.5, markersize=6)
#ax1.axhline(y=threshold, color='r', linestyle='--', label = 'BLAZE cell calling threshold')
ax2.legend(loc = 'lower left')
ax2.set_xlabel('Barcodes')
ax2.set_ylabel('Read counts')
ax2.set_xlim(1,1000000)


ax3.set_title('Knee plot (count using BLAZE minQ>=15)')
ax3.loglog(raw_bc_d_minQ15_filtered[~raw_bc_d_minQ15_filtered.BC.isin(cat1 | cat2 | cat3 | cat4)].index+1,
           raw_bc_d_minQ15_filtered[~raw_bc_d_minQ15_filtered.BC.isin(cat1 | cat2 | cat3 | cat4)].counts,
           marker = 'o', linestyle="", label="Other",  alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat1)].index+1)*1.2,
           raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat1)].counts,
           marker = 'o', linestyle="", label="Found by Sockeye only",  alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat2)].index+1)*1.2**2,
           raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat2)].counts,
           marker = 'o', linestyle="", label="Found by CellRanger only",  alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat3)].index+1)*1.2**3,
           raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat3)].counts,
           marker = 'o', linestyle="", label="Found by CR & SE but not BLAZE",  alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat4)].index+1),
           raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat4)].counts,
           marker = 'o', linestyle="", label="Found by CR & SE & BLAZE",  alpha = 0.5, markersize=6)
#ax1.axhline(y=threshold, color='r', linestyle='--', label = 'BLAZE cell calling threshold')
ax3.legend()
ax3.set_xlabel('Barcodes')
ax3.set_ylabel('Read counts')
ax3.set_xlim(1,1000000)


fig.show()

PRC plot¶

In [21]:
from sklearn.metrics import precision_recall_curve

#calculate precision and recall
# add count 0 for true BC if not exsit in the df
# ground true 
true_bc = cr_filtered_bc - cr_empty
# blaze


missing_bc =  [1]* len(true_bc - set(raw_bc_d_minQ15_filtered.BC))
missing_bc_count = [0] * len(missing_bc)
blaze_pre,blaze_recall , blaze_thresholds =\
                            precision_recall_curve([int(x in true_bc) for x in raw_bc_d_minQ15_filtered.BC] + missing_bc,
                                                   list(raw_bc_d_minQ15_filtered.counts) + missing_bc_count)

missing_bc =  [1]* len(true_bc - set(sock_count.BC))
missing_bc = [1] * len(missing_bc)
missing_bc_count = [0] * len(missing_bc)
sockeye_pre,sockeye_recall , sockeye_thresholds =\
                            precision_recall_curve([int(x in true_bc) for x in sock_count.BC] + missing_bc,
                                                   list(sock_count.counts)+missing_bc_count)
In [24]:
from sklearn.metrics import auc
#create precision recall curve

sock_p  = len(true_bc&sock_whitelist)/len(sock_whitelist)
sock_re = len(true_bc&sock_whitelist)/len(true_bc)
blz_p = len(true_bc&passed_bc)/len(passed_bc)
blz_re = len(true_bc&passed_bc)/len(true_bc)
blz_hi_p = len(true_bc&passed_bc_high_sen)/len(passed_bc_high_sen)
blz_hi_re = len(true_bc&passed_bc_high_sen)/len(true_bc)


fig, ax = plt.subplots(figsize = (5,5))
ax.plot(blaze_recall, blaze_pre, 
        label = f'BLAZE (AUC:{auc(blaze_recall,blaze_pre):.4f})', color='c')

ax.plot(sockeye_recall, sockeye_pre, 
        label = f'Sockeye (AUC:{auc(sockeye_recall, sockeye_pre):.4f})', color='orange')


ax.plot([blz_re], [blz_p],'o',  color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
ax.plot([sock_re], [sock_p],'o',   color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )
ax.legend()
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')



#display plot
plt.savefig('Grid_full.svg', format='svg')
plt.show()
/tmp/ipykernel_23689/1350324195.py:20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([blz_re], [blz_p],'o',  color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
/tmp/ipykernel_23689/1350324195.py:21: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([sock_re], [sock_p],'o',   color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
/tmp/ipykernel_23689/1350324195.py:22: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )
In [25]:
from sklearn.metrics import auc
#create precision recall curve

sock_p  = len(true_bc&sock_whitelist)/len(sock_whitelist)
sock_re = len(true_bc&sock_whitelist)/len(true_bc)
blz_p = len(true_bc&passed_bc)/len(passed_bc)
blz_re = len(true_bc&passed_bc)/len(true_bc)
blz_hi_p = len(true_bc&passed_bc_high_sen)/len(passed_bc_high_sen)
blz_hi_re = len(true_bc&passed_bc_high_sen)/len(true_bc)


fig, ax = plt.subplots(figsize = (5,5))
ax.plot(blaze_recall, blaze_pre, 
        label = f'BLAZE (AUC:{auc(blaze_recall,blaze_pre):.4f})', color='c')

ax.plot(sockeye_recall, sockeye_pre, 
        label = f'Sockeye (AUC:{auc(sockeye_recall, sockeye_pre):.4f})', color='orange')


ax.plot([blz_re], [blz_p],'o',  color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
ax.plot([sock_re], [sock_p],'o',   color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )
ax.legend()
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_xlim(0.75,1.01)


#display plot
plt.savefig('Grid_zooming.svg', format='svg')
plt.show()
/tmp/ipykernel_23689/1140288598.py:20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([blz_re], [blz_p],'o',  color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
/tmp/ipykernel_23689/1140288598.py:21: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([sock_re], [sock_p],'o',   color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
/tmp/ipykernel_23689/1140288598.py:22: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )