import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import collections
import upsetplot
import analysis_functions as af
# config filename:
# Cellranger output
filtered_matrix_h5 = "../data/hiPSC_diff/cellranger/filtered_feature_bc_matrix.h5"
raw_matrix_h5 = "../data/hiPSC_diff/cellranger/raw_feature_bc_matrix.h5"
# BLAZE output
blaze_raw_bc_fn = '../data/hiPSC_diff/blaze/gridion_q20_raw_bc.csv'
# sockeye
sock_whitelist = '../data/hiPSC_diff/sockeye/gridion_q20_whitelist.tsv'
sock_count = '../data/hiPSC_diff/sockeye/gridion_q20_uncorrected_bc_counts.tsv'
# this list is obtain from emtryDrops run in R script
cr_empty = {"ACTATCTAGTAGACAT",
"TCCCACACACTCCCTA",
"AATGCCATCACTGGTA",
"CCGGACACACACAGCC",
"ATGCCTCAGACTCGAG",
"CTAGACATCCCTCATG",
"ACACGCGGTCTCCTGT",
"TCATTGTGTTCTCCCA",
"CTCAACCTCCCATGGG",
"ACGGAAGCATTGAGCT",
"TCCCAGTGTCCGCAGT",
"AGTACCATCCCTCAAC",
"TAGCACATCAACTGAC",
"GAGTTACGTAACCCTA",
"TCGACCTCATGACTGT",
"TATTGGGTCCCATGGG",
"TCACTCGCATGTACGT",
"TGAGCATGTTCTCCCA",
"TCCCATGCACATACTG",
"GATGAGGTCTAAGGAA",
"TCCCATGGTCTCAAGT",
"CTGGCAGAGTATAGAC",
"CTTACCGGTACACTCA",
"AGTCACAGTGCCGTAC",
"TCGGGTGTCCCATGGG",
"TTCTGTAGTCCCTCAT",
"TATATCCCATAGCACT",
"TCCCATGGTCGTACTA",
"TGTTACTTCTAGAACC",
"GTTGTAGCACTATCCC",
"TGCTTGCTCCTCTCTT",
"GGAATCTAGTTCCGTA",
"CTTCCTTGTAGACACG",
"GATGCTACACATTCGA",
"GGGCTACCACCAGTTA",
"CAACGATTCTGACGCG",
"CAAAGAAAGTCTCTGA"}
# get cell ranger matrix
class cr_matrix:
def __init__(self, mat_file):
self.mat = af.get_matrix_from_h5(mat_file)
def get_barcode_umi_count(self):
'''
Get the barcode umi count (columns sum) of the matrix
return:
np.array
'''
return np.array(self.mat.matrix.sum(0)).reshape(-1)
def get_barcode_str(self, as_str = True):
'''
get the barcode stored in the matrix.hs
Option:
as_str: <bool> whether convert the barcode to str (True by default)
return:
np.array of barcodes
'''
if as_str:
return np.array([x.decode('UTF-8').split('-')[0] for x in self.mat.barcodes])
else:
return self.mat.barcodes
filtered_feature_bc_matrix = cr_matrix(filtered_matrix_h5)
raw_feature_bc_matrix = cr_matrix(raw_matrix_h5)
cr_filtered_bc = set(filtered_feature_bc_matrix.get_barcode_str())
cr_count_d = pd.DataFrame({
"BC": raw_feature_bc_matrix.get_barcode_str(),
"counts": raw_feature_bc_matrix.get_barcode_umi_count()
})
cr_count_d['is_cell'] = cr_count_d.BC.isin(cr_filtered_bc)
cr_count_d = cr_count_d.sort_values(by=['counts'], ascending = False).reset_index()
# blaze raw bc table
raw_bc_d = pd.read_csv(blaze_raw_bc_fn)
set_true = raw_bc_d[raw_bc_d.raw_bc.isin(cr_filtered_bc)].raw_bc_min_q
set_false = raw_bc_d[~raw_bc_d.raw_bc.isin(set(cr_count_d.BC))].raw_bc_min_q
plt.hist(set_true, label = 'Raw BCs exectly match short read barcode', density = True, bins = np.arange(0,40,3), alpha = 0.5, histtype='step')
plt.hist(set_false,label = 'Raw BCs do not match any short read barcode', density = True, bins = np.arange(0,40,3), alpha = 0.5, histtype='step')
#plt.hist(set_false2, label = '3', density = True, bins = np.arange(0,40,3), alpha = 0.5, histtype='step')
plt.xlabel('minimum base quality')
plt.ylabel('Density')
plt.legend()
#plt.savefig('minQ_gird_q20.svg',format = 'svg')
plt.show()
whole_whitelist = []
with open("../data/3M-february-2018.txt", 'r') as f:
for line in f:
whole_whitelist.append(line.strip())
whole_whitelist = set(whole_whitelist)
# threshold for minQ
def tune_minQ(minQ, d):
raw_bc_d_HQ = d[d.raw_bc_min_q >= minQ].raw_bc.value_counts().to_frame()
raw_bc_d_HQ['bc_rank'] = list(range(1,len(raw_bc_d_HQ)+1))
raw_bc_d_HQ.reset_index(inplace=True)
raw_bc_d_HQ = raw_bc_d_HQ.rename(columns = {'index':'BC', 'raw_bc':'counts'})
rst_d = raw_bc_d_HQ[raw_bc_d_HQ.BC.isin(whole_whitelist)]
rst_d.reset_index(inplace=True)
print(rst_d.counts.sum())
return rst_d
# use default threshold 15
raw_bc_d_minQ15_filtered = tune_minQ(15, d = raw_bc_d)
1862658
# 95-percentile of top N cell / 20
def percentile_count_thres(count_array, exp_cells=None):
if exp_cells:
top_count = np.sort(count_array)[::-1][:exp_cells]
return np.quantile(top_count, 0.95)/20
else:
exp_cells = 0
new_exp_cells = int(len(count_array)/2)
count_array = np.sort(count_array)[::-1]
while exp_cells != new_exp_cells:
exp_cells = new_exp_cells
top_count = count_array[:exp_cells]
t = np.quantile(top_count, 0.95)/20
new_exp_cells = (count_array >= t).sum()
print(exp_cells,new_exp_cells, t)
return t
threshold = percentile_count_thres(raw_bc_d_minQ15_filtered.counts,500)
# 95-percentile of top N cell / 200
def percentile_count_thres_high_sensitivity(count_array, exp_cells=None):
if exp_cells:
top_count = np.sort(count_array)[::-1][:exp_cells]
return np.quantile(top_count, 0.95)/200
else:
exp_cells = 0
new_exp_cells = int(len(count_array)/2)
count_array = np.sort(count_array)[::-1]
while exp_cells != new_exp_cells:
exp_cells = new_exp_cells
top_count = count_array[:exp_cells]
t = np.quantile(top_count, 0.95)/50
new_exp_cells = (count_array >= t).sum()
print(exp_cells,new_exp_cells, t)
return t
threshold_high_sen = percentile_count_thres_high_sensitivity(raw_bc_d_minQ15_filtered.counts,500)
x = np.arange(1,10000,50)
y = [sum(raw_bc_d_minQ15_filtered.counts >= percentile_count_thres(raw_bc_d_minQ15_filtered.counts,i)) for i in x ]
plt.plot(x,y)
plt.xlabel("N value (specified number of cells)")
plt.ylabel("Number of cells passed the threshold")
plt.ylim(0,1000)
#plt.savefig('tuneN_q20.svg', format='svg')
plt.show()
# Agreement BLAZE and cellranger
bc_pass_idx = raw_bc_d_minQ15_filtered.counts >= threshold
passed_bc = set(raw_bc_d_minQ15_filtered[bc_pass_idx].BC)
from upsetplot import from_contents
a = from_contents({'BLAZE':passed_bc, 'CellRanger':cr_filtered_bc})
upsetplot.plot(a, show_counts=True)
plt.show()
# Barcode rank plot
fig, ax = plt.subplots(1,1, figsize = (8,5))
ax.loglog(list(raw_bc_d_minQ15_filtered.counts), marker="o", linestyle="")
ax.set_xlabel('Barcodes')
ax.set_ylabel('High confident read counts')
ax.set_title('Barcode rank plot')
ax.axhline(y=threshold, color='r', linestyle='--', label = 'cell calling threshold')
ax.legend()
fig.show()
# load sockeye whitelist
sock_whitelist = set(pd.read_csv(sock_whitelist, header = None)[0])
# sockeye raw count
sock_count = pd.read_csv(sock_count, header = None, sep = '\t', names = ['BC', 'counts'])
# sort the count
sock_count = sock_count.sort_values(by=['counts'], ascending = False)
# upset plot
from upsetplot import from_contents
a = from_contents({'BLAZE':passed_bc,
'Sockeye':sock_whitelist,
'CellRanger':cr_filtered_bc})
upsetplot.plot(a, show_counts=True)
plt.title("PromithIon")
# plt.savefig('upset_q20.svg',format='svg')
plt.show()
bc_pass_idx_high_sen = raw_bc_d_minQ15_filtered.counts >= threshold_high_sen
passed_bc_high_sen = set(raw_bc_d_minQ15_filtered[bc_pass_idx_high_sen].BC)
# upset plot
from upsetplot import from_contents
a = from_contents({'BLAZE HS':passed_bc_high_sen,
'Sockeye':sock_whitelist,
'CellRanger':cr_filtered_bc})
upsetplot.plot(a, show_counts=True)
plt.title("Q20")
plt.show()
I plot the following three categories:
# # upset plot
# from upsetplot import from_contents
# a = from_contents({'Q20':q20_cat2,
# 'GridION':grid_cat2,
# 'PromethION':pmth_cat2})
# upsetplot.plot(a, show_counts=True)
# plt.title("CellRange-specific cells")
# # plt.savefig('upset_q20.svg',format='svg')
# plt.show()
# 4 categories of cells on range plot from three methods
cat1 = sock_whitelist-cr_filtered_bc-passed_bc
cat2 = cr_filtered_bc-sock_whitelist-passed_bc
cat3 = (cr_filtered_bc & sock_whitelist) - passed_bc
cat4= (cr_filtered_bc & sock_whitelist & passed_bc)
# One CellRanger Knee plot
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8))
ax1.set_title('Knee plot (count using CellRanger)')
ax1.loglog(cr_count_d[~cr_count_d.BC.isin(cat1 | cat2 | cat3 | cat4)].index+1,
cr_count_d[~cr_count_d.BC.isin(cat1 | cat2 | cat3 | cat4)].counts,
marker = 'o', linestyle="", label="Other", alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat1)].index+1)*1.2,
cr_count_d[cr_count_d.BC.isin(cat1)].counts,
marker = 'o', linestyle="", label="Found by Sockeye only", alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat2)].index+1)*1.2**2,
cr_count_d[cr_count_d.BC.isin(cat2)].counts,
marker = 'o', linestyle="", label="Found by CellRanger only", alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat3)].index+1)*1.2**3,
cr_count_d[cr_count_d.BC.isin(cat3)].counts,
marker = 'o', linestyle="", label="Found by CR & SE but not BLAZE", alpha = 0.5, markersize=6)
ax1.loglog((cr_count_d[cr_count_d.BC.isin(cat4)].index+1),
cr_count_d[cr_count_d.BC.isin(cat4)].counts,
marker = 'o', linestyle="", label="Found by CR & SE & BLAZE", alpha = 0.5, markersize=6)
#ax1.axhline(y=threshold, color='r', linestyle='--', label = 'BLAZE cell calling threshold')
ax1.legend()
ax1.set_xlabel('Barcodes')
ax1.set_ylabel('UNI counts')
ax1.set_xlim(1,1000000)
ax2.set_title('Knee plot (count using Sockeye)')
ax2.loglog(sock_count[~sock_count.BC.isin(cat1 | cat2 | cat3 | cat4)].index+1,
sock_count[~sock_count.BC.isin(cat1 | cat2 | cat3 | cat4)].counts,
marker = 'o', linestyle="", label="Other", alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat1)].index+1)*1.2,
sock_count[sock_count.BC.isin(cat1)].counts,
marker = 'o', linestyle="", label="Found by Sockeye only", alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat2)].index+1)*1.2**2,
sock_count[sock_count.BC.isin(cat2)].counts,
marker = 'o', linestyle="", label="Found by CellRanger only", alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat3)].index+1)*1.2**3,
sock_count[sock_count.BC.isin(cat3)].counts,
marker = 'o', linestyle="", label="Found by CR & SE but not BLAZE", alpha = 0.5, markersize=6)
ax2.loglog((sock_count[sock_count.BC.isin(cat4)].index+1),
sock_count[sock_count.BC.isin(cat4)].counts,
marker = 'o', linestyle="", label="Found by CR & SE & BLAZE", alpha = 0.5, markersize=6)
#ax1.axhline(y=threshold, color='r', linestyle='--', label = 'BLAZE cell calling threshold')
ax2.legend(loc = 'lower left')
ax2.set_xlabel('Barcodes')
ax2.set_ylabel('Read counts')
ax2.set_xlim(1,1000000)
ax3.set_title('Knee plot (count using BLAZE minQ>=15)')
ax3.loglog(raw_bc_d_minQ15_filtered[~raw_bc_d_minQ15_filtered.BC.isin(cat1 | cat2 | cat3 | cat4)].index+1,
raw_bc_d_minQ15_filtered[~raw_bc_d_minQ15_filtered.BC.isin(cat1 | cat2 | cat3 | cat4)].counts,
marker = 'o', linestyle="", label="Other", alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat1)].index+1)*1.2,
raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat1)].counts,
marker = 'o', linestyle="", label="Found by Sockeye only", alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat2)].index+1)*1.2**2,
raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat2)].counts,
marker = 'o', linestyle="", label="Found by CellRanger only", alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat3)].index+1)*1.2**3,
raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat3)].counts,
marker = 'o', linestyle="", label="Found by CR & SE but not BLAZE", alpha = 0.5, markersize=6)
ax3.loglog((raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat4)].index+1),
raw_bc_d_minQ15_filtered[raw_bc_d_minQ15_filtered.BC.isin(cat4)].counts,
marker = 'o', linestyle="", label="Found by CR & SE & BLAZE", alpha = 0.5, markersize=6)
#ax1.axhline(y=threshold, color='r', linestyle='--', label = 'BLAZE cell calling threshold')
ax3.legend()
ax3.set_xlabel('Barcodes')
ax3.set_ylabel('Read counts')
ax3.set_xlim(1,1000000)
fig.show()
from sklearn.metrics import precision_recall_curve
#calculate precision and recall
# add count 0 for true BC if not exsit in the df
# ground true
true_bc = cr_filtered_bc - cr_empty
# blaze
missing_bc = [1]* len(true_bc - set(raw_bc_d_minQ15_filtered.BC))
missing_bc_count = [0] * len(missing_bc)
blaze_pre,blaze_recall , blaze_thresholds =\
precision_recall_curve([int(x in true_bc) for x in raw_bc_d_minQ15_filtered.BC] + missing_bc,
list(raw_bc_d_minQ15_filtered.counts) + missing_bc_count)
missing_bc = [1]* len(true_bc - set(sock_count.BC))
missing_bc = [1] * len(missing_bc)
missing_bc_count = [0] * len(missing_bc)
sockeye_pre,sockeye_recall , sockeye_thresholds =\
precision_recall_curve([int(x in true_bc) for x in sock_count.BC] + missing_bc,
list(sock_count.counts)+missing_bc_count)
from sklearn.metrics import auc
#create precision recall curve
sock_p = len(true_bc&sock_whitelist)/len(sock_whitelist)
sock_re = len(true_bc&sock_whitelist)/len(true_bc)
blz_p = len(true_bc&passed_bc)/len(passed_bc)
blz_re = len(true_bc&passed_bc)/len(true_bc)
blz_hi_p = len(true_bc&passed_bc_high_sen)/len(passed_bc_high_sen)
blz_hi_re = len(true_bc&passed_bc_high_sen)/len(true_bc)
fig, ax = plt.subplots(figsize = (5,5))
ax.plot(blaze_recall, blaze_pre,
label = f'BLAZE (AUC:{auc(blaze_recall,blaze_pre):.4f})', color='c')
ax.plot(sockeye_recall, sockeye_pre,
label = f'Sockeye (AUC:{auc(sockeye_recall, sockeye_pre):.4f})', color='orange')
ax.plot([blz_re], [blz_p],'o', color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
ax.plot([sock_re], [sock_p],'o', color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )
ax.legend()
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_xlim(0.75,1.01)
#display plot
plt.savefig('Q20_zooming.svg', format='svg')
plt.show()
/tmp/ipykernel_4682/3268917005.py:20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence. ax.plot([blz_re], [blz_p],'o', color='c',markersize=6, label='BLAZE default threshold', marker = 'x' ) /tmp/ipykernel_4682/3268917005.py:21: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence. ax.plot([sock_re], [sock_p],'o', color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' ) /tmp/ipykernel_4682/3268917005.py:22: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence. ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )