# load 10x barcode whitelist
whole_whitelist = []
with open("../data/3M-february-2018.txt", 'r') as f:
for line in f:
whole_whitelist.append(line.strip())
whole_whitelist = set(whole_whitelist)
# Use BC information from short reads (by Cellranger)'
# Cellranger output
filtered_matrix_h5 = "../data/hiPSC_diff/cellranger/filtered_feature_bc_matrix.h5"
raw_matrix_h5 = "../data/hiPSC_diff/cellranger/raw_feature_bc_matrix.h5"
cr_filtered_bc = set(filtered_feature_bc_matrix.get_barcode_str())
cr_count_d = pd.DataFrame({
"BC": raw_feature_bc_matrix.get_barcode_str(),
"counts": raw_feature_bc_matrix.get_barcode_umi_count()
})
cr_count_d['is_cell'] = cr_count_d.BC.isin(cr_filtered_bc)
cr_count_d = cr_count_d.sort_values(by=['counts'], ascending = False).reset_index()
cr_count_d = cr_count_d[cr_count_d.counts > 0]
# save the UMI count per BC from short read
cr_count_d.to_csv('../data/SR_bc.csv',header=True)
The following code was run on ternimal:
PERFECT_READ=/data/projects/punim0614/yupei/project/SC_demultiplex_proj/simulation/perfect_read/1000cell_perfect_read.fa
python3 ~/github_repo/youyupei/sc_long_read_sim/bin/perfect_read_generator.py -o $PERFECT_READ -i ~/github_repo/youyupei/sc_long_read_sim/data/1000_simulated_cell_umi.csv --thread 32 -r
cd /data/projects/punim0614/yupei/project/SC_demultiplex_proj/simulation/SLsim_sample
python3 ~/github_repo/youyupei/sc_long_read_sim/bin/error_simulator.py --thread 32 --badread-identity 94,1.35,100 -o 94_1.35_100_ideal.fq -t ../perfect_read/1000cell_perfect_read.fa --badread-qscore-model ideal --batch-size 5000
NOTE: The read accruacy distribution is close to what we have observed in real data. We used ideal qscore model from badread because the other models didn't give us realistic distribution of the q score. Although the "ideal qscore" model was also not the best, it did show different distribution between the minimum Qscores of error-free putative barcodes and those from error-containing putative barcodes (This is what we have observed in the real data).
Command line output:
Loading error model from /home/youyupei/.conda/envs/blaze/lib/python3.7/site-packages/badread/error_models/nanopore2020.gz
done: loaded error distributions for 16384 7-mers
Using an ideal qscore model
Generating read identities from a beta distribution:
mean = 94% shape parameters:
max = 100% alpha = 2.8996e+02
stdev = 1.4% beta = 1.8508e+01
│ ▌
│ ▌▌
│ ▌▌
│ ▌▌▌
│ ▌▌▌▌
│ ▌▌▌▌
│ ▖▌▌▌▌
│ ▖▌▌▌▌▌▌
├─────────┬─────────┬─────────┬─────────┬─────────┐
50 60 70 80 90 100
python3 ~/github_repo/shimlab/BLAZE/bin/blaze.py --expect-cell 1000 --threads 32 template.fa
snakemake --use-conda --core all -pr all
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import collections
import upsetplot
import analysis_functions as af
# config filename:
# Cellranger output
filtered_matrix_h5 = "../data/hiPSC_diff/cellranger/filtered_feature_bc_matrix.h5"
raw_matrix_h5 = "../data/hiPSC_diff/cellranger/raw_feature_bc_matrix.h5"
# BLAZE output
blaze_raw_bc_fn = '../data/simulation/blaze/putative_bc.csv'
# sockeye
sock_whitelist = '../data/simulation/sockeye/whitelist.tsv'
sock_count = '../data/simulation/sockeye/uncorrected_bc_counts.tsv'
# output
out_dir = '../data/simulation/'
def reverse_complement(seq):
'''
Args: <str>
queried seq
Returns: <str>
reverse_complement seq
'''
comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A',
'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}
letters = \
[comp[base] if base in comp.keys() else base for base in seq]
return ''.join(letters)[::-1]
Note: In this analysis the ground truth is exactly the cell ranger result. Because all the simulation is based on the short reads barcodes and their frequencies.
# get cell ranger matrix
class cr_matrix:
def __init__(self, mat_file):
self.mat = af.get_matrix_from_h5(mat_file)
def get_barcode_umi_count(self):
'''
Get the barcode umi count (columns sum) of the matrix
return:
np.array
'''
return np.array(self.mat.matrix.sum(0)).reshape(-1)
def get_barcode_str(self, as_str = True):
'''
get the barcode stored in the matrix.hs
Option:
as_str: <bool> whether convert the barcode to str (True by default)
return:
np.array of barcodes
'''
if as_str:
return np.array([x.decode('UTF-8').split('-')[0] for x in self.mat.barcodes])
else:
return self.mat.barcodes
filtered_feature_bc_matrix = cr_matrix(filtered_matrix_h5)
raw_feature_bc_matrix = cr_matrix(raw_matrix_h5)
cr_filtered_bc = set(filtered_feature_bc_matrix.get_barcode_str())
cr_count_d = pd.DataFrame({
"BC": raw_feature_bc_matrix.get_barcode_str(),
"counts": raw_feature_bc_matrix.get_barcode_umi_count()
})
cr_count_d['is_cell'] = cr_count_d.BC.isin(cr_filtered_bc)
cr_count_d = cr_count_d.sort_values(by=['counts'], ascending = False).reset_index()
# load BLAZE result and correctness of putative barcode
# blaze raw bc table
putative_bc_d = pd.read_csv(blaze_raw_bc_fn)
# add ground truth
putative_bc_d['true_BC'] = putative_bc_d.read_id.apply(lambda x: x.split('_')[0])
putative_bc_d['BC_correctness'] = putative_bc_d.putative_bc == putative_bc_d.true_BC
# Summary
print(f"""
Putative barcode accruacy: {putative_bc_d.BC_correctness.mean()*100:.2f}%
Proportion of putative barcodes with minQ >= 15: {np.mean(putative_bc_d.putative_bc_min_q>=15)*100:.2f}%
Accuracy of putative barcodes with minQ >= 15: {putative_bc_d[putative_bc_d.putative_bc_min_q>=15].BC_correctness.mean()*100:.2f}%
Proportion of putative barcodes with minQ >= 7: {np.mean(putative_bc_d.putative_bc_min_q>=7)*100:.2f}%
Accuracy of putative barcodes with minQ >= 7: {putative_bc_d[putative_bc_d.putative_bc_min_q>=7].BC_correctness.mean()*100:.2f}%
""")
Putative barcode accruacy: 38.60% Proportion of putative barcodes with minQ >= 15: 38.69% Accuracy of putative barcodes with minQ >= 15: 88.31% Proportion of putative barcodes with minQ >= 7: 42.20% Accuracy of putative barcodes with minQ >= 7: 85.27%
The default minQ threshold 15 for real data. However, the 15 seems not to be a realitic value for this simulted data. I use try 7.
# threshold for minQ
def tune_minQ(minQ, d):
raw_bc_d_HQ = d[d.putative_bc_min_q >= minQ].putative_bc.value_counts().to_frame()
raw_bc_d_HQ['bc_rank'] = list(range(1,len(raw_bc_d_HQ)+1))
raw_bc_d_HQ.reset_index(inplace=True)
raw_bc_d_HQ = raw_bc_d_HQ.rename(columns = {'index':'BC', 'raw_bc':'counts'})
rst_d = raw_bc_d_HQ[raw_bc_d_HQ.BC.isin(whole_whitelist)]
rst_d.reset_index(inplace=True)
return rst_d
# use default threshold 15
putative_bc_d_minQ15_filtered = tune_minQ(15, d = putative_bc_d)
Applying count threshold manually
# Threshold
# Default threshod of count Quantile (95-percentile of top N cell / 20)
def percentile_count_thres(count_array, exp_cells=None):
if exp_cells:
top_count = np.sort(count_array)[::-1][:exp_cells]
return np.quantile(top_count, 0.95)/20
else:
exp_cells = 0
new_exp_cells = int(len(count_array)/2)
count_array = np.sort(count_array)[::-1]
while exp_cells != new_exp_cells:
exp_cells = new_exp_cells
top_count = count_array[:exp_cells]
t = np.quantile(top_count, 0.95)/20
new_exp_cells = (count_array >= t).sum()
print(exp_cells,new_exp_cells, t)
return t
threshold = percentile_count_thres(putative_bc_d_minQ15_filtered.putative_bc,500)
# high_sensitivity threshold
def percentile_count_thres_high_sensitivity(count_array, exp_cells=None):
if exp_cells:
top_count = np.sort(count_array)[::-1][:exp_cells]
return np.quantile(top_count, 0.95)/200
else:
exp_cells = 0
new_exp_cells = int(len(count_array)/2)
count_array = np.sort(count_array)[::-1]
while exp_cells != new_exp_cells:
exp_cells = new_exp_cells
top_count = count_array[:exp_cells]
t = np.quantile(top_count, 0.95)/200
new_exp_cells = (count_array >= t).sum()
print(exp_cells,new_exp_cells, t)
return t
threshold_high_sen = percentile_count_thres_high_sensitivity(putative_bc_d_minQ15_filtered.putative_bc,500)
The expected number of cell is set to 500 but the result is not sensitive to the number:
# load sockeye whitelist
sock_whitelist = set(pd.read_csv(sock_whitelist, header = None)[0])
# sockeye raw count
sock_count = pd.read_csv(sock_count, header = None, sep = '\t', names = ['BC', 'counts'])
# sort the count
sock_count = sock_count.sort_values(by=['counts'], ascending = False)
from sklearn.metrics import precision_recall_curve
missing_bc = [1]* len(cr_filtered_bc - set(putative_bc_d_minQ15_filtered.BC))
missing_bc_count = [0] * len(missing_bc)
blaze_pre,blaze_recall , blaze_thresholds =\
precision_recall_curve([int(x in cr_filtered_bc) for x in putative_bc_d_minQ15_filtered.BC] + missing_bc,
list(putative_bc_d_minQ15_filtered.putative_bc) + missing_bc_count)
missing_bc = [1]* len(cr_filtered_bc - set(sock_count.BC))
missing_bc = [1] * len(missing_bc)
missing_bc_count = [0] * len(missing_bc)
sockeye_pre,sockeye_recall , sockeye_thresholds =\
precision_recall_curve([int(x in cr_filtered_bc) for x in sock_count.BC] + missing_bc,
list(sock_count.counts)+missing_bc_count)
from sklearn.metrics import auc
#create precision recall curve
sock_p = len(cr_filtered_bc&sock_whitelist)/len(sock_whitelist)
sock_re = len(cr_filtered_bc&sock_whitelist)/len(cr_filtered_bc)
blz_p = len(cr_filtered_bc&passed_bc)/len(passed_bc)
blz_re = len(cr_filtered_bc&passed_bc)/len(cr_filtered_bc)
blz_hi_p = len(cr_filtered_bc&passed_bc_hs)/len(passed_bc_hs)
blz_hi_re = len(cr_filtered_bc&passed_bc_hs)/len(cr_filtered_bc)
fig, ax = plt.subplots(figsize = (5,5))
ax.plot(blaze_recall, blaze_pre,
label = f'BLAZE (AUC:{auc(blaze_recall,blaze_pre):.4f})', color='c')
ax.plot(sockeye_recall, sockeye_pre,
label = f'Sockeye (AUC:{auc(sockeye_recall, sockeye_pre):.4f})', color='orange')
ax.plot([blz_re], [blz_p],'o', color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
ax.plot([sock_re], [sock_p],'o', color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )
ax.legend()
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_xlim(0.75,1.01)
#display plot
plt.savefig('sim_zoomin.svg', format='svg')
plt.show()
/tmp/ipykernel_13238/3493766576.py:20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence. ax.plot([blz_re], [blz_p],'o', color='c',markersize=6, label='BLAZE default threshold', marker = 'x' ) /tmp/ipykernel_13238/3493766576.py:21: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence. ax.plot([sock_re], [sock_p],'o', color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' ) /tmp/ipykernel_13238/3493766576.py:22: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence. ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )