# load 10x barcode whitelist
whole_whitelist = []
with open("../data/3M-february-2018.txt", 'r') as f:
    for line in f:
        whole_whitelist.append(line.strip())
whole_whitelist = set(whole_whitelist)

# Use BC information from short reads (by Cellranger)'

# Cellranger output
filtered_matrix_h5 = "../data/hiPSC_diff/cellranger/filtered_feature_bc_matrix.h5"
raw_matrix_h5 = "../data/hiPSC_diff/cellranger/raw_feature_bc_matrix.h5"

cr_filtered_bc = set(filtered_feature_bc_matrix.get_barcode_str())
cr_count_d = pd.DataFrame({
    "BC": raw_feature_bc_matrix.get_barcode_str(),
    "counts": raw_feature_bc_matrix.get_barcode_umi_count()
})
cr_count_d['is_cell'] = cr_count_d.BC.isin(cr_filtered_bc)
cr_count_d = cr_count_d.sort_values(by=['counts'], ascending = False).reset_index()
cr_count_d = cr_count_d[cr_count_d.counts > 0]


# save the UMI count per BC from short read
cr_count_d.to_csv('../data/SR_bc.csv',header=True)

PERFECT_READ=/data/projects/punim0614/yupei/project/SC_demultiplex_proj/simulation/perfect_read/1000cell_perfect_read.fa

python3 ~/github_repo/youyupei/sc_long_read_sim/bin/perfect_read_generator.py -o $PERFECT_READ -i ~/github_repo/youyupei/sc_long_read_sim/data/1000_simulated_cell_umi.csv --thread 32 -r


import pandas as pd
import numpy as np 
from tqdm import tqdm
from matplotlib import pyplot as plt
import collections
import upsetplot

import analysis_functions as af


# config filename:
# Cellranger output
filtered_matrix_h5 = "../data/hiPSC_diff/cellranger/filtered_feature_bc_matrix.h5"
raw_matrix_h5 = "../data/hiPSC_diff/cellranger/raw_feature_bc_matrix.h5"

# BLAZE output
blaze_raw_bc_fn = '../data/simulation/blaze/putative_bc.csv'

# sockeye
sock_whitelist = '../data/simulation/sockeye/whitelist.tsv'
sock_count = '../data/simulation/sockeye/uncorrected_bc_counts.tsv'

# output
out_dir = '../data/simulation/'


def reverse_complement(seq):
    '''
    Args: <str>
        queried seq
    Returns: <str>
        reverse_complement seq
    '''
    comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 
                    'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}
    letters = \
        [comp[base] if base in comp.keys() else base for base in seq]
    return ''.join(letters)[::-1]


# get cell ranger matrix
class cr_matrix:
    def __init__(self, mat_file):
        self.mat = af.get_matrix_from_h5(mat_file)
    
    def get_barcode_umi_count(self):
        '''
        Get the barcode umi count (columns sum) of the matrix
        return:
            np.array
        '''
        return np.array(self.mat.matrix.sum(0)).reshape(-1)
    def get_barcode_str(self, as_str = True):
        '''
        get the barcode stored in the matrix.hs
        
        Option:
            as_str: <bool> whether convert the barcode to str (True by default)
        return:
            np.array of barcodes
        '''
        if as_str:
            return np.array([x.decode('UTF-8').split('-')[0] for x in self.mat.barcodes])
        else:
            return self.mat.barcodes


filtered_feature_bc_matrix = cr_matrix(filtered_matrix_h5)
raw_feature_bc_matrix = cr_matrix(raw_matrix_h5)


cr_filtered_bc = set(filtered_feature_bc_matrix.get_barcode_str())
cr_count_d = pd.DataFrame({
    "BC": raw_feature_bc_matrix.get_barcode_str(),
    "counts": raw_feature_bc_matrix.get_barcode_umi_count()
})


cr_count_d['is_cell'] = cr_count_d.BC.isin(cr_filtered_bc)
cr_count_d = cr_count_d.sort_values(by=['counts'], ascending = False).reset_index()


# load BLAZE result and correctness of putative barcode

# blaze raw bc table
putative_bc_d = pd.read_csv(blaze_raw_bc_fn)
# add ground truth
putative_bc_d['true_BC'] = putative_bc_d.read_id.apply(lambda x: x.split('_')[0])
putative_bc_d['BC_correctness'] = putative_bc_d.putative_bc == putative_bc_d.true_BC


# Summary
print(f"""
Putative barcode accruacy: {putative_bc_d.BC_correctness.mean()*100:.2f}%

Proportion of putative barcodes with minQ >= 15: {np.mean(putative_bc_d.putative_bc_min_q>=15)*100:.2f}% 
Accuracy of putative barcodes with minQ >= 15: {putative_bc_d[putative_bc_d.putative_bc_min_q>=15].BC_correctness.mean()*100:.2f}%

Proportion of putative barcodes with minQ >= 7: {np.mean(putative_bc_d.putative_bc_min_q>=7)*100:.2f}% 
Accuracy of putative barcodes with minQ >= 7: {putative_bc_d[putative_bc_d.putative_bc_min_q>=7].BC_correctness.mean()*100:.2f}%
""")

Putative barcode accruacy: 38.60%

Proportion of putative barcodes with minQ >= 15: 38.69% 
Accuracy of putative barcodes with minQ >= 15: 88.31%

Proportion of putative barcodes with minQ >= 7: 42.20% 
Accuracy of putative barcodes with minQ >= 7: 85.27%


# threshold for minQ
def tune_minQ(minQ, d):
    raw_bc_d_HQ = d[d.putative_bc_min_q >= minQ].putative_bc.value_counts().to_frame()
    raw_bc_d_HQ['bc_rank'] = list(range(1,len(raw_bc_d_HQ)+1))
    raw_bc_d_HQ.reset_index(inplace=True)
    raw_bc_d_HQ = raw_bc_d_HQ.rename(columns = {'index':'BC', 'raw_bc':'counts'})
    rst_d = raw_bc_d_HQ[raw_bc_d_HQ.BC.isin(whole_whitelist)]
    rst_d.reset_index(inplace=True)
    return rst_d

# use default threshold 15 
putative_bc_d_minQ15_filtered = tune_minQ(15, d = putative_bc_d)


# Threshold
# Default threshod of count Quantile (95-percentile of top N cell / 20)
def percentile_count_thres(count_array, exp_cells=None):
    if exp_cells:
        top_count = np.sort(count_array)[::-1][:exp_cells]
        return np.quantile(top_count, 0.95)/20
    else:
        exp_cells = 0
        new_exp_cells = int(len(count_array)/2)
        count_array = np.sort(count_array)[::-1]
        while exp_cells != new_exp_cells:  
            exp_cells = new_exp_cells
            
            top_count = count_array[:exp_cells]
            t = np.quantile(top_count, 0.95)/20

            new_exp_cells = (count_array >= t).sum()
            print(exp_cells,new_exp_cells, t)
        return t

threshold = percentile_count_thres(putative_bc_d_minQ15_filtered.putative_bc,500)

# high_sensitivity threshold
def percentile_count_thres_high_sensitivity(count_array, exp_cells=None):
    if exp_cells:
        top_count = np.sort(count_array)[::-1][:exp_cells]
        return np.quantile(top_count, 0.95)/200
    else:
        exp_cells = 0
        new_exp_cells = int(len(count_array)/2)
        count_array = np.sort(count_array)[::-1]
        while exp_cells != new_exp_cells:  
            exp_cells = new_exp_cells
            
            top_count = count_array[:exp_cells]
            t = np.quantile(top_count, 0.95)/200

            new_exp_cells = (count_array >= t).sum()
            print(exp_cells,new_exp_cells, t)
        return t

threshold_high_sen = percentile_count_thres_high_sensitivity(putative_bc_d_minQ15_filtered.putative_bc,500)


# load sockeye whitelist
sock_whitelist = set(pd.read_csv(sock_whitelist, header = None)[0])

# sockeye raw count
sock_count = pd.read_csv(sock_count, header = None, sep = '\t', names = ['BC', 'counts'])

# sort the count
sock_count = sock_count.sort_values(by=['counts'], ascending = False)


from sklearn.metrics import precision_recall_curve

missing_bc =  [1]* len(cr_filtered_bc - set(putative_bc_d_minQ15_filtered.BC))
missing_bc_count = [0] * len(missing_bc)
blaze_pre,blaze_recall , blaze_thresholds =\
                            precision_recall_curve([int(x in cr_filtered_bc) for x in putative_bc_d_minQ15_filtered.BC] + missing_bc,
                                                   list(putative_bc_d_minQ15_filtered.putative_bc) + missing_bc_count)



missing_bc =  [1]* len(cr_filtered_bc - set(sock_count.BC))
missing_bc = [1] * len(missing_bc)
missing_bc_count = [0] * len(missing_bc)
sockeye_pre,sockeye_recall , sockeye_thresholds =\
                            precision_recall_curve([int(x in cr_filtered_bc) for x in sock_count.BC] + missing_bc,
                                                   list(sock_count.counts)+missing_bc_count)


from sklearn.metrics import auc
#create precision recall curve

sock_p  = len(cr_filtered_bc&sock_whitelist)/len(sock_whitelist)
sock_re = len(cr_filtered_bc&sock_whitelist)/len(cr_filtered_bc)
blz_p = len(cr_filtered_bc&passed_bc)/len(passed_bc)
blz_re = len(cr_filtered_bc&passed_bc)/len(cr_filtered_bc)
blz_hi_p = len(cr_filtered_bc&passed_bc_hs)/len(passed_bc_hs)
blz_hi_re = len(cr_filtered_bc&passed_bc_hs)/len(cr_filtered_bc)


fig, ax = plt.subplots(figsize = (5,5))
ax.plot(blaze_recall, blaze_pre, 
        label = f'BLAZE (AUC:{auc(blaze_recall,blaze_pre):.4f})', color='c')

ax.plot(sockeye_recall, sockeye_pre, 
        label = f'Sockeye (AUC:{auc(sockeye_recall, sockeye_pre):.4f})', color='orange')


ax.plot([blz_re], [blz_p],'o',  color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
ax.plot([sock_re], [sock_p],'o',   color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )
ax.legend()
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
ax.set_xlim(0.75,1.01)




#display plot
plt.savefig('sim_zoomin.svg', format='svg')
plt.show()

/tmp/ipykernel_13238/3493766576.py:20: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([blz_re], [blz_p],'o',  color='c',markersize=6, label='BLAZE default threshold', marker = 'x' )
/tmp/ipykernel_13238/3493766576.py:21: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([sock_re], [sock_p],'o',   color='orange',markersize=6, label='Sockeye default threshold', marker = 'x' )
/tmp/ipykernel_13238/3493766576.py:22: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot([blz_hi_re], [blz_hi_p], 'o', color='c', markersize=6, label='BLAZE HS mode' , marker = '^' )

Simulate single cell nanopore reads¶

Running the simulation¶

generate perfect reads¶

introduce nanopore error¶

run blaze¶

run sockeye¶

Simulation data analysis¶

Prepare ground truth¶

BLAZE¶

Generate the whitelist¶

Threshod of count Quantile¶

Sockeye¶

Loading sockeye whitelist and check the agreement¶

PRC plot¶