[6]:
import pandas as pd
import numpy as np
import scipy.stats as sts
import gzip as gz
import pysam
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
from joblib import Parallel, delayed
import sys
import argparse
import plotly.express as px
import plotly.io as pio
import os
from Bio.Seq import Seq
from Bio import SeqIO
import swifter
import sys
import plotly.express as px
from Levenshtein import distance
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import scipy.stats as sts
import gzip as gz
import pysam
import pandas as pd
import seaborn as sns
# import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
from joblib import Parallel, delayed
import sys
import argparse
import plotly.express as px
import plotly.io as pio
import os
from Bio.Seq import Seq
from Bio import SeqIO
import swifter
import sys
import plotly.express as px
from Levenshtein import distance
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
# import matplotlib
# matplotlib.rcParams['pdf.fonttype'] = 42
# matplotlib.rcParams['ps.fonttype'] = 42
import svgwrite
pd.options.mode.chained_assignment = None  # Turns off the warning

[2]:
df = pd.read_parquet("CHANGEseqR.mis6.parquet")
[3]:
df
[3]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13252726 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTATGGATTTTTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 16, 18, 20] 18 18 C T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTTTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252730 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTGAGGGTTATGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [3, 10, 16, 20] 3 3 C G GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTGAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252732 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTATGGCTTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 14, 16, 20] 14 14 A C GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGCTTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252734 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTAAGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 11, 16, 20] 11 11 T A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTAAGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252735 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTAATGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [9, 10, 16, 20] 9 9 T A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTAATGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG

3921277 rows × 22 columns

[4]:
df = df[df.MM<=3]
[5]:
df
[5]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13252726 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTATGGATTTTTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 16, 18, 20] 18 18 C T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTTTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252730 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTGAGGGTTATGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [3, 10, 16, 20] 3 3 C G GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTGAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252732 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTATGGCTTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 14, 16, 20] 14 14 A C GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGCTTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252734 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTAAGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 11, 16, 20] 11 11 T A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTAAGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG
13252735 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTAATGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [9, 10, 16, 20] 9 9 T A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTAATGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG

2602927 rows × 22 columns

[11]:

mm_scores={'rU:dT,12': 0.8, 'rU:dT,13': 0.692307692, 'rU:dC,5': 0.64, 'rG:dA,14': 0.26666666699999997, 'rG:dG,19': 0.448275862, 'rG:dG,18': 0.47619047600000003, 'rG:dG,15': 0.272727273, 'rG:dG,14': 0.428571429, 'rG:dG,17': 0.235294118, 'rG:dG,16': 0.0, 'rC:dC,20': 0.058823529000000006, 'rG:dT,20': 0.9375, 'rG:dG,13': 0.42105263200000004, 'rG:dG,12': 0.529411765, 'rU:dC,6': 0.571428571, 'rU:dG,14': 0.28571428600000004, 'rU:dT,18': 0.666666667, 'rA:dG,13': 0.21052631600000002, 'rA:dG,12': 0.263157895, 'rA:dG,11': 0.4, 'rA:dG,10': 0.333333333, 'rA:dA,19': 0.538461538, 'rA:dA,18': 0.5, 'rA:dG,15': 0.272727273, 'rA:dG,14': 0.214285714, 'rA:dA,15': 0.2, 'rA:dA,14': 0.533333333, 'rA:dA,17': 0.133333333, 'rA:dA,16': 0.0, 'rA:dA,11': 0.307692308, 'rA:dA,10': 0.882352941, 'rA:dA,13': 0.3, 'rA:dA,12': 0.333333333, 'rG:dA,13': 0.3, 'rG:dA,12': 0.384615385, 'rG:dA,11': 0.384615385, 'rG:dA,10': 0.8125, 'rG:dA,17': 0.25, 'rG:dA,16': 0.0, 'rG:dA,15': 0.14285714300000002, 'rG:dA,6': 0.666666667, 'rG:dG,20': 0.428571429, 'rG:dA,19': 0.666666667, 'rG:dA,18': 0.666666667, 'rU:dC,4': 0.625, 'rG:dT,12': 0.933333333, 'rG:dT,13': 0.923076923, 'rU:dG,11': 0.666666667, 'rC:dA,3': 0.6875, 'rC:dA,2': 0.9090909090000001, 'rC:dA,1': 1.0, 'rC:dA,7': 0.8125, 'rC:dA,6': 0.9285714290000001, 'rC:dA,5': 0.636363636, 'rC:dA,4': 0.8, 'rC:dA,9': 0.875, 'rC:dA,8': 0.875, 'rU:dT,6': 0.8666666670000001, 'rA:dG,20': 0.22727272699999998, 'rG:dT,18': 0.692307692, 'rU:dG,10': 0.533333333, 'rG:dT,19': 0.7142857140000001, 'rG:dA,20': 0.7, 'rC:dT,20': 0.5, 'rU:dC,2': 0.84, 'rG:dG,10': 0.4, 'rC:dA,17': 0.46666666700000003, 'rC:dA,16': 0.307692308, 'rC:dA,15': 0.066666667, 'rC:dA,14': 0.7333333329999999, 'rC:dA,13': 0.7, 'rC:dA,12': 0.538461538, 'rC:dA,11': 0.307692308, 'rC:dA,10': 0.9411764709999999, 'rG:dG,11': 0.428571429, 'rU:dC,20': 0.176470588, 'rG:dG,3': 0.384615385, 'rC:dA,19': 0.46153846200000004, 'rC:dA,18': 0.642857143, 'rU:dG,17': 0.705882353, 'rU:dG,16': 0.666666667, 'rU:dG,15': 0.272727273, 'rG:dG,2': 0.692307692, 'rU:dG,13': 0.7894736840000001, 'rU:dG,12': 0.947368421, 'rG:dA,9': 0.533333333, 'rG:dA,8': 0.625, 'rG:dA,7': 0.571428571, 'rG:dG,5': 0.7857142859999999, 'rG:dA,5': 0.3, 'rG:dA,4': 0.363636364, 'rG:dA,3': 0.5, 'rG:dA,2': 0.636363636, 'rG:dA,1': 1.0, 'rG:dG,4': 0.529411765, 'rG:dG,1': 0.7142857140000001, 'rA:dC,9': 0.666666667, 'rG:dG,7': 0.6875, 'rG:dT,5': 0.8666666670000001, 'rU:dT,20': 0.5625, 'rC:dC,15': 0.05, 'rC:dC,14': 0.0, 'rC:dC,17': 0.058823529000000006, 'rC:dC,16': 0.153846154, 'rC:dC,11': 0.25, 'rC:dC,10': 0.38888888899999996, 'rC:dC,13': 0.13636363599999998, 'rC:dC,12': 0.444444444, 'rC:dA,20': 0.3, 'rC:dC,19': 0.125, 'rC:dC,18': 0.133333333, 'rA:dA,1': 1.0, 'rA:dA,3': 0.705882353, 'rA:dA,2': 0.727272727, 'rA:dA,5': 0.363636364, 'rA:dA,4': 0.636363636, 'rA:dA,7': 0.4375, 'rA:dA,6': 0.7142857140000001, 'rA:dA,9': 0.6, 'rA:dA,8': 0.428571429, 'rU:dG,20': 0.090909091, 'rC:dC,9': 0.6190476189999999, 'rC:dC,8': 0.642857143, 'rU:dT,10': 0.857142857, 'rU:dT,11': 0.75, 'rU:dT,16': 0.9090909090000001, 'rU:dT,17': 0.533333333, 'rU:dT,14': 0.6190476189999999, 'rU:dT,15': 0.578947368, 'rC:dC,1': 0.913043478, 'rU:dT,3': 0.7142857140000001, 'rC:dC,3': 0.5, 'rC:dC,2': 0.695652174, 'rC:dC,5': 0.6, 'rC:dC,4': 0.5, 'rC:dC,7': 0.470588235, 'rC:dC,6': 0.5, 'rU:dT,4': 0.47619047600000003, 'rU:dT,8': 0.8, 'rU:dT,9': 0.9285714290000001, 'rA:dC,19': 0.375, 'rA:dC,18': 0.4, 'rA:dC,17': 0.176470588, 'rA:dC,16': 0.192307692, 'rA:dC,15': 0.65, 'rA:dC,14': 0.46666666700000003, 'rA:dC,13': 0.6521739129999999, 'rA:dC,12': 0.7222222220000001, 'rA:dC,11': 0.65, 'rA:dC,10': 0.5555555560000001, 'rU:dC,7': 0.588235294, 'rC:dT,8': 0.65, 'rC:dT,9': 0.857142857, 'rC:dT,6': 0.9285714290000001, 'rC:dT,7': 0.75, 'rC:dT,4': 0.842105263, 'rC:dT,5': 0.571428571, 'rC:dT,2': 0.727272727, 'rC:dT,3': 0.8666666670000001, 'rC:dT,1': 1.0, 'rA:dC,8': 0.7333333329999999, 'rU:dT,1': 1.0, 'rU:dC,3': 0.5, 'rU:dC,1': 0.956521739, 'rU:dT,2': 0.846153846, 'rU:dG,19': 0.275862069, 'rG:dT,14': 0.75, 'rG:dT,15': 0.9411764709999999, 'rG:dT,16': 1.0, 'rG:dT,17': 0.933333333, 'rG:dT,10': 0.933333333, 'rG:dT,11': 1.0, 'rA:dG,9': 0.571428571, 'rA:dG,8': 0.428571429, 'rA:dG,7': 0.4375, 'rA:dG,6': 0.454545455, 'rA:dG,5': 0.5, 'rA:dG,4': 0.352941176, 'rA:dG,3': 0.428571429, 'rA:dG,2': 0.7857142859999999, 'rA:dG,1': 0.857142857, 'rU:dT,5': 0.5, 'rG:dT,2': 0.846153846, 'rA:dC,3': 0.611111111, 'rA:dC,20': 0.764705882, 'rG:dT,1': 0.9, 'rG:dT,6': 1.0, 'rG:dT,7': 1.0, 'rG:dT,4': 0.9, 'rC:dT,19': 0.428571429, 'rG:dG,9': 0.538461538, 'rG:dG,8': 0.615384615, 'rG:dT,8': 1.0, 'rG:dT,9': 0.642857143, 'rU:dG,18': 0.428571429, 'rU:dT,7': 0.875, 'rG:dG,6': 0.681818182, 'rA:dA,20': 0.6, 'rU:dC,9': 0.6190476189999999, 'rA:dG,17': 0.176470588, 'rU:dC,8': 0.7333333329999999, 'rA:dG,16': 0.0, 'rA:dG,19': 0.20689655199999998, 'rG:dT,3': 0.75, 'rU:dG,3': 0.428571429, 'rU:dG,2': 0.857142857, 'rU:dG,1': 0.857142857, 'rA:dG,18': 0.19047619, 'rU:dG,7': 0.6875, 'rU:dG,6': 0.9090909090000001, 'rU:dG,5': 1.0, 'rU:dG,4': 0.647058824, 'rU:dG,9': 0.923076923, 'rU:dG,8': 1.0, 'rU:dC,19': 0.25, 'rU:dC,18': 0.333333333, 'rU:dC,13': 0.260869565, 'rU:dC,12': 0.5, 'rU:dC,11': 0.4, 'rU:dC,10': 0.5, 'rU:dC,17': 0.117647059, 'rU:dC,16': 0.346153846, 'rU:dC,15': 0.05, 'rU:dC,14': 0.0, 'rC:dT,10': 0.8666666670000001, 'rC:dT,11': 0.75, 'rC:dT,12': 0.7142857140000001, 'rC:dT,13': 0.384615385, 'rC:dT,14': 0.35, 'rC:dT,15': 0.222222222, 'rC:dT,16': 1.0, 'rC:dT,17': 0.46666666700000003, 'rC:dT,18': 0.538461538, 'rA:dC,2': 0.8, 'rA:dC,1': 1.0, 'rA:dC,7': 0.705882353, 'rA:dC,6': 0.7142857140000001, 'rA:dC,5': 0.72, 'rA:dC,4': 0.625, 'rU:dT,19': 0.28571428600000004} pam_scores={'AA': 0.0, 'AC': 0.0, 'GT': 0.016129031999999998, 'AG': 0.25925925899999996, 'CC': 0.0, 'CA': 0.0, 'CG': 0.107142857, 'TT': 0.0, 'GG': 1.0, 'GC': 0.022222222000000003, 'AT': 0.0, 'GA': 0.06944444400000001, 'TG': 0.038961038999999996, 'TA': 0.0, 'TC': 0.0, 'CT': 0.0} def reverse_complement(seq): base_rev_comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','U':'A'} rc_seq = '' for base in seq: rc_seq += base_rev_comp[base] return rc_seq[::-1] def calc_cfd(wt,sg,pam): score = 1 sg = sg.replace('T','U') wt = wt.replace('T','U') s_list = list(sg) wt_list = list(wt) for i,sl in enumerate(s_list): if wt_list[i] == sl: score *= 1 else: key = 'r' + wt_list[i] + ':d' + reverse_complement(sl) + ',' + str(i+1) score *= mm_scores[key] score *= pam_scores[pam] return 100 * score
[8]:
def per_row_cfd(r):
    ref_PAM = r.control_target_seq[-2:]
    alt_PAM = r.variant_seq[-2:]
    ref = calc_cfd(r.on_target[:20],r.control_target_seq[:20],ref_PAM)
    alt = calc_cfd(r.on_target[:20],r.variant_seq[:20],alt_PAM)
    return alt-ref
df['delta_CFD'] = df.swifter.apply(per_row_cfd,axis=1)
[9]:
df
[9]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13252726 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTATGGATTTTTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 16, 18, 20] 18 18 C T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTTTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG 0.000000
13252730 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTGAGGGTTATGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [3, 10, 16, 20] 3 3 C G GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTGAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG 0.000000
13252732 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTATGGCTTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 14, 16, 20] 14 14 A C GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGCTTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG 0.000000
13252734 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTTAAGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [10, 11, 16, 20] 11 11 T A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTAAGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG 0.000000
13252735 TRAC.s1 3 GTCAGGGTTATGGATTTCTATGG 0.008117 GTCAGGGTAATGGATTTCTATGG 0.000000 4 -0.008117 0.000000 [10, 16, 20] [9, 10, 16, 20] 9 9 T A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGGTAATGGATTTCTATGG 0.008117 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTATGGATTTCTATGG 0.000000

2602927 rows × 23 columns

[14]:
df = pd.read_csv("changeseq_variant_activity_mm6_20241213.csv")
df = df[df.MM<df.variant_MM]
df.mismatch_positions = df.mismatch_positions.apply(eval)
df['var_pos'] = df['variant_position']
df['REF'] = df.swifter.apply(lambda r:r.control_target_seq[r.var_pos-1],axis=1)
df['ALT'] = df.swifter.apply(lambda r:r.variant_seq[r.var_pos-1],axis=1)
gRNA = {'AAVS1.s14': 'GGGGCCACTAGGGACAGGATNGG',
        'CTLA4.s9': 'GGACTGAGGGCCATGGACACNGG',
        'TRAC.s1': 'GTCAGGGTTCTGGATATCTGNGG',
        'LAG3.s9': 'GAAGGCTGAGATCCTGGAGGNGG',
        'CXCR4.s8': 'GTCCCCTGAGCCCATTTCCTNGG',
        'CCR5.s8': 'GGACAGTAAGAAGGAAAAACNGG'}

df['target_seq'] = df.site.map(gRNA)

df['realigned_target_seq'] = df.target_seq
df['seq'] = df.variant_seq
df['bulged_seq'] = ""
df['reads'] = df.activity_diff.abs()

df = df.drop(['Unnamed: 0'],axis=1)
df['on_target'] = df.target_seq
df['off'] = df.control_target_seq
def per_row_cfd(r):
    ref_PAM = r.control_target_seq[-2:]
    alt_PAM = r.variant_seq[-2:]
    ref = calc_cfd(r.on_target[:20],r.control_target_seq[:20],ref_PAM)
    alt = calc_cfd(r.on_target[:20],r.variant_seq[:20],alt_PAM)
    return alt-ref
df['delta_CFD'] = df.swifter.apply(per_row_cfd,axis=1)
df.to_parquet(f'CHANGEseqR.mis6.all_diff.parquet', engine='pyarrow')


[7]:
df = pd.read_parquet("CHANGEseqR.mis6.all_diff.parquet")
[8]:
df.head()
[8]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231
[12]:
def per_row_cfd_ratio(r):
    ref_PAM = r.control_target_seq[-2:]
    alt_PAM = r.variant_seq[-2:]
    ref = calc_cfd(r.on_target[:20],r.control_target_seq[:20],ref_PAM)
    alt = calc_cfd(r.on_target[:20],r.variant_seq[:20],alt_PAM)
    return (alt+1)/(ref+1)
df['ratio_CFD'] = df.swifter.apply(per_row_cfd_ratio,axis=1)
[13]:
def activity_ratio2(r):
    return (r.variant_relative_activity_percent+1)/(r.relative_activity_percent+1)
df['ratio_activity'] = df.swifter.apply(activity_ratio2,axis=1)
[14]:
df['ratio_CFD_log2'] = df['ratio_CFD'].apply(np.log2)
df['ratio_activity_log2'] = df['ratio_activity'].apply(np.log2)
[15]:
df.head()
[15]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD ratio_activity ratio_CFD ratio_CFD_log2 ratio_activity_log2
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769 0.108236 0.934681 -0.097454 -3.207752
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378 0.049956 0.643716 -0.635505 -4.323194
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485 0.017420 0.397057 -1.332582 -5.843132
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692 0.028134 0.877527 -0.188484 -5.151562
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231 0.020018 0.580093 -0.785643 -5.642592
[16]:
df['activity_ratio_log2'] = df.activity_ratio.apply(np.log2)
[17]:
df[['ratio_activity_log2','ratio_CFD_log2']].corr()
[17]:
ratio_activity_log2 ratio_CFD_log2
ratio_activity_log2 1.000000 0.258499
ratio_CFD_log2 0.258499 1.000000
[18]:
df[['activity_diff','delta_CFD']].corr()
[18]:
activity_diff delta_CFD
activity_diff 1.000000 0.362286
delta_CFD 0.362286 1.000000
[33]:
df.activity_diff.describe()
[33]:
count    5.052996e+06
mean    -2.969259e+00
std      8.534274e+00
min     -2.791489e+02
25%     -2.984412e+00
50%     -8.731075e-01
75%     -6.343969e-02
max      2.216153e+02
Name: activity_diff, dtype: float64
[35]:
sns.violinplot(df.activity_diff)
plt.yscale("log")
../_images/jupyter_notebooks_suprising_results_19_0.png
[81]:
df.head()
[81]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD abs_diff ratio_CFD ratio_activity ratio_CFD_log2 ratio_activity_log2
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769 96.769231 0.934681 0.108236 -0.097454 -3.207752
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378 82.377622 0.643716 0.049956 -0.635505 -4.323194
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485 70.177515 0.397057 0.017420 -1.332582 -5.843132
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692 93.942308 0.877527 0.028134 -0.188484 -5.151562
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231 79.230769 0.580093 0.020018 -0.785643 -5.642592
[19]:
def define_suprising_category(r):
    if abs(r.activity_diff)<10:
        return "Skip"
    if r.ratio_CFD_log2<=-2:
        if r.ratio_activity_log2<=-2:
            return "Captured_high_impact"
        elif    -2<r.ratio_activity_log2<2:
            return "Surprising_low_impact"
        else:
            return "Surprising_increase"
    else:
        if r.ratio_activity_log2<=-2:
            return "Surprising_high_impact"
        elif    -2<r.ratio_activity_log2<2:
            return "Captured_low_impact"
        else:
            return "Surprising_increase"
df['suprising_category'] = df.swifter.apply(define_suprising_category,axis=1)
[20]:
df.ratio_activity_log2.describe()
[20]:
count    5.052996e+06
mean    -7.236468e-01
std      9.448308e-01
min     -7.424908e+00
25%     -1.207113e+00
50%     -5.389062e-01
75%     -5.889734e-02
max      5.155138e+00
Name: ratio_activity_log2, dtype: float64
[21]:
df.groupby(['site','suprising_category']).size()
[21]:
site       suprising_category
AAVS1.s14  Captured_high_impact        17150
           Captured_low_impact         47419
           Skip                      1257750
           Surprising_high_impact      68822
           Surprising_increase           644
           Surprising_low_impact        6313
CCR5.s8    Captured_high_impact         3536
           Captured_low_impact          3716
           Skip                       815377
           Surprising_high_impact       9468
           Surprising_increase            77
           Surprising_low_impact         419
CTLA4.s9   Captured_high_impact         8259
           Captured_low_impact          4388
           Skip                       680830
           Surprising_high_impact      26173
           Surprising_increase            17
           Surprising_low_impact         346
LAG3.s9    Captured_high_impact        10089
           Captured_low_impact         15353
           Skip                       935553
           Surprising_high_impact      42830
           Surprising_increase           101
           Surprising_low_impact        1464
TRAC.s1    Captured_high_impact         3635
           Captured_low_impact          4145
           Skip                      1074562
           Surprising_high_impact      14106
           Surprising_increase            55
           Surprising_low_impact         399
dtype: int64
[22]:
df_3 = df[df.MM<=3]
[30]:
import pandas as pd
import numpy as np
import scipy.stats as sts
import gzip as gz
import pysam
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
from joblib import Parallel, delayed
import sys
import argparse
import plotly.express as px
import plotly.io as pio
import os
from Bio.Seq import Seq
from Bio import SeqIO
import swifter
import sys
import plotly.express as px
from Levenshtein import distance
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import scipy.stats as sts
import gzip as gz
import pysam
import pandas as pd
import seaborn as sns
# import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
from joblib import Parallel, delayed
import sys
import argparse
import plotly.express as px
import plotly.io as pio
import os
from Bio.Seq import Seq
from Bio import SeqIO
import swifter
import sys
import plotly.express as px
from Levenshtein import distance
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
# import matplotlib
# matplotlib.rcParams['pdf.fonttype'] = 42
# matplotlib.rcParams['ps.fonttype'] = 42
import svgwrite
pd.options.mode.chained_assignment = None  # Turns off the warning

boxWidth = 10
box_size = 15
v_spacing = 3

# colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#FFFFFF'}
colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#B3B3B3'}
for c in ['Y','S','W','K','M','B','D','H','V','.']:
    colors[c] = "#B3B3B3"

def find_PAM(seq,PAM):
    PAM_index=20
    return PAM_index


def is_transition(base1, base2):
    """Check if the mismatch is a transition (purine ↔ purine or pyrimidine ↔ pyrimidine)."""
    purines = {'A', 'G'}
    pyrimidines = {'C', 'T'}
    return ((base1 in purines and base2 in purines) or
            (base1 in pyrimidines and base2 in pyrimidines))
def visualizeOfftargets_variant_effect(offtargets, target_seq, outfile, title, PAM):
    # Initiate canvas
    genome=None
    total_seq = len(offtargets)
    dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(box_size*25+200, total_seq*(box_size + 1)*2+100))
    if title is not None:
        # Define top and left margins
        x_offset = 20
        y_offset = 50
        dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Helvetica"))
    else:
        # Define top and left margins
        x_offset = 20
        y_offset = 20

    tick_locations = []
    tick_legend = []
    # PAM_index = target_seq.index(PAM)
    PAM_index = find_PAM(target_seq,PAM)
    count = 0
    for i in range(PAM_index,0,-1):
        count = count+1
        if count % 10 == 0:
            tick_legend.append(count)
            # print (count,i)
            tick_locations.append(i)
    if len(PAM)>=3:
        tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
    else:
        tick_legend+=["PAM"]+['-']*(len(PAM)-3)
    tick_locations+=range(PAM_index+1,len(target_seq)+1)
    if PAM_index == 0:
        tick_legend = []
        tick_locations = []
        tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
        tick_locations+=range(1,len(PAM)+1)
        count = 0
        for i in range(len(PAM)+1,len(target_seq)+1):
            count = count+1
            if count % 10 == 0 or count == 1:
                tick_legend.append(count)
                # print (count,i)
                tick_locations.append(i)
    # print (zip(tick_locations, tick_legend))
    for x,y in zip(tick_locations, tick_legend):
        dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Helvetica"))

    # Draw reference sequence row
    for i, c in enumerate(target_seq):
        y = y_offset
        x = x_offset + i * box_size
        dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c]))
        dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:15px; font-family:Helvetica"))

    dwg.add(dwg.text('Variant Effect', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:15px; font-family:Helvetica"))
    # dwg.add(dwg.text('Mismatches', insert=(box_size * (len(target_seq) + 1) + 150, y_offset + box_size - 3), style="font-size:15px; font-family:Helvetica"))
    # dwg.add(dwg.text('Coordinates', insert=(box_size * (len(target_seq) + 1) + 250, y_offset + box_size - 3), style="font-size:15px; font-family:Helvetica"))
    if genome!=None:
        dwg.add(dwg.text('Annotation', insert=(box_size * (len(target_seq) + 1) + 500, y_offset + box_size - 3), style="font-size:15px; font-family:Helvetica"))

    # Draw aligned sequence rows
    y_offset += 0  # leave some extra space after the reference row
    line_number = 0  # keep track of plotted sequences



    for j, seq in enumerate(offtargets):
        realigned_target_seq = offtargets[j]['realigned_target_seq']
        realigned_target_seq = list(realigned_target_seq)
        realigned_target_seq[-3]=="N"
        realigned_target_seq = "".join(realigned_target_seq)

        no_bulge_offtarget_sequence = offtargets[j]['seq']
        bulge_offtarget_sequence = offtargets[j]['bulged_seq']
        target_seq = offtargets[j]['target_seq']
        target_seq = list(target_seq)
        target_seq[-3]="N"
        # print (target_seq)
        target_seq = "".join(target_seq)
        # print (target_seq)
        REF = offtargets[j]['REF']
        try:
            strand = offtargets[j]['strand']
        except:
            strand = "+"
        ALT = offtargets[j]['ALT']
        var_pos = offtargets[j]['var_pos']


        if no_bulge_offtarget_sequence != '':
            k = 0
            line_number += 2
            y = y_offset + line_number * box_size
            for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)):
                x = x_offset + k * box_size
                if var_pos == i+1:
                    # print (seq['coord'],i,var_pos)
                    if strand == "-":
                        REF = revcomp(REF)
                        ALT = revcomp(ALT)
                    tmp = box_size/2
                    # dwg.add(dwg.rect((x+1, tmp + y), (box_size, box_size*2), fill="grey",fill_opacity=0.2, stroke="black", stroke_width=1))
                    if r!=REF:
                        dwg.add(dwg.rect((x, 3 * tmp + y-box_size), (box_size, box_size), fill=colors[REF],fill_opacity=0.99))
                    dwg.add(dwg.text(REF, insert=(x + 3, 3 * tmp + y - 3), fill='black', style="font-size:15px; font-family:Helvetica"))
                    if r!=ALT:
                        dwg.add(dwg.rect((x, 5 * tmp + y-box_size), (box_size, box_size), fill=colors[ALT],fill_opacity=0.99))
                    dwg.add(dwg.text(ALT, insert=(x + 3, 5 * tmp + y - 3), fill='black', style="font-size:15px; font-family:Helvetica;font-weight:bold"))

                if r == '-':
                    if 0 < k < len(target_seq):
                        x = x_offset + (k - 0.25) * box_size
                        dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
                        dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Helvetica"))
                elif c == r:
                    if var_pos != i+1:
                        # dwg.add(dwg.text(".", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:20px; font-family:Helvetica;font-weight:bold"))
                        dwg.add(dwg.text(".", insert=(x + 5.5, 1.75 * box_size + y-3), fill='black', style="font-size:20px; font-family:Helvetica;font-weight:bold"))
                    k += 1
                elif r == 'N':
                    # dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Helvetica"))
                    dwg.add(dwg.text(".", insert=(x + 5.5, 1.75 * box_size + y-3), fill='black', style="font-size:20px; font-family:Helvetica;font-weight:bold"))
                    k += 1
                else:
                    if var_pos != i+1:
                        # if is_transition(r, c):
                        #     dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#ce3bff"))
                        # else:
                        #     dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#fa9d46"))
                        dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
                    k += 1
        if bulge_offtarget_sequence != '':
            k = 0
            line_number += 2
            y = y_offset + line_number * box_size
            for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)):
                x = x_offset + k * box_size
                if var_pos == i+1:
                    # print (seq['coord'],i,var_pos)
                    if strand == "-":
                        REF = revcomp(REF)
                        ALT = revcomp(ALT)
                    tmp = box_size/2
                    # dwg.add(dwg.rect((x+1, tmp + y), (box_size, box_size*2), fill="grey",fill_opacity=0.2, stroke="black", stroke_width=1))
                    if r!=REF:
                        dwg.add(dwg.rect((x, 3 * tmp + y-box_size), (box_size, box_size), fill=colors[REF],fill_opacity=0.99))
                    dwg.add(dwg.text(REF, insert=(x + 3, 3 * tmp + y - 3), fill='black', style="font-size:15px; font-family:Helvetica;font-weight:bold"))
                    if r!=ALT:
                        dwg.add(dwg.rect((x, 5 * tmp + y-box_size), (box_size, box_size), fill=colors[ALT],fill_opacity=0.99))
                    dwg.add(dwg.text(ALT, insert=(x + 3, 5 * tmp + y - 3), fill='black', style="font-size:15px; font-family:Helvetica;font-weight:bold"))

                if r == '-':
                    if 0 < k < len(realigned_target_seq):
                        x = x_offset + (k - 0.25) * box_size
                        dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
                        dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Helvetica"))
                elif c == r:
                    dwg.add(dwg.text(".", insert=(x + 5.5, 1.75 * box_size + y-3), fill='black', style="font-size:20px; font-family:Helvetica;font-weight:bold"))
                    k += 1
                elif r == 'N':
                    dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:15px; font-family:Helvetica"))
                    k += 1
                else:
                    if var_pos != i+1:
                        if is_transition(r, c):
                            dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#ce3bff"))
                        else:
                            dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill="#fa9d46"))
                    k += 1

        if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '':
            reads_text = dwg.text(seq['reads'], insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2),
                                  fill='black', style="font-size:15px; font-family:Helvetica")
            dwg.add(reads_text)

            # Calculate the width of the horizontal bar based on read count
            # max_bar_width = 100  # Set a maximum width for the bar
            # bar_width = (seq['reads'] / 50) * max_bar_width  # Scale bar width by read count
            # bar_height = 20  # Fixed height for each horizontal bar

            # Position the bar beside the text
            # bar_x = box_size * (len(target_seq) + 1) + 50  # Adjust position to be beside the text
            # bar_y = y_offset + box_size * (line_number + 2)- bar_height

            # Draw the horizontal bar plot as a rectangle
            # bar = dwg.rect(
                # insert=(bar_x, bar_y),
                # size=(bar_width, bar_height),
                # fill="blue"
            # )
            # dwg.add(bar)


            if genome!= None:
                annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 500, y_offset + box_size * (line_number + 2) - 2),
                                      fill='black', style="font-size:15px; font-family:Helvetica")
                dwg.add(annot_text)
        else:
            reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5),
                                  fill='black', style="font-size:15px; font-family:Helvetica")
            dwg.add(reads_text)
            if genome!= None:
                annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 500, y_offset + box_size * (line_number + 1) + 5),
                                      fill='black', style="font-size:15px; font-family:Helvetica")
                dwg.add(annot_text)
            reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5),
                                  fill='black', style="font-size:23px; font-family:Helvetica")
            dwg.add(reads_text02)
    dwg.save()
# offtargets=[r.to_dict() for i,r in low.sample(n=10).sort_values("reads",ascending=False).iterrows()]
# visualizeOfftargets_variant_effect(offtargets,"AGTGCTTGAGCCCATTTCCTNGG", f"CHANGEseqR.MM{s}", f"CHANGEseqR.MM{s}", "NGG")
[45]:
import pandas as pd
import numpy as np
import scipy.stats as sts
import gzip as gz
import pysam
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
from joblib import Parallel, delayed
import sys
import argparse
import plotly.express as px
import plotly.io as pio
import os
from Bio.Seq import Seq
from Bio import SeqIO
import swifter
import sys
import plotly.express as px
from Levenshtein import distance
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import scipy.stats as sts
import gzip as gz
import pysam
import pandas as pd
import seaborn as sns
# import matplotlib.pyplot as plt
import numpy as np
import scipy
import glob
from joblib import Parallel, delayed
import sys
import argparse
import plotly.express as px
import plotly.io as pio
import os
from Bio.Seq import Seq
from Bio import SeqIO
import swifter
import sys
import plotly.express as px
from Levenshtein import distance
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
# import matplotlib
# matplotlib.rcParams['pdf.fonttype'] = 42
# matplotlib.rcParams['ps.fonttype'] = 42
import svgwrite
pd.options.mode.chained_assignment = None  # Turns off the warning

boxWidth = 10
box_size = 15
v_spacing = 3

# colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#FFFFFF'}
colors = {'G': '#F5F500', 'A': '#FF5454', 'T': '#00D118', 'C': '#26A8FF', 'N': '#B3B3B3', 'R': '#B3B3B3', '-': '#B3B3B3'}
for c in ['Y','S','W','K','M','B','D','H','V','.']:
    colors[c] = "#B3B3B3"

def find_PAM(seq,PAM):
    PAM_index=20
    return PAM_index


def is_transition(base1, base2):
    """Check if the mismatch is a transition (purine ↔ purine or pyrimidine ↔ pyrimidine)."""
    purines = {'A', 'G'}
    pyrimidines = {'C', 'T'}
    return ((base1 in purines and base2 in purines) or
            (base1 in pyrimidines and base2 in pyrimidines))
def visualizeOfftargets_seqvis_reads(offtargets, target_seq, outfile, title, PAM):
    # Initiate canvas
    genome=None
    total_seq = len(offtargets)
    dwg = svgwrite.Drawing(outfile + '.svg', profile='full', size=(box_size*25+200, total_seq*(box_size + 1)*2+100))
    if title is not None:
        # Define top and left margins
        x_offset = 20
        y_offset = 50
        dwg.add(dwg.text(title, insert=(x_offset, 30), style="font-size:20px; font-family:Courier"))
    else:
        # Define top and left margins
        x_offset = 20
        y_offset = 20

    tick_locations = []
    tick_legend = []
    # PAM_index = target_seq.index(PAM)
    PAM_index = find_PAM(target_seq,PAM)
    count = 0
    for i in range(PAM_index,0,-1):
        count = count+1
        if count % 10 == 0:
            tick_legend.append(count)
            # print (count,i)
            tick_locations.append(i)
    if len(PAM)>=3:
        tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
    else:
        tick_legend+=["PAM"]+['-']*(len(PAM)-3)
    tick_locations+=range(PAM_index+1,len(target_seq)+1)
    if PAM_index == 0:
        tick_legend = []
        tick_locations = []
        tick_legend+=['P', 'A', 'M']+['-']*(len(PAM)-3)
        tick_locations+=range(1,len(PAM)+1)
        count = 0
        for i in range(len(PAM)+1,len(target_seq)+1):
            count = count+1
            if count % 10 == 0 or count == 1:
                tick_legend.append(count)
                # print (count,i)
                tick_locations.append(i)
    # print (zip(tick_locations, tick_legend))
    for x,y in zip(tick_locations, tick_legend):
        dwg.add(dwg.text(y, insert=(x_offset + (x - 1) * box_size + 2, y_offset - 2), style="font-size:10px; font-family:Courier"))

    # Draw reference sequence row
    for i, c in enumerate(target_seq):
        y = y_offset
        x = x_offset + i * box_size
        dwg.add(dwg.rect((x, y), (box_size, box_size), fill=colors[c]))
        dwg.add(dwg.text(c, insert=(x + 3, y + box_size - 3), fill='black', style="font-size:10px; font-family:Courier"))

    dwg.add(dwg.text('User given info', insert=(x_offset + box_size * len(target_seq) + 16, y_offset + box_size - 3), style="font-size:10px; font-family:Courier"))
    # dwg.add(dwg.text('Mismatches', insert=(box_size * (len(target_seq) + 1) + 150, y_offset + box_size - 3), style="font-size:10px; font-family:Courier"))
    # dwg.add(dwg.text('Coordinates', insert=(box_size * (len(target_seq) + 1) + 250, y_offset + box_size - 3), style="font-size:10px; font-family:Courier"))
    if genome!=None:
        dwg.add(dwg.text('Annotation', insert=(box_size * (len(target_seq) + 1) + 500, y_offset + box_size - 3), style="font-size:10px; font-family:Courier"))

    # Draw aligned sequence rows
    y_offset += 0  # leave some extra space after the reference row
    line_number = 0  # keep track of plotted sequences



    for j, seq in enumerate(offtargets):
        realigned_target_seq = offtargets[j]['realigned_target_seq']
        realigned_target_seq = list(realigned_target_seq)
        realigned_target_seq[-3]=="N"
        realigned_target_seq = "".join(realigned_target_seq)

        no_bulge_offtarget_sequence = offtargets[j]['seq']
        bulge_offtarget_sequence = offtargets[j]['bulged_seq']
        target_seq = offtargets[j]['target_seq']
        target_seq = list(target_seq)
        target_seq[-3]="N"
        # print (target_seq)
        target_seq = "".join(target_seq)
        # print (target_seq)
        # REF = offtargets[j]['REF']
        # try:
        #     strand = offtargets[j]['strand']
        # except:
        #     strand = "+"
        # ALT = offtargets[j]['ALT']
        # var_pos = offtargets[j]['var_pos']
        var_pos=999

        if no_bulge_offtarget_sequence != '':
            k = 0
            line_number += 1
            y = y_offset + line_number * box_size
            for i, (c, r) in enumerate(zip(no_bulge_offtarget_sequence, target_seq)):
                x = x_offset + k * box_size

                if r == '-':
                    if 0 < k < len(target_seq):
                        x = x_offset + (k - 0.25) * box_size
                        dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
                        dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
                elif c == r:
                    dwg.add(dwg.text(".", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
                    k += 1
                elif r == 'N':
                    # dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:10px; font-family:Courier"))
                    dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:10px; font-family:Courier"))
                    k += 1
                else:
                    dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
                    dwg.add(dwg.text(c, insert=(x+3, 2 * box_size + y - 3), fill='black', style="font-size:10px; font-family:Courier"))
                    k += 1
        if bulge_offtarget_sequence != '':
            k = 0
            line_number += 2
            y = y_offset + line_number * box_size
            for i, (c, r) in enumerate(zip(bulge_offtarget_sequence, realigned_target_seq)):
                x = x_offset + k * box_size

                if r == '-':
                    if 0 < k < len(realigned_target_seq):
                        x = x_offset + (k - 0.25) * box_size
                        dwg.add(dwg.rect((x, box_size * 1.4 + y), (box_size*0.6, box_size*0.6), fill=colors[c]))
                        dwg.add(dwg.text(c, insert=(x+1, 2 * box_size + y - 2), fill='black', style="font-size:10px; font-family:Courier"))
                elif c == r:
                    dwg.add(dwg.text(".", insert=(x + 4.5, 2 * box_size + y - 4), fill='black', style="font-size:10px; font-family:Courier"))
                    k += 1
                elif r == 'N':
                    dwg.add(dwg.text(c, insert=(x + 3, 2 * box_size + y - 3), fill='black', style="font-size:10px; font-family:Courier"))
                    k += 1
                else:
                    dwg.add(dwg.rect((x, box_size + y), (box_size, box_size), fill=colors[c]))
                    dwg.add(dwg.text(c, insert=(x+3, 2 * box_size + y - 3), fill='black', style="font-size:10px; font-family:Courier"))
                    k += 1

        if no_bulge_offtarget_sequence == '' or bulge_offtarget_sequence == '':
            reads_text = dwg.text(seq['reads'], insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 2) - 2),
                                  fill='black', style="font-size:10px; font-family:Courier")
            dwg.add(reads_text)


            if genome!= None:
                annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 500, y_offset + box_size * (line_number + 2) - 2),
                                      fill='black', style="font-size:10px; font-family:Courier")
                dwg.add(annot_text)
        else:
            reads_text = dwg.text(str(seq['reads']), insert=(box_size * (len(target_seq) + 1) + 20, y_offset + box_size * (line_number + 1) + 5),
                                  fill='black', style="font-size:10px; font-family:Courier")
            dwg.add(reads_text)
            if genome!= None:
                annot_text = dwg.text(seq['annot'], insert=(box_size * (len(target_seq) + 1) + 500, y_offset + box_size * (line_number + 1) + 5),
                                      fill='black', style="font-size:10px; font-family:Courier")
                dwg.add(annot_text)
            reads_text02 = dwg.text(u"\u007D", insert=(box_size * (len(target_seq) + 1) + 7, y_offset + box_size * (line_number + 1) + 5),
                                  fill='black', style="font-size:23px; font-family:Courier")
            dwg.add(reads_text02)
    dwg.save()
# offtargets=[r.to_dict() for i,r in low.sample(n=10).sort_values("reads",ascending=False).iterrows()]
# visualizeOfftargets_variant_effect(offtargets,"AGTGCTTGAGCCCATTTCCTNGG", f"CHANGEseqR.MM{s}", f"CHANGEseqR.MM{s}", "NGG")
[ ]:
        # realigned_target_seq = offtargets[j]['realigned_target_seq']
        # no_bulge_offtarget_sequence = offtargets[j]['seq']
        # bulge_offtarget_sequence = offtargets[j]['bulged_seq']
        # target_seq = offtargets[j]['target_seq']
        # REF = offtargets[j]['REF']
        # ALT = offtargets[j]['ALT']
        # var_pos = offtargets[j]['var_pos']
[32]:
for s,d in df_3.groupby(['site','MM','suprising_category']):
    d['reads'] = d.relative_activity_percent.astype(int).astype(str)+"_"+d.variant_relative_activity_percent.astype(int).astype(str)+"_"+d.delta_CFD.astype(int).astype(str)
    if d.shape[0]>20:
        offtargets=[r.to_dict() for i,r in d.sample(n=20).iterrows()]
    else:
        offtargets=[r.to_dict() for i,r in d.iterrows()]
    visualizeOfftargets_variant_effect(offtargets,d.target_seq.tolist()[0], f"{s[0]}.MM{s[1]}.{s[2]}", f"{s[0]}.MM{s[1]}.{s[2]}", "NGG")
[46]:
visualizeOfftargets_seqvis_reads(offtargets,d.target_seq.tolist()[0],"test", "test", "NGG")
[24]:
sns.barplot(data=merged[merged.suprising_category!="Skip"],hue="suprising_category",x="site",y="percentage")
[24]:
<Axes: xlabel='site', ylabel='percentage'>
../_images/jupyter_notebooks_suprising_results_30_1.png
[102]:
sns.barplot(data=merged[merged.suprising_category!="Skip"],hue="suprising_category",x="site",y="percentage")
[102]:
<Axes: xlabel='site', ylabel='percentage'>
../_images/jupyter_notebooks_suprising_results_31_1.png
[106]:
merged.groupby("suprising_category").percentage.mean()
[106]:
suprising_category
Captured_high_impact       1.274
Captured_low_impact        1.976
Skip                      91.984
Surprising_high_impact     4.518
Surprising_increase        0.014
Surprising_low_impact      0.236
Name: percentage, dtype: float64
[110]:
from Bio import motifs
from Bio.Seq import Seq
[114]:
import logomaker
def tologo(m,s,outfile,title):
    X=pd.DataFrame.from_dict(m.pwm,orient="index").T
    for i in range(len(s)):
        if s[i]=="N":
            continue
        X.at[i,s[i]]=0
    df_normalized = X.div(X.sum(axis=1), axis=0)
    logo = logomaker.Logo(df_normalized,
                              shade_below=.5,
                              fade_below=.5)

    # style using Logo methods
    logo.style_spines(visible=False)
    logo.style_spines(spines=['left', 'bottom'], visible=True)
    logo.style_xticks(rotation=90, fmt='%d', anchor=0)

    # style using Axes methods
    logo.ax.set_ylabel("MisMatch Frequency", labelpad=-1)
    logo.ax.xaxis.set_ticks_position('none')
    logo.ax.xaxis.set_tick_params(pad=-1)
    logo.ax.set_title(title)
    logo.ax.set_xticks(range(23))  # adjust the number based on your data
    logo.ax.set_xticklabels(list(s), rotation=0)
    plt.savefig(outfile,bbox_inches='tight')


[33]:
df2.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [33], in <cell line: 1>()
----> 1 df2.head()

NameError: name 'df2' is not defined
[116]:
gRNA = {'AAVS1.s14': 'GGGGCCACTAGGGACAGGATNGG',
        'CTLA4.s9': 'GGACTGAGGGCCATGGACACNGG',
        'TRAC.s1': 'GTCAGGGTTCTGGATATCTGNGG',
        'LAG3.s9': 'GAAGGCTGAGATCCTGGAGGNGG',
        'CXCR4.s8': 'GTCCCCTGAGCCCATTTCCTNGG',
        'CCR5.s8': 'GGACAGTAAGAAGGAAAAACNGG'}
for s,d in df2.groupby("site"):
    size=d.shape[0]
    seq=gRNA[s]
    d=d[d.control_target_seq.str.len()==23]
    m = motifs.create([Seq(x) for x in d['control_target_seq']])
    # m.weblogo(f"{s}.CHANGE-seq-R.png",logo_title=f"N={size}")
    # from IPython.display import Image
    # Image(f"{s}.CHANGE-seq-R.png")
    tologo(m,seq,f"{s}.CHANGE-seq-R.seqlogo.png",f"N={d.shape[0]}")
../_images/jupyter_notebooks_suprising_results_36_0.png
../_images/jupyter_notebooks_suprising_results_36_1.png
../_images/jupyter_notebooks_suprising_results_36_2.png
../_images/jupyter_notebooks_suprising_results_36_3.png
../_images/jupyter_notebooks_suprising_results_36_4.png
../_images/jupyter_notebooks_suprising_results_36_5.png
[115]:
df2
[115]:
Unnamed: 0 site MM control_target_seq Cas9_counts_rep1 Cas9_counts_rep2 Cas9_counts_scaled_rep1 Cas9_counts_scaled_rep2 control_counts_rep1 control_counts_rep2 control_counts_scaled_rep1 control_counts_scaled_rep2 log2FC_rep1 log2FC_rep2 delta_log2FC FC_rep1 FC_rep2 average_log2FC spacer PAM relative_activity_percent
0 8000 AAVS1.s14 3 AGAGCCACTAGGAACAGGATCGG 736.0 516.0 61.769835 65.073526 3.0 3.0 0.104740 0.104740 9.206384 9.281429 -0.075044 589.741982 621.283672 9.244394 AGAGCCACTAGGAACAGGAT CGG 1291.337166
1 79968 AAVS1.s14 3 TCGACCACTAGGGACAGGATGGG 130.0 93.0 10.910433 11.728368 1.0 1.0 0.034913 0.034913 8.292318 8.396291 -0.103973 312.499148 335.926636 8.345241 TCGACCACTAGGGACAGGAT GGG 691.427396
2 1606 AAVS1.s14 2 GGGACCACTAGGGACAGGATGGT 88.0 65.0 7.385524 8.197246 1.0 1.0 0.034913 0.034913 7.731576 7.881343 -0.149767 211.537885 234.787434 7.808402 GGGACCACTAGGGACAGGAT GGT 475.924247
3 384446 AAVS1.s14 4 TCCGCCACTAGGGACAGCATTGG 81.0 69.0 6.798039 8.701692 1.0 1.0 0.034913 0.034913 7.612581 7.967145 -0.354564 194.711008 249.235892 7.800728 TCCGCCACTAGGGACAGCAT TGG 473.388098
4 7583 AAVS1.s14 2 TGGGCCACTAAGGACAGGATAGG 145.0 89.0 12.169329 11.223922 2.0 2.0 0.069827 0.069827 7.453504 7.337525 0.115980 174.278371 160.739089 7.396680 TGGGCCACTAAGGACAGGAT AGG 357.234792
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19121147 19121147 TRAC.s1 21 TGACTAAGGAACCCGGAAACAAG 0.0 0.0 0.000000 0.000000 3.0 3.0 0.118482 0.118482 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 TGACTAAGGAACCCGGAAAC AAG 0.000000
19121148 19121148 TRAC.s1 21 TGACTAAGGGCGTTGGAAATGAC 0.0 0.0 0.000000 0.000000 1.0 1.0 0.039494 0.039494 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 TGACTAAGGGCGTTGGAAAT GAC 0.000000
19121149 19121149 TRAC.s1 21 TGAGCCTAAGATCTTGGAGAGTT 0.0 0.0 0.000000 0.000000 15.0 15.0 0.592411 0.592411 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 TGAGCCTAAGATCTTGGAGA GTT 0.000000
19121150 19121150 TRAC.s1 22 AGGCTTAGCGCCATGGAGATGTC 0.0 0.0 0.000000 0.000000 1.0 1.0 0.039494 0.039494 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 AGGCTTAGCGCCATGGAGAT GTC 0.000000
19121151 19121151 TRAC.s1 22 TGTCTAAGGGATATGGATACGAT 0.0 0.0 0.000000 0.000000 13.0 13.0 0.513423 0.513423 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 TGTCTAAGGGATATGGATAC GAT 0.000000

19121152 rows × 21 columns

[111]:
for s,d in df2.groupby("site"):
    size=d.shape[0]
    d=d[d.control_target_seq.str.len()==23]
    m = motifs.create([Seq(x) for x in d['control_target_seq']])
    m.weblogo(f"{s}.CHANGE-seq-R.png",logo_title=f"N={size}")
    from IPython.display import Image
    Image(f"{s}.CHANGE-seq-R.png")
[112]:
Image(f"{s}.CHANGE-seq-R.png")
[112]:
../_images/jupyter_notebooks_suprising_results_39_0.png
[23]:
# Group by site and surprising_category, then count
counts = df_3.groupby(['site', 'suprising_category']).size().reset_index(name='count')

# Get total per site
total_per_site = df_3.groupby('site').size().reset_index(name='total')

# Merge the counts with the totals
merged = counts.merge(total_per_site, on='site')

# Calculate the percentage
merged['percentage'] = merged['count'] / merged['total'] * 100

# Optional: format percentage to 2 decimal places
merged['percentage'] = merged['percentage'].round(2)
merged
[23]:
site suprising_category count total percentage
0 AAVS1.s14 Captured_high_impact 15567 793711 1.96
1 AAVS1.s14 Captured_low_impact 42342 793711 5.33
2 AAVS1.s14 Skip 674193 793711 84.94
3 AAVS1.s14 Surprising_high_impact 55450 793711 6.99
4 AAVS1.s14 Surprising_increase 321 793711 0.04
5 AAVS1.s14 Surprising_low_impact 5838 793711 0.74
6 CCR5.s8 Captured_high_impact 3315 527292 0.63
7 CCR5.s8 Captured_low_impact 3509 527292 0.67
8 CCR5.s8 Skip 511544 527292 97.01
9 CCR5.s8 Surprising_high_impact 8465 527292 1.61
10 CCR5.s8 Surprising_increase 59 527292 0.01
11 CCR5.s8 Surprising_low_impact 400 527292 0.08
12 CTLA4.s9 Captured_high_impact 8242 488245 1.69
13 CTLA4.s9 Captured_low_impact 4381 488245 0.90
14 CTLA4.s9 Skip 449138 488245 91.99
15 CTLA4.s9 Surprising_high_impact 26123 488245 5.35
16 CTLA4.s9 Surprising_increase 17 488245 0.00
17 CTLA4.s9 Surprising_low_impact 344 488245 0.07
18 LAG3.s9 Captured_high_impact 9962 634116 1.57
19 LAG3.s9 Captured_low_impact 15118 634116 2.38
20 LAG3.s9 Skip 565541 634116 89.19
21 LAG3.s9 Surprising_high_impact 41983 634116 6.62
22 LAG3.s9 Surprising_increase 64 634116 0.01
23 LAG3.s9 Surprising_low_impact 1448 634116 0.23
24 TRAC.s1 Captured_high_impact 3571 684641 0.52
25 TRAC.s1 Captured_low_impact 4081 684641 0.60
26 TRAC.s1 Skip 662697 684641 96.79
27 TRAC.s1 Surprising_high_impact 13856 684641 2.02
28 TRAC.s1 Surprising_increase 49 684641 0.01
29 TRAC.s1 Surprising_low_impact 387 684641 0.06
[ ]:

[83]:
df
[83]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD abs_diff ratio_CFD ratio_activity ratio_CFD_log2 ratio_activity_log2
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769 96.769231 0.934681 0.108236 -0.097454 -3.207752
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378 82.377622 0.643716 0.049956 -0.635505 -4.323194
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485 70.177515 0.397057 0.017420 -1.332582 -5.843132
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692 93.942308 0.877527 0.028134 -0.188484 -5.151562
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231 79.230769 0.580093 0.020018 -0.785643 -5.642592
[95]:
sns.scatterplot(data=df[df.activity_diff.abs()>=10][['ratio_activity_log2','ratio_CFD_log2','site']],x="ratio_activity_log2",y="ratio_CFD_log2",hue="site")
plt.axvline(x=2, color='grey', linestyle='--')
plt.axvline(x=-2, color='grey', linestyle='--')
plt.axhline(y=-2, color='grey', linestyle='--')

[95]:
<matplotlib.lines.Line2D at 0x155073ebf670>
../_images/jupyter_notebooks_suprising_results_43_1.png
[84]:
sns.scatterplot(data=df[df.activity_diff.abs()>=10][['ratio_activity_log2','ratio_CFD_log2','site']].sample(n=10000),x="ratio_activity_log2",y="ratio_CFD_log2",hue="site")
plt.axvline(x=2, color='grey', linestyle='--')
plt.axvline(x=-2, color='grey', linestyle='--')
plt.axhline(y=-2, color='grey', linestyle='--')

[84]:
<matplotlib.lines.Line2D at 0x155073ce5160>
../_images/jupyter_notebooks_suprising_results_44_1.png
[ ]:

[32]:
sns.scatterplot(data=df[['activity_diff','delta_CFD','site']].sample(n=10000),x="activity_diff",y="delta_CFD",hue="site")
[32]:
<Axes: xlabel='activity_diff', ylabel='delta_CFD'>
../_images/jupyter_notebooks_suprising_results_46_1.png
[34]:
f="/research_jude/rgs01_jude/groups/tsaigrp/projects/Genomics/common/users/Jackie/CHANGE-seq-R-analysis/20250303_Ashley_CXCR4_250224_VH00889_244_AAGCHCVM5_oldpipeline/results_log2FC/combined_scaled_log1p_df_pivot_activity_20250325.csv"
df2 = pd.read_csv(f)
[18]:
df2.head()
[18]:
Unnamed: 0 site MM control_target_seq Cas9_counts_rep1 Cas9_counts_rep2 Cas9_counts_scaled_rep1 Cas9_counts_scaled_rep2 control_counts_rep1 control_counts_rep2 control_counts_scaled_rep1 control_counts_scaled_rep2 log2FC_rep1 log2FC_rep2 delta_log2FC FC_rep1 FC_rep2 average_log2FC spacer PAM relative_activity_percent
0 8000 AAVS1.s14 3 AGAGCCACTAGGAACAGGATCGG 736.0 516.0 61.769835 65.073526 3.0 3.0 0.104740 0.104740 9.206384 9.281429 -0.075044 589.741982 621.283672 9.244394 AGAGCCACTAGGAACAGGAT CGG 1291.337166
1 79968 AAVS1.s14 3 TCGACCACTAGGGACAGGATGGG 130.0 93.0 10.910433 11.728368 1.0 1.0 0.034913 0.034913 8.292318 8.396291 -0.103973 312.499148 335.926636 8.345241 TCGACCACTAGGGACAGGAT GGG 691.427396
2 1606 AAVS1.s14 2 GGGACCACTAGGGACAGGATGGT 88.0 65.0 7.385524 8.197246 1.0 1.0 0.034913 0.034913 7.731576 7.881343 -0.149767 211.537885 234.787434 7.808402 GGGACCACTAGGGACAGGAT GGT 475.924247
3 384446 AAVS1.s14 4 TCCGCCACTAGGGACAGCATTGG 81.0 69.0 6.798039 8.701692 1.0 1.0 0.034913 0.034913 7.612581 7.967145 -0.354564 194.711008 249.235892 7.800728 TCCGCCACTAGGGACAGCAT TGG 473.388098
4 7583 AAVS1.s14 2 TGGGCCACTAAGGACAGGATAGG 145.0 89.0 12.169329 11.223922 2.0 2.0 0.069827 0.069827 7.453504 7.337525 0.115980 174.278371 160.739089 7.396680 TGGGCCACTAAGGACAGGAT AGG 357.234792
[22]:
df2.site.unique()
[22]:
array(['AAVS1.s14', 'CCR5.s8', 'CTLA4.s9', 'CXCR4.s8', 'LAG3.s9',
       'TRAC.s1'], dtype=object)
[24]:
df2[df2.site=="CTLA4.s9"].groupby("MM").control_counts_rep1.describe().head()
[24]:
count mean std min 25% 50% 75% max
MM
0 4.0 2627.500000 3734.267041 626.0 711.50 829.0 2745.00 8226.0
1 264.0 396.174242 543.041610 2.0 80.75 146.0 369.75 2521.0
2 7006.0 78.626463 105.057933 1.0 12.00 37.0 97.00 924.0
3 65819.0 23.703216 27.764936 1.0 5.00 14.0 33.00 377.0
4 222382.0 13.459646 15.672668 1.0 3.00 8.0 18.00 208.0
[37]:
df3 = df[(df.activity_diff<=-50)|(df.activity_diff>=20)]
[41]:
sns.scatterplot(data=df3,x="activity_diff",y="delta_CFD",hue="site")
[41]:
<Axes: xlabel='activity_diff', ylabel='delta_CFD'>
../_images/jupyter_notebooks_suprising_results_52_1.png
[42]:
df.head()
[42]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231
[52]:
def get_abs_diff(r):
    if r.activity_diff<=-100:
        change_seq_r_diff = -100
    else:
        change_seq_r_diff=r.activity_diff
    return r.delta_CFD-change_seq_r_diff
df['abs_diff'] = df.swifter.apply(get_abs_diff,axis=1)
[54]:
?sns.scatterplot
Signature:
sns.scatterplot(
    data=None,
    *,
    x=None,
    y=None,
    hue=None,
    size=None,
    style=None,
    palette=None,
    hue_order=None,
    hue_norm=None,
    sizes=None,
    size_order=None,
    size_norm=None,
    markers=True,
    style_order=None,
    legend='auto',
    ax=None,
    **kwargs,
)
Docstring:
Draw a scatter plot with possibility of several semantic groupings.

The relationship between `x` and `y` can be shown for different subsets
of the data using the `hue`, `size`, and `style` parameters. These
parameters control what visual semantics are used to identify the different
subsets. It is possible to show up to three dimensions independently by
using all three semantic types, but this style of plot can be hard to
interpret and is often ineffective. Using redundant semantics (i.e. both
`hue` and `style` for the same variable) can be helpful for making
graphics more accessible.

See the :ref:`tutorial <relational_tutorial>` for more information.

The default treatment of the `hue` (and to a lesser extent, `size`)
semantic, if present, depends on whether the variable is inferred to
represent "numeric" or "categorical" data. In particular, numeric variables
are represented with a sequential colormap by default, and the legend
entries show regular "ticks" with values that may or may not exist in the
data. This behavior can be controlled through various parameters, as
described and illustrated below.

Parameters
----------
data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence
    Input data structure. Either a long-form collection of vectors that can be
    assigned to named variables or a wide-form dataset that will be internally
    reshaped.
x, y : vectors or keys in ``data``
    Variables that specify positions on the x and y axes.
hue : vector or key in `data`
    Grouping variable that will produce points with different colors.
    Can be either categorical or numeric, although color mapping will
    behave differently in latter case.
size : vector or key in `data`
    Grouping variable that will produce points with different sizes.
    Can be either categorical or numeric, although size mapping will
    behave differently in latter case.
style : vector or key in `data`
    Grouping variable that will produce points with different markers.
    Can have a numeric dtype but will always be treated as categorical.
palette : string, list, dict, or :class:`matplotlib.colors.Colormap`
    Method for choosing the colors to use when mapping the ``hue`` semantic.
    String values are passed to :func:`color_palette`. List or dict values
    imply categorical mapping, while a colormap object implies numeric mapping.
hue_order : vector of strings
    Specify the order of processing and plotting for categorical levels of the
    ``hue`` semantic.
hue_norm : tuple or :class:`matplotlib.colors.Normalize`
    Either a pair of values that set the normalization range in data units
    or an object that will map from data units into a [0, 1] interval. Usage
    implies numeric mapping.
sizes : list, dict, or tuple
    An object that determines how sizes are chosen when `size` is used.
    List or dict arguments should provide a size for each unique data value,
    which forces a categorical interpretation. The argument may also be a
    min, max tuple.
size_order : list
    Specified order for appearance of the `size` variable levels,
    otherwise they are determined from the data. Not relevant when the
    `size` variable is numeric.
size_norm : tuple or Normalize object
    Normalization in data units for scaling plot objects when the
    `size` variable is numeric.
markers : boolean, list, or dictionary
    Object determining how to draw the markers for different levels of the
    `style` variable. Setting to `True` will use default markers, or
    you can pass a list of markers or a dictionary mapping levels of the
    `style` variable to markers. Setting to `False` will draw
    marker-less lines.  Markers are specified as in matplotlib.
style_order : list
    Specified order for appearance of the `style` variable levels
    otherwise they are determined from the data. Not relevant when the
    `style` variable is numeric.
legend : "auto", "brief", "full", or False
    How to draw the legend. If "brief", numeric `hue` and `size`
    variables will be represented with a sample of evenly spaced values.
    If "full", every group will get an entry in the legend. If "auto",
    choose between brief or full representation based on number of levels.
    If `False`, no legend data is added and no legend is drawn.
ax : :class:`matplotlib.axes.Axes`
    Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca`
    internally.
kwargs : key, value mappings
    Other keyword arguments are passed down to
    :meth:`matplotlib.axes.Axes.scatter`.

Returns
-------
:class:`matplotlib.axes.Axes`
    The matplotlib axes containing the plot.

See Also
--------
lineplot : Plot data using lines.
stripplot : Plot a categorical scatter with jitter.
swarmplot : Plot a categorical scatter with non-overlapping points.

Examples
--------

.. include:: ../docstrings/scatterplot.rst
File:      ~/.conda/envs/jupyterlab_2024/lib/python3.8/site-packages/seaborn/relational.py
Type:      function

[58]:
?sns.scatterplot
Signature:
sns.scatterplot(
    data=None,
    *,
    x=None,
    y=None,
    hue=None,
    size=None,
    style=None,
    palette=None,
    hue_order=None,
    hue_norm=None,
    sizes=None,
    size_order=None,
    size_norm=None,
    markers=True,
    style_order=None,
    legend='auto',
    ax=None,
    **kwargs,
)
Docstring:
Draw a scatter plot with possibility of several semantic groupings.

The relationship between `x` and `y` can be shown for different subsets
of the data using the `hue`, `size`, and `style` parameters. These
parameters control what visual semantics are used to identify the different
subsets. It is possible to show up to three dimensions independently by
using all three semantic types, but this style of plot can be hard to
interpret and is often ineffective. Using redundant semantics (i.e. both
`hue` and `style` for the same variable) can be helpful for making
graphics more accessible.

See the :ref:`tutorial <relational_tutorial>` for more information.

The default treatment of the `hue` (and to a lesser extent, `size`)
semantic, if present, depends on whether the variable is inferred to
represent "numeric" or "categorical" data. In particular, numeric variables
are represented with a sequential colormap by default, and the legend
entries show regular "ticks" with values that may or may not exist in the
data. This behavior can be controlled through various parameters, as
described and illustrated below.

Parameters
----------
data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence
    Input data structure. Either a long-form collection of vectors that can be
    assigned to named variables or a wide-form dataset that will be internally
    reshaped.
x, y : vectors or keys in ``data``
    Variables that specify positions on the x and y axes.
hue : vector or key in `data`
    Grouping variable that will produce points with different colors.
    Can be either categorical or numeric, although color mapping will
    behave differently in latter case.
size : vector or key in `data`
    Grouping variable that will produce points with different sizes.
    Can be either categorical or numeric, although size mapping will
    behave differently in latter case.
style : vector or key in `data`
    Grouping variable that will produce points with different markers.
    Can have a numeric dtype but will always be treated as categorical.
palette : string, list, dict, or :class:`matplotlib.colors.Colormap`
    Method for choosing the colors to use when mapping the ``hue`` semantic.
    String values are passed to :func:`color_palette`. List or dict values
    imply categorical mapping, while a colormap object implies numeric mapping.
hue_order : vector of strings
    Specify the order of processing and plotting for categorical levels of the
    ``hue`` semantic.
hue_norm : tuple or :class:`matplotlib.colors.Normalize`
    Either a pair of values that set the normalization range in data units
    or an object that will map from data units into a [0, 1] interval. Usage
    implies numeric mapping.
sizes : list, dict, or tuple
    An object that determines how sizes are chosen when `size` is used.
    List or dict arguments should provide a size for each unique data value,
    which forces a categorical interpretation. The argument may also be a
    min, max tuple.
size_order : list
    Specified order for appearance of the `size` variable levels,
    otherwise they are determined from the data. Not relevant when the
    `size` variable is numeric.
size_norm : tuple or Normalize object
    Normalization in data units for scaling plot objects when the
    `size` variable is numeric.
markers : boolean, list, or dictionary
    Object determining how to draw the markers for different levels of the
    `style` variable. Setting to `True` will use default markers, or
    you can pass a list of markers or a dictionary mapping levels of the
    `style` variable to markers. Setting to `False` will draw
    marker-less lines.  Markers are specified as in matplotlib.
style_order : list
    Specified order for appearance of the `style` variable levels
    otherwise they are determined from the data. Not relevant when the
    `style` variable is numeric.
legend : "auto", "brief", "full", or False
    How to draw the legend. If "brief", numeric `hue` and `size`
    variables will be represented with a sample of evenly spaced values.
    If "full", every group will get an entry in the legend. If "auto",
    choose between brief or full representation based on number of levels.
    If `False`, no legend data is added and no legend is drawn.
ax : :class:`matplotlib.axes.Axes`
    Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca`
    internally.
kwargs : key, value mappings
    Other keyword arguments are passed down to
    :meth:`matplotlib.axes.Axes.scatter`.

Returns
-------
:class:`matplotlib.axes.Axes`
    The matplotlib axes containing the plot.

See Also
--------
lineplot : Plot data using lines.
stripplot : Plot a categorical scatter with jitter.
swarmplot : Plot a categorical scatter with non-overlapping points.

Examples
--------

.. include:: ../docstrings/scatterplot.rst
File:      ~/.conda/envs/jupyterlab_2024/lib/python3.8/site-packages/seaborn/relational.py
Type:      function

[57]:
sns.scatterplot(data=df.sample(n=1000000),x="abs_diff",y="delta_CFD",hue="activity_diff",palette="RdBu_r",hue_norm=(-100,100))
[57]:
<Axes: xlabel='abs_diff', ylabel='delta_CFD'>
../_images/jupyter_notebooks_suprising_results_57_1.png
[60]:
sns.scatterplot(data=df.sample(n=10000),x="abs_diff",y="delta_CFD",hue="activity_diff",palette="RdBu_r",hue_norm=(-100,100))
[60]:
<Axes: xlabel='abs_diff', ylabel='delta_CFD'>
../_images/jupyter_notebooks_suprising_results_58_1.png
[62]:
sns.distplot(df.abs_diff)
/lsf_tmp/248986378_1.tmpdir/ipykernel_1223482/864164568.py:1: UserWarning:

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df.abs_diff)
[62]:
<Axes: xlabel='abs_diff', ylabel='Density'>
../_images/jupyter_notebooks_suprising_results_59_2.png
[64]:
df[df.abs_diff<-50]
[64]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD abs_diff
190 AAVS1.s14 3 TTGGCCACTAGGGACAGGTTTGG 121.891716 TTGGCCACTAGGGACAAGTTTGG 170.838335 4 48.946619 1.401558 [1, 2, 19] [1, 2, 17, 19] 17 17 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG TTGGCCACTAGGGACAAGTTTGG 48.946619 GGGGCCACTAGGGACAGGATNGG TTGGCCACTAGGGACAGGTTTGG -2.284382 -51.231001
491 AAVS1.s14 1 GGGGCCACTAGGGGCAGGATCGG 109.546162 GGGACCACTAGGGGCAGGATCGG 157.668336 2 48.122174 1.439287 [14] [4, 14] 4 4 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGGCAGGATCGG 48.122174 GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGGCAGGATCGG -4.666667 -52.788841
806 AAVS1.s14 1 GGGGCCACTAGGGACAGGTTCGG 105.462750 GGGACCACTAGGGACAGGTTCGG 193.097697 2 87.634947 1.830956 [19] [4, 19] 4 4 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG 87.634947 GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGTTCGG -5.384615 -93.019562
807 AAVS1.s14 1 GGGGCCACTAGGGACAGGTTCGG 105.462750 GTGGCCACTAGGGACAGGTTCGG 150.083680 2 44.620930 1.423097 [19] [2, 19] 2 2 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GTGGCCACTAGGGACAGGTTCGG 44.620930 GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGTTCGG -19.580420 -64.201349
1135 AAVS1.s14 2 AGGGCCACTAGGGACAGGTTTGG 102.230726 AAGGCCACTAGGGACAGGTTTGG 150.553124 3 48.322398 1.472680 [1, 19] [1, 2, 19] 2 2 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG AAGGCCACTAGGGACAGGTTTGG 48.322398 GGGGCCACTAGGGACAGGATNGG AGGGCCACTAGGGACAGGTTTGG -7.455621 -55.778019
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13541717 TRAC.s1 5 GTAAGAATTCTGAATATCTATGG 0.000000 GTAAGAATTCTGAATGTCTATGG 0.104366 6 0.104366 inf [3, 6, 7, 13, 20] [3, 6, 7, 13, 16, 20] 16 16 A G GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTAAGAATTCTGAATGTCTATGG 0.104366 GTCAGGGTTCTGGATATCTGNGG GTAAGAATTCTGAATATCTATGG -60.576923 -60.681289
13541725 TRAC.s1 5 GTAAGAATTCTGGATATTTATGG 0.000000 GTAAGAATTCTGGATTTTTATGG 0.000000 6 0.000000 NaN [3, 6, 7, 18, 20] [3, 6, 7, 16, 18, 20] 16 16 A T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTAAGAATTCTGGATTTTTATGG 0.000000 GTCAGGGTTCTGGATATCTGNGG GTAAGAATTCTGGATATTTATGG -52.232143 -52.232143
13549448 TRAC.s1 5 GTCAAACTTCTAGATATCTATGG 0.000000 GTCAAACTTCTAGAGATCTATGG 1.880659 6 1.880659 inf [5, 6, 7, 12, 20] [5, 6, 7, 12, 15, 20] 15 15 T G GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAAACTTCTAGAGATCTATGG 1.880659 GTCAGGGTTCTGGATATCTGNGG GTCAAACTTCTAGATATCTATGG -49.528646 -51.409304
13549538 TRAC.s1 5 GTCAAAGGTCTGAATATCTATGG 0.000000 GTCAAAGGTCTGAATATCTATGT 0.000000 6 0.000000 NaN [5, 6, 8, 13, 20] [5, 6, 8, 13, 20, 23] 23 23 G T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAAAGGTCTGAATATCTATGT 0.000000 GTCAGGGTTCTGGATATCTGNGG GTCAAAGGTCTGAATATCTATGG -54.112903 -54.112903
13555791 TRAC.s1 5 GTCACAGTCATGGATATCTATGG 0.000000 GTCACAGTCATGGATTTCTATGG 0.000000 6 0.000000 NaN [5, 6, 9, 10, 20] [5, 6, 9, 10, 16, 20] 16 16 A T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCACAGTCATGGATTTCTATGG 0.000000 GTCAGGGTTCTGGATATCTGNGG GTCACAGTCATGGATATCTATGG -58.928571 -58.928571

28295 rows × 24 columns

[63]:
df[df.abs_diff>50]
[63]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD abs_diff
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769 96.769231
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378 82.377622
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485 70.177515
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692 93.942308
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231 79.230769
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10764948 TRAC.s1 1 GTCAGGATTCTGGATATCTGAGG 55.040627 GTCAGGACTCTGGATATCTGAGG 4.909804 2 -50.130823 0.089203 [7] [7, 8] 8 8 T C GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGGACTCTGGATATCTGAGG 50.130823 GTCAGGGTTCTGGATATCTGNGG GTCAGGATTCTGGATATCTGAGG 0.000000 50.130823
10764995 TRAC.s1 1 GGCAGGGTTCTGGATATCTGGGG 54.406738 GGCAGAGTTCTGGATATCTGGGG 3.994802 2 -50.411937 0.073425 [2] [2, 6] 6 6 G A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GGCAGAGTTCTGGATATCTGGGG 50.411937 GTCAGGGTTCTGGATATCTGNGG GGCAGGGTTCTGGATATCTGGGG 0.000000 50.411937
10765049 TRAC.s1 1 GTCAAGGTTCTGGATATCTGAGG 54.077573 GTCAAAGTTCTGGATATCTGAGG 2.750119 2 -51.327454 0.050855 [5] [5, 6] 6 6 G A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAAAGTTCTGGATATCTGAGG 51.327454 GTCAGGGTTCTGGATATCTGNGG GTCAAGGTTCTGGATATCTGAGG 0.000000 51.327454
10765254 TRAC.s1 1 GTCAAGGTTCTGGATATCTGGGG 53.848331 GTCAAAGTTCTGGATATCTGGGG 2.514930 2 -51.333401 0.046704 [5] [5, 6] 6 6 G A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAAAGTTCTGGATATCTGGGG 51.333401 GTCAGGGTTCTGGATATCTGNGG GTCAAGGTTCTGGATATCTGGGG 0.000000 51.333401
10765458 TRAC.s1 2 GTCAGGGTACTGGATATCTATGG 52.780553 GTCAGAGTACTGGATATCTATGG 0.972426 3 -51.808126 0.018424 [9, 20] [6, 9, 20] 6 6 G A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG GTCAGAGTACTGGATATCTATGG 51.808126 GTCAGGGTTCTGGATATCTGNGG GTCAGGGTACTGGATATCTATGG 0.000000 51.808126

15830 rows × 24 columns

[ ]:

[ ]:

[25]:
df2[df2.site=="LAG3.s9"].groupby("MM").control_counts_rep1.describe().head()
[25]:
count mean std min 25% 50% 75% max
MM
0 4.0 2748.750000 3638.893639 445.0 936.25 1184.5 2997.00 8181.0
1 262.0 433.305344 515.572321 5.0 110.75 202.5 490.75 3042.0
2 7298.0 73.103453 92.585903 1.0 13.25 35.0 96.00 1041.0
3 74356.0 20.551052 22.955063 1.0 5.00 13.0 28.00 340.0
4 282869.0 10.850659 11.548069 1.0 3.00 7.0 15.00 151.0
[36]:
df.head()
[36]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD
7 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGAGACAGGTTCGG 20.008296 3 -173.089401 0.103617 [4, 19] [4, 12, 19] 12 12 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGAGACAGGTTCGG 173.089401 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -3.230769
9 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGATCACTAGGGACAGGTTCGG 8.696374 3 -184.401323 0.045036 [4, 19] [4, 5, 19] 5 5 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGATCACTAGGGACAGGTTCGG 184.401323 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -17.622378
10 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCACTAGTGACAGGTTCGG 2.381133 3 -190.716563 0.012331 [4, 19] [4, 12, 19] 12 12 G T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGTGACAGGTTCGG 190.716563 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -29.822485
11 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGACCATTAGGGACAGGTTCGG 4.460667 3 -188.637030 0.023101 [4, 19] [4, 8, 19] 8 8 C T GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCATTAGGGACAGGTTCGG 188.637030 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -6.057692
12 AAVS1.s14 2 GGGACCACTAGGGACAGGTTCGG 193.097697 GGGAACACTAGGGACAGGTTCGG 2.885356 3 -190.212341 0.014942 [4, 19] [4, 5, 19] 5 5 C A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGAACACTAGGGACAGGTTCGG 190.212341 GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG -20.769231
[66]:
df2[df2.control_target_seq=="TTGGCCACTAGGGACAGGTTTGG"]
[66]:
Unnamed: 0 site MM control_target_seq Cas9_counts_rep1 Cas9_counts_rep2 Cas9_counts_scaled_rep1 Cas9_counts_scaled_rep2 control_counts_rep1 control_counts_rep2 control_counts_scaled_rep1 control_counts_scaled_rep2 log2FC_rep1 log2FC_rep2 delta_log2FC FC_rep1 FC_rep2 average_log2FC spacer PAM relative_activity_percent
63 83622 AAVS1.s14 3 TTGGCCACTAGGGACAGGTTTGG 1176.0 863.0 98.697454 108.83421 52.0 52.0 1.815501 1.815501 5.79087 5.929488 -0.138618 54.363757 59.947206 5.861843 TTGGCCACTAGGGACAGGTT TGG 121.891716
[65]:
df2[df2.control_target_seq=="TTGGCCACTAGGGACAAGTTTGG"]
[65]:
Unnamed: 0 site MM control_target_seq Cas9_counts_rep1 Cas9_counts_rep2 Cas9_counts_scaled_rep1 Cas9_counts_scaled_rep2 control_counts_rep1 control_counts_rep2 control_counts_scaled_rep1 control_counts_scaled_rep2 log2FC_rep1 log2FC_rep2 delta_log2FC FC_rep1 FC_rep2 average_log2FC spacer PAM relative_activity_percent
21 406041 AAVS1.s14 4 TTGGCCACTAGGGACAAGTTTGG 176.0 149.0 14.771048 18.790611 6.0 6.0 0.209481 0.209481 6.160126 6.503044 -0.342918 70.512628 89.70084 6.34175 TTGGCCACTAGGGACAAGTT TGG 170.838335
[20]:
df2[df2.control_target_seq=="GAAGGTTGAGACCCTAGAGGAGG"]
[20]:
Unnamed: 0 site MM control_target_seq Cas9_counts_rep1 Cas9_counts_rep2 Cas9_counts_scaled_rep1 Cas9_counts_scaled_rep2 control_counts_rep1 control_counts_rep2 control_counts_scaled_rep1 control_counts_scaled_rep2 log2FC_rep1 log2FC_rep2 delta_log2FC FC_rep1 FC_rep2 average_log2FC spacer PAM relative_activity_percent
13401809 13438688 LAG3.s9 3 GAAGGTTGAGACCCTAGAGGAGG 30.0 31.0 1.845778 1.740488 6.0 6.0 0.198311 0.198311 3.36562 3.289325 0.076295 9.307481 8.776546 3.327977 GAAGGTTGAGACCCTAGAGG AGG 11.791731
[27]:
df[df.activity_diff>0]
[27]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD
190 AAVS1.s14 3 TTGGCCACTAGGGACAGGTTTGG 121.891716 TTGGCCACTAGGGACAAGTTTGG 170.838335 4 48.946619 1.401558 [1, 2, 19] [1, 2, 17, 19] 17 17 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG TTGGCCACTAGGGACAAGTTTGG 48.946619 GGGGCCACTAGGGACAGGATNGG TTGGCCACTAGGGACAGGTTTGG -2.284382
350 AAVS1.s14 1 GGGGCCACTATGGACAGGATCGG 110.700418 AGGGCCACTATGGACAGGATCGG 131.758582 2 21.058165 1.190227 [11] [1, 11] 1 1 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG AGGGCCACTATGGACAGGATCGG 21.058165 GGGGCCACTAGGGACAGGATNGG GGGGCCACTATGGACAGGATCGG -3.846154
491 AAVS1.s14 1 GGGGCCACTAGGGGCAGGATCGG 109.546162 GGGACCACTAGGGGCAGGATCGG 157.668336 2 48.122174 1.439287 [14] [4, 14] 4 4 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGGCAGGATCGG 48.122174 GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGGCAGGATCGG -4.666667
494 AAVS1.s14 1 GGGGCCACTAGGGGCAGGATCGG 109.546162 AGGGCCACTAGGGGCAGGATCGG 110.235641 2 0.689479 1.006294 [14] [1, 14] 1 1 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG AGGGCCACTAGGGGCAGGATCGG 0.689479 GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGGCAGGATCGG -4.666667
806 AAVS1.s14 1 GGGGCCACTAGGGACAGGTTCGG 105.462750 GGGACCACTAGGGACAGGTTCGG 193.097697 2 87.634947 1.830956 [19] [4, 19] 4 4 G A GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGATNGG GGGACCACTAGGGACAGGTTCGG 87.634947 GGGGCCACTAGGGACAGGATNGG GGGGCCACTAGGGACAGGTTCGG -5.384615
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13679294 TRAC.s1 5 TTTCGGGTTCTGGTTATCTATGG 0.000000 TCTCGGGTTCTGGTTATCTATGG 0.292225 6 0.292225 inf [1, 3, 4, 14, 20] [1, 2, 3, 4, 14, 20] 2 2 T C GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG TCTCGGGTTCTGGTTATCTATGG 0.292225 GTCAGGGTTCTGGATATCTGNGG TTTCGGGTTCTGGTTATCTATGG -1.733193
13679308 TRAC.s1 5 TTTGAGGTTCAGGATATCTGTGG 0.000000 TTTGAGGTTCAGGATATCTGTCG 0.253855 6 0.253855 inf [1, 3, 4, 5, 11] [1, 3, 4, 5, 11, 22] 22 22 G C GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG TTTGAGGTTCAGGATATCTGTCG 0.253855 GTCAGGGTTCTGGATATCTGNGG TTTGAGGTTCAGGATATCTGTGG -24.937221
13679339 TRAC.s1 5 TTTTGAGTTCTGGATTTCTGTGG 0.000000 TTTTGAGTTATGGATTTCTGTGG 0.075467 6 0.075467 inf [1, 3, 4, 6, 16] [1, 3, 4, 6, 10, 16] 10 10 C A GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG TTTTGAGTTATGGATTTCTGTGG 0.075467 GTCAGGGTTCTGGATATCTGNGG TTTTGAGTTCTGGATTTCTGTGG 0.000000
13679352 TRAC.s1 5 TTTTGGGTTCTGGATATTGGTGG 0.000000 TTTTGGGTTCTGTATATTGGTGG 0.058634 6 0.058634 inf [1, 3, 4, 18, 19] [1, 3, 4, 13, 18, 19] 13 13 G T GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG TTTTGGGTTCTGTATATTGGTGG 0.058634 GTCAGGGTTCTGGATATCTGNGG TTTTGGGTTCTGGATATTGGTGG -4.921875
13679354 TRAC.s1 5 TTTTGGGTTCTGGATCTCTATGG 0.000000 TTTTGGGGTCTGGATCTCTATGG 0.063527 6 0.063527 inf [1, 3, 4, 16, 20] [1, 3, 4, 8, 16, 20] 8 8 T G GTCAGGGTTCTGGATATCTGNGG GTCAGGGTTCTGGATATCTGNGG TTTTGGGGTCTGGATCTCTATGG 0.063527 GTCAGGGTTCTGGATATCTGNGG TTTTGGGTTCTGGATCTCTATGG 0.000000

854102 rows × 23 columns

[16]:
df[df.activity_diff<-200]
[16]:
site MM control_target_seq relative_activity_percent variant_seq variant_relative_activity_percent variant_MM activity_diff activity_ratio mismatch_positions variant_mismatch_positions variant_position var_pos REF ALT target_seq realigned_target_seq seq bulged_seq reads on_target off delta_CFD
8023465 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGACCCTAGAGGAGG 11.791731 3 -268.021737 0.042141 [6, 16] [6, 12, 16] 12 12 T C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGACCCTAGAGGAGG 268.021737 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -4.887218
8023467 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTAGAGATCCTAGAGGAGG 10.071949 3 -269.741518 0.035995 [6, 16] [6, 7, 16] 7 7 T A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTAGAGATCCTAGAGGAGG 269.741518 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -11.607143
8023468 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 TAAGGTTGAGATCCTAGAGGAGG 8.386182 3 -271.427285 0.029971 [6, 16] [1, 6, 16] 1 1 G T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG TAAGGTTGAGATCCTAGAGGAGG 271.427285 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG 0.000000
8023469 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTGGAGATCCTAGAGGAGG 2.095282 3 -277.718186 0.007488 [6, 16] [6, 7, 16] 7 7 T G GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTGGAGATCCTAGAGGAGG 277.718186 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -38.235294
8023470 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GTAGGTTGAGATCCTAGAGGAGG 6.969863 3 -272.843605 0.024909 [6, 16] [2, 6, 16] 2 2 A T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GTAGGTTGAGATCCTAGAGGAGG 272.843605 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -25.324675
8023471 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCCTATAGGAGG 6.797364 3 -273.016104 0.024292 [6, 16] [6, 16, 17] 17 17 G T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTATAGGAGG 273.016104 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -69.642857
8023472 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTAAGATCCTAGAGGAGG 5.726996 3 -274.086471 0.020467 [6, 16] [6, 8, 16] 8 8 G A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTAAGATCCTAGAGGAGG 274.086471 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG 0.000000
8023473 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCCTACAGGAGG 4.752846 3 -275.060621 0.016986 [6, 16] [6, 16, 17] 17 17 G C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTACAGGAGG 275.060621 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -71.008403
8023474 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGGGATCCTAGAGGAGG 5.527540 3 -274.285928 0.019754 [6, 16] [6, 9, 16] 9 9 A G GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGGGATCCTAGAGGAGG 274.285928 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -30.952381
8023475 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCCTAAAGGAGG 4.310561 3 -275.502907 0.015405 [6, 16] [6, 16, 17] 17 17 G A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAAAGGAGG 275.502907 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -6.190476
8023476 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGTGATCCTAGAGGAGG 3.880838 3 -275.932630 0.013869 [6, 16] [6, 9, 16] 9 9 A T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGTGATCCTAGAGGAGG 275.932630 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -37.142857
8023477 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GACGGTTGAGATCCTAGAGGAGG 3.815962 3 -275.997506 0.013638 [6, 16] [3, 6, 16] 3 3 A C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GACGGTTGAGATCCTAGAGGAGG 275.997506 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -53.061224
8023478 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATTCTAGAGGAGG 3.707332 3 -276.106135 0.013249 [6, 16] [6, 13, 16] 13 13 C T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATTCTAGAGGAGG 276.106135 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -27.857143
8023479 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GATGGTTGAGATCCTAGAGGAGG 2.777671 3 -277.035796 0.009927 [6, 16] [3, 6, 16] 3 3 A T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GATGGTTGAGATCCTAGAGGAGG 277.035796 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -27.310924
8023480 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGTTTGAGATCCTAGAGGAGG 3.387855 3 -276.425612 0.012108 [6, 16] [5, 6, 16] 5 5 G T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGTTTGAGATCCTAGAGGAGG 276.425612 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -65.000000
8023481 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCATAGAGGAGG 2.500840 3 -277.312628 0.008938 [6, 16] [6, 14, 16] 14 14 C A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCATAGAGGAGG 277.312628 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -60.357143
8023482 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGACATCCTAGAGGAGG 2.221278 3 -277.592190 0.007938 [6, 16] [6, 10, 16] 10 10 G C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGACATCCTAGAGGAGG 277.592190 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -55.714286
8023483 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCGTAGAGGAGG 0.664579 3 -279.148888 0.002375 [6, 16] [6, 14, 16] 14 14 C G GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCGTAGAGGAGG 279.148888 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -92.857143
8023484 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCCGAGAGGAGG 1.490701 3 -278.322767 0.005327 [6, 16] [6, 15, 16] 15 15 T G GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCGAGAGGAGG 278.322767 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -88.214286
8023485 LAG3.s9 2 GAAGGTTGAGATCCTAGAGGAGG 279.813468 GAAGGTTGAGATCCTAGACGAGG 1.005184 3 -278.808284 0.003592 [6, 16] [6, 16, 19] 19 19 G C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGACGAGG 278.808284 GAAGGCTGAGATCCTGGAGGNGG GAAGGTTGAGATCCTAGAGGAGG -51.231527
8023492 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCTGGAGTCGG 65.892935 3 -203.637962 0.244473 [1, 2] [1, 2, 20] 20 20 G T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGTCGG 203.637962 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -19.636364
8023493 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCTGGAGGCAG 24.032526 3 -245.498372 0.089164 [1, 2] [1, 2, 22] 22 22 G A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCAG 245.498372 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -48.484848
8023496 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGATGAGATCCTGGAGGCGG 20.156988 3 -249.373910 0.074785 [1, 2] [1, 2, 6] 6 6 C A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGATGAGATCCTGGAGGCGG 249.373910 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -4.675325
8023497 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTCAGATCCTGGAGGCGG 10.057149 3 -259.473749 0.037314 [1, 2] [1, 2, 8] 8 8 G C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTCAGATCCTGGAGGCGG 259.473749 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -25.174825
8023498 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCAGGAGGCGG 36.815412 3 -232.715486 0.136591 [1, 2] [1, 2, 15] 15 15 T A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCAGGAGGCGG 232.715486 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -27.559809
8023499 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCTGGAGGCTG 9.356937 3 -260.173961 0.034716 [1, 2] [1, 2, 22] 22 22 G T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCTG 260.173961 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -62.904368
8023500 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCTGTAGGCGG 19.593457 3 -249.937441 0.072695 [1, 2] [1, 2, 17] 17 17 G T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGTAGGCGG 249.937441 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -49.090909
8023501 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCCGGAGGCGG 17.578298 3 -251.952600 0.065218 [1, 2] [1, 2, 15] 15 15 T C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCCGGAGGCGG 251.952600 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -47.603306
8023502 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCTGGAGGCCG 1.924251 3 -267.606647 0.007139 [1, 2] [1, 2, 22] 22 22 G C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCCG 267.606647 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -58.441558
8023504 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATGCTGGAGGCGG 5.504538 3 -264.026360 0.020423 [1, 2] [1, 2, 13] 13 13 C G GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATGCTGGAGGCGG 264.026360 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -56.528926
8023505 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCATGGAGGCGG 15.847884 3 -253.683014 0.058798 [1, 2] [1, 2, 14] 14 14 C A GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCATGGAGGCGG 253.683014 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -42.545455
8023506 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGAGATCCTGGAGGCGC 8.342181 3 -261.188717 0.030951 [1, 2] [1, 2, 23] 23 23 G C GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGC 261.188717 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -64.000000
8023507 LAG3.s9 2 ATAGGCTGAGATCCTGGAGGCGG 269.530898 ATAGGCTGTGATCCTGGAGGCGG 3.255658 3 -266.275240 0.012079 [1, 2] [1, 2, 9] 9 9 A T GAAGGCTGAGATCCTGGAGGNGG GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGTGATCCTGGAGGCGG 266.275240 GAAGGCTGAGATCCTGGAGGNGG ATAGGCTGAGATCCTGGAGGCGG -26.181818
[3]:
import pandas as pd
[4]:
a = pd.read_csv("/home/yli11/dirs/MBNL1_Project_all_data_needed/yichao/bam_all/isoquant_bam_yli11_2025-04-09_b6e6d351c56d_isoQuant_isoseq_default_pacbio/OUT/40283.tsv",sep="\t",header=None)
[5]:
a[5].unique()
[5]:
array(['inconsistent_ambiguous', 'inconsistent', 'unique', 'ambiguous',
       'unique_minor_difference', 'inconsistent_non_intronic'],
      dtype=object)
[119]:
for s,d in a.groupby(5):
    d[[0]].to_csv(f"/home/yli11/dirs/MBNL1_Project_all_data_needed/yichao/bam_all/ISOseq_bam/jj40283.{s}.list",index=False,header=False)
[122]:
run_lsf.py -f bam.list --read_list BCL11A.ambiguous.read.list -p bam_read_list -j BCL11A.ambiguous.bam
run_lsf.py -f bam.list --read_list BCL11A.unique.read.list -p bam_read_list -j BCL11A.unique.bam

[122]:
0 1 2 3 4 5 6 7 8
0 a7cd1d6f-6220-4fe6-bd0b-1f5ab9086462 chr2 . JingJing40283.chr2.nnic ENSG00000119866.22 inconsistent_ambiguous extra_intron_novel:60339735-60460293,extra_int... 60188485-60188490,60188736-60188746,60339713-6... gene_assignment=inconsistent; PolyA=False; Cla...
1 d3e05f8e-f3f4-420f-b8a6-879972ad1ec7 chr2 + JingJing40283.chr2.nnic ENSG00000119866.22 inconsistent_ambiguous extra_intron_novel:60266600-60460303,antisense 60266577-60266599,60460304-60460657 gene_assignment=inconsistent; PolyA=False; Cla...
2 be4383cd-e00a-4da8-b3c3-c98825936ee2 chr2 + JingJing40283.chr2.nnic ENSG00000119866.22 inconsistent_ambiguous extra_intron_5:60268655-60284735,extra_intron_... 60268649-60268654,60284736-60284743,60448376-6... gene_assignment=inconsistent; PolyA=False; Cla...
3 b6c13913-70b6-4833-912b-574e5b200a6e chr2 + JingJing40283.chr2.nnic ENSG00000119866.22 inconsistent_ambiguous extra_intron_novel:60273789-60460303,antisense 60273766-60273788,60460304-60460671 gene_assignment=inconsistent; PolyA=False; Cla...
4 39d36047-2077-4861-80b6-5d1387852cd7 chr2 + JingJing40283.chr2.nnic ENSG00000119866.22 inconsistent_ambiguous extra_intron_novel:60281594-60460362,antisense 60281576-60281593,60460363-60460671 gene_assignment=inconsistent; PolyA=False; Cla...
[124]:
for s,d in a.groupby(5):
    print (s)
    print (d.sample(n=100)[6].unique())
ambiguous
['mono_exonic' 'ism_5' 'ism_3,tss_match:-11' 'ism_5,exon_elongation_5:3'
 'mono_exonic,antisense']
inconsistent
['terminal_exon_shift_known:60546301-60550867'
 'alternative_structure_novel:60468812-60553532'
 'alternative_structure_novel:60460366-60553937,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60460390-60553524,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60468834-60490508,tes_match_precise:0,correct_polya_site_left:60458603'
 'exon_gain_novel:60468834-60469801,60469865-60538380,tss_match:-3'
 'alternative_structure_novel:60461289-60553537,antisense'
 'terminal_exon_shift_known:60546301-60550867,exon_elongation_3:3'
 'extra_intron_novel:60461400-60461742,terminal_exon_shift_known:60546301-60550867'
 'terminal_exon_shift_known:60546301-60552497'
 'terminal_exon_shift_known:60546301-60552497,exon_elongation_3:4'
 'alternative_structure_novel:60460390-60553516,alternative_polya_site_left:60458654'
 'extra_intron_novel:60553460-60553515'
 'extra_intron_novel:60459027-60459109,tes_match_precise:0,correct_polya_site_left:60458603'
 'terminal_exon_shift_known:60546301-60550867,exon_elongation_3:4'
 'extra_intron_novel:60553394-60554839,extra_intron_5:60554966-60555025'
 'alternative_structure_novel:60460341-60553559,tes_match_precise:0,correct_polya_site_left:60458603'
 'alternative_structure_novel:60460338-60553557,tes_match_precise:0,correct_polya_site_left:60458603'
 'extra_intron_novel:60459266-60459318,tes_match_precise:0,correct_polya_site_left:60458603'
 'exon_gain_novel:60468834-60502222,60502351-60538380,terminal_exon_shift_known:60546301-60550867'
 'extra_intron_novel:60460346-60461830,tes_match_precise:0,correct_polya_site_left:60458603'
 'extra_intron_novel:60461268-60461619,terminal_exon_shift_known:60546301-60550867'
 'alternative_structure_novel:60461915-60553539'
 'extra_intron_novel:60459639-60460318,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60519168-60538380,tss_match:-5'
 'alternative_structure_novel:60460390-60553518,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60468783-60553553'
 'alternative_structure_novel:60460630-60546213,extra_intron_novel:60553460-60553515'
 'extra_intron_novel:60461394-60461606'
 'extra_intron_novel:60460381-60461809,major_exon_elongation_5:59'
 'extra_intron_novel:60461394-60461590,extra_intron_novel:60553460-60553515'
 'extra_intron_novel:60460742-60461833,terminal_exon_shift_known:60546301-60550867'
 'extra_intron_novel:60460328-60462130,alternative_structure_novel:60462425-60490508,tes_match_precise:0,correct_polya_site_left:60458603'
 'intron_alternation_novel:60546124-60553475,tss_match_precise:0'
 'alternative_structure_novel:60460283-60538840,tes_match_precise:0,correct_polya_site_left:60458603'
 'extra_intron_novel:60460464-60462341,tss_match:-6'
 'alternative_structure_novel:60461282-60553634,antisense'
 'alternative_structure_novel:60460333-60553292,tes_match_precise:0,major_exon_elongation_5:142,correct_polya_site_left:60458603'
 'alternative_structure_novel:60461810-60553634'
 'alt_acceptor_site_novel:60460717-60468731,terminal_exon_shift_known:60546301-60550867'
 'extra_intron_novel:60553454-60553538,extra_intron_5:60554036-60554244,extra_intron_5:60553809-60553913'
 'extra_intron_novel:60459753-60460246,alternative_structure_novel:60460341-60553529,tes_match_precise:0,correct_polya_site_left:60458603'
 'exon_gain_novel:60468834-60508605,60508825-60538380,terminal_exon_shift_known:60546301-60550867'
 'alternative_structure_novel:60487482-60508605,60508825-60538380'
 'alternative_structure_novel:60461655-60537387,extra_intron_known:60553394-60553877'
 'alternative_structure_novel:60460221-60553731,extra_intron_3:60553957-60553996,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'extra_intron_novel:60460327-60460383,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60475279-60478163,60478240-60538380,tss_match:-2'
 'extra_intron_novel:60460354-60461803,tes_match_precise:0,correct_polya_site_left:60458603'
 'alternative_structure_novel:60460390-60553553,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60460379-60553514,tss_match_precise:0,correct_polya_site_left:60458603,antisense'
 'alternative_structure_novel:60468812-60553552']
inconsistent_ambiguous
['alternative_structure_novel:60460845-60553277,tss_match:-4'
 'incomplete_intron_retention_3:60538550-60545970,tss_match:-2'
 'alternative_structure_novel:60461190-60553305,tes_match:-1,antisense'
 'extra_intron_novel:60460154-60460190,alternative_polya_site_left:60459264,antisense'
 'mono_exonic,alternative_polya_site_left:60459262'
 'alternative_structure_novel:60468812-60553521'
 'extra_intron_novel:60460918-60461110,antisense'
 'incomplete_intron_retention_3:60538550-60545970,extra_intron_novel:60553454-60553538'
 'mono_exonic,alternative_polya_site_left:60460285'
 'incomplete_intron_retention_3:60538550-60545970'
 'alternative_structure_novel:60460656-60523470'
 'incomplete_intron_retention_5:60546301-60553215,tes_match:-4,antisense'
 'alternative_structure_novel:60462425-60518469'
 'alternative_structure_novel:60461383-60553336,extra_intron_novel:60553367-60553877'
 'alternative_structure_novel:60461099-60553317,tes_match:-1,antisense'
 'incomplete_intron_retention_3:60538550-60545970,tss_match:-4'
 'extra_intron_novel:60460989-60461032'
 'alternative_structure_novel:60461110-60553788,antisense'
 'alternative_structure_novel:60461199-60553313,tss_match:-7'
 'alternative_structure_novel:60460858-60553430,tes_match:-6,antisense'
 'extra_intron_novel:60461220-60461744'
 'extra_intron_novel:60460711-60461092'
 'incomplete_intron_retention_3:60538550-60545970,tss_match:-5'
 'alternative_structure_novel:60461347-60553344,tes_match:-7,antisense'
 'incomplete_intron_retention_3:60538550-60545970,tss_match:-3'
 'alternative_structure_novel:60461319-60553310,tes_match:-7,antisense'
 'incomplete_intron_retention_3:60538550-60545970,tss_match_precise:0'
 'incomplete_intron_retention_5:60546301-60553215,tes_match:-8,antisense'
 'alternative_structure_novel:60462153-60553238,tes_match:-4,antisense'
 'alternative_structure_novel:60468834-60531079'
 'extra_intron_novel:60460337-60460372'
 'alternative_structure_novel:60462425-60531079'
 'incomplete_intron_retention_5:60546301-60553215,tes_match:-6,antisense'
 'alternative_structure_novel:60468847-60545970,extra_intron_novel:60553454-60553538'
 'alternative_structure_novel:60460938-60553292,tss_match:-14'
 'alternative_structure_novel:60461285-60553285,tes_match:-3,antisense'
 'mono_exonic,alternative_polya_site_left:60459278'
 'alternative_structure_novel:60460587-60462602'
 'alternative_structure_novel:60460908-60550877'
 'mono_exonic,alternative_polya_site_left:60458285'
 'alternative_structure_novel:60468834-60516426,60516504-60553535'
 'mono_exonic,alternative_polya_site_left:60459258'
 'alternative_structure_novel:60461052-60553293,tes_match:-3,antisense'
 'alternative_structure_novel:60462425-60553285,tss_match:-1'
 'alternative_structure_novel:60461345-60553315,tes_match:-2,antisense'
 'alternative_structure_novel:60461208-60553215,tss_match:-4'
 'alternative_structure_novel:60461293-60553285,tss_match:-3'
 'alternative_structure_novel:60460630-60546213,tss_match:-6'
 'incomplete_intron_retention:60462425-60468731'
 'mono_exonic,alternative_polya_site_left:60460462'
 'alt_donor_site_novel:60468834-60538039,tss_match:-11'
 'mono_exonic,alternative_polya_site_left:60458666'
 'alternative_structure_novel:60460878-60553372,tes_match:-3,antisense'
 'mono_exonic,alternative_polya_site_left:60460289'
 'extra_intron_novel:60460920-60461113,extra_intron_novel:60461220-60461792'
 'incomplete_intron_retention_3:60538550-60545970,tss_match:-6'
 'incomplete_intron_retention_3:60538550-60545970,tss_match:-1'
 'alternative_structure_novel:60460664-60516142'
 'alternative_structure_novel:60461241-60553237,tes_match:-5,antisense'
 'mono_exonic,alternative_polya_site_left:60458654'
 'alternative_structure_novel:60461287-60553287,tes_match:-1,antisense'
 'alternative_structure_novel:60460878-60553227,tes_match:-7,antisense'
 'incomplete_intron_retention_5:60546301-60553215,tes_match:-3,antisense'
 'alternative_structure_novel:60461254-60553277,tss_match:-9'
 'alternative_structure_novel:60461157-60553301,tes_match:-1,antisense'
 'alternative_structure_novel:60462143-60554336'
 'alternative_structure_novel:60460842-60553238,tes_match:-3,antisense'
 'extra_intron_novel:60460549-60460576'
 'alternative_structure_novel:60460759-60541333,antisense'
 'alternative_structure_novel:60461208-60553215,tss_match:-10'
 'extra_intron_novel:60460842-60461189,antisense'
 'alternative_structure_novel:60462425-60516426'
 'alternative_structure_novel:60461295-60553280,tss_match:-20'
 'extra_intron_novel:60460858-60462293,antisense'
 'alternative_structure_novel:60461133-60553326'
 'incomplete_intron_retention_5:60546301-60553215,tes_match:-11,antisense'
 'alternative_structure_novel:60461352-60553317,tss_match:-18'
 'alternative_structure_novel:60461208-60553393,tss_match:-4'
 'alternative_structure_novel:60461590-60553346,tes_match:-4,antisense'
 'alternative_structure_novel:60461295-60553295,tss_match:-3'
 'alternative_structure_novel:60462425-60518469,60519168-60530371'
 'extra_intron_novel:60461064-60461645'
 'mono_exonic,alternative_polya_site_left:60459148'
 'alternative_structure_novel:60462425-60530371'
 'alternative_structure_novel:60461352-60553310,tss_match:-17'
 'alternative_structure_novel:60460711-60553241,tss_match:-12'
 'extra_intron_novel:60460322-60460379,antisense'
 'alternative_structure_novel:60461287-60553288,tes_match:-7,antisense']
inconsistent_non_intronic
['major_exon_elongation_5:127' 'major_exon_elongation_5:623'
 'major_exon_elongation_5:120' 'major_exon_elongation_5:142'
 'major_exon_elongation_5:139' 'major_exon_elongation_5:138'
 'major_exon_elongation_5:141' 'major_exon_elongation_5:146'
 'major_exon_elongation_5:1543' 'major_exon_elongation_5:601'
 'major_exon_elongation_5:90' 'major_exon_elongation_5:117'
 'major_exon_elongation_5:88' 'major_exon_elongation_5:79'
 'major_exon_elongation_5:56' 'major_exon_elongation_5:157'
 'major_exon_elongation_5:128' 'major_exon_elongation_5:113'
 'major_exon_elongation_5:126' 'major_exon_elongation_5:129'
 'major_exon_elongation_5:136' 'major_exon_elongation_5:59'
 'major_exon_elongation_5:84' 'major_exon_elongation_5:199'
 'major_exon_elongation_5:71' 'major_exon_elongation_5:122'
 'exon_elongation_3:12,major_exon_elongation_5:57'
 'exon_elongation_3:12,major_exon_elongation_5:142'
 'major_exon_elongation_5:132']
unique
['mono_exonic,tes_match_precise:0,correct_polya_site_left:60458603'
 'mono_exonic,tes_match:21,correct_polya_site_left:60458580'
 'mono_exonic,tes_match:-31,correct_polya_site_left:60458631'
 'fsm,tss_match:-4' 'fsm' 'ism_5'
 'mono_exonic,tes_match_precise:0,correct_polya_site_left:60458620'
 'mono_exonic,tes_match:-26,correct_polya_site_left:60458627'
 'mono_exonic,tes_match:-1,correct_polya_site_left:60458620'
 'mono_exonic,tes_match_precise:0,correct_polya_site_left:60458631'
 'mono_exonic,tes_match:-25,correct_polya_site_left:60458625']
unique_minor_difference
['mono_exonic,tes_match_precise:0,exon_elongation_5:4,correct_polya_site_left:60458603'
 'mono_exonic,tes_match_precise:0,exon_elongation_5:3,correct_polya_site_left:60458620'
 'mono_exonic,tes_match_precise:0,exon_elongation_5:1,correct_polya_site_left:60458603'
 'ism_5,exon_elongation_5:3' 'ism_internal,exon_elongation_5:3'
 'fsm,exon_elongation_5:1'
 'mono_exonic,tes_match_precise:0,exon_elongation_5:3,correct_polya_site_left:60458603'
 'ism_5,exon_elongation_5:1' 'ism_internal,exon_elongation_3:13'
 'mono_exonic,tes_match:-26,exon_elongation_5:3,correct_polya_site_left:60458627'
 'fsm,exon_elongation_3:1,tss_match:-1'
 'ism_3,exon_elongation_3:13,tss_match:-9'
 'ism_3,exon_elongation_3:4,tss_match:-5'
 'mono_exonic,tes_match_precise:0,exon_elongation_5:1,correct_polya_site_left:60458620'
 'ism_internal,exon_elongation_3:5,exon_elongation_5:3'
 'ism_3,exon_elongation_3:13,tss_match:-4'
 'fsm,tss_match:47,exon_elongation_5:47'
 'fsm,tss_match:10,exon_elongation_5:10'
 'ism_3,exon_elongation_3:12,tss_match_precise:0'
 'fsm,exon_elongation_3:3,tss_match:-6'
 'fsm,tss_match:15,exon_elongation_5:15'
 'mono_exonic,tes_match_precise:0,exon_elongation_5:2,correct_polya_site_left:60458603'
 'ism_3,exon_elongation_3:12,tss_match:-3']
[ ]:
mono_exonic
[ ]: