Calculate DEG gene expression level in the same TAD

Output

Comparison Name | insertion location | # Up-DEGs | # Down-DEGs

[151]:
import pandas as pd
import pyranges as pr
from pybedtools import BedTool as pb
import numpy as np
import glob
import warnings
warnings.filterwarnings("ignore")
files=glob.glob("diff_gene_tables/*")
# pyrange overlap is similar to bedtools intersect -u
# pyrange intersect is similar to bedtools intersect default parameters
# to get -wa -wb, only pybedtools work, however, converting dataframe to pyrange obj without correct header need more than one line of work
[97]:
ins = pd.read_csv("../RNAseq_splicing_analysis/input.list",sep="\t",header=None)
ins = ins.drop_duplicates(2)
ins = ins.set_index(2)
myDict = ins[5].to_dict()
ins_dict = {}
for k in myDict:
    tmp = pr.read_bed(myDict[k]).sort()
    tmp = tmp.as_df().drop_duplicates("Name")
    k =k.replace("dCTCF_","").replace("Jurkat_","").split("_S")[0]
    print (k)
    ins_dict[k] = pr.PyRanges(tmp)
20copy
d3
d4
d13
hgcOPT_s17
hgcOPT_s4
hgcOPT_s5
hgcOPT_s15
hgcOPT_s22
hgcOPT_s21
[110]:
hg19_gene.head()
[110]:
Chromosome Start End Name Score Strand
0 chr1 11872 14412 . ENSG00000223972 +
1 chr1 53049 54936 . ENSG00000268020 +
2 chr1 62948 63887 . ENSG00000240361 +
3 chr1 69091 70008 . ENSG00000186092 +
4 chr1 131025 134836 . ENSG00000233750 +
5 chr1 326096 328112 . ENSG00000250575 +
6 chr1 334126 334305 . ENSG00000224813 +
7 chr1 367640 368634 . ENSG00000235249 +
[144]:
output_dict = {}
df_list = []
LFC=np.log2(1.5)
FDR=0.01
hg19_gene = pr.read_bed("hg19_gene.ensembl_v75.bed",as_df=False).sort()
hg19_tad = pr.read_bed("hg19.tad.bed",as_df=False).sort()
for deg_file in files:
    # use cutoff to define DEGs
    label = deg_file.split("/")[-1].split("_vs_")[0]
    output_dict[label] = [None,None]
    df = pd.read_csv(deg_file,index_col=0)
    deg_up = df[(df.qval<=FDR)&(df.logFC>=LFC)]
    deg_down = df[(df.qval<=FDR)&(df.logFC<=-LFC)]

    # Identify DEG coordinates
    deg_up_bed = hg19_gene[hg19_gene.Score.isin(deg_up.index)]
    deg_down_bed = hg19_gene[hg19_gene.Score.isin(deg_down.index)]

    print (label,deg_up_bed.as_df().shape[0],deg_down_bed.as_df().shape[0])

    # insertion-site TAD
    ins_bed = pb.from_dataframe(ins_dict[label].as_df())
    ab = pb.from_dataframe(hg19_tad.as_df()).intersect(ins_bed,wa=True,wb=True)
    # ab.to_dataframe(header=None).head()
    #  overlap with  insertion-site TAD
    tmp = ab.intersect(pb.from_dataframe(deg_up_bed.as_df()),wao=True).to_dataframe(header=None)
    tmp[22] = tmp[21]>0
    tmp = tmp.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
    tmp.index = tmp[9]+"_"+tmp[10].astype(str)+"_"+tmp[11].astype(str)+"_"+tmp[12].astype(str)
    tmp['#UP'] = tmp['size']
    tmp['Sample'] = label
    tmp['Insert_location'] = tmp.index.tolist()

    tmp2 = ab.intersect(pb.from_dataframe(deg_down_bed.as_df()),wao=True).to_dataframe(header=None)
    tmp2[22] = tmp2[21]>0
    tmp2 = tmp2.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
    tmp2.index = tmp2[9]+"_"+tmp2[10].astype(str)+"_"+tmp2[11].astype(str)+"_"+tmp2[12].astype(str)
    tmp['#DOWN'] = tmp2['size']
    df_list.append(tmp[['Sample','Insert_location','#UP','#DOWN']])


#     output_dict[label][0] =

#     tmp = ab.intersect(pb.from_dataframe(deg_down_bed.as_df()),wao=True).to_dataframe(header=None)
#     tmp[22] = tmp[21]>0
#     output_dict[label][1] = tmp.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()

    # output count table
d4 936 1374
hgcOPT_s22 60 108
hgcOPT_s4 154 265
d13 1730 1192
d3 579 536
20copy 634 277
hgcOPT_s21 218 255
hgcOPT_s5 157 221
hgcOPT_s17 244 398
hgcOPT_s15 238 277
[146]:
out = pd.concat(df_list)
[147]:
out.head()
[147]:
Sample Insert_location #UP #DOWN
chr1_36015845_36015848_NCDN,KIAA0319L d4 chr1_36015845_36015848_NCDN,KIAA0319L 0 0
chr1_111696445_111696476_CEPT1,DRAM2 d4 chr1_111696445_111696476_CEPT1,DRAM2 2 2
chr1_201814880_201814886_IPO9 d4 chr1_201814880_201814886_IPO9 0 0
chr1_215751270_215751296_KCTD3 d4 chr1_215751270_215751296_KCTD3 0 0
chr11_36760826_36760863_RAG2 d4 chr11_36760826_36760863_RAG2 1 0
[149]:
out.groupby("Sample").sum()
[149]:
#UP #DOWN
Sample
20copy 2 5
d13 18 18
d3 10 5
d4 11 20
hgcOPT_s15 5 1
hgcOPT_s17 6 8
hgcOPT_s21 2 1
hgcOPT_s22 3 3
hgcOPT_s4 3 6
hgcOPT_s5 2 2
[150]:
out.to_csv("DEG_in_insertion_TAD.csv",index=False)
[ ]:

[ ]:

[ ]:

[ ]:

[ ]:

[29]: