Calculate DEG gene expression level in the same TAD¶

Output¶

Comparison Name | insertion location | # Up-DEGs | # Down-DEGs

[151]:

import pandas as pd
import pyranges as pr
from pybedtools import BedTool as pb
import numpy as np
import glob
import warnings
warnings.filterwarnings("ignore")
files=glob.glob("diff_gene_tables/*")
# pyrange overlap is similar to bedtools intersect -u
# pyrange intersect is similar to bedtools intersect default parameters
# to get -wa -wb, only pybedtools work, however, converting dataframe to pyrange obj without correct header need more than one line of work

[97]:

ins = pd.read_csv("../RNAseq_splicing_analysis/input.list",sep="\t",header=None)
ins = ins.drop_duplicates(2)
ins = ins.set_index(2)
myDict = ins[5].to_dict()
ins_dict = {}
for k in myDict:
    tmp = pr.read_bed(myDict[k]).sort()
    tmp = tmp.as_df().drop_duplicates("Name")
    k =k.replace("dCTCF_","").replace("Jurkat_","").split("_S")[0]
    print (k)
    ins_dict[k] = pr.PyRanges(tmp)

20copy
d3
d4
d13
hgcOPT_s17
hgcOPT_s4
hgcOPT_s5
hgcOPT_s15
hgcOPT_s22
hgcOPT_s21

[110]:

hg19_gene.head()

[110]:

	Chromosome	Start	End	Name	Score	Strand
0	chr1	11872	14412	.	ENSG00000223972	+
1	chr1	53049	54936	.	ENSG00000268020	+
2	chr1	62948	63887	.	ENSG00000240361	+
3	chr1	69091	70008	.	ENSG00000186092	+
4	chr1	131025	134836	.	ENSG00000233750	+
5	chr1	326096	328112	.	ENSG00000250575	+
6	chr1	334126	334305	.	ENSG00000224813	+
7	chr1	367640	368634	.	ENSG00000235249	+

[144]:

output_dict = {}
df_list = []
LFC=np.log2(1.5)
FDR=0.01
hg19_gene = pr.read_bed("hg19_gene.ensembl_v75.bed",as_df=False).sort()
hg19_tad = pr.read_bed("hg19.tad.bed",as_df=False).sort()
for deg_file in files:
    # use cutoff to define DEGs
    label = deg_file.split("/")[-1].split("_vs_")[0]
    output_dict[label] = [None,None]
    df = pd.read_csv(deg_file,index_col=0)
    deg_up = df[(df.qval<=FDR)&(df.logFC>=LFC)]
    deg_down = df[(df.qval<=FDR)&(df.logFC<=-LFC)]

    # Identify DEG coordinates
    deg_up_bed = hg19_gene[hg19_gene.Score.isin(deg_up.index)]
    deg_down_bed = hg19_gene[hg19_gene.Score.isin(deg_down.index)]

    print (label,deg_up_bed.as_df().shape[0],deg_down_bed.as_df().shape[0])

    # insertion-site TAD
    ins_bed = pb.from_dataframe(ins_dict[label].as_df())
    ab = pb.from_dataframe(hg19_tad.as_df()).intersect(ins_bed,wa=True,wb=True)
    # ab.to_dataframe(header=None).head()
    #  overlap with  insertion-site TAD
    tmp = ab.intersect(pb.from_dataframe(deg_up_bed.as_df()),wao=True).to_dataframe(header=None)
    tmp[22] = tmp[21]>0
    tmp = tmp.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
    tmp.index = tmp[9]+"_"+tmp[10].astype(str)+"_"+tmp[11].astype(str)+"_"+tmp[12].astype(str)
    tmp['#UP'] = tmp['size']
    tmp['Sample'] = label
    tmp['Insert_location'] = tmp.index.tolist()

    tmp2 = ab.intersect(pb.from_dataframe(deg_down_bed.as_df()),wao=True).to_dataframe(header=None)
    tmp2[22] = tmp2[21]>0
    tmp2 = tmp2.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
    tmp2.index = tmp2[9]+"_"+tmp2[10].astype(str)+"_"+tmp2[11].astype(str)+"_"+tmp2[12].astype(str)
    tmp['#DOWN'] = tmp2['size']
    df_list.append(tmp[['Sample','Insert_location','#UP','#DOWN']])


#     output_dict[label][0] =

#     tmp = ab.intersect(pb.from_dataframe(deg_down_bed.as_df()),wao=True).to_dataframe(header=None)
#     tmp[22] = tmp[21]>0
#     output_dict[label][1] = tmp.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()

    # output count table

d4 936 1374
hgcOPT_s22 60 108
hgcOPT_s4 154 265
d13 1730 1192
d3 579 536
20copy 634 277
hgcOPT_s21 218 255
hgcOPT_s5 157 221
hgcOPT_s17 244 398
hgcOPT_s15 238 277

[146]:

out = pd.concat(df_list)

[147]:

out.head()

[147]:

	Sample	Insert_location	#UP	#DOWN
chr1_36015845_36015848_NCDN,KIAA0319L	d4	chr1_36015845_36015848_NCDN,KIAA0319L	0	0
chr1_111696445_111696476_CEPT1,DRAM2	d4	chr1_111696445_111696476_CEPT1,DRAM2	2	2
chr1_201814880_201814886_IPO9	d4	chr1_201814880_201814886_IPO9	0	0
chr1_215751270_215751296_KCTD3	d4	chr1_215751270_215751296_KCTD3	0	0
chr11_36760826_36760863_RAG2	d4	chr11_36760826_36760863_RAG2	1	0

[149]:

out.groupby("Sample").sum()

[149]:

	#UP	#DOWN
Sample
20copy	2	5
d13	18	18
d3	10	5
d4	11	20
hgcOPT_s15	5	1
hgcOPT_s17	6	8
hgcOPT_s21	2	1
hgcOPT_s22	3	3
hgcOPT_s4	3	6
hgcOPT_s5	2	2

[150]:

out.to_csv("DEG_in_insertion_TAD.csv",index=False)

[ ]:

[ ]:

[ ]:

[ ]:

[ ]:

[29]: