Calculate DEG gene expression level in the same TAD¶
Output¶
Comparison Name | insertion location | # Up-DEGs | # Down-DEGs
[151]:
import pandas as pd
import pyranges as pr
from pybedtools import BedTool as pb
import numpy as np
import glob
import warnings
warnings.filterwarnings("ignore")
files=glob.glob("diff_gene_tables/*")
# pyrange overlap is similar to bedtools intersect -u
# pyrange intersect is similar to bedtools intersect default parameters
# to get -wa -wb, only pybedtools work, however, converting dataframe to pyrange obj without correct header need more than one line of work
[97]:
ins = pd.read_csv("../RNAseq_splicing_analysis/input.list",sep="\t",header=None)
ins = ins.drop_duplicates(2)
ins = ins.set_index(2)
myDict = ins[5].to_dict()
ins_dict = {}
for k in myDict:
tmp = pr.read_bed(myDict[k]).sort()
tmp = tmp.as_df().drop_duplicates("Name")
k =k.replace("dCTCF_","").replace("Jurkat_","").split("_S")[0]
print (k)
ins_dict[k] = pr.PyRanges(tmp)
20copy
d3
d4
d13
hgcOPT_s17
hgcOPT_s4
hgcOPT_s5
hgcOPT_s15
hgcOPT_s22
hgcOPT_s21
[110]:
hg19_gene.head()
[110]:
Chromosome | Start | End | Name | Score | Strand | |
---|---|---|---|---|---|---|
0 | chr1 | 11872 | 14412 | . | ENSG00000223972 | + |
1 | chr1 | 53049 | 54936 | . | ENSG00000268020 | + |
2 | chr1 | 62948 | 63887 | . | ENSG00000240361 | + |
3 | chr1 | 69091 | 70008 | . | ENSG00000186092 | + |
4 | chr1 | 131025 | 134836 | . | ENSG00000233750 | + |
5 | chr1 | 326096 | 328112 | . | ENSG00000250575 | + |
6 | chr1 | 334126 | 334305 | . | ENSG00000224813 | + |
7 | chr1 | 367640 | 368634 | . | ENSG00000235249 | + |
[144]:
output_dict = {}
df_list = []
LFC=np.log2(1.5)
FDR=0.01
hg19_gene = pr.read_bed("hg19_gene.ensembl_v75.bed",as_df=False).sort()
hg19_tad = pr.read_bed("hg19.tad.bed",as_df=False).sort()
for deg_file in files:
# use cutoff to define DEGs
label = deg_file.split("/")[-1].split("_vs_")[0]
output_dict[label] = [None,None]
df = pd.read_csv(deg_file,index_col=0)
deg_up = df[(df.qval<=FDR)&(df.logFC>=LFC)]
deg_down = df[(df.qval<=FDR)&(df.logFC<=-LFC)]
# Identify DEG coordinates
deg_up_bed = hg19_gene[hg19_gene.Score.isin(deg_up.index)]
deg_down_bed = hg19_gene[hg19_gene.Score.isin(deg_down.index)]
print (label,deg_up_bed.as_df().shape[0],deg_down_bed.as_df().shape[0])
# insertion-site TAD
ins_bed = pb.from_dataframe(ins_dict[label].as_df())
ab = pb.from_dataframe(hg19_tad.as_df()).intersect(ins_bed,wa=True,wb=True)
# ab.to_dataframe(header=None).head()
# overlap with insertion-site TAD
tmp = ab.intersect(pb.from_dataframe(deg_up_bed.as_df()),wao=True).to_dataframe(header=None)
tmp[22] = tmp[21]>0
tmp = tmp.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
tmp.index = tmp[9]+"_"+tmp[10].astype(str)+"_"+tmp[11].astype(str)+"_"+tmp[12].astype(str)
tmp['#UP'] = tmp['size']
tmp['Sample'] = label
tmp['Insert_location'] = tmp.index.tolist()
tmp2 = ab.intersect(pb.from_dataframe(deg_down_bed.as_df()),wao=True).to_dataframe(header=None)
tmp2[22] = tmp2[21]>0
tmp2 = tmp2.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
tmp2.index = tmp2[9]+"_"+tmp2[10].astype(str)+"_"+tmp2[11].astype(str)+"_"+tmp2[12].astype(str)
tmp['#DOWN'] = tmp2['size']
df_list.append(tmp[['Sample','Insert_location','#UP','#DOWN']])
# output_dict[label][0] =
# tmp = ab.intersect(pb.from_dataframe(deg_down_bed.as_df()),wao=True).to_dataframe(header=None)
# tmp[22] = tmp[21]>0
# output_dict[label][1] = tmp.groupby(list(range(14)))[22].sum().to_frame('size').reset_index()
# output count table
d4 936 1374
hgcOPT_s22 60 108
hgcOPT_s4 154 265
d13 1730 1192
d3 579 536
20copy 634 277
hgcOPT_s21 218 255
hgcOPT_s5 157 221
hgcOPT_s17 244 398
hgcOPT_s15 238 277
[146]:
out = pd.concat(df_list)
[147]:
out.head()
[147]:
Sample | Insert_location | #UP | #DOWN | |
---|---|---|---|---|
chr1_36015845_36015848_NCDN,KIAA0319L | d4 | chr1_36015845_36015848_NCDN,KIAA0319L | 0 | 0 |
chr1_111696445_111696476_CEPT1,DRAM2 | d4 | chr1_111696445_111696476_CEPT1,DRAM2 | 2 | 2 |
chr1_201814880_201814886_IPO9 | d4 | chr1_201814880_201814886_IPO9 | 0 | 0 |
chr1_215751270_215751296_KCTD3 | d4 | chr1_215751270_215751296_KCTD3 | 0 | 0 |
chr11_36760826_36760863_RAG2 | d4 | chr11_36760826_36760863_RAG2 | 1 | 0 |
[149]:
out.groupby("Sample").sum()
[149]:
#UP | #DOWN | |
---|---|---|
Sample | ||
20copy | 2 | 5 |
d13 | 18 | 18 |
d3 | 10 | 5 |
d4 | 11 | 20 |
hgcOPT_s15 | 5 | 1 |
hgcOPT_s17 | 6 | 8 |
hgcOPT_s21 | 2 | 1 |
hgcOPT_s22 | 3 | 3 |
hgcOPT_s4 | 3 | 6 |
hgcOPT_s5 | 2 | 2 |
[150]:
out.to_csv("DEG_in_insertion_TAD.csv",index=False)
[ ]:
[ ]:
[ ]:
[ ]:
[ ]:
[29]: