How to run GSEA analysis using user defined gene sets¶
[1]:
import pandas as pd
import sys
from gseapy.plot import barplot, dotplot
import gseapy as gp
from gseapy.plot import gseaplot
import matplotlib.pyplot as plt
import os
[2]:
!ls *
d11.gene.final.combined.tpm.csv
d14.gene.final.combined.tpm.csv
KO_vs_WT.transcript.final.combined.tpm.csv
M1_2_vs_Nontarget.gene.final.combined.tpm.csv
Untitled.ipynb
[28]:
!grep HBG d11.gene.final.combined.tpm.csv
"ENSG00000213934",0.000691152671456949,0.0380034665756572,"HBG1",1,"ENSG00000213934",4354.02754692137,3504.26940266044,8949.08469785786,7809.74626693033,3929.14847479091,8379.4154823941,-1.09243795506553
"ENSG00000196565",0.0110688093846302,0.221549861480019,"HBG2",4,"ENSG00000196565",7333.30876254706,5746.03084894695,15433.5686716436,49179.3445028147,6539.669805747,32306.4565872291,-2.30435688908612
"ENSG00000132677",0.83369088955457,1,"RHBG",2,"ENSG00000132677",1.86930917339403,2.74454628049257,2.58432620309884,2.50216723733193,2.3069277269433,2.54324672021539,-0.0995804097088474
"ENSG00000129214",NA,NA,"SHBG",NA,"ENSG00000129214",0.0829922009031624,0.134691961318615,0.1889591995574,0,0.108842081110889,0.0944795997787,0.0188088505323646
[29]:
def get_gene_list(f):
df = pd.read_csv(f,index_col=0)
df = df[df.qval<=0.05]
up = df[df.logFC>=1]
down = df[df.logFC<=1]
return up.ext_gene.tolist(),down.ext_gene.tolist()
[5]:
!head d14.gene.final.combined.tpm.csv
"target_id","pval","qval","ext_gene","num_aggregated_transcripts","X","SRR5890880_Adult_d14","SRR5890881_Adult_d14","SRR5890884_Fetal_d14","SRR5890885_Fetal_d14","treatment_mean","control_mean","logFC"
"ENSG00000133742",4.22869597156715e-36,7.55118239642747e-32,"CA1",21,"ENSG00000133742",1436.2391530381,2493.70265349881,69.2841083253776,67.8978085617473,1964.97090326845,68.5909584435625,4.82019828235003
"ENSG00000110888",2.08784299695339e-16,1.86413061982983e-12,"CAPRIN2",12,"ENSG00000110888",93.7687859637368,130.544414867218,9.78549492821019,17.6348799679751,112.156600415478,13.7101874480926,2.94343320363569
"ENSG00000140403",1.65792304960105e-15,9.86851063224199e-12,"DNAJA4",12,"ENSG00000140403",279.649455034164,329.749809401072,115.929031669401,97.5746798130713,304.699632217618,106.751855741236,1.50440210040088
"ENSG00000171843",7.69574293636136e-14,3.43557204036512e-10,"MLLT3",7,"ENSG00000171843",126.925018635667,98.2518562296459,457.257886961007,609.000325808402,112.588437432657,533.129106384705,-2.23337251808464
"ENSG00000223609",3.65803591684729e-13,1.30643094734284e-09,"HBD",4,"ENSG00000223609",3535.22606030737,3405.55340231687,17.0862139408139,26.2828909210616,3470.38973131212,21.6845524309377,7.25765933932053
"ENSG00000064201",1.53829811296484e-12,4.57823156720218e-09,"TSPAN32",11,"ENSG00000064201",2.38406260165873,7.00812954463309,57.8596753054163,60.6709483849889,4.69609607314591,59.2653118452026,-3.4032824543055
"ENSG00000104267",3.46532579903955e-12,8.84004611334988e-09,"CA2",5,"ENSG00000104267",392.280621101268,450.438441603829,64.2550447173632,67.1356091660377,421.359531352549,65.6953269417005,2.66281402202505
"ENSG00000104763",1.0722723119758e-11,2.39344583436897e-08,"ASAH1",10,"ENSG00000104763",152.488433089148,93.730591373333,384.738279294862,396.211399504666,123.109512231241,390.474839399764,-1.65730589318479
"ENSG00000244734",1.25588238489589e-11,2.49181019412066e-08,"HBB",4,"ENSG00000244734",135640.708030554,91325.2552218797,11400.2304544865,13585.235376584,113482.981626217,12492.7329155353,3.1832121748051
define gene sets¶
You need to define a dict that contains each gene set as a list
[30]:
gene_sets={}
a=get_gene_list("d11.gene.final.combined.tpm.csv")
b=get_gene_list("d14.gene.final.combined.tpm.csv")
gene_sets['d11_Adult'] = a[0]
gene_sets['d11_Fetal'] = a[1]
gene_sets['d14_Adult'] = b[0]
gene_sets['d14_Fetal'] = b[1]
[ ]:
define your ranked list¶
[19]:
df = pd.read_csv("M1_2_vs_Nontarget.gene.final.combined.tpm.csv")
df = df[df.logFC.abs()>0.25]
df = df.set_index("ext_gene")
df = df[['logFC']]
df = df.sort_values('logFC',ascending=False)
df.head()
[19]:
logFC | |
---|---|
ext_gene | |
HBG2 | 3.108108 |
HBG1 | 2.725993 |
HBZ | 2.345194 |
HBE1 | 2.119513 |
SLC6A19 | 2.028183 |
run GSEA¶
[31]:
pre_res = gp.prerank(rnk=df, # or rnk = rnk,
gene_sets=gene_sets,
min_size=5,
max_size=1000,
permutation_num=100, # reduce number to speed up testing
outdir=None, # don't write to disk
seed=6,
verbose=True, # see what's going on behind the scenes
)
2022-10-11 16:14:41,517 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!
2022-10-11 16:14:41,529 Parsing data files for GSEA.............................
2022-10-11 16:14:41,606 0000 gene_sets have been filtered out when max_size=1000 and min_size=5
2022-10-11 16:14:41,608 0004 gene_sets used for further statistical testing.....
2022-10-11 16:14:41,610 Start to run GSEA...Might take a while..................
2022-10-11 16:14:41,765 Start to generate gseapy reports, and produce figures...
2022-10-11 16:14:41,770 Congratulations. GSEApy runs successfully................
save GSEA stat¶
[32]:
pre_res.res2d.to_csv("CD34.A_vs_F.GSEA.csv")
[33]:
pre_res.res2d
[33]:
es | nes | pval | fdr | geneset_size | matched_size | genes | ledge_genes | |
---|---|---|---|---|---|---|---|---|
Term | ||||||||
d11_Adult | 0.279731 | 1.743089 | 0.0 | 0.0 | 130 | 69 | MMRN1;ITGB3;SELP;HBBP1;TNXB;ITGA2B;RAB6B;PLEK2... | MMRN1;ITGB3;SELP;HBBP1;TNXB;ITGA2B;RAB6B;PLEK2... |
d11_Fetal | -0.362205 | -1.869913 | 0.0 | 0.0 | 269 | 120 | HBG1;HBZ;HBE1;KRT13;ARID3A;TNXB;KSR1;GAS2L1;GA... | SOS1;MEIS2;REXO2;MECOM;ZSCAN23;STX3;SLC39A10;T... |
d14_Adult | 0.276151 | 2.133017 | 0.0 | 0.0 | 142 | 72 | AQP1;TNXB;GNAZ;SLC22A16;PIK3AP1;RAB27B;UBASH3B... | AQP1;TNXB;GNAZ;SLC22A16;PIK3AP1;RAB27B;UBASH3B... |
d14_Fetal | 0.370086 | 2.982766 | 0.0 | 0.0 | 315 | 119 | HBG2;HBG1;HBZ;HBE1;MRC2;KRT13;ARID3A;ACSL6;APL... | HBG2;HBG1;HBZ;HBE1;MRC2;KRT13;ARID3A;ACSL6;APL... |
plot GSEA figure¶
[34]:
from gseapy import gseaplot
res = pre_res.res2d
for i in res.index.tolist():
name = i.replace(" ","_").replace("/","_")
gseaplot(rank_metric=pre_res.ranking, term=i, ofname=f'CD34.{name}.pdf', **pre_res.results[i])
run everything for another ranked gene list¶
[35]:
df = pd.read_csv("KO_vs_WT.gene.final.combined.tpm.csv")
df = df[df.logFC.abs()>0.25]
df = df.set_index("ext_gene")
df = df[['logFC']]
df = df.sort_values('logFC',ascending=False)
pre_res = gp.prerank(rnk=df, # or rnk = rnk,
gene_sets=gene_sets,
min_size=5,
max_size=1000,
permutation_num=100, # reduce number to speed up testing
outdir=None, # don't write to disk
seed=6,
verbose=True, # see what's going on behind the scenes
)
pre_res.res2d.to_csv("H2.A_vs_F.GSEA.csv")
from gseapy import gseaplot
res = pre_res.res2d
for i in res.index.tolist():
name = i.replace(" ","_").replace("/","_")
gseaplot(rank_metric=pre_res.ranking, term=i, ofname=f'H2.{name}.pdf', **pre_res.results[i])
2022-10-11 16:16:31,410 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!
2022-10-11 16:16:31,421 Parsing data files for GSEA.............................
2022-10-11 16:16:31,492 0000 gene_sets have been filtered out when max_size=1000 and min_size=5
2022-10-11 16:16:31,494 0004 gene_sets used for further statistical testing.....
2022-10-11 16:16:31,495 Start to run GSEA...Might take a while..................
2022-10-11 16:16:31,635 Start to generate gseapy reports, and produce figures...
2022-10-11 16:16:31,639 Congratulations. GSEApy runs successfully................
[36]:
pre_res.res2d
[36]:
es | nes | pval | fdr | geneset_size | matched_size | genes | ledge_genes | |
---|---|---|---|---|---|---|---|---|
Term | ||||||||
d14_Adult | 0.403459 | 1.981156 | 0.000000 | 0.000000 | 142 | 56 | AQP1;SLC22A16;PITX1;RIMKLB;SNCA;TNXB;GRINA;VEG... | AQP1;SLC22A16;PITX1;RIMKLB;SNCA;TNXB;GRINA;VEG... |
d14_Fetal | 0.445841 | 2.718864 | 0.000000 | 0.000000 | 315 | 121 | HBG2;HBG1;HBE1;MRC2;SOX6;HBZ;SLC30A10;RUNDC3A;... | HBG2;HBG1;HBE1;MRC2;SOX6;HBZ;SLC30A10;RUNDC3A;... |
d11_Fetal | 0.313873 | 1.819299 | 0.000000 | 0.012945 | 269 | 91 | HBG1;HBE1;GSTP1;HBZ;SLC30A10;C17orf99;KRT13;RI... | HBG1;HBE1;GSTP1;HBZ;SLC30A10;C17orf99;KRT13;RI... |
d11_Adult | 0.269756 | 1.482600 | 0.045455 | 0.055016 | 130 | 57 | ANKH;SLC22A16;HBBP1;IFI27;SNCA;TNXB;VEGFA;MMD;... | ANKH;SLC22A16;HBBP1;IFI27;SNCA;TNXB;VEGFA;MMD;... |
[ ]:
[ ]:
[ ]: