How to run GSEA analysis using user defined gene sets

[1]:
import pandas as pd
import sys
from gseapy.plot import barplot, dotplot
import gseapy as gp
from gseapy.plot import gseaplot
import matplotlib.pyplot as plt
import os
[2]:
!ls *
d11.gene.final.combined.tpm.csv
d14.gene.final.combined.tpm.csv
KO_vs_WT.transcript.final.combined.tpm.csv
M1_2_vs_Nontarget.gene.final.combined.tpm.csv
Untitled.ipynb
[28]:
!grep HBG d11.gene.final.combined.tpm.csv
"ENSG00000213934",0.000691152671456949,0.0380034665756572,"HBG1",1,"ENSG00000213934",4354.02754692137,3504.26940266044,8949.08469785786,7809.74626693033,3929.14847479091,8379.4154823941,-1.09243795506553
"ENSG00000196565",0.0110688093846302,0.221549861480019,"HBG2",4,"ENSG00000196565",7333.30876254706,5746.03084894695,15433.5686716436,49179.3445028147,6539.669805747,32306.4565872291,-2.30435688908612
"ENSG00000132677",0.83369088955457,1,"RHBG",2,"ENSG00000132677",1.86930917339403,2.74454628049257,2.58432620309884,2.50216723733193,2.3069277269433,2.54324672021539,-0.0995804097088474
"ENSG00000129214",NA,NA,"SHBG",NA,"ENSG00000129214",0.0829922009031624,0.134691961318615,0.1889591995574,0,0.108842081110889,0.0944795997787,0.0188088505323646
[29]:
def get_gene_list(f):
    df = pd.read_csv(f,index_col=0)
    df = df[df.qval<=0.05]
    up = df[df.logFC>=1]
    down = df[df.logFC<=1]
    return up.ext_gene.tolist(),down.ext_gene.tolist()
[5]:
!head d14.gene.final.combined.tpm.csv
"target_id","pval","qval","ext_gene","num_aggregated_transcripts","X","SRR5890880_Adult_d14","SRR5890881_Adult_d14","SRR5890884_Fetal_d14","SRR5890885_Fetal_d14","treatment_mean","control_mean","logFC"
"ENSG00000133742",4.22869597156715e-36,7.55118239642747e-32,"CA1",21,"ENSG00000133742",1436.2391530381,2493.70265349881,69.2841083253776,67.8978085617473,1964.97090326845,68.5909584435625,4.82019828235003
"ENSG00000110888",2.08784299695339e-16,1.86413061982983e-12,"CAPRIN2",12,"ENSG00000110888",93.7687859637368,130.544414867218,9.78549492821019,17.6348799679751,112.156600415478,13.7101874480926,2.94343320363569
"ENSG00000140403",1.65792304960105e-15,9.86851063224199e-12,"DNAJA4",12,"ENSG00000140403",279.649455034164,329.749809401072,115.929031669401,97.5746798130713,304.699632217618,106.751855741236,1.50440210040088
"ENSG00000171843",7.69574293636136e-14,3.43557204036512e-10,"MLLT3",7,"ENSG00000171843",126.925018635667,98.2518562296459,457.257886961007,609.000325808402,112.588437432657,533.129106384705,-2.23337251808464
"ENSG00000223609",3.65803591684729e-13,1.30643094734284e-09,"HBD",4,"ENSG00000223609",3535.22606030737,3405.55340231687,17.0862139408139,26.2828909210616,3470.38973131212,21.6845524309377,7.25765933932053
"ENSG00000064201",1.53829811296484e-12,4.57823156720218e-09,"TSPAN32",11,"ENSG00000064201",2.38406260165873,7.00812954463309,57.8596753054163,60.6709483849889,4.69609607314591,59.2653118452026,-3.4032824543055
"ENSG00000104267",3.46532579903955e-12,8.84004611334988e-09,"CA2",5,"ENSG00000104267",392.280621101268,450.438441603829,64.2550447173632,67.1356091660377,421.359531352549,65.6953269417005,2.66281402202505
"ENSG00000104763",1.0722723119758e-11,2.39344583436897e-08,"ASAH1",10,"ENSG00000104763",152.488433089148,93.730591373333,384.738279294862,396.211399504666,123.109512231241,390.474839399764,-1.65730589318479
"ENSG00000244734",1.25588238489589e-11,2.49181019412066e-08,"HBB",4,"ENSG00000244734",135640.708030554,91325.2552218797,11400.2304544865,13585.235376584,113482.981626217,12492.7329155353,3.1832121748051

define gene sets

You need to define a dict that contains each gene set as a list

[30]:
gene_sets={}
a=get_gene_list("d11.gene.final.combined.tpm.csv")
b=get_gene_list("d14.gene.final.combined.tpm.csv")
gene_sets['d11_Adult'] = a[0]
gene_sets['d11_Fetal'] = a[1]
gene_sets['d14_Adult'] = b[0]
gene_sets['d14_Fetal'] = b[1]
[ ]:

define your ranked list

[19]:
df = pd.read_csv("M1_2_vs_Nontarget.gene.final.combined.tpm.csv")
df = df[df.logFC.abs()>0.25]
df = df.set_index("ext_gene")
df = df[['logFC']]
df = df.sort_values('logFC',ascending=False)
df.head()
[19]:
logFC
ext_gene
HBG2 3.108108
HBG1 2.725993
HBZ 2.345194
HBE1 2.119513
SLC6A19 2.028183

run GSEA

[31]:
pre_res = gp.prerank(rnk=df, # or rnk = rnk,
                     gene_sets=gene_sets,
                     min_size=5,
                     max_size=1000,
                     permutation_num=100, # reduce number to speed up testing
                     outdir=None, # don't write to disk
                     seed=6,
                     verbose=True, # see what's going on behind the scenes
                    )
2022-10-11 16:14:41,517 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!
2022-10-11 16:14:41,529 Parsing data files for GSEA.............................
2022-10-11 16:14:41,606 0000 gene_sets have been filtered out when max_size=1000 and min_size=5
2022-10-11 16:14:41,608 0004 gene_sets used for further statistical testing.....
2022-10-11 16:14:41,610 Start to run GSEA...Might take a while..................
2022-10-11 16:14:41,765 Start to generate gseapy reports, and produce figures...
2022-10-11 16:14:41,770 Congratulations. GSEApy runs successfully................

save GSEA stat

[32]:
pre_res.res2d.to_csv("CD34.A_vs_F.GSEA.csv")
[33]:
pre_res.res2d
[33]:
es nes pval fdr geneset_size matched_size genes ledge_genes
Term
d11_Adult 0.279731 1.743089 0.0 0.0 130 69 MMRN1;ITGB3;SELP;HBBP1;TNXB;ITGA2B;RAB6B;PLEK2... MMRN1;ITGB3;SELP;HBBP1;TNXB;ITGA2B;RAB6B;PLEK2...
d11_Fetal -0.362205 -1.869913 0.0 0.0 269 120 HBG1;HBZ;HBE1;KRT13;ARID3A;TNXB;KSR1;GAS2L1;GA... SOS1;MEIS2;REXO2;MECOM;ZSCAN23;STX3;SLC39A10;T...
d14_Adult 0.276151 2.133017 0.0 0.0 142 72 AQP1;TNXB;GNAZ;SLC22A16;PIK3AP1;RAB27B;UBASH3B... AQP1;TNXB;GNAZ;SLC22A16;PIK3AP1;RAB27B;UBASH3B...
d14_Fetal 0.370086 2.982766 0.0 0.0 315 119 HBG2;HBG1;HBZ;HBE1;MRC2;KRT13;ARID3A;ACSL6;APL... HBG2;HBG1;HBZ;HBE1;MRC2;KRT13;ARID3A;ACSL6;APL...

plot GSEA figure

[34]:
from gseapy import gseaplot

res = pre_res.res2d
for i in res.index.tolist():
    name = i.replace(" ","_").replace("/","_")
    gseaplot(rank_metric=pre_res.ranking, term=i, ofname=f'CD34.{name}.pdf', **pre_res.results[i])

run everything for another ranked gene list

[35]:
df = pd.read_csv("KO_vs_WT.gene.final.combined.tpm.csv")
df = df[df.logFC.abs()>0.25]
df = df.set_index("ext_gene")
df = df[['logFC']]
df = df.sort_values('logFC',ascending=False)
pre_res = gp.prerank(rnk=df, # or rnk = rnk,
                     gene_sets=gene_sets,
                     min_size=5,
                     max_size=1000,
                     permutation_num=100, # reduce number to speed up testing
                     outdir=None, # don't write to disk
                     seed=6,
                     verbose=True, # see what's going on behind the scenes
                    )
pre_res.res2d.to_csv("H2.A_vs_F.GSEA.csv")
from gseapy import gseaplot

res = pre_res.res2d
for i in res.index.tolist():
    name = i.replace(" ","_").replace("/","_")
    gseaplot(rank_metric=pre_res.ranking, term=i, ofname=f'H2.{name}.pdf', **pre_res.results[i])
2022-10-11 16:16:31,410 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!
2022-10-11 16:16:31,421 Parsing data files for GSEA.............................
2022-10-11 16:16:31,492 0000 gene_sets have been filtered out when max_size=1000 and min_size=5
2022-10-11 16:16:31,494 0004 gene_sets used for further statistical testing.....
2022-10-11 16:16:31,495 Start to run GSEA...Might take a while..................
2022-10-11 16:16:31,635 Start to generate gseapy reports, and produce figures...
2022-10-11 16:16:31,639 Congratulations. GSEApy runs successfully................

[36]:
pre_res.res2d
[36]:
es nes pval fdr geneset_size matched_size genes ledge_genes
Term
d14_Adult 0.403459 1.981156 0.000000 0.000000 142 56 AQP1;SLC22A16;PITX1;RIMKLB;SNCA;TNXB;GRINA;VEG... AQP1;SLC22A16;PITX1;RIMKLB;SNCA;TNXB;GRINA;VEG...
d14_Fetal 0.445841 2.718864 0.000000 0.000000 315 121 HBG2;HBG1;HBE1;MRC2;SOX6;HBZ;SLC30A10;RUNDC3A;... HBG2;HBG1;HBE1;MRC2;SOX6;HBZ;SLC30A10;RUNDC3A;...
d11_Fetal 0.313873 1.819299 0.000000 0.012945 269 91 HBG1;HBE1;GSTP1;HBZ;SLC30A10;C17orf99;KRT13;RI... HBG1;HBE1;GSTP1;HBZ;SLC30A10;C17orf99;KRT13;RI...
d11_Adult 0.269756 1.482600 0.045455 0.055016 130 57 ANKH;SLC22A16;HBBP1;IFI27;SNCA;TNXB;VEGFA;MMD;... ANKH;SLC22A16;HBBP1;IFI27;SNCA;TNXB;VEGFA;MMD;...
[ ]:

[ ]:

[ ]: