2.1.1_prepare_predictDB_standard_inputs

Author

Saideep Gona

Published

November 21, 2023

Context

I need to run a standard PrediXcan TWAS analysis on the Aracena et al. as a standard of comparison. To do this, I need to prepare the proper input files for the PrediXcan pipeline. This takes some reformatting as shown below:

Code

import pandas as pd
import numpy as np
import os,sys

Code

genotype_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes_b37.txt"
genotype_data = pd.read_csv(genotype_file, sep="\t", index_col=0)
genotype_data.head()
genotype_samples = genotype_data.columns
print(genotype_samples)

Index(['AF04', 'AF06', 'AF08', 'AF10', 'AF12', 'AF14', 'AF16', 'AF18', 'AF20',
       'EU22', 'AF24', 'AF26', 'AF28', 'AF30', 'AF34', 'EU36', 'EU38', 'EU03',
       'EU05', 'EU07', 'EU09', 'EU13', 'EU15', 'EU17', 'EU19', 'EU21', 'EU25',
       'EU27', 'EU29', 'EU33', 'EU37', 'EU39', 'EU41', 'EU43', 'EU47'],
      dtype='object')

Expression files

Code

# load existing expression file
file_tag = "_onta"

all_expression = pd.read_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_expression{file_tag}.txt", sep=" ")
# all_expression = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression.txt_NO_LENGTH_NORMALIZATION", sep=" ")

samples = all_expression.columns

flu_samples = [x for x in samples if "Flu" in x]
ni_samples = [x for x in samples if "NI" in x]

flu_samples = [x for x in flu_samples if x.split("_")[0] in genotype_samples]
ni_samples = [x for x in ni_samples if x.split("_")[0] in genotype_samples]

flu_expression = all_expression[flu_samples]
ni_expression = all_expression[ni_samples]

flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

# Write expression files with "Name" added

flu_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDB{file_tag}.csv", index=True, index_label="NAME")
ni_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDB{file_tag}.csv", index=True, index_label="NAME")
flu_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDB{file_tag}.txt", index=True, index_label="NAME", sep="\t")
ni_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDB{file_tag}.txt", index=True, index_label="NAME", sep="\t")
# flu_expression.T.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDB{file_tag}_transposed.txt", index=True, index_label="NAME", sep="\t")
# ni_expression.T.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDB{file_tag}_transposed.txt", index=True, index_label="NAME", sep="\t")

Code

flu_expression

	AF04	AF06	AF08	AF10	AF12	AF14	AF16	AF18	AF20	AF24	...	EU21	EU25	EU27	EU29	EU33	EU37	EU39	EU41	EU43	EU47
ENSG00000000419	4.247264	3.984541	3.808667	4.387522	4.280257	3.301860	4.370497	4.329033	4.142495	4.278582	...	4.441305	4.248960	3.825667	4.227968	3.236020	4.296582	4.040271	3.995014	4.428678	4.229811
ENSG00000000460	2.084423	2.374117	2.186385	1.552826	1.766373	1.974538	2.709397	2.201944	2.116858	1.837106	...	2.089717	2.050071	1.986421	2.008548	1.284029	1.715819	1.959209	1.967797	2.100634	2.173422
ENSG00000000938	5.927820	6.311732	6.212348	6.165741	5.387954	5.285123	5.879858	6.431732	6.056096	5.995297	...	5.931943	6.022909	6.031408	5.808432	6.175322	6.133609	5.839329	5.960366	6.255181	5.403151
ENSG00000000971	4.857289	3.717234	4.786917	5.008106	4.887095	3.714513	4.190340	5.011576	4.841874	4.747534	...	5.334937	4.257221	4.152555	3.980055	3.840778	4.808895	4.575839	4.377932	5.166898	3.961062
ENSG00000001036	6.455511	6.605715	6.273509	6.295124	6.144346	5.846182	6.566031	6.707117	6.429354	6.130178	...	6.311400	6.386900	5.959519	6.495524	5.730415	6.500412	6.144727	6.394556	6.655935	6.424312
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ENSG00000273375	1.189708	0.403898	0.789746	0.788083	0.926125	0.095197	0.810983	0.374500	0.717079	1.091195	...	-0.024988	1.203391	0.859122	0.918967	0.647828	0.645034	0.837831	0.641001	0.790858	0.212928
ENSG00000273437	-0.135665	0.085988	0.357227	0.008651	-0.017337	-0.411125	-0.285801	-0.861348	-0.370453	0.242389	...	-0.563449	0.415358	0.300610	0.054347	0.630219	0.192951	-0.379374	0.015349	-0.244158	-0.086177
ENSG00000273448	-0.292841	0.136501	0.425021	0.087109	0.356929	0.672980	0.354483	-0.570697	0.255480	-0.000678	...	-0.244652	0.282939	0.438417	0.653817	0.028750	0.000916	0.198524	0.434090	-0.347806	0.836002
ENSG00000273451	-0.252675	-1.628698	-0.662008	-1.975064	-1.373232	-0.013630	-2.330006	-1.792680	-0.097467	-2.375824	...	-1.679244	-0.421800	0.548356	-1.164319	-0.552637	-0.445711	-2.078597	0.072709	-1.245223	-0.498993
ENSG00000273486	0.832979	0.807239	1.212064	0.146193	0.678311	1.627964	1.017530	0.161301	1.368186	0.436491	...	0.175049	1.218379	1.033344	1.209306	1.180173	0.762603	1.391422	0.762733	-0.204011	1.239956

14120 rows × 32 columns

Uncorrected expression

Code

all_expression = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression_NO_CORRECTION.txt", sep=" ")

samples = all_expression.columns


flu_samples = [x for x in samples if "Flu" in x]
ni_samples = [x for x in samples if "NI" in x]

flu_expression = all_expression[flu_samples]
ni_expression = all_expression[ni_samples]

flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

# Write expression files with "Name" added

flu_expression.transpose().to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_flu_expression_for_predictDB_NO_CORRECTION.csv", index=True, index_label="NAME")
ni_expression.transpose().to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_NI_expression_for_predictDB_NO_CORRECTION.csv", index=True, index_label="NAME")

Uncorrected, quantile norm expression

Code

all_expression = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression_NO_CORRECTION_QUANTILE.txt", sep=" ")

samples = all_expression.columns

flu_samples = [x for x in samples if "Flu" in x]
ni_samples = [x for x in samples if "NI" in x]

flu_expression = all_expression[flu_samples]
ni_expression = all_expression[ni_samples]

flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

# Write expression files with "Name" added

flu_expression.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_flu_expression_for_predictDB_NO_CORRECTION_QUANTILE.csv", index=True, index_label="NAME")
ni_expression.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_NI_expression_for_predictDB_NO_CORRECTION_QUANTILE.csv", index=True, index_label="NAME")

Gene annotation file

Code

import os

gtf_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/gencode.v44lift37.annotation.gtf"
parse_script = "/beagle3/haky/users/saideep/github_repos/PredictDB-Tutorial/code/parse_gtf.py"

parse_com = [
    "python",
    parse_script,
    gtf_file,
    gtf_file+".parsed.txt"
]

os.system(" ".join(parse_com))

# Load parsed gtf file

parsed_gtf = pd.read_csv(gtf_file+".parsed.txt", sep="\t")
parsed_gtf

	chr	gene_id	gene_name	start	end	gene_type
0	1	ENSG00000290825.1_1	DDX11L2	11869	14409	lncRNA
1	1	ENSG00000223972.6_6	DDX11L1	12010	13670	transcribed_unprocessed_pseudogene
2	1	ENSG00000227232.5_5	WASH7P	14404	29570	unprocessed_pseudogene
3	1	ENSG00000243485.5_11	MIR1302-2HG	29554	31109	lncRNA
4	1	ENSG00000237613.2_6	FAM138A	34554	36081	lncRNA
...	...	...	...	...	...	...
64481	M	ENSG00000198695.2_4	MT-ND6	14149	14673	protein_coding
64482	M	ENSG00000210194.1	MT-TE	14674	14742	Mt_tRNA
64483	M	ENSG00000198727.2_2	MT-CYB	14747	15887	protein_coding
64484	M	ENSG00000210195.2	MT-TT	15888	15953	Mt_tRNA
64485	M	ENSG00000210196.2	MT-TP	15956	16023	Mt_tRNA

64486 rows × 6 columns

Code

gene_ids_trim = [x.split(".")[0] for x in parsed_gtf["gene_id"].values]
parsed_gtf["gene_id"] = gene_ids_trim
# parsed_gtf["chr"] = "chr" + parsed_gtf["chr"].astype(str)

parsed_gtf.drop_duplicates(subset=["gene_id"], inplace=True)

parsed_gtf.to_csv(gtf_file+".parsed.txt", sep="\t", index=False)

Genotype file

Code

genotypes_source_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes.txt"

unformated_genotypes = pd.read_csv(genotypes_source_file, sep="\t")

Code

unformated_genotypes.index = "chr"+unformated_genotypes.index+"_b37"
unformated_genotypes.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes_b37.txt", sep="\t", index=True, index_label="varID")
unformated_genotypes

	AF04	AF06	AF08	AF10	AF12	AF14	AF16	AF18	AF20	EU22	...	EU21	EU25	EU27	EU29	EU33	EU37	EU39	EU41	EU43	EU47
chr1_10492_C_T_b37	0	0	0	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
chr1_10583_G_A_b37	0	0	0	0	0	0	1	0	1	1	...	1	1	1	0	1	1	0	1	0	1
chr1_12783_A_G_b37	1	1	1	2	1	1	1	1	1	0	...	1	1	1	0	1	1	1	0	0	1
chr1_12807_C_T_b37	1	0	1	0	1	0	1	0	0	0	...	0	0	0	0	0	1	0	0	0	0
chr1_12839_G_C_b37	0	0	0	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chr22_51241882_C_T_b37	0	0	0	0	1	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
chr22_51241883_C_A_b37	0	0	0	0	1	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
chr22_51242271_C_G_b37	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	2	0	0	0
chr22_51242557_A_T_b37	1	0	0	0	0	0	0	0	0	0	...	2	0	0	0	0	0	0	1	1	0
chr22_51244332_A_C_b37	0	0	2	0	0	0	0	0	2	0	...	1	0	2	1	0	0	-9	0	0	1

7383243 rows × 35 columns

SNP annotation file

Code

snp_annotation_source_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.matrixeqtl.txt"


unformatted_snp_annotation = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.matrixeqtl.txt", sep=" ")

Code

unformatted_snp_annotation["varID"] = unformatted_snp_annotation["snps"]+"_b37"
unformatted_snp_annotation

	chr	pos	rsID	A1	A2	snps	varID
1	1	10492	rs55998931	C	T	1_10492_C_T	1_10492_C_T_b37
2	1	10583	rs58108140	G	A	1_10583_G_A	1_10583_G_A_b37
3	1	12783	rs62635284	A	G	1_12783_A_G	1_12783_A_G_b37
4	1	12807	rs62635285	C	T	1_12807_C_T	1_12807_C_T_b37
5	1	12839	rs747124606	G	C	1_12839_G_C	1_12839_G_C_b37
...	...	...	...	...	...	...	...
7383239	22	51241882	.	C	T	22_51241882_C_T	22_51241882_C_T_b37
7383240	22	51241883	.	C	A	22_51241883_C_A	22_51241883_C_A_b37
7383241	22	51242271	rs782727945	C	G	22_51242271_C_G	22_51242271_C_G_b37
7383242	22	51242557	rs3888401	A	T	22_51242557_A_T	22_51242557_A_T_b37
7383243	22	51244332	rs200908937	A	C	22_51244332_A_C	22_51244332_A_C_b37

7383243 rows × 7 columns

Code

formatted_snp_annotation_dict = {
    "chr":unformatted_snp_annotation["chr"].astype(str),
    "pos":unformatted_snp_annotation["pos"],
    "varID":"chr"+unformatted_snp_annotation["varID"],
    "ref_vcf":unformatted_snp_annotation["A1"],
    "alt_vcf":unformatted_snp_annotation["A2"],
    "maf":0.01,
    "rsid":"chr"+unformatted_snp_annotation["varID"]
}

formatted_snp_annotation = pd.DataFrame(formatted_snp_annotation_dict)
formatted_snp_annotation

	chr	pos	varID	ref_vcf	alt_vcf	maf	rsid
1	1	10492	chr1_10492_C_T_b37	C	T	0.01	chr1_10492_C_T_b37
2	1	10583	chr1_10583_G_A_b37	G	A	0.01	chr1_10583_G_A_b37
3	1	12783	chr1_12783_A_G_b37	A	G	0.01	chr1_12783_A_G_b37
4	1	12807	chr1_12807_C_T_b37	C	T	0.01	chr1_12807_C_T_b37
5	1	12839	chr1_12839_G_C_b37	G	C	0.01	chr1_12839_G_C_b37
...	...	...	...	...	...	...	...
7383239	22	51241882	chr22_51241882_C_T_b37	C	T	0.01	chr22_51241882_C_T_b37
7383240	22	51241883	chr22_51241883_C_A_b37	C	A	0.01	chr22_51241883_C_A_b37
7383241	22	51242271	chr22_51242271_C_G_b37	C	G	0.01	chr22_51242271_C_G_b37
7383242	22	51242557	chr22_51242557_A_T_b37	A	T	0.01	chr22_51242557_A_T_b37
7383243	22	51244332	chr22_51244332_A_C_b37	A	C	0.01	chr22_51244332_A_C_b37

7383243 rows × 7 columns

Code

formatted_snp_annotation.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.predixcan_b37.txt", sep="\t", index=False)

Check compatibility of files

Code

parsed_gtf

	chr	gene_id	gene_name	start	end	gene_type
0	chr1	ENSG00000290825	DDX11L2	11869	14409	lncRNA
1	chr1	ENSG00000223972	DDX11L1	12010	13670	transcribed_unprocessed_pseudogene
2	chr1	ENSG00000227232	WASH7P	14404	29570	unprocessed_pseudogene
3	chr1	ENSG00000243485	MIR1302-2HG	29554	31109	lncRNA
4	chr1	ENSG00000237613	FAM138A	34554	36081	lncRNA
...	...	...	...	...	...	...
64481	chrM	ENSG00000198695	MT-ND6	14149	14673	protein_coding
64482	chrM	ENSG00000210194	MT-TE	14674	14742	Mt_tRNA
64483	chrM	ENSG00000198727	MT-CYB	14747	15887	protein_coding
64484	chrM	ENSG00000210195	MT-TT	15888	15953	Mt_tRNA
64485	chrM	ENSG00000210196	MT-TP	15956	16023	Mt_tRNA

64476 rows × 6 columns

Code

expression_inds = flu_expression.columns
expression_genes = flu_expression.index

gene_annotation_genes = list(parsed_gtf["gene_id"].values)

genotype_snpids = unformated_genotypes.index
genotype_inds = unformated_genotypes.columns

snp_annotation_snpids = formatted_snp_annotation["varID"].values

Look for duplicates

Code

print("Num individuals, unique individuals in expression file", len(expression_inds), len(set(expression_inds)))
print("Num genes, unique genes in expression file", len(expression_genes), len(set(expression_genes)))

print("Num genes, unique genes in gene annotation file", len(gene_annotation_genes), len(set(gene_annotation_genes)))

print("Num SNPs, unique SNPs in genotype file", len(genotype_snpids), len(set(genotype_snpids)))
print("Num individuals, unique individuals in genotype file", len(genotype_inds), len(set(genotype_inds)))

print("Num SNPs, unique SNPs in SNP annotation file", len(snp_annotation_snpids), len(set(snp_annotation_snpids)))

Num individuals, unique individuals in expression file 35 35
Num genes, unique genes in expression file 13716 13716
Num genes, unique genes in gene annotation file 64476 64476
Num SNPs, unique SNPs in genotype file 7383243 7383243
Num individuals, unique individuals in genotype file 35 35
Num SNPs, unique SNPs in SNP annotation file 7383243 7383243

Check overlap between different files

Code

print("Genes overlap", len(set(expression_genes).intersection(set(gene_annotation_genes))))

print("SNPs overlap", len(set(genotype_snpids).intersection(set(snp_annotation_snpids))))

print("Individuals overlap", len(set(expression_inds).intersection(set(genotype_inds))))

Genes overlap 13605
SNPs overlap 7383243
Individuals overlap 35