2.1.1_prepare_predictDB_standard_inputs

Author

Saideep Gona

Published

November 21, 2023

Context

I need to run a standard PrediXcan TWAS analysis on the Aracena et al. as a standard of comparison. To do this, I need to prepare the proper input files for the PrediXcan pipeline. This takes some reformatting as shown below:

Code
import pandas as pd
import numpy as np
import os,sys
Code
genotype_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes_b37.txt"
genotype_data = pd.read_csv(genotype_file, sep="\t", index_col=0)
genotype_data.head()
genotype_samples = genotype_data.columns
print(genotype_samples)
Index(['AF04', 'AF06', 'AF08', 'AF10', 'AF12', 'AF14', 'AF16', 'AF18', 'AF20',
       'EU22', 'AF24', 'AF26', 'AF28', 'AF30', 'AF34', 'EU36', 'EU38', 'EU03',
       'EU05', 'EU07', 'EU09', 'EU13', 'EU15', 'EU17', 'EU19', 'EU21', 'EU25',
       'EU27', 'EU29', 'EU33', 'EU37', 'EU39', 'EU41', 'EU43', 'EU47'],
      dtype='object')

Expression files

Code
# load existing expression file
file_tag = "_onta"

all_expression = pd.read_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_expression{file_tag}.txt", sep=" ")
# all_expression = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression.txt_NO_LENGTH_NORMALIZATION", sep=" ")

samples = all_expression.columns

flu_samples = [x for x in samples if "Flu" in x]
ni_samples = [x for x in samples if "NI" in x]

flu_samples = [x for x in flu_samples if x.split("_")[0] in genotype_samples]
ni_samples = [x for x in ni_samples if x.split("_")[0] in genotype_samples]

flu_expression = all_expression[flu_samples]
ni_expression = all_expression[ni_samples]

flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

# Write expression files with "Name" added

flu_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDB{file_tag}.csv", index=True, index_label="NAME")
ni_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDB{file_tag}.csv", index=True, index_label="NAME")
flu_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDB{file_tag}.txt", index=True, index_label="NAME", sep="\t")
ni_expression.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDB{file_tag}.txt", index=True, index_label="NAME", sep="\t")
# flu_expression.T.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDB{file_tag}_transposed.txt", index=True, index_label="NAME", sep="\t")
# ni_expression.T.to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDB{file_tag}_transposed.txt", index=True, index_label="NAME", sep="\t")
Code
flu_expression
AF04 AF06 AF08 AF10 AF12 AF14 AF16 AF18 AF20 AF24 ... EU21 EU25 EU27 EU29 EU33 EU37 EU39 EU41 EU43 EU47
ENSG00000000419 4.247264 3.984541 3.808667 4.387522 4.280257 3.301860 4.370497 4.329033 4.142495 4.278582 ... 4.441305 4.248960 3.825667 4.227968 3.236020 4.296582 4.040271 3.995014 4.428678 4.229811
ENSG00000000460 2.084423 2.374117 2.186385 1.552826 1.766373 1.974538 2.709397 2.201944 2.116858 1.837106 ... 2.089717 2.050071 1.986421 2.008548 1.284029 1.715819 1.959209 1.967797 2.100634 2.173422
ENSG00000000938 5.927820 6.311732 6.212348 6.165741 5.387954 5.285123 5.879858 6.431732 6.056096 5.995297 ... 5.931943 6.022909 6.031408 5.808432 6.175322 6.133609 5.839329 5.960366 6.255181 5.403151
ENSG00000000971 4.857289 3.717234 4.786917 5.008106 4.887095 3.714513 4.190340 5.011576 4.841874 4.747534 ... 5.334937 4.257221 4.152555 3.980055 3.840778 4.808895 4.575839 4.377932 5.166898 3.961062
ENSG00000001036 6.455511 6.605715 6.273509 6.295124 6.144346 5.846182 6.566031 6.707117 6.429354 6.130178 ... 6.311400 6.386900 5.959519 6.495524 5.730415 6.500412 6.144727 6.394556 6.655935 6.424312
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ENSG00000273375 1.189708 0.403898 0.789746 0.788083 0.926125 0.095197 0.810983 0.374500 0.717079 1.091195 ... -0.024988 1.203391 0.859122 0.918967 0.647828 0.645034 0.837831 0.641001 0.790858 0.212928
ENSG00000273437 -0.135665 0.085988 0.357227 0.008651 -0.017337 -0.411125 -0.285801 -0.861348 -0.370453 0.242389 ... -0.563449 0.415358 0.300610 0.054347 0.630219 0.192951 -0.379374 0.015349 -0.244158 -0.086177
ENSG00000273448 -0.292841 0.136501 0.425021 0.087109 0.356929 0.672980 0.354483 -0.570697 0.255480 -0.000678 ... -0.244652 0.282939 0.438417 0.653817 0.028750 0.000916 0.198524 0.434090 -0.347806 0.836002
ENSG00000273451 -0.252675 -1.628698 -0.662008 -1.975064 -1.373232 -0.013630 -2.330006 -1.792680 -0.097467 -2.375824 ... -1.679244 -0.421800 0.548356 -1.164319 -0.552637 -0.445711 -2.078597 0.072709 -1.245223 -0.498993
ENSG00000273486 0.832979 0.807239 1.212064 0.146193 0.678311 1.627964 1.017530 0.161301 1.368186 0.436491 ... 0.175049 1.218379 1.033344 1.209306 1.180173 0.762603 1.391422 0.762733 -0.204011 1.239956

14120 rows × 32 columns

Uncorrected expression

Code
all_expression = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression_NO_CORRECTION.txt", sep=" ")

samples = all_expression.columns


flu_samples = [x for x in samples if "Flu" in x]
ni_samples = [x for x in samples if "NI" in x]

flu_expression = all_expression[flu_samples]
ni_expression = all_expression[ni_samples]

flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

# Write expression files with "Name" added

flu_expression.transpose().to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_flu_expression_for_predictDB_NO_CORRECTION.csv", index=True, index_label="NAME")
ni_expression.transpose().to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_NI_expression_for_predictDB_NO_CORRECTION.csv", index=True, index_label="NAME")

Uncorrected, quantile norm expression

Code
all_expression = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression_NO_CORRECTION_QUANTILE.txt", sep=" ")

samples = all_expression.columns

flu_samples = [x for x in samples if "Flu" in x]
ni_samples = [x for x in samples if "NI" in x]

flu_expression = all_expression[flu_samples]
ni_expression = all_expression[ni_samples]

flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

# Write expression files with "Name" added

flu_expression.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_flu_expression_for_predictDB_NO_CORRECTION_QUANTILE.csv", index=True, index_label="NAME")
ni_expression.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_NI_expression_for_predictDB_NO_CORRECTION_QUANTILE.csv", index=True, index_label="NAME")

Gene annotation file

Code
import os

gtf_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/gencode.v44lift37.annotation.gtf"
parse_script = "/beagle3/haky/users/saideep/github_repos/PredictDB-Tutorial/code/parse_gtf.py"

parse_com = [
    "python",
    parse_script,
    gtf_file,
    gtf_file+".parsed.txt"
]

os.system(" ".join(parse_com))

# Load parsed gtf file

parsed_gtf = pd.read_csv(gtf_file+".parsed.txt", sep="\t")
parsed_gtf
chr gene_id gene_name start end gene_type
0 1 ENSG00000290825.1_1 DDX11L2 11869 14409 lncRNA
1 1 ENSG00000223972.6_6 DDX11L1 12010 13670 transcribed_unprocessed_pseudogene
2 1 ENSG00000227232.5_5 WASH7P 14404 29570 unprocessed_pseudogene
3 1 ENSG00000243485.5_11 MIR1302-2HG 29554 31109 lncRNA
4 1 ENSG00000237613.2_6 FAM138A 34554 36081 lncRNA
... ... ... ... ... ... ...
64481 M ENSG00000198695.2_4 MT-ND6 14149 14673 protein_coding
64482 M ENSG00000210194.1 MT-TE 14674 14742 Mt_tRNA
64483 M ENSG00000198727.2_2 MT-CYB 14747 15887 protein_coding
64484 M ENSG00000210195.2 MT-TT 15888 15953 Mt_tRNA
64485 M ENSG00000210196.2 MT-TP 15956 16023 Mt_tRNA

64486 rows × 6 columns

Code
gene_ids_trim = [x.split(".")[0] for x in parsed_gtf["gene_id"].values]
parsed_gtf["gene_id"] = gene_ids_trim
# parsed_gtf["chr"] = "chr" + parsed_gtf["chr"].astype(str)

parsed_gtf.drop_duplicates(subset=["gene_id"], inplace=True)

parsed_gtf.to_csv(gtf_file+".parsed.txt", sep="\t", index=False)

Genotype file

Code
genotypes_source_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes.txt"

unformated_genotypes = pd.read_csv(genotypes_source_file, sep="\t")
Code
unformated_genotypes.index = "chr"+unformated_genotypes.index+"_b37"
unformated_genotypes.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes_b37.txt", sep="\t", index=True, index_label="varID")
unformated_genotypes
AF04 AF06 AF08 AF10 AF12 AF14 AF16 AF18 AF20 EU22 ... EU21 EU25 EU27 EU29 EU33 EU37 EU39 EU41 EU43 EU47
chr1_10492_C_T_b37 0 0 0 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
chr1_10583_G_A_b37 0 0 0 0 0 0 1 0 1 1 ... 1 1 1 0 1 1 0 1 0 1
chr1_12783_A_G_b37 1 1 1 2 1 1 1 1 1 0 ... 1 1 1 0 1 1 1 0 0 1
chr1_12807_C_T_b37 1 0 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
chr1_12839_G_C_b37 0 0 0 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
chr22_51241882_C_T_b37 0 0 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
chr22_51241883_C_A_b37 0 0 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
chr22_51242271_C_G_b37 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 2 0 0 0
chr22_51242557_A_T_b37 1 0 0 0 0 0 0 0 0 0 ... 2 0 0 0 0 0 0 1 1 0
chr22_51244332_A_C_b37 0 0 2 0 0 0 0 0 2 0 ... 1 0 2 1 0 0 -9 0 0 1

7383243 rows × 35 columns

SNP annotation file

Code
snp_annotation_source_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.matrixeqtl.txt"


unformatted_snp_annotation = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.matrixeqtl.txt", sep=" ")
Code
unformatted_snp_annotation["varID"] = unformatted_snp_annotation["snps"]+"_b37"
unformatted_snp_annotation
chr pos rsID A1 A2 snps varID
1 1 10492 rs55998931 C T 1_10492_C_T 1_10492_C_T_b37
2 1 10583 rs58108140 G A 1_10583_G_A 1_10583_G_A_b37
3 1 12783 rs62635284 A G 1_12783_A_G 1_12783_A_G_b37
4 1 12807 rs62635285 C T 1_12807_C_T 1_12807_C_T_b37
5 1 12839 rs747124606 G C 1_12839_G_C 1_12839_G_C_b37
... ... ... ... ... ... ... ...
7383239 22 51241882 . C T 22_51241882_C_T 22_51241882_C_T_b37
7383240 22 51241883 . C A 22_51241883_C_A 22_51241883_C_A_b37
7383241 22 51242271 rs782727945 C G 22_51242271_C_G 22_51242271_C_G_b37
7383242 22 51242557 rs3888401 A T 22_51242557_A_T 22_51242557_A_T_b37
7383243 22 51244332 rs200908937 A C 22_51244332_A_C 22_51244332_A_C_b37

7383243 rows × 7 columns

Code
formatted_snp_annotation_dict = {
    "chr":unformatted_snp_annotation["chr"].astype(str),
    "pos":unformatted_snp_annotation["pos"],
    "varID":"chr"+unformatted_snp_annotation["varID"],
    "ref_vcf":unformatted_snp_annotation["A1"],
    "alt_vcf":unformatted_snp_annotation["A2"],
    "maf":0.01,
    "rsid":"chr"+unformatted_snp_annotation["varID"]
}

formatted_snp_annotation = pd.DataFrame(formatted_snp_annotation_dict)
formatted_snp_annotation
chr pos varID ref_vcf alt_vcf maf rsid
1 1 10492 chr1_10492_C_T_b37 C T 0.01 chr1_10492_C_T_b37
2 1 10583 chr1_10583_G_A_b37 G A 0.01 chr1_10583_G_A_b37
3 1 12783 chr1_12783_A_G_b37 A G 0.01 chr1_12783_A_G_b37
4 1 12807 chr1_12807_C_T_b37 C T 0.01 chr1_12807_C_T_b37
5 1 12839 chr1_12839_G_C_b37 G C 0.01 chr1_12839_G_C_b37
... ... ... ... ... ... ... ...
7383239 22 51241882 chr22_51241882_C_T_b37 C T 0.01 chr22_51241882_C_T_b37
7383240 22 51241883 chr22_51241883_C_A_b37 C A 0.01 chr22_51241883_C_A_b37
7383241 22 51242271 chr22_51242271_C_G_b37 C G 0.01 chr22_51242271_C_G_b37
7383242 22 51242557 chr22_51242557_A_T_b37 A T 0.01 chr22_51242557_A_T_b37
7383243 22 51244332 chr22_51244332_A_C_b37 A C 0.01 chr22_51244332_A_C_b37

7383243 rows × 7 columns

Code
formatted_snp_annotation.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.predixcan_b37.txt", sep="\t", index=False)

Check compatibility of files

Code
parsed_gtf
chr gene_id gene_name start end gene_type
0 chr1 ENSG00000290825 DDX11L2 11869 14409 lncRNA
1 chr1 ENSG00000223972 DDX11L1 12010 13670 transcribed_unprocessed_pseudogene
2 chr1 ENSG00000227232 WASH7P 14404 29570 unprocessed_pseudogene
3 chr1 ENSG00000243485 MIR1302-2HG 29554 31109 lncRNA
4 chr1 ENSG00000237613 FAM138A 34554 36081 lncRNA
... ... ... ... ... ... ...
64481 chrM ENSG00000198695 MT-ND6 14149 14673 protein_coding
64482 chrM ENSG00000210194 MT-TE 14674 14742 Mt_tRNA
64483 chrM ENSG00000198727 MT-CYB 14747 15887 protein_coding
64484 chrM ENSG00000210195 MT-TT 15888 15953 Mt_tRNA
64485 chrM ENSG00000210196 MT-TP 15956 16023 Mt_tRNA

64476 rows × 6 columns

Code
expression_inds = flu_expression.columns
expression_genes = flu_expression.index

gene_annotation_genes = list(parsed_gtf["gene_id"].values)

genotype_snpids = unformated_genotypes.index
genotype_inds = unformated_genotypes.columns

snp_annotation_snpids = formatted_snp_annotation["varID"].values

Look for duplicates

Code
print("Num individuals, unique individuals in expression file", len(expression_inds), len(set(expression_inds)))
print("Num genes, unique genes in expression file", len(expression_genes), len(set(expression_genes)))

print("Num genes, unique genes in gene annotation file", len(gene_annotation_genes), len(set(gene_annotation_genes)))

print("Num SNPs, unique SNPs in genotype file", len(genotype_snpids), len(set(genotype_snpids)))
print("Num individuals, unique individuals in genotype file", len(genotype_inds), len(set(genotype_inds)))

print("Num SNPs, unique SNPs in SNP annotation file", len(snp_annotation_snpids), len(set(snp_annotation_snpids)))
Num individuals, unique individuals in expression file 35 35
Num genes, unique genes in expression file 13716 13716
Num genes, unique genes in gene annotation file 64476 64476
Num SNPs, unique SNPs in genotype file 7383243 7383243
Num individuals, unique individuals in genotype file 35 35
Num SNPs, unique SNPs in SNP annotation file 7383243 7383243

Check overlap between different files

Code
print("Genes overlap", len(set(expression_genes).intersection(set(gene_annotation_genes))))

print("SNPs overlap", len(set(genotype_snpids).intersection(set(snp_annotation_snpids))))

print("Individuals overlap", len(set(expression_inds).intersection(set(genotype_inds))))
Genes overlap 13605
SNPs overlap 7383243
Individuals overlap 35