1.5.2_format_harmonized_GWAS_sumstats_for_SPrediXcan

Author

Saideep Gona

Published

December 14, 2023

Code
import pandas as pd
import numpy as np
Code
gwas_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/T2D_harmonized_FORMATTED_hg19.txt"

gwas = pd.read_csv(gwas_file, sep="\t")
gwas
chromosome panel_variant_id non_effect_allele effect_allele effect_size pvalue position frequency zscore standard_error sample_size
0 chr1 chr1_758351_A_G_b38 A G -0.0060 0.5806 693731.0 0.112621 -0.552509 0.010860 328697
1 chr1 chr1_770988_A_G_b38 A G -0.0103 0.2501 706368.0 0.516505 -1.150106 0.008956 328697
2 chr1 chr1_785910_G_C_b38 G C 0.0021 0.9179 721290.0 0.045631 0.103079 0.020373 328697
3 chr1 chr1_794299_C_G_b38 C G -0.0040 0.6551 729679.0 0.840777 -0.446688 0.008955 328697
4 chr1 chr1_794707_T_C_b38 T C -0.0016 0.9288 730087.0 0.050485 -0.089355 0.017906 328697
... ... ... ... ... ... ... ... ... ... ... ...
9980951 NaN chr19_39907860_C_T_b38 C T -0.0134 0.7430 NaN 0.001942 -0.327883 0.040868 328697
9980952 NaN chr19_54303139_T_G_b38 T G -0.0011 0.8442 NaN 0.698058 -0.196524 0.005597 328697
9980953 NaN chr21_28872555_G_T_b38 G T -0.0050 0.3521 NaN 0.323301 -0.930524 0.005373 328697
9980954 NaN chr22_22052260_C_T_b38 C T -0.0012 0.8196 NaN 0.598058 -0.228059 0.005262 328697
9980955 NaN chr22_22244312_T_G_b38 T G -0.0072 0.2162 NaN 0.597087 -1.236696 0.005822 328697

9980956 rows × 11 columns

Code
# Count NA rows

gwas.isna().sum()

# Filter NA rows

gwas = gwas.dropna()

# Count NaN
print(gwas.isnull().sum())
chromosome           0
panel_variant_id     0
non_effect_allele    0
effect_allele        0
effect_size          0
pvalue               0
position             0
frequency            0
zscore               0
standard_error       0
sample_size          0
dtype: int64
Code
gwas["panel_variant_id"] = gwas["chromosome"].astype(str) + "_" + gwas["position"].astype(int).astype(str) + "_" + gwas["non_effect_allele"].astype(str) + "_" + gwas["effect_allele"].astype(str) + "_b37"
gwas["position"] = gwas["position"].astype(int)
Code
gwas["panel_variant_id"]
0             chr1_693731_A_G_b37
1             chr1_706368_A_G_b37
2             chr1_721290_G_C_b37
3             chr1_729679_C_G_b37
4             chr1_730087_T_C_b37
                    ...          
9980880    chr22_51219387_T_C_b37
9980881    chr22_51219704_G_A_b37
9980882    chr22_51221731_T_C_b37
9980883    chr22_51222100_G_T_b37
9980884    chr22_51229805_T_C_b37
Name: panel_variant_id, Length: 9945081, dtype: object
Code
# gwas.to_csv(gwas_file+".chromb38", sep="\t", index=False, header=True)
gwas.to_csv(gwas_file+".chromb37", sep="\t", index=False, header=True)
Code
gwas_file_raw = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_raw/Asthma_eczema_farreira_GRCh38_preharmonized.txt.head"
gwas_raw = pd.read_csv(gwas_file_raw, sep="\t")
gwas_raw
hm_variant_id hm_rsid hm_chrom hm_pos hm_other_allele hm_effect_allele hm_beta hm_odds_ratio hm_ci_lower hm_ci_upper ... effect_allele other_allele beta standard_error p_value variant_id effect_allele_frequency odds_ratio ci_lower ci_upper
0 1_99534456_G_T rs10875231 1 99534456 G T -0.0072 NaN NaN NaN ... t g -0.0072 0.0067 0.2826 rs10875231 0.24490 NaN NaN NaN
1 1_99535271_C_T rs6678176 1 99535271 C T -0.0070 NaN NaN NaN ... t c -0.0070 0.0063 0.2656 rs6678176 0.31970 NaN NaN NaN
2 1_99535287_T_C rs78286437 1 99535287 T C -0.0002 NaN NaN NaN ... t c 0.0002 0.0131 0.9865 rs78286437 0.07483 NaN NaN NaN
3 1_99535433_A_ATC rs146963890 1 99535433 A ATC -0.0078 NaN NaN NaN ... a atc 0.0078 0.0138 0.5728 rs146963890 0.06803 NaN NaN NaN
4 1_99535582_A_G rs144406489 1 99535582 A G 0.0049 NaN NaN NaN ... a g -0.0049 0.0214 0.8196 rs144406489 0.02041 NaN NaN NaN
5 1_99535645_G_T rs76909621 1 99535645 G T -0.0131 NaN NaN NaN ... t g -0.0131 0.0096 0.1728 rs76909621 0.08163 NaN NaN NaN
6 1_99536599_GTTAGT_G rs147727421 1 99536599 GTTAGT G -0.0141 NaN NaN NaN ... g gttagt -0.0141 0.0100 0.1584 rs147727421 0.08163 NaN NaN NaN
7 1_99536934_C_T rs78642210 1 99536934 C T 0.0004 NaN NaN NaN ... t c 0.0004 0.0131 0.9776 rs78642210 0.07653 NaN NaN NaN
8 1_99537157_C_T rs77140576 1 99537157 C T -0.0111 NaN NaN NaN ... t c -0.0111 0.0094 0.2386 rs77140576 0.08333 NaN NaN NaN

9 rows × 24 columns