simulate_summed_phenotype

Author

Saideep Gona

Published

October 10, 2023

Code

import numpy as np
import os,sys

import seaborn as sns
import matplotlib.pyplot as plt

Simulate true phenotype as sum of individual variant effects.

Code

num_variants = 1000

true_effect_sizes = np.random.uniform(0,1,num_variants)

Simulate genotype of individuals, allele freq of variants is equal to (1-effect size) of the variant

Code

num_individuals = 400

genotype_matrix = np.zeros((num_individuals,num_variants))

allele_freq_array = [(1-x) for x in true_effect_sizes]

for individual in range(num_individuals):

    hap1 = np.random.binomial(1,allele_freq_array,num_variants)
    hap2 = np.random.binomial(1,allele_freq_array,num_variants)

    genotype_matrix[individual,:] = hap1 + hap2

print(genotype_matrix)

print(genotype_matrix.shape)

[[2. 1. 0. ... 2. 0. 2.]
 [1. 1. 0. ... 1. 0. 0.]
 [2. 2. 1. ... 1. 1. 0.]
 ...
 [2. 2. 1. ... 1. 0. 1.]
 [2. 0. 0. ... 2. 1. 0.]
 [2. 1. 1. ... 1. 0. 1.]]
(400, 1000)

True phenotype is just the dot product of true genotype and true effect size (genetic component) + environmental component. Here lets just focus on the genetic component for simplicity

Code

true_phenotype_genetic = np.dot(genotype_matrix,true_effect_sizes) 

print(np.mean(true_phenotype_genetic))

343.6973482088499

Now let us imagine a genetic predictor which has learned the true effect size of only a subset of variants. Furthermore, when making personalized predictions, it does so by summing the individual variant effect sizes. We can simulate this by masking the effect size from different variant subsets and repredicting the phenotype.

Code

def mask_effect_sizes(effect_sizes, mask_prob, noise=True):
    mask = np.random.binomial(1,(1-mask_prob),len(effect_sizes))
    if noise:
        return effect_sizes * mask + np.random.normal(0,0.1,len(effect_sizes))
    return effect_sizes * mask+1e-6


p=0.6
masked_effect_sizes = mask_effect_sizes(true_effect_sizes,p)
masked_phenotype_genetic = np.dot(genotype_matrix,masked_effect_sizes)

We can now compute correlations between true and predicted phenotypes across individuals

Code

corr_coef = np.corrcoef(true_phenotype_genetic,masked_phenotype_genetic)[0,1]
corr_coef

0.6121026930359719

OK cool, we can now put everything together. Into a single function to simulate a single gene.

Code

def simulate_single_gene_modeling(num_variants, num_individuals, mask_prob):

    true_effect_sizes = np.random.uniform(0,1,num_variants)
    allele_freq_array = [(1-x) for x in true_effect_sizes]
    genotype_matrix = np.zeros((num_individuals,num_variants))
    for individual in range(num_individuals):
        hap1 = np.random.binomial(1,allele_freq_array,num_variants)
        hap2 = np.random.binomial(1,allele_freq_array,num_variants)
        genotype_matrix[individual,:] = hap1 + hap2
    true_phenotype_genetic = np.dot(genotype_matrix,true_effect_sizes) 
    masked_effect_sizes = mask_effect_sizes(true_effect_sizes,mask_prob)
    masked_phenotype_genetic = np.dot(genotype_matrix,masked_effect_sizes)
    masked_phenotype = masked_phenotype_genetic + np.random.normal(0,1,num_individuals)
    corr_coef = np.corrcoef(true_phenotype_genetic,masked_phenotype_genetic)[0,1]
    
    return corr_coef

We also want to be able to simulate a suite of genes at once, since ultimately this is is the most interesting question

Code

def simulate_multiple_genes(num_variants, num_individuals, mask_prob, num_genes):

    corr_coef_array = np.zeros(num_genes)

    for gene in range(num_genes):
        corr_coef_array[gene] = simulate_single_gene_modeling(num_variants, num_individuals, mask_prob)
    
    return corr_coef_array

And finally, we can test different hyperparameters as a grid

Code

def plot_multigene(corr_coef_array, title, output_dir="/beagle3/haky/users/saideep/projects/simulate_effect"):
    plt.boxplot(corr_coef_array)
    plt.ylabel("Correlation Coefficients")
    plt.title(title)
    plt.savefig(os.path.join(output_dir,title+".png"))
    plt.clf()
    

def simulate_grid(num_variants_array, num_individuals_array, mask_prob_array, num_genes_array):

    # corr_coef_grid = np.zeros((len(num_variants_array),len(num_individuals_array),len(mask_prob_array),num_reps))

    for nv in num_variants_array:
        for ni in num_individuals_array:
            for mp in mask_prob_array:
                for ng in num_genes_array:
                    corr_coef_array = simulate_multiple_genes(nv, ni, mp, ng)
                    # print(corr_coef_array)
                    title = f"nv={nv}, ni={ni}, mp={mp}, ng={ng}"
                    print(title)
                    plot_multigene(corr_coef_array,
                                    title)

Code

simulate_grid(np.arange(10,100,30),[400],np.arange(0.1,0.9,0.2),[1000])

nv=10, ni=400, mp=0.1, ng=1000
nv=10, ni=400, mp=0.30000000000000004, ng=1000
nv=10, ni=400, mp=0.5000000000000001, ng=1000
nv=10, ni=400, mp=0.7000000000000001, ng=1000
nv=40, ni=400, mp=0.1, ng=1000
nv=40, ni=400, mp=0.30000000000000004, ng=1000
nv=40, ni=400, mp=0.5000000000000001, ng=1000
nv=40, ni=400, mp=0.7000000000000001, ng=1000
nv=70, ni=400, mp=0.1, ng=1000
nv=70, ni=400, mp=0.30000000000000004, ng=1000
nv=70, ni=400, mp=0.5000000000000001, ng=1000
nv=70, ni=400, mp=0.7000000000000001, ng=1000

<Figure size 640x480 with 0 Axes>