Code
import numpy as np
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
Saideep Gona
October 10, 2023
Simulate true phenotype as sum of individual variant effects.
Simulate genotype of individuals, allele freq of variants is equal to (1-effect size) of the variant
num_individuals = 400
genotype_matrix = np.zeros((num_individuals,num_variants))
allele_freq_array = [(1-x) for x in true_effect_sizes]
for individual in range(num_individuals):
hap1 = np.random.binomial(1,allele_freq_array,num_variants)
hap2 = np.random.binomial(1,allele_freq_array,num_variants)
genotype_matrix[individual,:] = hap1 + hap2
print(genotype_matrix)
print(genotype_matrix.shape)
[[2. 1. 0. ... 2. 0. 2.]
[1. 1. 0. ... 1. 0. 0.]
[2. 2. 1. ... 1. 1. 0.]
...
[2. 2. 1. ... 1. 0. 1.]
[2. 0. 0. ... 2. 1. 0.]
[2. 1. 1. ... 1. 0. 1.]]
(400, 1000)
True phenotype is just the dot product of true genotype and true effect size (genetic component) + environmental component. Here lets just focus on the genetic component for simplicity
343.6973482088499
Now let us imagine a genetic predictor which has learned the true effect size of only a subset of variants. Furthermore, when making personalized predictions, it does so by summing the individual variant effect sizes. We can simulate this by masking the effect size from different variant subsets and repredicting the phenotype.
def mask_effect_sizes(effect_sizes, mask_prob, noise=True):
mask = np.random.binomial(1,(1-mask_prob),len(effect_sizes))
if noise:
return effect_sizes * mask + np.random.normal(0,0.1,len(effect_sizes))
return effect_sizes * mask+1e-6
p=0.6
masked_effect_sizes = mask_effect_sizes(true_effect_sizes,p)
masked_phenotype_genetic = np.dot(genotype_matrix,masked_effect_sizes)
We can now compute correlations between true and predicted phenotypes across individuals
0.6121026930359719
OK cool, we can now put everything together. Into a single function to simulate a single gene.
def simulate_single_gene_modeling(num_variants, num_individuals, mask_prob):
true_effect_sizes = np.random.uniform(0,1,num_variants)
allele_freq_array = [(1-x) for x in true_effect_sizes]
genotype_matrix = np.zeros((num_individuals,num_variants))
for individual in range(num_individuals):
hap1 = np.random.binomial(1,allele_freq_array,num_variants)
hap2 = np.random.binomial(1,allele_freq_array,num_variants)
genotype_matrix[individual,:] = hap1 + hap2
true_phenotype_genetic = np.dot(genotype_matrix,true_effect_sizes)
masked_effect_sizes = mask_effect_sizes(true_effect_sizes,mask_prob)
masked_phenotype_genetic = np.dot(genotype_matrix,masked_effect_sizes)
masked_phenotype = masked_phenotype_genetic + np.random.normal(0,1,num_individuals)
corr_coef = np.corrcoef(true_phenotype_genetic,masked_phenotype_genetic)[0,1]
return corr_coef
We also want to be able to simulate a suite of genes at once, since ultimately this is is the most interesting question
And finally, we can test different hyperparameters as a grid
def plot_multigene(corr_coef_array, title, output_dir="/beagle3/haky/users/saideep/projects/simulate_effect"):
plt.boxplot(corr_coef_array)
plt.ylabel("Correlation Coefficients")
plt.title(title)
plt.savefig(os.path.join(output_dir,title+".png"))
plt.clf()
def simulate_grid(num_variants_array, num_individuals_array, mask_prob_array, num_genes_array):
# corr_coef_grid = np.zeros((len(num_variants_array),len(num_individuals_array),len(mask_prob_array),num_reps))
for nv in num_variants_array:
for ni in num_individuals_array:
for mp in mask_prob_array:
for ng in num_genes_array:
corr_coef_array = simulate_multiple_genes(nv, ni, mp, ng)
# print(corr_coef_array)
title = f"nv={nv}, ni={ni}, mp={mp}, ng={ng}"
print(title)
plot_multigene(corr_coef_array,
title)
nv=10, ni=400, mp=0.1, ng=1000
nv=10, ni=400, mp=0.30000000000000004, ng=1000
nv=10, ni=400, mp=0.5000000000000001, ng=1000
nv=10, ni=400, mp=0.7000000000000001, ng=1000
nv=40, ni=400, mp=0.1, ng=1000
nv=40, ni=400, mp=0.30000000000000004, ng=1000
nv=40, ni=400, mp=0.5000000000000001, ng=1000
nv=40, ni=400, mp=0.7000000000000001, ng=1000
nv=70, ni=400, mp=0.1, ng=1000
nv=70, ni=400, mp=0.30000000000000004, ng=1000
nv=70, ni=400, mp=0.5000000000000001, ng=1000
nv=70, ni=400, mp=0.7000000000000001, ng=1000
<Figure size 640x480 with 0 Axes>