Code
import os,sys
import numpy as np
import cyvcf2
import pandas as pd
import kipoiseq
import pyfaidx
from kipoiseq import Interval
import liftover
from liftover import ChainFile
import h5py
import pysam
Saideep Gona
October 26, 2023
I wish to perturb the “best” SNP at each gene and observe the effect size on the flu prediction
best_SNPs_table_path = os.path.join("/beagle3/haky/users/saideep/projects/aracena_modeling/QTL_results/RNAseq_bestQTL_SNP.csv")
best_SNPs_table = pd.read_csv(best_SNPs_table_path)
best_SNPs_table["ensembl_gene_id"] = best_SNPs_table["feature"]
pval_threshold = 1e-5
best_SNPs_table = best_SNPs_table[best_SNPs_table["pvalue_Flu"] < pval_threshold]
best_SNPs_table
feature | best_snp_NI | statistic_NI | pvalue_NI | beta_NI | qvalue_NI | best_snp_Flu | statistic_Flu | pvalue_Flu | beta_Flu | qvalue_Flu | lfsr_NI | lfsr_Flu | ensembl_gene_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ENSG00000121716 | 7_99910443_C_G | 14.618995 | 1.022990e-15 | 1.175611 | 0.000278 | 7_99910443_C_G | 10.959885 | 2.298140e-12 | 0.895589 | 0.000862 | 8.783740e-47 | 9.910960e-29 | ENSG00000121716 |
1 | ENSG00000256274 | 12_11187069_A_C | -14.295073 | 1.910380e-15 | -0.811393 | 0.000278 | 12_11229016_C_G | -10.522330 | 6.429120e-12 | -0.838072 | 0.000862 | 0.000000e+00 | 0.000000e+00 | ENSG00000256274 |
2 | ENSG00000166435 | 11_74470292_G_A | 12.571045 | 6.377710e-14 | 1.107514 | 0.000278 | 11_74560117_T_C | 9.892441 | 2.951840e-11 | 0.961354 | 0.000862 | 4.949970e-36 | 6.189170e-21 | ENSG00000166435 |
3 | ENSG00000272221 | 6_31353992_G_C | -12.496661 | 7.474620e-14 | -0.861125 | 0.000278 | 6_31378650_C_A | -8.831113 | 4.328820e-10 | -1.206735 | 0.000862 | 0.000000e+00 | 1.110220e-16 | ENSG00000272221 |
4 | ENSG00000256400 | 12_11240633_T_A | -12.236287 | 1.309310e-13 | -2.093424 | 0.000278 | 12_11219216_T_A | -8.605599 | 7.805670e-10 | -1.316067 | 0.000862 | 0.000000e+00 | 3.367020e-08 | ENSG00000256400 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13267 | ENSG00000116679 | 1_185354466_T_A | 2.043682 | 4.928971e-02 | 0.051783 | 0.510419 | 1_185354466_T_A | 5.346575 | 7.247840e-06 | 0.190662 | 0.026184 | 1.414188e-01 | 1.661950e-04 | ENSG00000116679 |
13302 | ENSG00000131797 | 16_31640495_A_C | 2.020001 | 5.182164e-02 | 0.747627 | 0.511978 | 16_31789892_G_T | 6.406736 | 3.361190e-07 | 0.577612 | 0.004951 | 4.859062e-01 | 2.332100e-07 | ENSG00000131797 |
13328 | ENSG00000182179 | 3_49943200_G_A | -2.006159 | 5.335385e-02 | -0.085472 | 0.512528 | 3_49853073_A_G | -5.651302 | 2.985560e-06 | -0.182053 | 0.016016 | 7.611641e-01 | 4.744480e-06 | ENSG00000182179 |
13446 | ENSG00000230532 | 17_46961493_C_T | -1.921331 | 6.363554e-02 | -0.133152 | 0.516122 | 17_47017477_T_C | -5.312061 | 8.014130e-06 | -0.213421 | 0.027664 | 3.423043e-01 | 1.092190e-04 | ENSG00000230532 |
13623 | ENSG00000257621 | 14_58820568_T_C | -1.661748 | 1.063307e-01 | -0.061140 | 0.522792 | 14_58684121_C_A | -5.333885 | 7.520680e-06 | -0.103046 | 0.026347 | 5.365127e-01 | 3.966030e-05 | ENSG00000257621 |
635 rows × 14 columns
ensembl_gene_id | external_gene_name | chromosome_name | transcript_biotype | transcript_start | transcript_end | transcription_start_site | transcript_is_canonical | transcript_count | target_regions | |
---|---|---|---|---|---|---|---|---|---|---|
0 | ENSG00000160072 | ATAD3B | 1 | protein_coding | 1471765 | 1497848 | 1471765 | 1 | 6 | chr1_1471765_1471767 |
1 | ENSG00000142611 | PRDM16 | 1 | protein_coding | 3069203 | 3438621 | 3069203 | 1 | 10 | chr1_3069203_3069205 |
2 | ENSG00000157911 | PEX10 | 1 | protein_coding | 2403974 | 2412564 | 2412564 | 1 | 9 | chr1_2412564_2412566 |
3 | ENSG00000142655 | PEX14 | 1 | protein_coding | 10474950 | 10630758 | 10474950 | 1 | 4 | chr1_10474950_10474952 |
4 | ENSG00000149527 | PLCH2 | 1 | protein_coding | 2476289 | 2505532 | 2476289 | 1 | 8 | chr1_2476289_2476291 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19662 | ENSG00000233803 | TSPY4 | Y | protein_coding | 9337464 | 9340278 | 9337464 | 1 | 5 | chrY_9337464_9337466 |
19663 | ENSG00000226941 | RBMY1J | Y | protein_coding | 22403410 | 22417881 | 22403410 | 1 | 2 | chrY_22403410_22403412 |
19664 | ENSG00000238074 | TSPY9 | Y | protein_coding | 9487267 | 9490081 | 9487267 | 1 | 1 | chrY_9487267_9487269 |
19665 | ENSG00000242875 | RBMY1B | Y | protein_coding | 21511338 | 21525786 | 21511338 | 1 | 2 | chrY_21511338_21511340 |
19666 | ENSG00000234414 | RBMY1A1 | Y | protein_coding | 21534879 | 21549326 | 21534879 | 1 | 2 | chrY_21534879_21534881 |
19667 rows × 10 columns
print(len(list(best_SNPs_table["ensembl_gene_id"])))
print(len(set(best_SNPs_table["ensembl_gene_id"])))
print(len(list(gene_metadata["ensembl_gene_id"])))
print(len(set(gene_metadata["ensembl_gene_id"])))
best_SNPs_genes = set(best_SNPs_table["ensembl_gene_id"])
gene_metadata_genes = set(gene_metadata["ensembl_gene_id"])
intersection = best_SNPs_genes.intersection(gene_metadata_genes)
print(len(intersection))
best_SNPs_subset = best_SNPs_table[best_SNPs_table["ensembl_gene_id"].isin(intersection)]
gene_metadata_subset = gene_metadata[gene_metadata["ensembl_gene_id"].isin(intersection)]
635
635
19667
19667
491
def liftover_variant(variant, source="hg38", target="hg19"):
'''
Inputs:
interval: kipoiseq interval object containing the coordinates to be converted
Outputs:
converted_coords: kipoiseq interval object containing the converted coordinates
'''
# Initialize converter
converter = liftover.get_lifter('hg19', 'hg38')
# converter = ChainFile(f"/project2/haky/saideep/aracena_data_preprocessing/test_conversion/{source}To{target.capitalize()}.over.chain.gz", source, target)
converted_coords = converter[variant["chrom"]][variant["loc"]][0][1]
return converted_coords
for row in best_SNPs_subset.iterrows():
break
row = row[1]
print(row["feature"], row["best_snp_Flu"])
variant = {
"chrom": row["best_snp_Flu"].split("_")[0],
"loc": int(row["best_snp_Flu"].split("_")[1]),
"ref": row["best_snp_Flu"].split("_")[2],
"alt": row["best_snp_Flu"].split("_")[3]
}
converted_variant = liftover_variant(variant, source="hg19", target="hg38")
print(converted_variant)
# @title `variant_centered_sequences`
class FastaStringExtractor:
def __init__(self, fasta_file):
self.fasta = pyfaidx.Fasta(fasta_file)
self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}
def extract(self, interval: Interval, **kwargs) -> str:
# Truncate interval if it extends beyond the chromosome lengths.
chromosome_length = self._chromosome_sizes[interval.chrom]
trimmed_interval = Interval(interval.chrom,
max(interval.start, 0),
min(interval.end, chromosome_length),
)
# pyfaidx wants a 1-based interval
sequence = str(self.fasta.get_seq(trimmed_interval.chrom,
trimmed_interval.start + 1,
trimmed_interval.stop).seq).upper()
# Fill truncated values with N's.
pad_upstream = 'N' * max(-interval.start, 0)
pad_downstream = 'N' * max(interval.end - chromosome_length, 0)
return pad_upstream + sequence + pad_downstream
def close(self):
return self.fasta.close()
sequence_length = 393216
fasta_path = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/hg38_ref/Homo_sapiens_assembly38.fasta"
fasta_extractor = FastaStringExtractor(fasta_path)
c=0
for row in best_SNPs_subset.iterrows():
row = row[1]
# print(row["feature"], row["best_snp_Flu"])
variant = {
"chrom": row["best_snp_Flu"].split("_")[0],
"loc": int(row["best_snp_Flu"].split("_")[1]),
"ref": row["best_snp_Flu"].split("_")[2],
"alt": row["best_snp_Flu"].split("_")[3]
}
converted_variant = liftover_variant(variant, source="hg19", target="hg38")
variant_centered_interval = kipoiseq.Interval("chr"+variant["chrom"], converted_variant - 1, converted_variant + 1).resize(sequence_length)
fasta_sequence = fasta_extractor.extract(variant_centered_interval)
# print(fasta_sequence)
if fasta_sequence[(sequence_length//2)-1] not in {variant["ref"], variant["alt"]}:
print("REF allele not in recorded alleles: ", variant["ref"], variant["alt"])
c+=1
if c == 10:
break
import tensorflow_hub as hub
import tensorflow as tf
class Enformer:
def __init__(self, tfhub_url):
self._model = hub.load(tfhub_url).model
def predict_on_batch(self, inputs):
predictions = self._model.predict_on_batch(inputs)
return {k: v.numpy() for k, v in predictions.items()}
@tf.function
def contribution_input_grad(self, input_sequence,
target_mask, output_head='human'):
input_sequence = input_sequence[tf.newaxis]
target_mask_mass = tf.reduce_sum(target_mask)
with tf.GradientTape() as tape:
tape.watch(input_sequence)
prediction = tf.reduce_sum(
target_mask[tf.newaxis] *
self._model.predict_on_batch(input_sequence)[output_head]) / target_mask_mass
input_grad = tape.gradient(prediction, input_sequence) * input_sequence
input_grad = tf.squeeze(input_grad, axis=0)
return tf.reduce_sum(input_grad, axis=-1)
def one_hot_encode(sequence):
return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)
def predict_variant_effect(enformer_obj, ref_seq, alt_seq, bins=393216):
ref_pred = enformer_obj.predict_on_batch(one_hot_encode(ref_seq)[np.newaxis])['human'][0]
print(ref_pred.shape)
ref_quant = np.mean(ref_pred[bins//2-2:bins//2+2, :], axis=0, keepdims=True)
print(ref_quant.shape)
alt_pred = enformer_obj.predict_on_batch(one_hot_encode(alt_seq)[np.newaxis])['human'][0]
print(alt_pred.shape)
alt_quant = np.mean(alt_pred[bins//2-2:bins//2+2, :], axis=0, keepdims=True)
return {"ref": ref_quant, "alt": alt_quant}
import time
model_path = "/project2/haky/Data/enformer/raw"
enformer_obj = Enformer(model_path)
sequence_length = 393216
fasta_path = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/hg38_ref/Homo_sapiens_assembly38.fasta"
fasta_extractor = FastaStringExtractor(fasta_path)
ref_preds = []
alt_preds = []
ref_alleles = []
alt_alleles = []
relative_direction = []
c=0
for row in best_SNPs_subset.iterrows():
if c % 100 ==0:
print(c)
start=time.time()
row = row[1]
# print(row["feature"], row["best_snp_Flu"])
variant = {
"chrom": row["best_snp_Flu"].split("_")[0],
"loc": int(row["best_snp_Flu"].split("_")[1]),
"ref": row["best_snp_Flu"].split("_")[2],
"alt": row["best_snp_Flu"].split("_")[3]
}
converted_variant = liftover_variant(variant, source="hg19", target="hg38")
variant_centered_interval = kipoiseq.Interval("chr"+variant["chrom"], converted_variant - 1, converted_variant + 1).resize(sequence_length)
ref_sequence = fasta_extractor.extract(variant_centered_interval)
# print(fasta_sequence)
if ref_sequence[(sequence_length//2)-1] not in {variant["ref"], variant["alt"]}:
print("REF allele not in recorded alleles: ", variant["ref"], variant["alt"])
if ref_sequence[(sequence_length//2)-1] == variant["ref"]:
alt_sequence = list(ref_sequence[:])
alt_sequence[(sequence_length//2)-1] = variant["alt"]
alt_sequence = "".join(alt_sequence)
ref_alleles.append(variant["ref"])
alt_alleles.append(variant["alt"])
relative_direction.append(True)
elif ref_sequence[(sequence_length//2)-1] == variant["alt"]:
alt_sequence = list(ref_sequence[:])
alt_sequence[(sequence_length//2)-1] = variant["ref"]
ref_alleles.append(variant["alt"])
alt_alleles.append(variant["ref"])
relative_direction.append(False)
preds = predict_variant_effect(enformer_obj, fasta_sequence, fasta_sequence)
ref_preds.append(preds["ref"])
alt_preds.append(preds["alt"])
end=time.time()
print(end-start)
c+=1
if c == 3:
break
ref_preds = np.array(ref_preds)
alt_preds = np.array(alt_preds)
print(ref_preds.shape)
hdf5_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/mutagenesis_hdf5"
with h5py.File(os.path.join(hdf5_path, "ref_preds.hdf5"), 'w') as f:
f.create_dataset("ref_preds", data=ref_preds)
with h5py.File(os.path.join(hdf5_path, "alt_preds.hdf5"), 'w') as f:
f.create_dataset("alt_preds", data=alt_preds)
meta_table = pd.DataFrame.from_dict({"ref_alleles": ref_alleles, "alt_alleles": alt_alleles, "relative_direction": relative_direction})
meta_table.to_csv("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-26-make_aracena_enformer_preds/meta_table.csv")
0
UnknownError: Graph execution error:
Detected at node seqnn/trunk/downres/downres_block_1/conv_block/normalization/cross_replica_batch_norm/batchnorm/Rsqrt defined at (most recent call last):
<stack traces unavailable>
JIT compilation failed.
[[{{node seqnn/trunk/downres/downres_block_1/conv_block/normalization/cross_replica_batch_norm/batchnorm/Rsqrt}}]] [Op:__inference_restored_function_body_8227]
import h5py
import numpy as np
import os
hdf5_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/mutagenesis_hdf5"
print(hdf5_path)
with h5py.File(os.path.join(hdf5_path, "ref_preds.hdf5"), 'r') as f:
ref_preds = f["ref_preds"][:].squeeze()
print(ref_preds)
np.savetxt(os.path.join(hdf5_path, "ref_preds.csv"),np.transpose(ref_preds))
with h5py.File(os.path.join(hdf5_path, "alt_preds.hdf5"), 'r') as f:
alt_preds = f["alt_preds"][:].squeeze()
np.savetxt(os.path.join(hdf5_path, "alt_preds.csv"), np.transpose(alt_preds))
print(ref_preds.shape)
print(alt_preds.shape)
print(sum(ref_preds[0,:]-ref_preds[1,:]))
/beagle3/haky/users/saideep/projects/aracena_modeling/mutagenesis_hdf5
[[0.01936201 0.02289032 0.03525835 ... 0.00384166 0.01861589 0.01648933]
[0.15239096 0.1552572 0.13544452 ... 0.02660625 0.19597983 0.19740911]
[1.1871796 2.4680097 2.3212388 ... 0.11045121 0.2405752 0.38479745]
...
[0.03934279 0.03601513 0.02608934 ... 0.01133371 0.10566092 0.08299403]
[0.04601159 0.04957747 0.04208577 ... 0.0049286 0.01726986 0.01519084]
[0.02632891 0.02830148 0.02941273 ... 0.00297602 0.00587846 0.00631356]]
(491, 5313)
(491, 5313)
-4866.325176687096