QTL_mutagenesis

Author

Saideep Gona

Published

October 26, 2023

Context

I wish to perturb the “best” SNP at each gene and observe the effect size on the flu prediction

Code
import os,sys
import numpy as np
import cyvcf2
import pandas as pd

import kipoiseq
import pyfaidx
from kipoiseq import Interval
import liftover
from liftover import ChainFile

import h5py

import pysam

1.) Collect best SNPs

Code
best_SNPs_table_path = os.path.join("/beagle3/haky/users/saideep/projects/aracena_modeling/QTL_results/RNAseq_bestQTL_SNP.csv")

best_SNPs_table = pd.read_csv(best_SNPs_table_path)
best_SNPs_table["ensembl_gene_id"] = best_SNPs_table["feature"]

pval_threshold = 1e-5
best_SNPs_table = best_SNPs_table[best_SNPs_table["pvalue_Flu"] < pval_threshold]

best_SNPs_table
feature best_snp_NI statistic_NI pvalue_NI beta_NI qvalue_NI best_snp_Flu statistic_Flu pvalue_Flu beta_Flu qvalue_Flu lfsr_NI lfsr_Flu ensembl_gene_id
0 ENSG00000121716 7_99910443_C_G 14.618995 1.022990e-15 1.175611 0.000278 7_99910443_C_G 10.959885 2.298140e-12 0.895589 0.000862 8.783740e-47 9.910960e-29 ENSG00000121716
1 ENSG00000256274 12_11187069_A_C -14.295073 1.910380e-15 -0.811393 0.000278 12_11229016_C_G -10.522330 6.429120e-12 -0.838072 0.000862 0.000000e+00 0.000000e+00 ENSG00000256274
2 ENSG00000166435 11_74470292_G_A 12.571045 6.377710e-14 1.107514 0.000278 11_74560117_T_C 9.892441 2.951840e-11 0.961354 0.000862 4.949970e-36 6.189170e-21 ENSG00000166435
3 ENSG00000272221 6_31353992_G_C -12.496661 7.474620e-14 -0.861125 0.000278 6_31378650_C_A -8.831113 4.328820e-10 -1.206735 0.000862 0.000000e+00 1.110220e-16 ENSG00000272221
4 ENSG00000256400 12_11240633_T_A -12.236287 1.309310e-13 -2.093424 0.000278 12_11219216_T_A -8.605599 7.805670e-10 -1.316067 0.000862 0.000000e+00 3.367020e-08 ENSG00000256400
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13267 ENSG00000116679 1_185354466_T_A 2.043682 4.928971e-02 0.051783 0.510419 1_185354466_T_A 5.346575 7.247840e-06 0.190662 0.026184 1.414188e-01 1.661950e-04 ENSG00000116679
13302 ENSG00000131797 16_31640495_A_C 2.020001 5.182164e-02 0.747627 0.511978 16_31789892_G_T 6.406736 3.361190e-07 0.577612 0.004951 4.859062e-01 2.332100e-07 ENSG00000131797
13328 ENSG00000182179 3_49943200_G_A -2.006159 5.335385e-02 -0.085472 0.512528 3_49853073_A_G -5.651302 2.985560e-06 -0.182053 0.016016 7.611641e-01 4.744480e-06 ENSG00000182179
13446 ENSG00000230532 17_46961493_C_T -1.921331 6.363554e-02 -0.133152 0.516122 17_47017477_T_C -5.312061 8.014130e-06 -0.213421 0.027664 3.423043e-01 1.092190e-04 ENSG00000230532
13623 ENSG00000257621 14_58820568_T_C -1.661748 1.063307e-01 -0.061140 0.522792 14_58684121_C_A -5.333885 7.520680e-06 -0.103046 0.026347 5.365127e-01 3.966030e-05 ENSG00000257621

635 rows × 14 columns

Code
gene_metadata = pd.read_csv("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")
gene_metadata
ensembl_gene_id external_gene_name chromosome_name transcript_biotype transcript_start transcript_end transcription_start_site transcript_is_canonical transcript_count target_regions
0 ENSG00000160072 ATAD3B 1 protein_coding 1471765 1497848 1471765 1 6 chr1_1471765_1471767
1 ENSG00000142611 PRDM16 1 protein_coding 3069203 3438621 3069203 1 10 chr1_3069203_3069205
2 ENSG00000157911 PEX10 1 protein_coding 2403974 2412564 2412564 1 9 chr1_2412564_2412566
3 ENSG00000142655 PEX14 1 protein_coding 10474950 10630758 10474950 1 4 chr1_10474950_10474952
4 ENSG00000149527 PLCH2 1 protein_coding 2476289 2505532 2476289 1 8 chr1_2476289_2476291
... ... ... ... ... ... ... ... ... ... ...
19662 ENSG00000233803 TSPY4 Y protein_coding 9337464 9340278 9337464 1 5 chrY_9337464_9337466
19663 ENSG00000226941 RBMY1J Y protein_coding 22403410 22417881 22403410 1 2 chrY_22403410_22403412
19664 ENSG00000238074 TSPY9 Y protein_coding 9487267 9490081 9487267 1 1 chrY_9487267_9487269
19665 ENSG00000242875 RBMY1B Y protein_coding 21511338 21525786 21511338 1 2 chrY_21511338_21511340
19666 ENSG00000234414 RBMY1A1 Y protein_coding 21534879 21549326 21534879 1 2 chrY_21534879_21534881

19667 rows × 10 columns

Code
print(len(list(best_SNPs_table["ensembl_gene_id"])))
print(len(set(best_SNPs_table["ensembl_gene_id"])))
print(len(list(gene_metadata["ensembl_gene_id"])))
print(len(set(gene_metadata["ensembl_gene_id"])))

best_SNPs_genes = set(best_SNPs_table["ensembl_gene_id"])
gene_metadata_genes = set(gene_metadata["ensembl_gene_id"])

intersection = best_SNPs_genes.intersection(gene_metadata_genes)
print(len(intersection))

best_SNPs_subset = best_SNPs_table[best_SNPs_table["ensembl_gene_id"].isin(intersection)]
gene_metadata_subset = gene_metadata[gene_metadata["ensembl_gene_id"].isin(intersection)]
635
635
19667
19667
491
Code
def liftover_variant(variant, source="hg38", target="hg19"):
    '''
    Inputs:
        interval: kipoiseq interval object containing the coordinates to be converted
    Outputs:
        converted_coords: kipoiseq interval object containing the converted coordinates
    '''

    # Initialize converter

    converter = liftover.get_lifter('hg19', 'hg38')
    # converter = ChainFile(f"/project2/haky/saideep/aracena_data_preprocessing/test_conversion/{source}To{target.capitalize()}.over.chain.gz", source, target)

    converted_coords = converter[variant["chrom"]][variant["loc"]][0][1]

    return converted_coords
Code
for row in best_SNPs_subset.iterrows():
    break
    row = row[1]
    print(row["feature"], row["best_snp_Flu"])
    variant = {
        "chrom": row["best_snp_Flu"].split("_")[0],
        "loc": int(row["best_snp_Flu"].split("_")[1]),
        "ref": row["best_snp_Flu"].split("_")[2],
        "alt": row["best_snp_Flu"].split("_")[3]
    }
    converted_variant = liftover_variant(variant, source="hg19", target="hg38")
    print(converted_variant)

Extract and mutate sequence window for Enformer

Code
# @title `variant_centered_sequences`

class FastaStringExtractor:
    
    def __init__(self, fasta_file):
        self.fasta = pyfaidx.Fasta(fasta_file)
        self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}

    def extract(self, interval: Interval, **kwargs) -> str:
        # Truncate interval if it extends beyond the chromosome lengths.
        chromosome_length = self._chromosome_sizes[interval.chrom]
        trimmed_interval = Interval(interval.chrom,
                                    max(interval.start, 0),
                                    min(interval.end, chromosome_length),
                                    )
        # pyfaidx wants a 1-based interval
        sequence = str(self.fasta.get_seq(trimmed_interval.chrom,
                                          trimmed_interval.start + 1,
                                          trimmed_interval.stop).seq).upper()
        # Fill truncated values with N's.
        pad_upstream = 'N' * max(-interval.start, 0)
        pad_downstream = 'N' * max(interval.end - chromosome_length, 0)
        return pad_upstream + sequence + pad_downstream

    def close(self):
        return self.fasta.close()
Code
sequence_length = 393216

fasta_path = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/hg38_ref/Homo_sapiens_assembly38.fasta"

fasta_extractor = FastaStringExtractor(fasta_path)

c=0
for row in best_SNPs_subset.iterrows():
    row = row[1]
    # print(row["feature"], row["best_snp_Flu"])
    variant = {
        "chrom": row["best_snp_Flu"].split("_")[0],
        "loc": int(row["best_snp_Flu"].split("_")[1]),
        "ref": row["best_snp_Flu"].split("_")[2],
        "alt": row["best_snp_Flu"].split("_")[3]
    }
    converted_variant = liftover_variant(variant, source="hg19", target="hg38")

    variant_centered_interval = kipoiseq.Interval("chr"+variant["chrom"], converted_variant - 1, converted_variant + 1).resize(sequence_length)

    fasta_sequence = fasta_extractor.extract(variant_centered_interval)
    # print(fasta_sequence)

    if fasta_sequence[(sequence_length//2)-1] not in {variant["ref"], variant["alt"]}:
        print("REF allele not in recorded alleles: ", variant["ref"], variant["alt"])

    c+=1 
    if c == 10:
        break

Run enformer predictions

Code
import tensorflow_hub as hub
import tensorflow as tf

class Enformer:

  def __init__(self, tfhub_url):
    self._model = hub.load(tfhub_url).model

  def predict_on_batch(self, inputs):
    predictions = self._model.predict_on_batch(inputs)
    return {k: v.numpy() for k, v in predictions.items()}

  @tf.function
  def contribution_input_grad(self, input_sequence,
                              target_mask, output_head='human'):
    input_sequence = input_sequence[tf.newaxis]

    target_mask_mass = tf.reduce_sum(target_mask)
    with tf.GradientTape() as tape:
      tape.watch(input_sequence)
      prediction = tf.reduce_sum(
          target_mask[tf.newaxis] *
          self._model.predict_on_batch(input_sequence)[output_head]) / target_mask_mass

    input_grad = tape.gradient(prediction, input_sequence) * input_sequence
    input_grad = tf.squeeze(input_grad, axis=0)
    return tf.reduce_sum(input_grad, axis=-1)


def one_hot_encode(sequence):
  return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)


def predict_variant_effect(enformer_obj, ref_seq, alt_seq, bins=393216):

    ref_pred = enformer_obj.predict_on_batch(one_hot_encode(ref_seq)[np.newaxis])['human'][0]
    print(ref_pred.shape)
    ref_quant = np.mean(ref_pred[bins//2-2:bins//2+2, :], axis=0, keepdims=True)
    print(ref_quant.shape)

    alt_pred = enformer_obj.predict_on_batch(one_hot_encode(alt_seq)[np.newaxis])['human'][0]
    print(alt_pred.shape)
    alt_quant = np.mean(alt_pred[bins//2-2:bins//2+2, :], axis=0, keepdims=True)

    return {"ref": ref_quant, "alt": alt_quant}
Code
import time

model_path = "/project2/haky/Data/enformer/raw"
enformer_obj = Enformer(model_path)

sequence_length = 393216

fasta_path = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/hg38_ref/Homo_sapiens_assembly38.fasta"

fasta_extractor = FastaStringExtractor(fasta_path)

ref_preds = []
alt_preds = []
ref_alleles = []
alt_alleles = []
relative_direction = []

c=0
for row in best_SNPs_subset.iterrows():

    if c % 100 ==0:
        print(c)

    start=time.time()

    row = row[1]
    # print(row["feature"], row["best_snp_Flu"])
    variant = {
        "chrom": row["best_snp_Flu"].split("_")[0],
        "loc": int(row["best_snp_Flu"].split("_")[1]),
        "ref": row["best_snp_Flu"].split("_")[2],
        "alt": row["best_snp_Flu"].split("_")[3]
    }
    converted_variant = liftover_variant(variant, source="hg19", target="hg38")

    variant_centered_interval = kipoiseq.Interval("chr"+variant["chrom"], converted_variant - 1, converted_variant + 1).resize(sequence_length)

    ref_sequence = fasta_extractor.extract(variant_centered_interval)
    # print(fasta_sequence)

    if ref_sequence[(sequence_length//2)-1] not in {variant["ref"], variant["alt"]}:
        print("REF allele not in recorded alleles: ", variant["ref"], variant["alt"])

    if ref_sequence[(sequence_length//2)-1] == variant["ref"]:
        alt_sequence = list(ref_sequence[:])
        alt_sequence[(sequence_length//2)-1] = variant["alt"]
        alt_sequence = "".join(alt_sequence)
        ref_alleles.append(variant["ref"])
        alt_alleles.append(variant["alt"])
        relative_direction.append(True)
    elif ref_sequence[(sequence_length//2)-1] == variant["alt"]:
        alt_sequence = list(ref_sequence[:])
        alt_sequence[(sequence_length//2)-1] = variant["ref"]
        ref_alleles.append(variant["alt"])
        alt_alleles.append(variant["ref"])
        relative_direction.append(False)

    preds = predict_variant_effect(enformer_obj, fasta_sequence, fasta_sequence)
    ref_preds.append(preds["ref"])
    alt_preds.append(preds["alt"])

    end=time.time()
    print(end-start)

    c+=1 
    if c == 3:
        break

ref_preds = np.array(ref_preds)
alt_preds = np.array(alt_preds)

print(ref_preds.shape)

hdf5_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/mutagenesis_hdf5"

with h5py.File(os.path.join(hdf5_path, "ref_preds.hdf5"), 'w') as f:
    f.create_dataset("ref_preds", data=ref_preds)

with h5py.File(os.path.join(hdf5_path, "alt_preds.hdf5"), 'w') as f:
    f.create_dataset("alt_preds", data=alt_preds)

meta_table = pd.DataFrame.from_dict({"ref_alleles": ref_alleles, "alt_alleles": alt_alleles, "relative_direction": relative_direction})
meta_table.to_csv("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-26-make_aracena_enformer_preds/meta_table.csv")
0
UnknownError: Graph execution error:

Detected at node seqnn/trunk/downres/downres_block_1/conv_block/normalization/cross_replica_batch_norm/batchnorm/Rsqrt defined at (most recent call last):
<stack traces unavailable>
JIT compilation failed.
     [[{{node seqnn/trunk/downres/downres_block_1/conv_block/normalization/cross_replica_batch_norm/batchnorm/Rsqrt}}]] [Op:__inference_restored_function_body_8227]

Convert predictions to csv

Code
import h5py
import numpy as np
import os

hdf5_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/mutagenesis_hdf5"

print(hdf5_path)

with h5py.File(os.path.join(hdf5_path, "ref_preds.hdf5"), 'r') as f:
    ref_preds = f["ref_preds"][:].squeeze()
    print(ref_preds)

np.savetxt(os.path.join(hdf5_path, "ref_preds.csv"),np.transpose(ref_preds))

with h5py.File(os.path.join(hdf5_path, "alt_preds.hdf5"), 'r') as f:
    alt_preds = f["alt_preds"][:].squeeze()

np.savetxt(os.path.join(hdf5_path, "alt_preds.csv"), np.transpose(alt_preds))

print(ref_preds.shape)
print(alt_preds.shape)
print(sum(ref_preds[0,:]-ref_preds[1,:]))
/beagle3/haky/users/saideep/projects/aracena_modeling/mutagenesis_hdf5
[[0.01936201 0.02289032 0.03525835 ... 0.00384166 0.01861589 0.01648933]
 [0.15239096 0.1552572  0.13544452 ... 0.02660625 0.19597983 0.19740911]
 [1.1871796  2.4680097  2.3212388  ... 0.11045121 0.2405752  0.38479745]
 ...
 [0.03934279 0.03601513 0.02608934 ... 0.01133371 0.10566092 0.08299403]
 [0.04601159 0.04957747 0.04208577 ... 0.0049286  0.01726986 0.01519084]
 [0.02632891 0.02830148 0.02941273 ... 0.00297602 0.00587846 0.00631356]]
(491, 5313)
(491, 5313)
-4866.325176687096