Source code for src.seqvar.auto_pm1

"""Implementation of PM1 criteria."""

import os
from typing import Optional, Tuple

import tabix
from loguru import logger

from src.core.config import settings
from src.defs.auto_acmg import (
    PM1,
    AutoACMGCriteria,
    AutoACMGPrediction,
    AutoACMGSeqVarData,
    AutoACMGStrength,
    GenomicStrand,
)
from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, InvalidAPIResposeError
from src.defs.genome_builds import GenomeRelease
from src.defs.seqvar import SeqVar
from src.utils import AutoACMGHelper


[docs] class AutoPM1(AutoACMGHelper): """Class for PM1 prediction.""" def __init__(self): super().__init__() #: Prediction result. self.prediction_pm1: Optional[PM1] = None #: comment_pm1 to store the prediction explanation. self.comment_pm1: str = ""
[docs] def _get_affected_exon(self, var_data: AutoACMGSeqVarData, seqvar: SeqVar) -> int: """ Get the affected exon number for the variant. Go through all exons and count them before the variant position. The method also considers the strand of the gene. Args: var_data: AutoACMGData object seqvar: SeqVar object Returns: int: Affected exon number Raises: AlgorithmError: If the strand is invalid. """ exon_number = 0 if var_data.strand == GenomicStrand.Plus: for exon in var_data.exons: if exon.altStartI >= seqvar.pos: return exon_number if exon.altStartI <= seqvar.pos <= exon.altEndI: exon_number += 1 return exon_number if exon.altEndI < seqvar.pos: exon_number += 1 elif var_data.strand == GenomicStrand.Minus: for exon in var_data.exons[::-1]: if exon.altEndI <= seqvar.pos: return exon_number if exon.altStartI <= seqvar.pos <= exon.altEndI: exon_number += 1 return exon_number if exon.altStartI > seqvar.pos: exon_number += 1 else: raise AlgorithmError(f"Invalid strand for {var_data.hgnc_id}: {var_data.strand}") return exon_number
[docs] def _count_vars(self, seqvar: SeqVar, start_pos: int, end_pos: int) -> Tuple[int, int]: """ Counts pathogenic and benign variants in the specified range. The method retrieves variants from the specified range and iterates through the ClinVar data of each variant to count the number of pathogenic and benign SNVs. Note: The method considers "Pathogenic" and "Likely pathogenic" as pathogenic and "Benign" and "Likely benign" as benign. Args: seqvar: The sequence variant being analyzed. start_pos: The start position of the range. end_pos: The end position of the range. Returns: Tuple[int, int]: The number of pathogenic and benign variants. Raises: AlgorithmError: If end position is less than the start position. InvalidAPIResposeError: If the API response is invalid or cannot be processed. """ if end_pos < start_pos: raise AlgorithmError("End position is less than the start position.") response = self.annonars_client.get_variant_from_range(seqvar, start_pos, end_pos) if response and response.clinvar: pathogenic_variants = [ v for v in response.clinvar if v.records and v.records[0].classifications and v.records[0].classifications.germlineClassification and v.records[0].classifications.germlineClassification.description in [ "Pathogenic", "Likely pathogenic", ] and v.records[0].variationType == "VARIATION_TYPE_SNV" ] benign_variants = [ v for v in response.clinvar if v.records and v.records[0].classifications and v.records[0].classifications.germlineClassification and v.records[0].classifications.germlineClassification.description in [ "Benign", "Likely benign", ] and v.records[0].variationType == "VARIATION_TYPE_SNV" ] return len(pathogenic_variants), len(benign_variants) else: raise InvalidAPIResposeError("Failed to get variant from range. No ClinVar data.")
[docs] def _get_uniprot_domain(self, seqvar: SeqVar) -> Optional[Tuple[int, int]]: """ Retrieve the UniProt domain for the variant and return the start and end positions if found or None otherwise. Args: seqvar: The sequence variant being analyzed. Returns: Optional[Tuple[int, int]]: The start and end positions of the UniProt domain if found, None otherwise. Raises: AlgorithmError: If tabix fails to query the UniProt file. """ try: # Find path to the lib file if seqvar.genome_release == GenomeRelease.GRCh37: path = os.path.join( settings.PATH_TO_ROOT, "lib", "uniprot", "grch37", "uniprot.bed.gz" ) else: path = os.path.join( settings.PATH_TO_ROOT, "lib", "uniprot", "grch38", "uniprot.bed.gz" ) tb = tabix.open(path) records = tb.query(f"chr{seqvar.chrom}", seqvar.pos - 1, seqvar.pos) # Return the first record for record in records: return int(record[1]), int(record[2]) return None except tabix.TabixError as e: raise AlgorithmError("Failed to check if the variant is in a UniProt domain.") from e
[docs] def verify_pm1(self, seqvar: SeqVar, var_data: AutoACMGSeqVarData) -> Tuple[Optional[PM1], str]: """ Predict PM1 criteria. The method verifies the PM1 criteria for the variant. It first checks if the variant is in the mitochondrial genome. If so, it returns PM1 as not met. Otherwise, it counts the number of pathogenic and benign variants in the range of 25 bases before and after the variant position. If the number of pathogenic variants is greater than or equal to the threshold, it returns PM1 as met. Otherwise, it retrieves the UniProt domain for the variant and counts the number of pathogenic and benign variants in the domain. If the number of pathogenic variants is greater than or equal to 1/4 of the domain length, it returns PM1 as met. Otherwise, it returns PM1 as not met. Args: seqvar: The sequence variant being analyzed. var_data: AutoACMGData object Returns: Tuple[Optional[PM1], str]: The prediction result and the explanation """ self.prediction_pm1 = PM1() self.comment_pm1 = "" if seqvar.chrom == "MT": # skipped according to McCormick et al. (2020). self.comment_pm1 = "The variant is in the mitochondrial genome. PM1 is not met." self.prediction_pm1.PM1 = False else: try: pathogenic_count, benign_count = self._count_vars( seqvar, seqvar.pos - 25, seqvar.pos + 25 ) if pathogenic_count >= var_data.thresholds.pm1_pathogenic: self.comment_pm1 += ( f"Found {var_data.thresholds.pm1_pathogenic} or more pathogenic variants. " "PM1 is met." ) self.prediction_pm1.PM1 = True return self.prediction_pm1, self.comment_pm1 else: self.comment_pm1 += ( f"Found less than {var_data.thresholds.pm1_pathogenic} pathogenic " "variants. " ) uniprot_domain = self._get_uniprot_domain(seqvar) if not uniprot_domain: self.comment_pm1 += "Variant is not in a UniProt domain. PM1 is not met." self.prediction_pm1.PM1 = False return self.prediction_pm1, self.comment_pm1 start_pos, end_pos = uniprot_domain pathogenic_count, benign_count = self._count_vars(seqvar, start_pos, end_pos) self.comment_pm1 += ( f"Found {pathogenic_count} Pathogenic variants " f"and {benign_count} Benign variants " f"in Uniprot Domain: {start_pos}-{end_pos}. " ) if pathogenic_count >= (end_pos - start_pos) / 4: self.comment_pm1 += "PM1 is met." self.prediction_pm1.PM1 = True return self.prediction_pm1, self.comment_pm1 else: self.comment_pm1 += "PM1 is not met." self.prediction_pm1.PM1 = False except AutoAcmgBaseException as e: self.comment_pm1 = f"Error occurred during PM1 prediction. Error: {e}" self.prediction_pm1 = None return self.prediction_pm1, self.comment_pm1
[docs] def predict_pm1(self, seqvar: SeqVar, var_data: AutoACMGSeqVarData) -> AutoACMGCriteria: """Predict PM1 criteria.""" logger.info("Predict PM1") pred, comment = self.verify_pm1(seqvar, var_data) if pred: pm1_pred = ( AutoACMGPrediction.Applicable if pred.PM1 else ( AutoACMGPrediction.NotApplicable if pred.PM1 is False else AutoACMGPrediction.Failed ) ) pm1_strength = pred.PM1_strength else: pm1_pred = AutoACMGPrediction.Failed pm1_strength = AutoACMGStrength.PathogenicModerate return AutoACMGCriteria( name="PM1", prediction=pm1_pred, strength=pm1_strength, summary=comment, )