"""Implementation of PM1 criteria."""
import os
from typing import Optional, Tuple
import tabix
from loguru import logger
from src.core.config import settings
from src.defs.auto_acmg import (
PM1,
AutoACMGCriteria,
AutoACMGPrediction,
AutoACMGSeqVarData,
AutoACMGStrength,
GenomicStrand,
)
from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, InvalidAPIResposeError
from src.defs.genome_builds import GenomeRelease
from src.defs.seqvar import SeqVar
from src.utils import AutoACMGHelper
[docs]
class AutoPM1(AutoACMGHelper):
"""Class for PM1 prediction."""
def __init__(self):
super().__init__()
#: Prediction result.
self.prediction_pm1: Optional[PM1] = None
#: comment_pm1 to store the prediction explanation.
self.comment_pm1: str = ""
[docs]
def _get_affected_exon(self, var_data: AutoACMGSeqVarData, seqvar: SeqVar) -> int:
"""
Get the affected exon number for the variant.
Go through all exons and count them before the variant position. The method also considers
the strand of the gene.
Args:
var_data: AutoACMGData object
seqvar: SeqVar object
Returns:
int: Affected exon number
Raises:
AlgorithmError: If the strand is invalid.
"""
exon_number = 0
if var_data.strand == GenomicStrand.Plus:
for exon in var_data.exons:
if exon.altStartI >= seqvar.pos:
return exon_number
if exon.altStartI <= seqvar.pos <= exon.altEndI:
exon_number += 1
return exon_number
if exon.altEndI < seqvar.pos:
exon_number += 1
elif var_data.strand == GenomicStrand.Minus:
for exon in var_data.exons[::-1]:
if exon.altEndI <= seqvar.pos:
return exon_number
if exon.altStartI <= seqvar.pos <= exon.altEndI:
exon_number += 1
return exon_number
if exon.altStartI > seqvar.pos:
exon_number += 1
else:
raise AlgorithmError(f"Invalid strand for {var_data.hgnc_id}: {var_data.strand}")
return exon_number
[docs]
def _count_vars(self, seqvar: SeqVar, start_pos: int, end_pos: int) -> Tuple[int, int]:
"""
Counts pathogenic and benign variants in the specified range.
The method retrieves variants from the specified range and iterates through the ClinVar data
of each variant to count the number of pathogenic and benign SNVs.
Note:
The method considers "Pathogenic" and "Likely pathogenic" as pathogenic and "Benign"
and "Likely benign" as benign.
Args:
seqvar: The sequence variant being analyzed.
start_pos: The start position of the range.
end_pos: The end position of the range.
Returns:
Tuple[int, int]: The number of pathogenic and benign variants.
Raises:
AlgorithmError: If end position is less than the start position.
InvalidAPIResposeError: If the API response is invalid or cannot be processed.
"""
if end_pos < start_pos:
raise AlgorithmError("End position is less than the start position.")
response = self.annonars_client.get_variant_from_range(seqvar, start_pos, end_pos)
if response and response.clinvar:
pathogenic_variants = [
v
for v in response.clinvar
if v.records
and v.records[0].classifications
and v.records[0].classifications.germlineClassification
and v.records[0].classifications.germlineClassification.description
in [
"Pathogenic",
"Likely pathogenic",
]
and v.records[0].variationType == "VARIATION_TYPE_SNV"
]
benign_variants = [
v
for v in response.clinvar
if v.records
and v.records[0].classifications
and v.records[0].classifications.germlineClassification
and v.records[0].classifications.germlineClassification.description
in [
"Benign",
"Likely benign",
]
and v.records[0].variationType == "VARIATION_TYPE_SNV"
]
return len(pathogenic_variants), len(benign_variants)
else:
raise InvalidAPIResposeError("Failed to get variant from range. No ClinVar data.")
[docs]
def _get_uniprot_domain(self, seqvar: SeqVar) -> Optional[Tuple[int, int]]:
"""
Retrieve the UniProt domain for the variant and return the start and end positions if found
or None otherwise.
Args:
seqvar: The sequence variant being analyzed.
Returns:
Optional[Tuple[int, int]]: The start and end positions of the UniProt domain if found,
None otherwise.
Raises:
AlgorithmError: If tabix fails to query the UniProt file.
"""
try:
# Find path to the lib file
if seqvar.genome_release == GenomeRelease.GRCh37:
path = os.path.join(
settings.PATH_TO_ROOT, "lib", "uniprot", "grch37", "uniprot.bed.gz"
)
else:
path = os.path.join(
settings.PATH_TO_ROOT, "lib", "uniprot", "grch38", "uniprot.bed.gz"
)
tb = tabix.open(path)
records = tb.query(f"chr{seqvar.chrom}", seqvar.pos - 1, seqvar.pos)
# Return the first record
for record in records:
return int(record[1]), int(record[2])
return None
except tabix.TabixError as e:
raise AlgorithmError("Failed to check if the variant is in a UniProt domain.") from e
[docs]
def verify_pm1(self, seqvar: SeqVar, var_data: AutoACMGSeqVarData) -> Tuple[Optional[PM1], str]:
"""
Predict PM1 criteria.
The method verifies the PM1 criteria for the variant. It first checks if the variant is in
the mitochondrial genome. If so, it returns PM1 as not met. Otherwise, it counts the number
of pathogenic and benign variants in the range of 25 bases before and after the variant
position. If the number of pathogenic variants is greater than or equal to the threshold,
it returns PM1 as met. Otherwise, it retrieves the UniProt domain for the variant and
counts the number of pathogenic and benign variants in the domain. If the number of
pathogenic variants is greater than or equal to 1/4 of the domain length, it returns PM1 as
met. Otherwise, it returns PM1 as not met.
Args:
seqvar: The sequence variant being analyzed.
var_data: AutoACMGData object
Returns:
Tuple[Optional[PM1], str]: The prediction result and the explanation
"""
self.prediction_pm1 = PM1()
self.comment_pm1 = ""
if seqvar.chrom == "MT":
# skipped according to McCormick et al. (2020).
self.comment_pm1 = "The variant is in the mitochondrial genome. PM1 is not met."
self.prediction_pm1.PM1 = False
else:
try:
pathogenic_count, benign_count = self._count_vars(
seqvar, seqvar.pos - 25, seqvar.pos + 25
)
if pathogenic_count >= var_data.thresholds.pm1_pathogenic:
self.comment_pm1 += (
f"Found {var_data.thresholds.pm1_pathogenic} or more pathogenic variants. "
"PM1 is met."
)
self.prediction_pm1.PM1 = True
return self.prediction_pm1, self.comment_pm1
else:
self.comment_pm1 += (
f"Found less than {var_data.thresholds.pm1_pathogenic} pathogenic "
"variants. "
)
uniprot_domain = self._get_uniprot_domain(seqvar)
if not uniprot_domain:
self.comment_pm1 += "Variant is not in a UniProt domain. PM1 is not met."
self.prediction_pm1.PM1 = False
return self.prediction_pm1, self.comment_pm1
start_pos, end_pos = uniprot_domain
pathogenic_count, benign_count = self._count_vars(seqvar, start_pos, end_pos)
self.comment_pm1 += (
f"Found {pathogenic_count} Pathogenic variants "
f"and {benign_count} Benign variants "
f"in Uniprot Domain: {start_pos}-{end_pos}. "
)
if pathogenic_count >= (end_pos - start_pos) / 4:
self.comment_pm1 += "PM1 is met."
self.prediction_pm1.PM1 = True
return self.prediction_pm1, self.comment_pm1
else:
self.comment_pm1 += "PM1 is not met."
self.prediction_pm1.PM1 = False
except AutoAcmgBaseException as e:
self.comment_pm1 = f"Error occurred during PM1 prediction. Error: {e}"
self.prediction_pm1 = None
return self.prediction_pm1, self.comment_pm1
[docs]
def predict_pm1(self, seqvar: SeqVar, var_data: AutoACMGSeqVarData) -> AutoACMGCriteria:
"""Predict PM1 criteria."""
logger.info("Predict PM1")
pred, comment = self.verify_pm1(seqvar, var_data)
if pred:
pm1_pred = (
AutoACMGPrediction.Applicable
if pred.PM1
else (
AutoACMGPrediction.NotApplicable
if pred.PM1 is False
else AutoACMGPrediction.Failed
)
)
pm1_strength = pred.PM1_strength
else:
pm1_pred = AutoACMGPrediction.Failed
pm1_strength = AutoACMGStrength.PathogenicModerate
return AutoACMGCriteria(
name="PM1",
prediction=pm1_pred,
strength=pm1_strength,
summary=comment,
)