Source code for harbor.pli.plip_analysis_schema

from pydantic import BaseModel, Field, model_validator
from enum import Enum, auto
from plip.structure.preparation import PDBComplex, PLInteraction
import pandas as pd
from pathlib import Path
import json


[docs] class InteractionType(Enum): """ Protein-Ligand Interaction Types. Names refer to the role of the Ligand """ HydrogenBondDonor = "HydrogenBondDonor" HydrogenBondAcceptor = "HydrogenBondAcceptor" HydrophobicInteraction = "HydrophobicInteraction" PiStacking = "PiStacking" HalogenBond = "HalogenBond" SaltBridge = "SaltBridge" def __str__(self): return self.value
[docs] class AtomType(str, Enum): NONE = "none" H = "H" C = "C" N = "N" O = "O" S = "S" P = "P" F = "F" Cl = "Cl" Br = "Br" def __str__(self): return self.value
[docs] class ProteinResidueType(str, Enum): ALA = "ALA" ARG = "ARG" ASN = "ASN" ASP = "ASP" CYS = "CYS" GLN = "GLN" GLU = "GLU" GLY = "GLY" HIS = "HIS" ILE = "ILE" LEU = "LEU" LYS = "LYS" MET = "MET" PHE = "PHE" PRO = "PRO" SER = "SER" THR = "THR" TRP = "TRP" TYR = "TYR" VAL = "VAL" def __str__(self): return self.value
[docs] class FormalCharge(str, Enum): Positive = "positive" Negative = "negative" Neutral = "neutral" def __str__(self): return self.name
[docs] class ProteinLigandInteraction(BaseModel): interaction_type: InteractionType = Field( ..., description="Type of interaction between protein and ligand." ) protein_residue_number: int = Field( ..., description="Residue number of the protein involved in the interaction." ) protein_residue_type: ProteinResidueType = Field( ..., description="Amino acid identify of the residue involved in the interaction.", ) to_sidechain: bool = Field( ..., description="Indicates if the interaction is to the sidechain of the protein residue (if False, it is to the backbone).", ) count: int = Field( 1, description="Count of the number of interactions of this type." ) ligand_atom_type: AtomType = Field( "none", description="Type of the ligand atom involved in the interaction." ) protein_atom_type: AtomType = Field( "none", description="Type of the protein atom involved in the interaction." ) ligand_charge: FormalCharge = Field( "neutral", description="Formal charge of the ligand involved in the interaction.", ) protein_charge: FormalCharge = Field( "neutral", description="Formal charge of the protein involved in the interaction.", ) protein_atom_number: int = Field( -1, description="Atom number of the protein involved in the interaction. Optional, can be -1 if not applicable.", ) ligand_atom_number: int = Field( -1, description="Atom number of the ligand involved in the interaction. Optional, can be -1 if not applicable.", ) def to_json_dict(self) -> dict: return { k: v.value if k in [self.interaction_type, self.protein_residue_type] else v for k, v in self.dict().items() }
[docs] def plip_constructor(plip: PLInteraction) -> list[ProteinLigandInteraction]: """ Function to get a tidy dataframe of interactions from a plip fingerprint Parameters ---------- plip : PLInteraction The PLIP interaction object containing all interaction types for a given binding site. Returns ------- list[ProteinLigandInteraction] """ interactions = [] for interaction in plip.all_itypes: interaction_type = type(interaction).__name__ protein_atom_type = AtomType.NONE ligand_atom_type = AtomType.NONE ligand_charge = FormalCharge.Neutral protein_charge = FormalCharge.Neutral protein_atom_number = -1 ligand_atom_number = -1 if interaction_type == "hbond": output_interaction_type = ( InteractionType.HydrogenBondAcceptor if interaction.protisdon else InteractionType.HydrogenBondDonor ) protein_atom_type = AtomType( interaction.dtype[0] if interaction.protisdon else interaction.atype[0] ) ligand_atom_type = AtomType( interaction.atype[0] if interaction.protisdon else interaction.dtype[0] ) protein_atom_number = ( interaction.a_orig_idx if interaction.protisdon else interaction.d_orig_idx ) ligand_atom_number = ( interaction.d_orig_idx if interaction.protisdon else interaction.a_orig_idx ) elif interaction_type == "halogenbond": # We assume that halogen bonds are always ligand-donor protein-acceptor output_interaction_type = InteractionType.HalogenBond protein_atom_type = AtomType(interaction.acctype[0]) ligand_atom_type = AtomType(interaction.donortype[0]) protein_atom_number = interaction.acc_orig_idx ligand_atom_number = interaction.don_orig_idx elif interaction_type == "saltbridge": output_interaction_type = InteractionType.SaltBridge ligand_charge = ( FormalCharge.Negative if interaction.protispos else FormalCharge.Positive ) protein_charge = ( FormalCharge.Positive if interaction.protispos else FormalCharge.Negative ) elif interaction_type == "hydroph_interaction": output_interaction_type = InteractionType.HydrophobicInteraction elif interaction_type == "pication": output_interaction_type = InteractionType.PiStacking protein_charge = ( FormalCharge.Positive if interaction.protcharged else FormalCharge.Negative ) elif interaction_type == "pistack": output_interaction_type = InteractionType.PiStacking else: raise NotImplementedError( f"Interaction type not implemented: {interaction_type} for interaction: {interaction}" ) protein_residue_number = interaction.resnr protein_residue_type = interaction.restype # These interactions must be to the sidechain, and therefore PLIP does not provide the sidechain boolean if interaction_type in [ "hydroph_interaction", "pistack", "saltbridge", "pication", ]: to_sidechain = True else: to_sidechain = interaction.sidechain interactions.append( ProteinLigandInteraction( interaction_type=output_interaction_type, protein_residue_number=protein_residue_number, protein_residue_type=protein_residue_type, to_sidechain=to_sidechain, ligand_atom_type=ligand_atom_type, protein_atom_type=protein_atom_type, ligand_charge=ligand_charge, protein_charge=protein_charge, protein_atom_number=protein_atom_number, ligand_atom_number=ligand_atom_number, ) ) return interactions
[docs] def collect_duplicates( plints: list[ProteinLigandInteraction], ) -> list[ProteinLigandInteraction]: """ This function takes a list of ProteinLigandInteractions and returns a list of unique interactions with the count updated if any duplicates are found. Parameters ---------- plints : list[ProteinLigandInteraction] A list of ProteinLigandInteraction objects, potentially with duplicates. Returns ------- list[ProteinLigandInteraction] """ obj_dict = {} for obj in plints: if obj.json() in obj_dict: obj_dict[obj.json()].count += 1 else: obj_dict[obj.json()] = obj return list(obj_dict.values())
[docs] class PLIntReport(BaseModel): """ Class to store a report of PLIP interactions Parameters ---------- structure : str The path to the PDB structure file. interactions : list[ProteinLigandInteraction] A list of ProteinLigandInteraction objects representing the interactions found in the structure. """ report_id: str = Field( ..., description="Identifier for the structure, typically the input structure filename without extension.", ) structure: str = Field(..., description="Path to the PDB structure file.") interactions: list[ProteinLigandInteraction] = Field( ..., description="List of Protein-Ligand Interaction objects found in the structure.", )
[docs] @classmethod def from_complex_path( cls, complex_path: str | Path, ligand_id="UNK" ) -> "PLIntReport": """ Create a PLIntReport from a PDBComplex object loaded from a file path. Parameters ---------- complex_path : str | Path The path to the PDB file containing the protein-ligand complex. ligand_id : str, optional The identifier of the ligand in the complex (default is "UNK"). Returns ------- PLIntReport An instance of PLIntReport containing the structure path and a list of ProteinLigandInteraction objects. """ my_mol = PDBComplex() my_mol.load_pdb(str(complex_path)) my_mol.analyze() binding_site = None for k, v in my_mol.interaction_sets.items(): if ligand_id in k: binding_site = k break # Convert the PLIP interaction analysis of the binding site to a list of ProteinLigandInteractions and then # Collect any duplicate interactions if binding_site: raw_plip_report = my_mol.interaction_sets[binding_site] interactions = collect_duplicates(plip_constructor(raw_plip_report)) else: interactions = [] return cls( report_id=complex_path.stem, structure=str(complex_path), interactions=interactions, )
[docs] def to_csv(self, path: str | Path): """ Save the interactions to a CSV file. Parameters ---------- path : str | Path The path where the CSV file will be saved. """ df = pd.DataFrame.from_records( [json.loads(interaction.json()) for interaction in self.interactions] ) df["report_id"] = self.report_id # Add report_id to each row for reference df["structure"] = self.structure # Add structure path to each row for reference df.to_csv(path, index=False)
[docs] @classmethod def from_csv(cls, plint_report: str | Path) -> "PLIntReport": """ Create a PLIntReport from a CSV file. Parameters ---------- plint_report : str | Path The path to the CSV file containing the interactions. Returns ------- PLIntReport An instance of PLIntReport containing the structure path and a list of ProteinLigandInteraction objects. """ df = pd.read_csv(plint_report) report_id = ( df["report_id"].iloc[0] if "report_id" in df.columns else Path(plint_report).stem ) pdb_file = ( df["structure"].iloc[0] if "structure" in df.columns else str(plint_report) ) interactions = [ProteinLigandInteraction(**row) for _, row in df.iterrows()] return PLIntReport( report_id=report_id, structure=pdb_file, interactions=interactions )
[docs] class FingerprintLevel(Enum): """ Enum to specify the level of detail for the fingerprint """ ByTotalInteractions = "ByTotalInteractions" ByInteractionType = "ByInteractionType" ByInteractionTypeAndResidueType = "ByInteractionTypeAndResidueType" ByInteractionTypeAndAtomTypes = "ByInteractionTypeAndAtomTypes" ByInteractionTypeAndResidueTypeAndBBorSC = ( "ByInteractionTypeAndResidueTypeAndBBorSC" ) ByInteractionTypeAndResidueTypeAndNumber = ( "ByInteractionTypeAndResidueTypeAndNumber" ) ByEverything = "ByEverything" def __str__(self): return self.value
[docs] def calculate_fingerprint(plint_report: PLIntReport, level: FingerprintLevel) -> dict: """ Calculate a fingerprint of the protein-ligand interactions based on the specified level of detail. Parameters ---------- plint_report : PLIntReport A report containing a list of protein-ligand interactions. level : FingerprintLevel The level of detail for the fingerprint. Options include: - ByTotalInteractions: Returns total count of interactions - ByInteractionType: Groups by type (hydrogen bond, salt bridge, etc.) - ByInteractionTypeAndResidueType: Groups by type and residue (e.g., HBond_ALA) - ByInteractionTypeAndAtomTypes: Groups by type and atom types involved - ByInteractionTypeAndResidueTypeAndBBorSC: Groups by type, residue and backbone/sidechain - ByInteractionTypeAndResidueTypeAndNumber: Groups by type, residue and residue number - ByEverything: Creates unique key for each unique interaction property combination Returns ------- dict A dictionary where keys are interaction descriptors based on the specified level and values are the counts of those interactions. For example: - ByTotalInteractions: {'TotalInteractions': 5} - ByInteractionType: {'HydrogenBond': 2, 'SaltBridge': 3} - ByInteractionTypeAndResidueType: {'HydrogenBond_ALA': 1, 'HydrogenBond_SER': 1} Raises ------ ValueError If an invalid FingerprintLevel is provided. """ fingerprint_dict = {} if level == FingerprintLevel.ByTotalInteractions: fingerprint_dict["TotalInteractions"] = len(plint_report.interactions) else: for interaction in plint_report.interactions: if level == FingerprintLevel.ByInteractionType: key = interaction.interaction_type.value elif level == FingerprintLevel.ByInteractionTypeAndResidueType: key = f"{interaction.interaction_type.value}_{interaction.protein_residue_type}" elif level == FingerprintLevel.ByInteractionTypeAndAtomTypes: key = f"{interaction.interaction_type.value}_Protein_{interaction.protein_atom_type}_Ligand_{interaction.ligand_atom_type}" elif level == FingerprintLevel.ByInteractionTypeAndResidueTypeAndBBorSC: key = f"{interaction.interaction_type.value}_{interaction.protein_residue_type}_{'SC' if interaction.to_sidechain else 'BB'}" elif level == FingerprintLevel.ByInteractionTypeAndResidueTypeAndNumber: key = f"{interaction.interaction_type.value}_{interaction.protein_residue_type}{interaction.protein_residue_number}" elif level == FingerprintLevel.ByEverything: key = "_".join([f"{k}_{str(v)}" for k, v in interaction.dict().items()]) else: raise ValueError("Invalid Fingerprint Level") original_count = fingerprint_dict.get(key, 0) fingerprint_dict[key] = original_count + interaction.count return fingerprint_dict
[docs] class SimilarityScore(BaseModel): """ Class to store a similarity score between two fingerprints with some record of how it was calculated. """ provenance: str = Field( ..., description="Method used to calculate the similarity score." ) score: float = Field( ..., description="Similarity score between the two fingerprints, ranging from 0 to 1.", ) number_of_interactions_in_reference: int = Field( ..., description="Total number of interactions in the reference fingerprint." ) number_of_interactions_in_query: int = Field( ..., description="Total number of interactions in the query fingerprint." ) number_of_interactions_in_intersection: int = Field( ..., description="Number of interactions common to both fingerprints." ) number_of_interactions_in_union: int = Field( ..., description="Total number of unique interactions across both fingerprints." )
[docs] @model_validator(mode="before") def check_score_range(cls, values): """ Validate that the score is between 0 and 1. """ score = values.get("score") if not (0 <= score <= 1): raise ValueError("Score must be between 0 and 1.") return values
[docs] @model_validator(mode="before") def check_interaction_counts(cls, values): """ Validate that the interaction counts are non-negative and make sense. """ num_ref = values.get("number_of_interactions_in_reference") num_query = values.get("number_of_interactions_in_query") num_intersection = values.get("number_of_interactions_in_intersection") num_union = values.get("number_of_interactions_in_union") if num_ref < 0 or num_query < 0 or num_intersection < 0 or num_union < 0: raise ValueError("Interaction counts must be non-negative.") if not (num_intersection <= num_ref and num_intersection <= num_query): raise ValueError( f"Number of interactions in intersection ({num_intersection}) cannot exceed those in reference ({num_ref}) or query ({num_query})." ) if not (num_union >= max(num_ref, num_query)): raise ValueError( f"Number of interactions in union ({num_union}) must be at least as large as the larger of the two fingerprints (ref {num_ref}, query {num_query})." ) if not num_ref + num_query - num_intersection == num_union: raise ValueError( f"Union {num_union} must equal the sum of both fingerprints (ref {num_ref}, query {num_query}) minus the intersection ({num_intersection})." ) return values
[docs] def calculate_tversky( fingerprint1: dict, fingerprint2: dict, alpha: float = 1, beta: float = 0 ) -> SimilarityScore: """ Calculate the Tversky Index between two fingerprints. The Tversky Index is an asymmetric similarity measure that generalizes several other similarity coefficients: Tversky Index = |A ∩ B| / (|A ∩ B| + α|A - B| + β|B - A|) - For Recall: α=1, β=0 - For Tanimoto Coefficient: α=β=1 - For Dice Coefficient: α=β=0.5 Parameters ---------- fingerprint1 : dict Reference fingerprint dictionary with features as keys and counts as values. fingerprint2 : dict Query fingerprint dictionary with features as keys and counts as values. alpha : float, optional Weight for features unique to fingerprint1 (default=1). beta : float, optional Weight for features unique to fingerprint2 (default=0). Returns ------- SimilarityScore Object containing the similarity score and additional metrics about the comparison. """ # First get the union of the keys fp_types = set(fingerprint1.keys()).union(set(fingerprint2.keys())) # Calculate the number of interactions that are in both fingerprints matched = sum( [min(fingerprint1.get(a, 0), fingerprint2.get(a, 0)) for a in fp_types] ) # Calculate the number of interactions in the union union = sum(fingerprint1.values()) + sum(fingerprint2.values()) - matched union_v2 = sum( [max(fingerprint1.get(a, 0), fingerprint2.get(a, 0)) for a in fp_types] ) if not union == union_v2: raise ValueError( f"Union calculation mismatch: {union} != {union_v2}. Please check the input fingerprints." ) # Calculate the Tversky Index score = matched / ( matched + alpha * (sum(fingerprint1.values()) - matched) + beta * (sum(fingerprint2.values()) - matched) ) return SimilarityScore( provenance=f"Tversky_alpha{alpha:.2f}_beta{beta:.2f}", score=score, number_of_interactions_in_reference=sum(fingerprint1.values()), number_of_interactions_in_query=sum(fingerprint2.values()), number_of_interactions_in_intersection=matched, number_of_interactions_in_union=union, )
[docs] class InteractionScore(BaseModel): """ Class to store the results of comparing two PLIntReports. Automatically calculates the Tanimoto Coefficient (alpha=beta=1) and the Tversky Index (alpha=1, beta=0) """ provenance: str = Field( ..., description="Level of detail used for the fingerprint comparison (e.g., ByInteractionType).", ) number_of_interactions_in_query: int = Field( ..., description="Total number of interactions in the query structure." ) number_of_interactions_in_reference: int = Field( ..., description="Total number of interactions in the reference structure." ) number_of_interactions_in_intersection: int = Field( ..., description="Number of interactions common to both structures." ) number_of_interactions_in_union: int = Field( ..., description="Total number of unique interactions across both structures." ) tanimoto_coefficient: float = Field( ..., description="Tanimoto coefficient (α=β=1) measuring similarity between the structures, ranges from 0 to 1.", ) tversky_index: float = Field( ..., description="Tversky index (α=1, β=0) measuring how well the query matches the reference, ranges from 0 to 1.", ) reference_fingerprint: dict = Field( ..., description="Dictionary containing the interaction fingerprint of the reference structure.", ) query_fingerprint: dict = Field( ..., description="Dictionary containing the interaction fingerprint of the query structure.", ) @classmethod def from_fingerprints( cls, reference: PLIntReport, query: PLIntReport, level: FingerprintLevel ) -> "InteractionScore": # Calculate the fingerprints according to the specified level of detail reference = calculate_fingerprint(reference, level) query = calculate_fingerprint(query, level) # Calculate the Tanimoto Coefficient and Tversky Index tanimoto = calculate_tversky(reference, query, alpha=1, beta=1) tversky_index = calculate_tversky(reference, query) return cls( provenance=level.value, number_of_interactions_in_query=tanimoto.number_of_interactions_in_query, number_of_interactions_in_reference=tanimoto.number_of_interactions_in_reference, number_of_interactions_in_intersection=tanimoto.number_of_interactions_in_intersection, number_of_interactions_in_union=tanimoto.number_of_interactions_in_union, tanimoto_coefficient=tanimoto.score, tversky_index=tversky_index.score, reference_fingerprint=reference, query_fingerprint=query, )