Source code for harbor.schema.data

from pydantic import BaseModel, Field, model_validator
from enum import Enum
import pandas as pd, numpy as np

from harbor.schema.measurement_types import (
    ExperimentalMeasurementType,
    PredictedMeasurementType,
    IsActive,
)


[docs] class Molecule(BaseModel): """ Molecule """ id: str = Field(..., description="ID") smiles: str = Field(None, description="SMILES string")
[docs] class MoleculePropertyType(Enum): """ MoleculePropertyType """ pka = "pka" logp = "logp" solubility = "solubility" melting_point = "melting_point"
[docs] class Prediction(BaseModel): """ Prediction """ molecule: Molecule = Field(..., description="Molecule") type: PredictedMeasurementType = Field(..., description="Prediction type") value: float = Field(..., description="Prediction")
[docs] class Experiment(BaseModel): """ Experiment """ molecule: Molecule = Field(..., description="Molecule") type: ExperimentalMeasurementType = Field(..., description="Experiment type") value: float = Field(..., description="Experimental value") def to_active_inactive(self, threshold: float) -> "Experiment": if self.type.higher_is_better: return Experiment( molecule=self.molecule, type=IsActive, value=1 if self.value >= threshold else 0, ) else: return Experiment( molecule=self.molecule, type=IsActive, value=1 if self.value <= threshold else 0, )
[docs] class Dataset(BaseModel): """ Dataset """ molecules: list[Molecule] = Field(..., description="Molecules") predictions: list[Prediction] = Field(..., description="Predictions") experiments: list[Experiment] = Field(..., description="Experiment") prediction_type: PredictedMeasurementType = Field( ..., description="Prediction type" ) experiment_type: ExperimentalMeasurementType = Field( ..., description="Experiment type" ) @property def predicted_values(self) -> np.ndarray: return np.array([p.value for p in self.predictions]) def get_higher_is_better_values(self) -> np.ndarray: if self.prediction_type.higher_is_better: return self.predicted_values else: return np.array([-p.value for p in self.predictions]) @property def experimental_values(self) -> np.ndarray: return np.array([e.value for e in self.experiments]) @property def molecule_ids(self) -> np.ndarray: return np.array([p.molecule.id for p in self.predictions]) def to_active_inactive(self, threshold: float) -> "ActiveInactiveDataset": experiments = [e.to_active_inactive(threshold) for e in self.experiments] return ActiveInactiveDataset( molecules=self.molecules, predictions=self.predictions, experiments=experiments, prediction_type=self.prediction_type, experiment_type=IsActive, ) @model_validator(mode="after") def check_consistency(self): if not all(p.type == self.prediction_type for p in self.predictions): raise ValueError("Inconsistent prediction types") if not all(e.type == self.experiment_type for e in self.experiments): raise ValueError("Inconsistent experiment types") if not len(self.molecules) == len(self.predictions) == len(self.experiments): raise ValueError( f"Inconsistent number of " f"molecules({len(self.molecules)}), " f"predictions({len(self.predictions)}), and " f"experiments({len(self.experiments)})" ) @classmethod def from_csv( cls, filename: str, id_column: str, experimental_data_column: str, prediction_column: str, prediction_type: PredictedMeasurementType, experiment_type: ExperimentalMeasurementType, smiles_column: str = None, ) -> "Dataset": df = pd.read_csv(filename) return cls.from_dataframe( df, id_column, experimental_data_column, prediction_column, prediction_type, experiment_type, smiles_column, ) @classmethod def from_dataframe( cls, df: pd.DataFrame, id_column: str, experimental_data_column: str, prediction_column: str, prediction_type: PredictedMeasurementType, experiment_type: ExperimentalMeasurementType, smiles_column: str = None, ) -> "Dataset": molecules = [] predictions = [] experiments = [] for row in df.iterrows(): if row[1][id_column] is None: raise ValueError(f"ID is missing in {row}") if row[1][experimental_data_column] is None: raise ValueError(f"Experimental data is missing in {row}") if smiles_column and row[1][smiles_column] is None: raise ValueError(f"SMILES is missing in {row}") molecule = ( Molecule(id=row[1][id_column], smiles=row[1][smiles_column]) if smiles_column else Molecule(id=row[1][id_column]) ) molecules.append(molecule) predictions.append( Prediction( molecule=molecule, type=prediction_type, value=row[1][prediction_column], ) ) experiments.append( Experiment( molecule=molecule, type=experiment_type, value=row[1][experimental_data_column], ) ) return cls( molecules=molecules, predictions=predictions, experiments=experiments, prediction_type=prediction_type, experiment_type=experiment_type, )
[docs] class ActiveInactiveDataset(Dataset): """ ActiveInactiveDataset """ experiment_type: ExperimentalMeasurementType = Field( IsActive, frozen=True, description="Experiment type" ) @property def n_actives(self) -> int: return int(np.sum(self.experimental_values)) @property def n_inactives(self) -> int: return int(len(self.experimental_values) - self.n_actives)