Source code for lomas_client.libraries.diffprivlib

from typing import List, Optional, Type

from diffprivlib_logger import serialise_pipeline
from lomas_core.models.requests import (
    DiffPrivLibDummyQueryModel,
    DiffPrivLibQueryModel,
    DiffPrivLibRequestModel,
)
from lomas_core.models.responses import CostResponse, QueryResponse
from sklearn.pipeline import Pipeline

from lomas_client.constants import (
    DUMMY_NB_ROWS,
    DUMMY_SEED,
)
from lomas_client.http_client import LomasHttpClient
from lomas_client.utils import validate_model_response


[docs] class DiffPrivLibClient: """A client for executing and estimating the cost of DiffPrivLib queries.""" def __init__(self, http_client: LomasHttpClient): self.http_client = http_client
[docs] def cost( self, pipeline: Pipeline, feature_columns: List[str] = [""], target_columns: List[str] = [""], test_size: float = 0.2, test_train_split_seed: int = 1, imputer_strategy: str = "drop", ) -> Optional[CostResponse]: """This function estimates the cost of executing a DiffPrivLib query. Args: pipeline (sklearn.pipeline): DiffPrivLib pipeline with three conditions: - The pipeline MUST start with a `models.StandardScaler`. Otherwise a PrivacyLeakWarning is raised by DiffPrivLib library and is treated as an error in lomas server. - `random_state` fields can only be int (`RandomState` will not work). - `accountant` fields must be None. Note: as in DiffPrivLib, avoid any DiffprivlibCompatibilityWarning to ensure that the pipeline does what is intended. feature_columns (list[str]): the list of feature column to train target_columns (list[str], optional): the list of target column to predict \ May be None for certain models. test_size (float, optional): proportion of the test set \ Defaults to 0.2. test_train_split_seed (int, optional): seed for random train test split \ Defaults to 1. imputer_strategy (str, optional): imputation strategy. Defaults to "drop". "drop": will drop all rows with missing values "mean": will replace values by the mean of the column values "median": will replace values by the median of the column values "most_frequent": : will replace values by the most frequent values Returns: Optional[dict[str, float]]: A dictionary containing the estimated cost. """ body_dict = { "dataset_name": self.http_client.dataset_name, "diffprivlib_json": serialise_pipeline(pipeline), "feature_columns": feature_columns, "target_columns": target_columns, "test_size": test_size, "test_train_split_seed": test_train_split_seed, "imputer_strategy": imputer_strategy, } body = DiffPrivLibRequestModel.model_validate(body_dict) res = self.http_client.post("estimate_diffprivlib_cost", body) return validate_model_response(res, CostResponse)
[docs] def query( self, pipeline: Pipeline, feature_columns: List[str], target_columns: Optional[List[str]] = None, test_size: float = 0.2, test_train_split_seed: int = 1, imputer_strategy: str = "drop", dummy: bool = False, nb_rows: int = DUMMY_NB_ROWS, seed: int = DUMMY_SEED, ) -> Optional[QueryResponse]: """Trains a DiffPrivLib pipeline and return a trained Pipeline. Args: pipeline (sklearn.pipeline): DiffPrivLib pipeline with three conditions: - The pipeline MUST start with a `models.StandardScaler`. Otherwise a PrivacyLeakWarning is raised by DiffPrivLib library and is treated as an error in lomas server. - `random_state` fields can only be int (`RandomState` will not work). - `accountant` fields must be None. Note: as in DiffPrivLib, avoid any DiffprivlibCompatibilityWarning to ensure that the pipeline does what is intended. feature_columns (list[str]): the list of feature column to train target_columns (list[str], optional): the list of target column to predict \ May be None for certain models. test_size (float, optional): proportion of the test set \ Defaults to 0.2. test_train_split_seed (int, optional): seed for random train test split \ Defaults to 1. imputer_strategy (str, optional): imputation strategy. Defaults to "drop". "drop": will drop all rows with missing values "mean": will replace values by the mean of the column values "median": will replace values by the median of the column values "most_frequent": : will replace values by the most frequent values dummy (bool, optional): Whether to use a dummy dataset. Defaults to False. nb_rows (int, optional): The number of rows in the dummy dataset.\ Defaults to DUMMY_NB_ROWS. seed (int, optional): The random seed for generating the dummy dataset.\ Defaults to DUMMY_SEED. Returns: Optional[Pipeline]: A trained DiffPrivLip pipeline """ body_dict = { "dataset_name": self.http_client.dataset_name, "diffprivlib_json": serialise_pipeline(pipeline), "feature_columns": feature_columns, "target_columns": target_columns, "test_size": test_size, "test_train_split_seed": test_train_split_seed, "imputer_strategy": imputer_strategy, } request_model: Type[DiffPrivLibRequestModel] if dummy: endpoint = "dummy_diffprivlib_query" body_dict["dummy_nb_rows"] = nb_rows body_dict["dummy_seed"] = seed request_model = DiffPrivLibDummyQueryModel else: endpoint = "diffprivlib_query" request_model = DiffPrivLibQueryModel body = request_model.model_validate(body_dict) res = self.http_client.post(endpoint, body) return validate_model_response(res, QueryResponse)