Source code for lomas_server.dp_queries.dp_libraries.utils

import pickle
from base64 import b64encode
from typing import Any

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

from lomas_server.constants import NUMERICAL_DTYPES
from lomas_server.utils.error_handler import InvalidQueryException



[docs]
def handle_missing_data(
    df: pd.DataFrame, imputer_strategy: str
) -> pd.DataFrame:
    """Impute missing data based on given imputation strategy for NaNs
    Args:
        df (pd.DataFrame): dataframe with the data
        imputer_strategy (str): string to indicate imputatation for NaNs
            "drop": will drop all rows with missing values
            "mean": will replace values by the mean of the column values
            "median": will replace values by the median of the column values
            "most_frequent": : will replace values by the most frequent values

    Raises:
        InvalidQueryException: If the "imputer_strategy" does not exist

    Returns:
        df (pd.DataFrame): dataframe with the imputed data
    """
    dtypes = df.dtypes

    if imputer_strategy == "drop":
        df = df.dropna()
    elif imputer_strategy in ["mean", "median"]:
        numerical_cols = df.select_dtypes(
            include=NUMERICAL_DTYPES
        ).columns.tolist()
        categorical_cols = [
            col for col in df.columns if col not in numerical_cols
        ]

        # Impute numerical features using given strategy
        imp_mean = SimpleImputer(strategy=imputer_strategy)
        df_num_imputed = imp_mean.fit_transform(df[numerical_cols])

        # Impute categorical features with most frequent value
        imp_most_frequent = SimpleImputer(strategy="most_frequent")
        df[categorical_cols] = df[categorical_cols].astype("object")
        df[categorical_cols] = df[categorical_cols].replace({pd.NA: np.nan})
        df_cat_imputed = imp_most_frequent.fit_transform(df[categorical_cols])

        # Combine imputed dataframes
        df = pd.concat(
            [
                pd.DataFrame(df_num_imputed, columns=numerical_cols),
                pd.DataFrame(df_cat_imputed, columns=categorical_cols),
            ],
            axis=1,
        )
    elif imputer_strategy == "most_frequent":
        # Impute all features with most frequent value
        imp_most_frequent = SimpleImputer(strategy=imputer_strategy)
        df[df.columns] = df[df.columns].astype("object")
        df[df.columns] = df[df.columns].replace({pd.NA: np.nan})
        df = pd.DataFrame(
            imp_most_frequent.fit_transform(df), columns=df.columns
        )
    else:
        raise InvalidQueryException(
            f"Imputation strategy {imputer_strategy} not supported."
        )

    df = df.astype(dtype=dtypes)
    return df




[docs]
def serialise_model(model: Any) -> str:
    """
    Serialise a python object (fitted Smartnoise Synth synthesizer of
    fitted DiffPrivLib pipeline) into an utf-8 string

    Args:
        model (Any): An object to serialise

    Returns:
        str: string of serialised model
    """
    serialised = b64encode(pickle.dumps(model))
    return serialised.decode("utf-8")