Source code for lomas_server.dp_queries.dummy_dataset

import datetime
import random

import numpy as np
import pandas as pd

from admin_database.admin_database import AdminDatabase
from constants import (
    DEFAULT_NUMERICAL_MAX,
    DEFAULT_NUMERICAL_MIN,
    DUMMY_NB_ROWS,
    DUMMY_SEED,
    NB_RANDOM_NONE,
    RANDOM_DATE_RANGE,
    RANDOM_DATE_START,
    RANDOM_STRINGS,
)
from private_dataset.in_memory_dataset import InMemoryDataset
from utils.error_handler import InternalServerException
from utils.input_models import GetDummyDataset


[docs] def make_dummy_dataset( metadata: dict, nb_rows: int = DUMMY_NB_ROWS, seed: int = DUMMY_SEED ) -> pd.DataFrame: """ Create a dummy dataset based on a metadata dictionnary Args: metadata (dict): dictionnary of the metadata of the real dataset nb_rows (int, optional): _description_. Defaults to DUMMY_NB_ROWS. seed (int, optional): _description_. Defaults to DUMMY_SEED. Raises: InternalServerException: If any unknown column type occurs. Returns: pd.DataFrame: dummy dataframe based on metadata """ # Setting seed random.seed(seed) np.random.seed(seed) # Create dataframe df = pd.DataFrame() for col_name, data in metadata["columns"].items(): # Create a random serie based on the data type match data["type"]: case "string": if "cardinality" in data.keys(): cardinality = data["cardinality"] if "categories" in data.keys(): categories = data["categories"] serie = pd.Series( random.choices(categories, k=nb_rows) ) else: serie = pd.Series( random.choices( RANDOM_STRINGS[:cardinality], k=nb_rows ) ) else: serie = pd.Series( random.choices(RANDOM_STRINGS, k=nb_rows) ) case "boolean": # type boolean instead of bool will allow null values serie = pd.Series( random.choices([True, False], k=nb_rows), dtype="boolean" ) case "int" | "float": column_min = ( data["lower"] if "lower" in data.keys() else DEFAULT_NUMERICAL_MIN ) column_max = ( data["upper"] if "upper" in data.keys() else DEFAULT_NUMERICAL_MAX ) if data["type"] == "int": # pd.Series to ensure consistency between different types serie = pd.Series( np.random.randint(column_min, column_max, size=nb_rows) ) else: serie = pd.Series( np.random.uniform(column_min, column_max, size=nb_rows) ) case "datetime": # From start date and random on a range above start = datetime.datetime.strptime( RANDOM_DATE_START, "%m/%d/%Y" ) serie = pd.Series( [ start + datetime.timedelta( seconds=random.randrange(RANDOM_DATE_RANGE) ) for _ in range(nb_rows) ] ) case "unknown": # Unknown column are ignored by smartnoise sql continue case _: raise InternalServerException( f"unknown column type in metadata: \ {data['type']} in column {col_name}" ) # Add None value if the column is nullable nullable = data["nullable"] if "nullable" in data.keys() else False if nullable: # Get the indexes of 'serie' indexes = serie.index.tolist() for _ in range(0, NB_RANDOM_NONE): index_to_insert = random.choice(indexes) serie.at[index_to_insert] = None # Add randomly generated data as new column of dataframe df[col_name] = serie return df
[docs] def get_dummy_dataset_for_query( admin_database: AdminDatabase, query_json: GetDummyDataset ) -> InMemoryDataset: """Get a dummy dataset for a given query. Args: admin_database (AdminDatabase): An initialized instance of AdminDatabase. query_json (GetDummyDataset): The JSON request object for the query. Returns: InMemoryDataset: An in memory dummy dataset instance. """ # Create dummy dataset based on seed and number of rows ds_metadata = admin_database.get_dataset_metadata(query_json.dataset_name) ds_df = make_dummy_dataset( ds_metadata, query_json.dummy_nb_rows, query_json.dummy_seed ) ds_private_dataset = InMemoryDataset(ds_metadata, ds_df) return ds_private_dataset