Source code for lomas_core.models.requests_examples

from typing import Dict

from pydantic import JsonValue

from lomas_core.constants import SSynthGanSynthesizer
from lomas_core.models.constants import (
    DIFFPRIVLIB_VERSION,
    DUMMY_NB_ROWS,
    DUMMY_SEED,
    OPENDP_VERSION,
)

# Query constants
PENGUIN_DATASET: str = "PENGUIN"
QUERY_EPSILON: float = 0.1
QUERY_DELTA: float = 0.00001
SQL_QUERY: str = "SELECT COUNT(*) AS NB_ROW FROM df"
DP_MECHANISM: JsonValue = {"count": "gaussian"}
FEATURE_COLUMNS: JsonValue = [
    "bill_length_mm",
    "bill_depth_mm",
    "flipper_length_mm",
    "body_mass_g",
]
TARGET_COLUMNS: JsonValue = ["species"]
SPLIT_SEED: int = 4
TEST_SIZE: float = 0.2
IMPUTER_STRATEGY: str = "drop"
SNSYNTH_NB_SAMPLES: int = 200


[docs] def make_dummy(example_query: Dict[str, JsonValue]) -> Dict[str, JsonValue]: """Make dummy example dummy query based on example query.""" example_query_dummy = dict(example_query) example_query_dummy["dummy_nb_rows"] = DUMMY_NB_ROWS example_query_dummy["dummy_seed"] = DUMMY_SEED return example_query_dummy
# Lomas logic # ----------------------------------------------------------------------------- example_get_admin_db_data: Dict[str, JsonValue] = { "dataset_name": PENGUIN_DATASET, } example_get_dummy_dataset: Dict[str, JsonValue] = { "dataset_name": PENGUIN_DATASET, "dummy_nb_rows": DUMMY_NB_ROWS, "dummy_seed": DUMMY_SEED, } # Smartnoise-SQL # ----------------------------------------------------------------------------- example_smartnoise_sql_cost: Dict[str, JsonValue] = { "query_str": SQL_QUERY, "dataset_name": PENGUIN_DATASET, "epsilon": QUERY_EPSILON, "delta": QUERY_DELTA, "mechanisms": DP_MECHANISM, } example_smartnoise_sql: Dict[str, JsonValue] = dict(example_smartnoise_sql_cost) example_smartnoise_sql["postprocess"] = True example_dummy_smartnoise_sql: Dict[str, JsonValue] = make_dummy(example_smartnoise_sql) # Smartnoise-Synth # ----------------------------------------------------------------------------- example_smartnoise_synth_cost: Dict[str, JsonValue] = { "dataset_name": PENGUIN_DATASET, "synth_name": SSynthGanSynthesizer.DP_CTGAN, "epsilon": QUERY_EPSILON, "delta": QUERY_DELTA, "select_cols": [], "synth_params": { "embedding_dim": 128, "batch_size": 50, "epochs": 5, }, "nullable": True, "constraints": "", } example_smartnoise_synth_query: Dict[str, JsonValue] = dict(example_smartnoise_synth_cost) example_smartnoise_synth_query["return_model"] = True example_smartnoise_synth_query["condition"] = "" example_smartnoise_synth_query["nb_samples"] = SNSYNTH_NB_SAMPLES example_dummy_smartnoise_synth_query: Dict[str, JsonValue] = make_dummy(example_smartnoise_synth_query) # OpenDP # ----------------------------------------------------------------------------- OPENDP_PIPELINE: str = ( f'{{"version": "{OPENDP_VERSION}", ' '"ast": {' '"_type": "partial_chain", "lhs": {' '"_type": "partial_chain", "lhs": {' '"_type": "partial_chain", "lhs": {' '"_type": "partial_chain", "lhs": {' '"_type": "partial_chain", "lhs": {' '"_type": "constructor", ' '"func": "make_chain_tt", ' '"module": "combinators", ' '"args": [' "{" '"_type": "constructor", ' '"func": "make_select_column", ' '"module": "transformations", ' '"kwargs": {"key": "bill_length_mm", "TOA": "String"}' "}, {" '"_type": "constructor", ' '"func": "make_split_dataframe", ' '"module": "transformations", ' '"kwargs": {"separator": ",", "col_names": {"_type": ' '"list", "_items": ["species", "island", ' '"bill_length_mm", "bill_depth_mm", "flipper_length_' 'mm", "body_mass_g", "sex"]}}' "}]}, " '"rhs": {' '"_type": "constructor", ' '"func": "then_cast_default", ' '"module": "transformations", ' '"kwargs": {"TOA": "f64"}' "}}, " '"rhs": {' '"_type": "constructor", ' '"func": "then_clamp", ' '"module": "transformations", ' '"kwargs": {"bounds": [30.0, 65.0]}' "}}, " '"rhs": {' '"_type": "constructor", ' '"func": "then_resize", ' '"module": "transformations", ' '"kwargs": {"size": 346, "constant": 43.61}' "}}, " '"rhs": {' '"_type": "constructor", ' '"func": "then_variance", ' '"module": "transformations"' "}}, " '"rhs": {' '"_type": "constructor", ' '"func": "then_laplace", ' '"module": "measurements", ' '"kwargs": {"scale": 5.0}' "}}}" ) example_opendp: Dict[str, JsonValue] = { "dataset_name": PENGUIN_DATASET, "opendp_json": OPENDP_PIPELINE, "fixed_delta": QUERY_DELTA, } example_dummy_opendp: Dict[str, JsonValue] = make_dummy(example_opendp) # DiffPrivLib # ----------------------------------------------------------------------------- DIFFPRIVLIB_PIPELINE: str = ( '{"module": "diffprivlib", ' f'"version": "{DIFFPRIVLIB_VERSION}", ' '"pipeline": [' "{" '"type": "_dpl_type:StandardScaler", ' '"name": "scaler", ' '"params": {' '"with_mean": true, ' '"with_std": true, ' '"copy": true, ' '"epsilon": 0.5, ' '"bounds": {' '"_tuple": true, ' '"_items": [[30.0, 13.0, 150.0, 2000.0], [65.0, 23.0, 250.0, 7000.0]]' "}, " '"random_state": null, ' '"accountant": "_dpl_instance:BudgetAccountant"' "}" "}, " "{" '"type": "_dpl_type:LogisticRegression", ' '"name": "classifier", ' '"params": {' '"tol": 0.0001, ' '"C": 1.0, ' '"fit_intercept": true, ' '"random_state": null, ' '"max_iter": 100, ' '"verbose": 0, ' '"warm_start": false, ' '"n_jobs": null, ' '"epsilon": 1.0, ' '"data_norm": 83.69469642643347, ' '"accountant": "_dpl_instance:BudgetAccountant"' "}" "}" "]" "}" ) example_diffprivlib: Dict[str, JsonValue] = { "dataset_name": PENGUIN_DATASET, "diffprivlib_json": DIFFPRIVLIB_PIPELINE, "feature_columns": FEATURE_COLUMNS, "target_columns": TARGET_COLUMNS, "test_size": TEST_SIZE, "test_train_split_seed": SPLIT_SEED, "imputer_strategy": IMPUTER_STRATEGY, } example_dummy_diffprivlib: Dict[str, JsonValue] = make_dummy(example_diffprivlib)