import unittest
from typing import Any
from lomas_server.dp_queries.dummy_dataset import make_dummy_dataset
from lomas_server.utils.collection_models import Metadata
from lomas_server.utils.query_examples import DUMMY_NB_ROWS, DUMMY_SEED
[docs]
class TestMakeDummyDataset(unittest.TestCase):
"""
Tests for the generation of dummy datasets.
"""
metadata: dict[str, Any] = {
"max_ids": 1,
"rows": 100,
"row_privacy": True,
"columns": {},
}
[docs]
def test_categorical_column(self) -> None:
"""test_categorical_column"""
self.metadata["columns"] = {
"col_card_cat": { # cardinality + categories
"type": "string",
"cardinality": 3,
"categories": ["x", "y", "z"],
}
}
metadata = Metadata.model_validate(self.metadata)
df = make_dummy_dataset(metadata)
# Test shape
self.assertEqual(df.shape[0], DUMMY_NB_ROWS)
self.assertEqual(df.shape[1], 1)
# Test cardinality type and categories
self.assertIn("col_card_cat", df.columns)
self.assertEqual(df["col_card_cat"].nunique(), 3)
self.assertEqual(set(df["col_card_cat"].values), {"x", "y", "z"})
assert isinstance(df["col_card_cat"], object)
[docs]
def test_boolean_column(self) -> None:
"""test_boolean_column"""
# Test a boolean column
self.metadata["columns"] = {
"col_bool": {"type": "boolean", "nullable": True}
}
metadata = Metadata.model_validate(self.metadata)
df = make_dummy_dataset(metadata)
# Test length
self.assertEqual(len(df), DUMMY_NB_ROWS)
# Test col generated is boolean
self.assertIn("col_bool", df.columns)
self.assertEqual(df.col_bool.dtypes.name, "boolean")
[docs]
def test_float_column(self) -> None:
"""test_float_column"""
lower_bound = 10.0
upper_bound = 20.0
self.metadata["columns"] = {
"col_float": {
"type": "float",
"precision": 32,
"upper": upper_bound,
"lower": lower_bound,
}
}
metadata = Metadata.model_validate(self.metadata)
df = make_dummy_dataset(metadata)
# Test col generated is of type float
self.assertEqual(df.col_float.dtypes.name, "float32")
# Test within bounds
self.assertTrue((df["col_float"] >= lower_bound).all())
self.assertTrue((df["col_float"] <= upper_bound).all())
[docs]
def test_int_column(self) -> None:
"""test_int_column"""
lower_bound = 100
upper_bound = 120
self.metadata["columns"] = {
"col_int": {
"type": "int",
"precision": 64,
"upper": upper_bound,
"lower": lower_bound,
}
}
metadata = Metadata.model_validate(self.metadata)
df = make_dummy_dataset(metadata)
# Test col generated is of type int
self.assertIn(df.col_int.dtypes.name, ["int64"])
# Test within bounds
self.assertTrue((df["col_int"] >= lower_bound).all())
self.assertTrue((df["col_int"] <= upper_bound).all())
[docs]
def test_datetime_column(self) -> None:
"""test_datetime_column"""
self.metadata["columns"] = {
"col_datetime": {
"type": "datetime",
"lower": "2000-01-01",
"upper": "2010-01-01",
}
}
metadata = Metadata.model_validate(self.metadata)
df = make_dummy_dataset(metadata)
# Test col generated is of type datetime
self.assertEqual(df.col_datetime.dtypes.name, "datetime64[ns]")
# Should not have any null values
self.assertFalse(df.col_datetime.isnull().values.any())
[docs]
def test_nullable_column(self) -> None:
"""test_nullable_column"""
self.metadata["columns"] = {
"col_nullable": {
"type": "datetime",
"nullable": True,
"lower": "2000-01-01",
"upper": "2010-01-01",
}
}
metadata = Metadata.model_validate(self.metadata)
df = make_dummy_dataset(metadata)
# Should have null values
self.assertTrue(df.col_nullable.isnull().values.any())
[docs]
def test_seed(self) -> None:
"""test_seed"""
# Test the behavior with different seeds
self.metadata["columns"] = {
"col_int": {
"type": "int",
"nullable": True,
"precision": 32,
"lower": 0,
"upper": 100,
}
}
metadata = Metadata.model_validate(self.metadata)
seed1 = DUMMY_SEED
seed2 = DUMMY_SEED + 1
df1 = make_dummy_dataset(metadata, seed=seed1)
df2 = make_dummy_dataset(metadata, seed=seed2)
# Check if datasets generated with different seeds are different
self.assertFalse(df1.equals(df2))
# Check if datasets generated with the same seed are identical
df1_copy = make_dummy_dataset(metadata, seed=seed1)
self.assertTrue(df1.equals(df1_copy))
# TODO maybe remove this, see issue #335
# def test_unknown_column(self) -> None:
# """test_unknown_column"""
# metadata = {
# "columns": {
# "col_bool": {"type": "boolean", "nullable": True},
# "col_unknown": {"type": "unknown"},
# }
# }
# df = make_dummy_dataset(metadata)
# # Test col generated
# self.assertIn("col_bool", df.columns)
# # Test col not generated
# self.assertNotIn("col_unknown", df.columns)