{ "cells": [ { "cell_type": "markdown", "id": "3f18d338", "metadata": {}, "source": [ "# S3 example" ] }, { "cell_type": "markdown", "id": "01ae30d2", "metadata": {}, "source": [ "## Step 1: Install the library\n", "To interact with the secure server on which the data is stored, one first needs to install the library `lomas-client` on her local developping environment. \n", "\n", "It can be installed via the pip command:" ] }, { "cell_type": "code", "execution_count": 1, "id": "28fbdd79-8c15-49a9-bcf9-fcdeac09d2b5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: lomas-client in /usr/local/lib/python3.12/site-packages (0.3.3)\n", "Requirement already satisfied: diffprivlib>=0.6.4 in /usr/local/lib/python3.12/site-packages (from lomas-client) (0.6.4)\n", "Requirement already satisfied: diffprivlib-logger>=0.0.3 in /usr/local/lib/python3.12/site-packages (from lomas-client) (0.0.3)\n", "Requirement already satisfied: numpy>=1.26.2 in /usr/local/lib/python3.12/site-packages (from lomas-client) (1.26.2)\n", "Requirement already satisfied: opendp==0.10.0 in /usr/local/lib/python3.12/site-packages (from lomas-client) (0.10.0)\n", "Requirement already satisfied: opendp-logger==0.3.0 in /usr/local/lib/python3.12/site-packages (from lomas-client) (0.3.0)\n", "Requirement already satisfied: pandas>=2.2.2 in /usr/local/lib/python3.12/site-packages (from lomas-client) (2.2.2)\n", "Requirement already satisfied: requests>=2.32.0 in /usr/local/lib/python3.12/site-packages (from lomas-client) (2.32.0)\n", "Requirement already satisfied: scikit-learn==1.4.0 in /usr/local/lib/python3.12/site-packages (from lomas-client) (1.4.0)\n", "Requirement already satisfied: smartnoise-synth==1.0.4 in /usr/local/lib/python3.12/site-packages (from lomas-client) (1.0.4)\n", "Requirement already satisfied: smartnoise-synth-logger==0.0.3 in /usr/local/lib/python3.12/site-packages (from lomas-client) (0.0.3)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/site-packages (from scikit-learn==1.4.0->lomas-client) (1.14.1)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/site-packages (from scikit-learn==1.4.0->lomas-client) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.12/site-packages (from scikit-learn==1.4.0->lomas-client) (3.5.0)\n", "Requirement already satisfied: Faker>=17.0.0 in /usr/local/lib/python3.12/site-packages (from smartnoise-synth==1.0.4->lomas-client) (30.1.0)\n", "Requirement already satisfied: opacus<0.15.0,>=0.14.0 in /usr/local/lib/python3.12/site-packages (from smartnoise-synth==1.0.4->lomas-client) (0.14.0)\n", "Requirement already satisfied: pac-synth<0.0.9,>=0.0.8 in /usr/local/lib/python3.12/site-packages (from smartnoise-synth==1.0.4->lomas-client) (0.0.8)\n", "Requirement already satisfied: smartnoise-sql<2.0.0,>=1.0.4 in /usr/local/lib/python3.12/site-packages (from smartnoise-synth==1.0.4->lomas-client) (1.0.4)\n", "Requirement already satisfied: torch>=2.2.0 in /usr/local/lib/python3.12/site-packages (from smartnoise-synth==1.0.4->lomas-client) (2.4.1)\n", "Requirement already satisfied: setuptools>=49.0.0 in /usr/local/lib/python3.12/site-packages (from diffprivlib>=0.6.4->lomas-client) (75.1.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/site-packages (from pandas>=2.2.2->lomas-client) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/site-packages (from pandas>=2.2.2->lomas-client) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/site-packages (from pandas>=2.2.2->lomas-client) (2024.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.12/site-packages (from requests>=2.32.0->lomas-client) (3.4.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/site-packages (from requests>=2.32.0->lomas-client) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/site-packages (from requests>=2.32.0->lomas-client) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/site-packages (from requests>=2.32.0->lomas-client) (2024.8.30)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/site-packages (from Faker>=17.0.0->smartnoise-synth==1.0.4->lomas-client) (4.12.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=2.2.2->lomas-client) (1.16.0)\n", "Requirement already satisfied: PyYAML<7.0.0,>=6.0.1 in /usr/local/lib/python3.12/site-packages (from smartnoise-sql<2.0.0,>=1.0.4->smartnoise-synth==1.0.4->lomas-client) (6.0.2)\n", "Requirement already satisfied: antlr4-python3-runtime==4.9.3 in /usr/local/lib/python3.12/site-packages (from smartnoise-sql<2.0.0,>=1.0.4->smartnoise-synth==1.0.4->lomas-client) (4.9.3)\n", "Requirement already satisfied: graphviz<0.18,>=0.17 in /usr/local/lib/python3.12/site-packages (from smartnoise-sql<2.0.0,>=1.0.4->smartnoise-synth==1.0.4->lomas-client) (0.17)\n", "Requirement already satisfied: sqlalchemy<3.0.0,>=2.0.0 in /usr/local/lib/python3.12/site-packages (from smartnoise-sql<2.0.0,>=1.0.4->smartnoise-synth==1.0.4->lomas-client) (2.0.35)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (3.16.1)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (3.1.4)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (2024.9.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (9.1.0.70)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.1.105)\n", "Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.12/site-packages (from torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (3.0.0)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.12/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (12.6.77)\n", "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.12/site-packages (from sqlalchemy<3.0.0,>=2.0.0->smartnoise-sql<2.0.0,>=1.0.4->smartnoise-synth==1.0.4->lomas-client) (3.1.1)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/site-packages (from jinja2->torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/site-packages (from sympy->torch>=2.2.0->smartnoise-synth==1.0.4->lomas-client) (1.3.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install lomas-client" ] }, { "cell_type": "code", "execution_count": 1, "id": "6fb569fc", "metadata": {}, "outputs": [], "source": [ "from lomas_client.client import Client\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "9c63718b", "metadata": {}, "source": [ "## Step 2: Initialise the client\n", "\n", "Once the library is installed, a Client object must be created. It is responsible for sending sending requests to the server and processing responses in the local environment. It enables a seamless interaction with the server. \n", "\n", "To create the client, one needs to give it a few parameters:\n", "- a url: the root application endpoint to the remote secure server.\n", "- user_name: her name as registered in the database (Jack)\n", "- dataset_name: the name of the dataset that we want to query (TITANIC)" ] }, { "cell_type": "code", "execution_count": null, "id": "4ec2f52f", "metadata": {}, "outputs": [], "source": [ "DATASET_NAME = \"TITANIC\"" ] }, { "cell_type": "code", "execution_count": null, "id": "74fdc85e", "metadata": {}, "outputs": [], "source": [ "# The following would usually be set in the environment by a system administrator\n", "# and be tranparent to lomas users.\n", "APP_URL = \"http://localhost:48080\" # For local devenv setup\n", "# APP_URL = \"http://lomas_server:48080\" # For local docker compose setup\n", "# APP_URL = \"http://lomas-server.example.com:80\" # For Kubernetes deployment\n", "USER_NAME = \"Jack\"\n", "\n", "import os\n", "os.environ[\"LOMAS_CLIENT_ID\"] = USER_NAME\n", "os.environ[\"LOMAS_CLIENT_SECRET\"] = USER_NAME.lower()\n", "os.environ[\"LOMAS_KEYCLOAK_ADDRESS\"] = \"localhost\" # For local devenv setup\n", "# os.environ[\"LOMAS_KEYCLOAK_ADDRESS\"] = \"keycloak\" # For local docker compose setup\n", "# os.environ[\"LOMAS_KEYCLOAK_ADDRESS\"] = \"lomas-keycloak.example.com\" # For Kubernetes deployment \n", "os.environ[\"LOMAS_KEYCLOAK_PORT\"] = \"80\" # For local deployments\n", "# os.environ[\"LOMAS_KEYCLOAK_PORT\"] = \"443\" # For Kubernetes deployment\n", "os.environ[\"LOMAS_KEYCLOAK_USE_TLS\"] = \"0\" # For local deployments\n", "# os.environ[\"LOMAS_KEYCLOAK_USE_TLS\"] = \"1\" # For Kubernetes deployments\n", "os.environ[\"LOMAS_REALM\"] = \"lomas\"" ] }, { "cell_type": "code", "execution_count": null, "id": "e629463f", "metadata": {}, "outputs": [], "source": [ "client = Client(url=APP_URL, dataset_name=DATASET_NAME)" ] }, { "cell_type": "markdown", "id": "9b9a5f13", "metadata": {}, "source": [ "## Step 3: Understand the functionnalities of the library" ] }, { "cell_type": "markdown", "id": "c7cb5531", "metadata": {}, "source": [ "### Getting dataset metadata" ] }, { "cell_type": "code", "execution_count": 3, "id": "d15cbe39", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'max_ids': 1,\n", " 'rows': 887,\n", " 'row_privacy': True,\n", " 'censor_dims': False,\n", " 'columns': {'Pclass': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'int',\n", " 'precision': 32,\n", " 'lower': 1,\n", " 'upper': 3},\n", " 'Name': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string'},\n", " 'Sex': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string',\n", " 'cardinality': 2,\n", " 'categories': ['male', 'female']},\n", " 'Age': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'float',\n", " 'precision': 64,\n", " 'lower': 0.1,\n", " 'upper': 100.0},\n", " 'SibSp': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'int',\n", " 'precision': 32,\n", " 'lower': 0,\n", " 'upper': 10},\n", " 'Parch': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'int',\n", " 'precision': 32,\n", " 'lower': 0,\n", " 'upper': 10},\n", " 'Ticket': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string'},\n", " 'Fare': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'float',\n", " 'precision': 64,\n", " 'lower': 0.0,\n", " 'upper': 1000.0},\n", " 'Cabin': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string'},\n", " 'Embarked': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string',\n", " 'cardinality': 3,\n", " 'categories': ['C', 'Q', 'S']},\n", " 'Survived': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'boolean'}}}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_metadata = client.get_dataset_metadata()\n", "titanic_metadata" ] }, { "cell_type": "markdown", "id": "5a3c899d", "metadata": {}, "source": [ "### Get a dummy dataset" ] }, { "cell_type": "code", "execution_count": 4, "id": "01f4365a", "metadata": {}, "outputs": [], "source": [ "NB_ROWS = 200\n", "SEED = 0" ] }, { "cell_type": "code", "execution_count": 5, "id": "3f553b29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(200, 11)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassNameSexAgeSibSpParchTicketFareCabinEmbarkedSurvived
03ofemale89.690443662858.435326USTrue
12Dmale58.37367300Z620.908898aCTrue
22ufemale4.11780024h193.917948GSTrue
31omale71.17753497a687.914521ZQTrue
413male56.9456834101758.999002WSTrue
\n", "
" ], "text/plain": [ " Pclass Name Sex Age SibSp Parch Ticket Fare Cabin \\\n", "0 3 o female 89.690443 6 6 2 858.435326 U \n", "1 2 D male 58.373673 0 0 Z 620.908898 a \n", "2 2 u female 4.117800 2 4 h 193.917948 G \n", "3 1 o male 71.177534 9 7 a 687.914521 Z \n", "4 1 3 male 56.945683 4 10 1 758.999002 W \n", "\n", " Embarked Survived \n", "0 S True \n", "1 C True \n", "2 S True \n", "3 Q True \n", "4 S True " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dummy = client.get_dummy_dataset(\n", " nb_rows = NB_ROWS, \n", " seed = SEED\n", ")\n", "\n", "print(df_dummy.shape)\n", "df_dummy.head()" ] }, { "cell_type": "markdown", "id": "98e6fda2-dde7-4f8b-a787-c9a1e3571ebe", "metadata": {}, "source": [ "### Query on dummy dataset" ] }, { "cell_type": "markdown", "id": "243c73e3-daec-45d6-a3c8-ae1d60439ec4", "metadata": {}, "source": [ "#### Average and number of rows with smartnoise-sql library on remote dummy" ] }, { "cell_type": "code", "execution_count": 6, "id": "3946425d", "metadata": {}, "outputs": [], "source": [ "# Average Age\n", "QUERY = \"SELECT COUNT(*) AS nb_passengers, \\\n", " AVG(Age) AS avg_age \\\n", " FROM df\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "90cf2a6d", "metadata": {}, "outputs": [], "source": [ "# On the remote server dummy dataframe\n", "dummy_res = client.smartnoise_sql.query(\n", " query = QUERY, \n", " epsilon = 100.0, # make sure to select high values of epsilon and delta to have small differences\n", " delta = 2.0, # make sure to select high values of epsilon and delta to have small differences\n", " dummy = True, \n", " nb_rows = NB_ROWS,\n", " seed = SEED\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "a30f277e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average age in remote dummy: 51.7 years old.\n", "Number of rows in remote dummy: 199.\n" ] } ], "source": [ "print(f\"Average age in remote dummy: {np.round(dummy_res.result.df['avg_age'][0], 2)} years old.\")\n", "print(f\"Number of rows in remote dummy: {np.round(dummy_res.result.df['nb_passengers'][0], 2)}.\")" ] }, { "cell_type": "markdown", "id": "324454ed", "metadata": {}, "source": [ "### Get current budget" ] }, { "cell_type": "code", "execution_count": 9, "id": "61a467f3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "InitialBudgetResponse(initial_epsilon=45.0, initial_delta=0.2)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.get_initial_budget()" ] }, { "cell_type": "code", "execution_count": 10, "id": "afd22f84", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SpentBudgetResponse(total_spent_epsilon=0.0, total_spent_delta=0.0)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.get_total_spent_budget()" ] }, { "cell_type": "markdown", "id": "05daf5a4", "metadata": {}, "source": [ "It will also be useful to know what the remaining budget is. Therefore, we call the function `get_remaining_budget`. It just substarcts the total spent budget from the initial budget." ] }, { "cell_type": "code", "execution_count": 11, "id": "6260cf54", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RemainingBudgetResponse(remaining_epsilon=45.0, remaining_delta=0.2)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.get_remaining_budget()" ] }, { "cell_type": "markdown", "id": "20298e00", "metadata": {}, "source": [ "As expected, for now the remaining budget is equal to the inital budget." ] }, { "cell_type": "markdown", "id": "b746374c", "metadata": {}, "source": [ "### Estimate cost of a query\n", "Another safeguard is the functionnality to estimate the cost of a query. As in OpenDP and SmartnoiseSQL, the budget that will by used by a query might be slightly different than what is asked by the user. The `estimate cost` function returns the estimated real cost of any query.\n", "\n", "Again, of course, this will not impact the user's budget." ] }, { "cell_type": "code", "execution_count": 12, "id": "fd5ed08a", "metadata": {}, "outputs": [], "source": [ "EPSILON = 0.5\n", "DELTA = 1e-4" ] }, { "cell_type": "code", "execution_count": 13, "id": "133020c6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CostResponse(epsilon=1.5, delta=0.00014999500000001387)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.smartnoise_sql.cost(\n", " query = QUERY, \n", " epsilon = EPSILON, \n", " delta = DELTA\n", ")" ] }, { "cell_type": "markdown", "id": "e5379edf", "metadata": {}, "source": [ "### Query on real private dataset with smartnoise-sql." ] }, { "cell_type": "code", "execution_count": 14, "id": "19e60263", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RemainingBudgetResponse(remaining_epsilon=45.0, remaining_delta=0.2)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.get_remaining_budget()" ] }, { "cell_type": "code", "execution_count": 15, "id": "69767fac", "metadata": {}, "outputs": [], "source": [ "response = client.smartnoise_sql.query(\n", " query = QUERY, \n", " epsilon = EPSILON, \n", " delta = DELTA,\n", " dummy = False # Optionnal\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "a7a4e2d3-2922-4f95-bdc9-a35c160f157c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "id": "6dbbdf93", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of passengers in real data: 887.\n", "Average age in real data: 29.34.\n" ] } ], "source": [ "nb_passengers = response.result.df['nb_passengers'].iloc[0]\n", "print(f\"Number of passengers in real data: {nb_passengers}.\")\n", "\n", "avg_age = np.round(response.result.df['avg_age'].iloc[0], 2)\n", "print(f\"Average age in real data: {avg_age}.\")" ] }, { "cell_type": "markdown", "id": "b2767e65", "metadata": {}, "source": [ "After each query on the real dataset, the budget informations are also returned to the researcher. It is possible possible to check the remaining budget again afterwards:" ] }, { "cell_type": "code", "execution_count": 17, "id": "39701fe5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RemainingBudgetResponse(remaining_epsilon=43.5, remaining_delta=0.199850005)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.get_remaining_budget()" ] }, { "cell_type": "markdown", "id": "e37c587f", "metadata": {}, "source": [ "As can be seen in `get_total_spent_budget()`, it is the budget estimated with `estimate_smartnoise_sql_cost()` that was spent." ] }, { "cell_type": "code", "execution_count": 18, "id": "487f835f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SpentBudgetResponse(total_spent_epsilon=1.5, total_spent_delta=0.00014999500000001387)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.get_total_spent_budget()" ] }, { "cell_type": "markdown", "id": "04929993", "metadata": {}, "source": [ "## Step 4: Titanic statistics with opendp" ] }, { "cell_type": "code", "execution_count": 19, "id": "b9685226", "metadata": {}, "outputs": [], "source": [ "import opendp as dp\n", "import opendp.transformations as trans\n", "import opendp.measurements as meas" ] }, { "cell_type": "markdown", "id": "bbbca191", "metadata": {}, "source": [ "### Confidence intervals for age over the whole population" ] }, { "cell_type": "code", "execution_count": 20, "id": "4331d86f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'max_ids': 1,\n", " 'rows': 887,\n", " 'row_privacy': True,\n", " 'censor_dims': False,\n", " 'columns': {'Pclass': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'int',\n", " 'precision': 32,\n", " 'lower': 1,\n", " 'upper': 3},\n", " 'Name': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string'},\n", " 'Sex': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string',\n", " 'cardinality': 2,\n", " 'categories': ['male', 'female']},\n", " 'Age': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'float',\n", " 'precision': 64,\n", " 'lower': 0.1,\n", " 'upper': 100.0},\n", " 'SibSp': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'int',\n", " 'precision': 32,\n", " 'lower': 0,\n", " 'upper': 10},\n", " 'Parch': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'int',\n", " 'precision': 32,\n", " 'lower': 0,\n", " 'upper': 10},\n", " 'Ticket': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string'},\n", " 'Fare': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'float',\n", " 'precision': 64,\n", " 'lower': 0.0,\n", " 'upper': 1000.0},\n", " 'Cabin': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string'},\n", " 'Embarked': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'string',\n", " 'cardinality': 3,\n", " 'categories': ['C', 'Q', 'S']},\n", " 'Survived': {'private_id': False,\n", " 'nullable': False,\n", " 'max_partition_length': None,\n", " 'max_influenced_partitions': None,\n", " 'max_partition_contributions': None,\n", " 'type': 'boolean'}}}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titanic_metadata" ] }, { "cell_type": "code", "execution_count": 21, "id": "ff8cb7b6", "metadata": {}, "outputs": [], "source": [ "columns = [\"PassengerId\", \"Pclass\", \"Name\", \"Sex\", \"Age\", \"SibSp\", \"Parch\"]" ] }, { "cell_type": "code", "execution_count": 22, "id": "70b2bdb1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.1, 100.0)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age_min = titanic_metadata['columns']['Age']['lower']\n", "age_max = titanic_metadata['columns']['Age']['upper']\n", "age_min, age_max" ] }, { "cell_type": "code", "execution_count": 23, "id": "75e4933b", "metadata": {}, "outputs": [], "source": [ "age_transformation_pipeline = (\n", " trans.make_split_dataframe(separator=\",\", col_names=columns) >>\n", " trans.make_select_column(key=\"Age\", TOA=str) >>\n", " trans.then_cast_default(TOA=float) >>\n", " trans.then_clamp(bounds=(age_min, age_max)) >>\n", " trans.then_resize(size=nb_passengers.tolist(), constant=avg_age) >>\n", " trans.then_variance()\n", ")" ] }, { "cell_type": "code", "execution_count": 24, "id": "8041a647", "metadata": {}, "outputs": [ { "ename": "InvalidQueryException", "evalue": "The pipeline provided is not a measurement. It cannot be processed in this server.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mInvalidQueryException\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[24], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Expect to fail !!!\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopendp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mopendp_pipeline\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mage_transformation_pipeline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mdummy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\n\u001b[1;32m 5\u001b[0m \u001b[43m)\u001b[49m\n", "File \u001b[0;32m/code/lomas_client/libraries/opendp.py:105\u001b[0m, in \u001b[0;36mOpenDPClient.query\u001b[0;34m(self, opendp_pipeline, fixed_delta, dummy, nb_rows, seed)\u001b[0m\n\u001b[1;32m 102\u001b[0m body \u001b[38;5;241m=\u001b[39m request_model\u001b[38;5;241m.\u001b[39mmodel_validate(body_dict)\n\u001b[1;32m 103\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhttp_client\u001b[38;5;241m.\u001b[39mpost(endpoint, body)\n\u001b[0;32m--> 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvalidate_model_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mQueryResponse\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/code/lomas_client/utils.py:83\u001b[0m, in \u001b[0;36mvalidate_model_response\u001b[0;34m(response, response_model)\u001b[0m\n\u001b[1;32m 80\u001b[0m r_model \u001b[38;5;241m=\u001b[39m response_model\u001b[38;5;241m.\u001b[39mmodel_validate_json(data)\n\u001b[1;32m 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r_model\n\u001b[0;32m---> 83\u001b[0m \u001b[43mraise_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m/code/lomas_client/utils.py:26\u001b[0m, in \u001b[0;36mraise_error\u001b[0;34m(response)\u001b[0m\n\u001b[1;32m 24\u001b[0m error_message \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m status\u001b[38;5;241m.\u001b[39mHTTP_400_BAD_REQUEST:\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidQueryException(error_message[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalidQueryException\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m status\u001b[38;5;241m.\u001b[39mHTTP_422_UNPROCESSABLE_ENTITY:\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ExternalLibraryException(\n\u001b[1;32m 29\u001b[0m error_message[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlibrary\u001b[39m\u001b[38;5;124m\"\u001b[39m], error_message[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExternalLibraryException\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 30\u001b[0m )\n", "\u001b[0;31mInvalidQueryException\u001b[0m: The pipeline provided is not a measurement. It cannot be processed in this server." ] } ], "source": [ "# Expect to fail !!!\n", "client.opendp.query(\n", " opendp_pipeline = age_transformation_pipeline,\n", " dummy=True\n", ")" ] }, { "cell_type": "markdown", "id": "d06c59dc", "metadata": {}, "source": [ "This is because the server will only allow measurement pipeline with differentially private results. We add Laplacian noise to the pipeline and should be able to instantiate the pipeline." ] }, { "cell_type": "code", "execution_count": 26, "id": "b8162859", "metadata": {}, "outputs": [], "source": [ "var_age_transformation_pipeline = (\n", " age_transformation_pipeline >>\n", " meas.then_laplace(scale=5.0)\n", ")" ] }, { "cell_type": "markdown", "id": "fc7e0ecd", "metadata": {}, "source": [ "Now that there is a measurement, one is able to apply the pipeline on the dummy dataset of the server." ] }, { "cell_type": "code", "execution_count": 27, "id": "df61bce0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dummy result for variance: 59.75\n" ] } ], "source": [ "dummy_var_res = client.opendp.query(\n", " opendp_pipeline = var_age_transformation_pipeline, \n", " dummy=True\n", ")\n", "print(f\"Dummy result for variance: {np.round(dummy_var_res.result.value, 2)}\")" ] }, { "cell_type": "markdown", "id": "ded11ac4", "metadata": {}, "source": [ "With opendp, the function `estimate_opendp_cost` is particularly useful to estimate the used `epsilon` and `delta` based on the `scale` value." ] }, { "cell_type": "code", "execution_count": 28, "id": "7ae7f735", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CostResponse(epsilon=2.2502841037292076, delta=0.0)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cost_res = client.opendp.cost(\n", " opendp_pipeline = var_age_transformation_pipeline\n", ")\n", "cost_res" ] }, { "cell_type": "markdown", "id": "1c791d36", "metadata": {}, "source": [ "One can now execute the query on the real dataset." ] }, { "cell_type": "code", "execution_count": 29, "id": "085555a5", "metadata": {}, "outputs": [], "source": [ "var_res = client.opendp.query(\n", " opendp_pipeline = var_age_transformation_pipeline, \n", ")" ] }, { "cell_type": "code", "execution_count": 30, "id": "674332e7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of passengers: 887 (from previous smartnoise-sql query).\n", "Average age: 29.34 (from previous smartnoise-sql query).\n", "Variance of age: 182.132 (from opendp query).\n" ] } ], "source": [ "print(f\"Number of passengers: {nb_passengers} (from previous smartnoise-sql query).\")\n", "\n", "print(f\"Average age: {np.round(avg_age, 2)} (from previous smartnoise-sql query).\")\n", "\n", "var_age = var_res.result.value\n", "print(f\"Variance of age: {np.round(var_age, 3)} (from opendp query).\")" ] }, { "cell_type": "code", "execution_count": 31, "id": "f72b19d0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Standard error of age: 0.45.\n" ] } ], "source": [ "# Get standard error\n", "standard_error = np.sqrt(var_age/nb_passengers)\n", "print(f\"Standard error of age: {np.round(standard_error, 2)}.\")" ] }, { "cell_type": "code", "execution_count": 32, "id": "62630a03", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The 95% confidence interval of the age of all passengers is [28.45, 30.23].\n" ] } ], "source": [ " # Compute the 95% confidence interval\n", "ZSCORE = 1.96\n", "lower_bound = np.round(avg_age - ZSCORE*standard_error, 2)\n", "upper_bound = np.round(avg_age + ZSCORE*standard_error, 2)\n", "print(f\"The 95% confidence interval of the age of all passengers is [{lower_bound}, {upper_bound}].\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3bcb4225-59c3-4e58-9be5-b9fae115b99d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 }