Skip to content

OpenDP Integration

Utilities for integrating CSVW-EO metadata with OpenDP.


OpenDP Context

csvw_eo.csvw_to_opendp_context

Create an OpenDP Context from CSVW-EO metadata and a dataset.

This module: - Converts CSVW-EO metadata into OpenDP margins - Builds an OpenDP Context using a provided dataset - Supports epsilon-based (Laplace) and rho-based (Gaussian) DP - Exposes both a Python API and CLI

The resulting context can be used for differentially private queries.

csvw_to_opendp_context(csvw_meta: dict[str, Any], data: pl.LazyFrame, epsilon: float | None = None, rho: float | None = None, delta: float | None = None, split_evenly_over: int | None = None, split_by_weights: list[float] | None = None, distance: str = 'contributions') -> dp.Context

Create an OpenDP Context from CSVW-EO metadata and a dataset.

Parameters:

Name Type Description Default
csvw_meta Dict[str, Any]

CSVW-EO metadata dictionary. Must include csvw-eo.dp.maxContributions.

required
data LazyFrame

Input dataset (recommended as LazyFrame).

required
epsilon float

Privacy budget epsilon (for Laplace DP).

None
rho float

Privacy budget rho (for Gaussian / zCDP).

None
delta float

Privacy budget delta (if using approximate DP).

None
split_evenly_over int

Number of queries to split privacy budget across.

None
split_by_weights list[float] | None

List of privacy budget weight by query.

None
distance str

Distance metric for privacy unit.

'contributions'

Returns:

Type Description
Context

OpenDP Context object ready for queries.

Raises:

Type Description
ValueError

If required metadata (max_contributions) is missing. If neither epsilon nor rho is provided.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_opendp_context.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def csvw_to_opendp_context(  # noqa: PLR0913
    csvw_meta: dict[str, Any],
    data: pl.LazyFrame,
    epsilon: float | None = None,
    rho: float | None = None,
    delta: float | None = None,
    split_evenly_over: int | None = None,
    split_by_weights: list[float] | None = None,
    distance: str = "contributions",
) -> dp.Context:
    """
    Create an OpenDP Context from CSVW-EO metadata and a dataset.

    Parameters
    ----------
    csvw_meta : Dict[str, Any]
        CSVW-EO metadata dictionary.
        Must include `csvw-eo.dp.maxContributions`.
    data : pl.LazyFrame
        Input dataset (recommended as LazyFrame).
    epsilon : float, optional
        Privacy budget epsilon (for Laplace DP).
    rho : float, optional
        Privacy budget rho (for Gaussian / zCDP).
    delta : float, optional
        Privacy budget delta (if using approximate DP).
    split_evenly_over : int
        Number of queries to split privacy budget across.
    split_by_weights: list[float]
        List of privacy budget weight by query.
    distance: str, default='contributions'
        Distance metric for privacy unit.

    Returns
    -------
    Context
        OpenDP Context object ready for queries.

    Raises
    ------
    ValueError
        If required metadata (max_contributions) is missing.
        If neither epsilon nor rho is provided.

    """
    if split_evenly_over is not None and split_by_weights is not None:
        raise ValueError("Specify only one of split_evenly_over or split_by_weights")

    kwargs: dict[str, Any] = {
        "data": data,
        "privacy_unit": get_privacy_unit(csvw_meta, distance),
        "privacy_loss": get_privacy_loss(epsilon, rho, delta),
        "margins": csvw_to_opendp_margins(csvw_meta),
    }
    if split_by_weights is not None:
        kwargs["split_by_weights"] = split_by_weights
    else:
        kwargs["split_evenly_over"] = split_evenly_over

    return dp.Context.compositor(**kwargs)

get_privacy_loss(epsilon: float | None = None, rho: float | None = None, delta: float | None = None) -> tuple[Measure, Any]

Create an opendp privacy loss object.

Parameters:

Name Type Description Default
epsilon float

Privacy budget epsilon (for Laplace DP).

None
rho float

Privacy budget rho (for Gaussian / zCDP).

None
delta float

Privacy budget delta (if using approximate DP).

None

Returns:

Type Description
privacy_loss

opendp privacy loss object

Raises:

Type Description
ValueError

If neither epsilon nor rho is provided.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_opendp_context.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def get_privacy_loss(
    epsilon: float | None = None,
    rho: float | None = None,
    delta: float | None = None,
) -> tuple[Measure, Any]:
    """
    Create an opendp privacy loss object.

    Parameters
    ----------
    epsilon : float, optional
        Privacy budget epsilon (for Laplace DP).
    rho : float, optional
        Privacy budget rho (for Gaussian / zCDP).
    delta : float, optional
        Privacy budget delta (if using approximate DP).

    Returns
    -------
    privacy_loss
        opendp privacy loss object

    Raises
    ------
    ValueError
        If neither epsilon nor rho is provided.

    """
    if epsilon is None and rho is None:
        raise ValueError("Either epsilon or rho must be provided")

    if epsilon is not None and rho is not None:
        raise ValueError("Specify only one of epsilon or rho")

    if epsilon is not None:
        return dp.loss_of(epsilon=epsilon, delta=delta)

    return dp.loss_of(rho=rho, delta=delta)

get_privacy_unit(csvw_meta: dict[str, Any], distance: str) -> tuple[Metric, Union[float, Sequence[Bound]]]

Construct an OpenDP privacy unit from CSVW-EO metadata.

Parameters:

Name Type Description Default
csvw_meta Dict[str, Any]

CSVW-EO metadata dictionary.

required
distance str

Type of privacy distance metric to use (e.g. "contributions", "changes").

required

Returns:

Type Description
privacy_unit

OpenDP privacy unit descriptor.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_opendp_context.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def get_privacy_unit(
    csvw_meta: dict[str, Any], distance: str
) -> tuple[Metric, Union[float, Sequence[Bound]]]:
    """
    Construct an OpenDP privacy unit from CSVW-EO metadata.

    Parameters
    ----------
    csvw_meta : Dict[str, Any]
        CSVW-EO metadata dictionary.
    distance : str
        Type of privacy distance metric to use (e.g. "contributions", "changes").

    Returns
    -------
    privacy_unit
        OpenDP privacy unit descriptor.

    """
    if MAX_CONTRIB not in csvw_meta:
        raise ValueError("Missing max_contributions in metadata")

    max_contrib = csvw_meta[MAX_CONTRIB]

    kwargs: dict[str, Any] = {}

    # Map distance type → correct argument
    if distance == "contributions":
        kwargs["contributions"] = max_contrib
    elif distance == "changes":
        kwargs["changes"] = max_contrib
    # elif distance == "absolute":
    # kwargs["absolute"] = max_contrib
    # elif distance == "l1":
    # kwargs["l1"] = float(max_contrib)
    # elif distance == "l2":
    # kwargs["l2"] = float(max_contrib)
    else:
        raise ValueError(f"Unsupported distance type: {distance}")

    # identifier = csvw_meta.get(PRIVACY_UNIT)
    # if identifier is not None:
    #     kwargs["identifier"] = pl.col(identifier)  # TODO: investigate more

    return dp.unit_of(**kwargs)

OpenDP Margins

csvw_eo.csvw_to_opendp_margins

Convert CSVW-EO JSON metadata into OpenDP margin descriptors.

This module provides: - A function to translate CSVW-EO differential privacy metadata into OpenDP dp.polars.Margin objects. - A CLI for generating margin specifications from a JSON metadata file.

The resulting margins can be used in an OpenDP context, for example:

dp.Context.compositor(
    data=...,
    privacy_unit=dp.unit_of(contributions=...),
    privacy_loss=dp.loss_of(epsilon=...),
    margins=[...],
)

csvw_to_opendp_margins(csvw_meta: dict[str, Any]) -> list[Margin]

Convert CSVW-EO metadata to a list of OpenDP Margin objects.

Parameters:

Name Type Description Default
csvw_meta Dict[str, Any]

CSVW-EO metadata dictionary.

required

Returns:

Type Description
List[Margin]

List of OpenDP margin descriptors.

Raises:

Type Description
ValueError

If required metadata (e.g., max_contributions) is missing.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_opendp_margins.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def csvw_to_opendp_margins(csvw_meta: dict[str, Any]) -> list["Margin"]:
    """
    Convert CSVW-EO metadata to a list of OpenDP Margin objects.

    Parameters
    ----------
    csvw_meta : Dict[str, Any]
        CSVW-EO metadata dictionary.

    Returns
    -------
    List["Margin"]
        List of OpenDP margin descriptors.

    Raises
    ------
    ValueError
        If required metadata (e.g., max_contributions) is missing.

    """
    margins: list[Margin] = []

    # Table-level margins: non groupby queries (by=[], max_length=10, ...)
    margin_kwargs: dict[str, Any] = {}

    # Max length (for non count queries)
    if csvw_meta.get(MAX_LENGTH, False):
        margin_kwargs["max_length"] = csvw_meta[MAX_LENGTH]

    # If length is public --> invariant lengths
    if csvw_meta.get(PUBLIC_LENGTH, False):
        margin_kwargs["invariant"] = "lengths"

    if margin_kwargs:
        margins.append(Margin(**margin_kwargs))

    # Column-level margins: groupby queries (by=['col_name'], max_length=100, ...)
    for col_meta in csvw_meta[TABLE_SCHEMA][COL_LIST]:
        margin_kwargs = get_margins(col_meta, by=[col_meta[COL_NAME]])
        margins.append(Margin(**margin_kwargs))

    # Multi-columns-level margins: groupby queries (by=['col_1', 'col_2'], max_length=100, ...)
    for cols_meta in csvw_meta.get(ADD_INFO, []):
        margin_kwargs = get_margins(cols_meta, by=cols_meta[COLUMNS_IN_GROUP])
        margins.append(Margin(**margin_kwargs))

    return margins

get_margins(col_meta: dict[str, Any], by: list[str]) -> dict[str, Any]

Build margin keyword arguments for a given column or column group.

Parameters:

Name Type Description Default
col_meta Dict[str, Any]

Metadata describing a column or group of columns, including differential privacy constraints (e.g., max_length, max_groups).

required
by List[str]

Column name(s) to group by when defining the margin.

required

Returns:

Type Description
Dict[str, Any]

Dictionary of keyword arguments suitable for constructing an OpenDP Margin object.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_opendp_margins.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_margins(col_meta: dict[str, Any], by: list[str]) -> dict[str, Any]:
    """
    Build margin keyword arguments for a given column or column group.

    Parameters
    ----------
    col_meta : Dict[str, Any]
        Metadata describing a column or group of columns, including
        differential privacy constraints (e.g., max_length, max_groups).
    by : List[str]
        Column name(s) to group by when defining the margin.

    Returns
    -------
    Dict[str, Any]
        Dictionary of keyword arguments suitable for constructing an
        OpenDP Margin object.

    """
    margin_kwargs: dict[str, Any] = {"by": by}

    # max_length per column
    if MAX_LENGTH in col_meta:
        margin_kwargs["max_length"] = col_meta[MAX_LENGTH]

    # max_groups per column
    if MAX_GROUPS in col_meta:
        margin_kwargs["max_groups"] = col_meta[MAX_GROUPS]
    elif MAX_NUM_PARTITIONS in col_meta:
        margin_kwargs["max_groups"] = col_meta[MAX_NUM_PARTITIONS]

    # Exhaustive partitions --> invariant keys
    if col_meta.get(INVARIANT_PUBLIC_KEYS):
        margin_kwargs["invariant"] = "keys"

    if col_meta.get(PUBLIC_LENGTH):
        margin_kwargs["invariant"] = "lengths"

    return margin_kwargs