Skip to content

SmartNoise SQL Integration

Convert CSVW-EO metadata into SmartNoise SQL configuration.


csvw_eo.csvw_to_smartnoise_sql

Convert CSVW-EO JSON metadata to SmartNoise SQL metadata format.

See smarntoise-sql documentation: https://docs.smartnoise.org/sql/metadata.html

csvw_to_smartnoise_sql(csvw_meta: dict[str, Any], schema_name: str = '', table_name: str = 'df', sample_max_ids: bool | None = None, censor_dims: bool | None = None, clamp_counts: bool | None = None, clamp_columns: bool | None = None, use_dpsu: bool | None = None) -> dict[str, Any]

Convert a CSVW-EO table metadata dictionary to SmartNoise SQL metadata.

Parameters:

Name Type Description Default
csvw_meta Dict[str, Any]

The CSVW-EO metadata dictionary for a single table. Must include "columns" list and "max_contributions" (used as max_ids).

required
schema_name str

Name of the SmartNoise schema (top-level namespace) for the table.

""
table_name str

Name of the table in SmartNoise metadata.

"df"
sample_max_ids bool

If True, skips reservoir sampling when users appear at most max_ids times.

True
censor_dims bool

If True, drops GROUP BY rows that might reveal rare individuals.

True
clamp_counts bool

If True, clamps negative differentially private counts to zero.

False
clamp_columns bool

If True, clamps input data to column lower/upper bounds.

True
use_dpsu bool

If True, enables Differential Private Set Union for censoring rare dimensions.

False

Returns:

Type Description
Dict[str, Any]

SmartNoise SQL metadata as a nested dictionary suitable for YAML serialization. Structure: { "": { schema_name: { table_name: { "max_ids": ..., "sample_max_ids": ..., "censor_dims": ..., "clamp_counts": ..., "clamp_columns": ..., "use_dpsu": ..., "": { "name": ..., "type": ..., "nullable": ..., "lower": ..., "upper": ..., "private_id": ... }, ... } } } }

Raises:

Type Description
ValueError

If "max_contributions" is missing from the CSVW metadata, since it is required as max_ids.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_smartnoise_sql.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def csvw_to_smartnoise_sql(  # noqa: PLR0913
    csvw_meta: dict[str, Any],
    schema_name: str = "",
    table_name: str = "df",
    sample_max_ids: bool | None = None,
    censor_dims: bool | None = None,
    clamp_counts: bool | None = None,
    clamp_columns: bool | None = None,
    use_dpsu: bool | None = None,
) -> dict[str, Any]:
    """
    Convert a CSVW-EO table metadata dictionary to SmartNoise SQL metadata.

    Parameters
    ----------
    csvw_meta : Dict[str, Any]
        The CSVW-EO metadata dictionary for a single table.
        Must include "columns" list and "max_contributions" (used as max_ids).
    schema_name : str, default=""
        Name of the SmartNoise schema (top-level namespace) for the table.
    table_name : str, default="df"
        Name of the table in SmartNoise metadata.
    sample_max_ids : bool, default=True
        If True, skips reservoir sampling when users appear at most max_ids times.
    censor_dims : bool, default=True
        If True, drops GROUP BY rows that might reveal rare individuals.
    clamp_counts : bool, default=False
        If True, clamps negative differentially private counts to zero.
    clamp_columns : bool, default=True
        If True, clamps input data to column lower/upper bounds.
    use_dpsu : bool, default=False
        If True, enables Differential Private Set Union for censoring rare dimensions.

    Returns
    -------
    Dict[str, Any]
        SmartNoise SQL metadata as a nested dictionary suitable for YAML serialization.
        Structure:
        {
            "": {
                schema_name: {
                    table_name: {
                        "max_ids": ...,
                        "sample_max_ids": ...,
                        "censor_dims": ...,
                        "clamp_counts": ...,
                        "clamp_columns": ...,
                        "use_dpsu": ...,
                        "<column_name>": {
                            "name": ...,
                            "type": ...,
                            "nullable": ...,
                            "lower": ...,
                            "upper": ...,
                            "private_id": ...
                        },
                        ...
                    }
                }
            }
        }

    Raises
    ------
    ValueError
        If "max_contributions" is missing from the CSVW metadata, since it is required
        as `max_ids`.

    """
    # Validate max_ids
    if MAX_CONTRIB not in csvw_meta:
        raise ValueError(f"CSVW metadata must include '{MAX_CONTRIB}' (max_ids for SNSQL)")
    max_ids = csvw_meta[MAX_CONTRIB]

    # Required fields only
    table_meta: dict[str, Any] = {"max_ids": max_ids}

    if csvw_meta.get(PUBLIC_LENGTH, False):
        table_meta["rows"] = csvw_meta[PUBLIC_LENGTH]

    # Optional fields (only include if explicitly provided)
    optional_fields = {
        "sample_max_ids": sample_max_ids,
        "censor_dims": censor_dims,
        "clamp_counts": clamp_counts,
        "clamp_columns": clamp_columns,
        "use_dpsu": use_dpsu,
    }
    for key, value in optional_fields.items():
        if value is not None:
            table_meta[key] = value

    # Convert columns
    has_private_id = False
    for col_meta in csvw_meta[TABLE_SCHEMA][COL_LIST]:
        col_dict = csvw_to_snsql_column(col_meta)
        table_meta[col_meta[COL_NAME]] = col_dict

        if col_dict.get("private_id", False):
            has_private_id = True

    table_meta["row_privacy"] = not has_private_id  # TODO: verify and document

    # Wrap into schema/table hierarchy
    return {"": {schema_name: {table_name: table_meta}}}

csvw_to_snsql_column(col_meta: dict[str, Any]) -> dict[str, Any]

Convert a single CSVW column metadata to SmartNoise SQL column metadata.

Parameters:

Name Type Description Default
col_meta dict

Dictionary representing CSVW column metadata. Expected keys: "name", "datatype".

required

Returns:

Type Description
dict

Dictionary representing the column metadata in SmartNoise SQL format.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_smartnoise_sql.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def csvw_to_snsql_column(col_meta: dict[str, Any]) -> dict[str, Any]:
    """
    Convert a single CSVW column metadata to SmartNoise SQL column metadata.

    Parameters
    ----------
    col_meta : dict
        Dictionary representing CSVW column metadata.
        Expected keys: "name", "datatype".

    Returns
    -------
    dict
        Dictionary representing the column metadata in SmartNoise SQL format.

    """
    if DATATYPE not in col_meta:
        raise ValueError(f"Column '{col_meta.get('name', '<unknown>')}' is missing 'datatype'")

    # Basic column information
    nullable = not col_meta[REQUIRED] if REQUIRED in col_meta else col_meta.get(NULL_PROP, 1.0) > 0.0
    xsd_datatype = col_meta[DATATYPE]
    col_dict: dict[str, Any] = {
        "name": col_meta[COL_NAME],
        "type": to_snsql_datatype(xsd_datatype),
        "nullable": nullable,
    }

    # Mark privacy unit
    # Mark privacy unit
    if col_meta.get(PRIVACY_ID, False):
        col_dict["private_id"] = True

    if XSD_GROUP_MAP[xsd_datatype] != DataTypesGroups.DATETIME:
        if col_meta.get(PRIVACY_ID):
            pass
        else:
            if MINIMUM in col_meta:
                col_dict["lower"] = col_meta[MINIMUM]
            if MAXIMUM in col_meta:
                col_dict["upper"] = col_meta[MAXIMUM]

    return col_dict

main() -> None

CLI for converting CSVW-EO JSON metadata to SmartNoise SQL YAML metadata.

This function reads a CSVW-EO JSON metadata file and converts it into SmartNoise SQL YAML metadata. All table-level options are configurable via CLI arguments, except max_ids, which must be present in the CSVW metadata (as 'max_contributions'). Defaults and meaning are taken directly from https://docs.smartnoise.org/sql/metadata.html.

Command-line arguments

--input : str (required) Path to input CSVW-EO JSON metadata file.

--output : str (required) Path to output SmartNoise YAML metadata file.

--schema : str (default="MySchema") SmartNoise schema name.

--table : str (default="MyTable") SmartNoise table name.

--sample_max_ids : bool (default=True) Skip reservoir sampling if users appear at most max_ids times.

--censor_dims : bool (default=True) Drop GROUP BY output rows that might reveal rare individuals.

--clamp_counts : bool (default=False) Clamp negative DP counts to zero.

--clamp_columns : bool (default=True) Clamp all input data to the column lower/upper bounds.

--use_dpsu : bool (default=False) Use Differential Private Set Union for rare dimensions.

Source code in csvw-eo-library/src/csvw_eo/csvw_to_smartnoise_sql.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
def main() -> None:
    """
    CLI for converting CSVW-EO JSON metadata to SmartNoise SQL YAML metadata.

    This function reads a CSVW-EO JSON metadata file and converts it into SmartNoise SQL
    YAML metadata.
    All table-level options are configurable via CLI arguments, except `max_ids`, which must
    be present in the CSVW metadata (as 'max_contributions').
    Defaults and meaning are taken directly from https://docs.smartnoise.org/sql/metadata.html.

    Command-line arguments
    ----------------------
    --input : str (required)
        Path to input CSVW-EO JSON metadata file.

    --output : str (required)
        Path to output SmartNoise YAML metadata file.

    --schema : str (default="MySchema")
        SmartNoise schema name.

    --table : str (default="MyTable")
        SmartNoise table name.

    --sample_max_ids : bool (default=True)
        Skip reservoir sampling if users appear at most max_ids times.

    --censor_dims : bool (default=True)
        Drop GROUP BY output rows that might reveal rare individuals.

    --clamp_counts : bool (default=False)
        Clamp negative DP counts to zero.

    --clamp_columns : bool (default=True)
        Clamp all input data to the column lower/upper bounds.

    --use_dpsu : bool (default=False)
        Use Differential Private Set Union for rare dimensions.
    """
    parser = argparse.ArgumentParser(
        description="Convert CSVW-EO JSON metadata to SmartNoise SQL YAML metadata."
    )
    parser.add_argument("--input", required=True, help="Input CSVW-EO JSON metadata file")
    parser.add_argument("--output", required=True, help="Output SmartNoise YAML metadata file")
    parser.add_argument("--schema", default="MySchema", help="SmartNoise SQL schema name")
    parser.add_argument("--table", default="MyTable", help="SmartNoise SQL table name")
    parser.add_argument(
        "--sample_max_ids",
        type=bool,
        default=None,
        help="Skip sampling if max_ids enforced",
    )
    parser.add_argument(
        "--censor_dims",
        type=bool,
        default=None,
        help="Drop GROUP BY rows revealing individuals",
    )
    parser.add_argument("--clamp_counts", type=bool, default=None, help="Clamp negative counts to zero")
    parser.add_argument("--clamp_columns", type=bool, default=None, help="Clamp columns to bounds")
    parser.add_argument("--use_dpsu", type=bool, default=None, help="Use Differential Private Set Union")

    args = parser.parse_args()

    # Load CSVW metadata
    with open(args.input, encoding="utf-8") as f:
        csvw_meta = json.load(f)

    # Call conversion function
    snsql_meta = csvw_to_smartnoise_sql(
        csvw_meta=csvw_meta,
        schema_name=args.schema,
        table_name=args.table,
        sample_max_ids=args.sample_max_ids,
        censor_dims=args.censor_dims,
        clamp_counts=args.clamp_counts,
        clamp_columns=args.clamp_columns,
        use_dpsu=args.use_dpsu,
    )

    # Write YAML
    with open(args.output, "w", encoding="utf-8") as f:
        yaml.safe_dump(snsql_meta, f)

    print(f"SmartNoise SQL metadata written to {args.output}")  # noqa: T201