Skip to content

Validation

Validation utilities for CSVW-EO metadata and generated datasets.


Metadata Validation

csvw_eo.validate_metadata

Validate metadata file format.

main() -> None

Command-line interface for SHACL validation of CSVW-EO metadata.

This function parses command-line arguments specifying the metadata JSON-LD file and the SHACL shapes file, then runs SHACL validation.

If validation succeeds, a success message is printed. If validation fails, the validation report is printed and the program exits with a non-zero status code.

Source code in csvw-eo-library/src/csvw_eo/validate_metadata.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def main() -> None:
    """
    Command-line interface for SHACL validation of CSVW-EO metadata.

    This function parses command-line arguments specifying the metadata
    JSON-LD file and the SHACL shapes file, then runs SHACL validation.

    If validation succeeds, a success message is printed. If validation
    fails, the validation report is printed and the program exits with
    a non-zero status code.
    """
    parser = argparse.ArgumentParser(description="SHACL validation for CSVW-EO metadata")
    parser.add_argument("metadata_file", type=str)
    args = parser.parse_args()

    metadata_path = Path(args.metadata_file)
    if not metadata_path.exists():
        raise FileNotFoundError(f"Metadata file not found: {metadata_path}")

    with metadata_path.open("r", encoding="utf-8") as f:
        metadata = json.load(f)

    validate_metadata(metadata)

validate_metadata(metadata: dict[str, Any]) -> TableMetadata

Validate CSVW-EO metadata against the pydantic model.

Parameters:

Name Type Description Default
metadata dict

CSVW-EO metadata structure.

required
Source code in csvw-eo-library/src/csvw_eo/validate_metadata.py
11
12
13
14
15
16
17
18
19
20
21
def validate_metadata(metadata: dict[str, Any]) -> TableMetadata:
    """
    Validate CSVW-EO metadata against the pydantic model.

    Parameters
    ----------
    metadata : dict
        CSVW-EO metadata structure.

    """
    return TableMetadata.from_dict(metadata)

SHACL Validation

csvw_eo.validate_metadata_shacl

SHACL validation for CSVW-EO metadata.

This module validates CSVW-EO metadata files against a SHACL schema using the pySHACL engine. The metadata is expected to be in JSON-LD format and the SHACL shapes in Turtle format.

Requires

pyshacl rdflib

main() -> None

Command-line interface for SHACL validation of CSVW-EO metadata.

This function parses command-line arguments specifying the metadata JSON-LD file and the SHACL shapes file, then runs SHACL validation.

If validation succeeds, a success message is printed. If validation fails, the validation report is printed and the program exits with a non-zero status code.

Source code in csvw-eo-library/src/csvw_eo/validate_metadata_shacl.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def main() -> None:
    """
    Command-line interface for SHACL validation of CSVW-EO metadata.

    This function parses command-line arguments specifying the metadata
    JSON-LD file and the SHACL shapes file, then runs SHACL validation.

    If validation succeeds, a success message is printed. If validation
    fails, the validation report is printed and the program exits with
    a non-zero status code.
    """
    parser = argparse.ArgumentParser(description="SHACL validation for CSVW-EO metadata")
    parser.add_argument("metadata_file", type=str)
    parser.add_argument("shacl_file", type=str, help="SHACL TTL file")
    args = parser.parse_args()

    metadata_path = Path(args.metadata_file)
    shacl_path = Path(args.shacl_file)

    if not metadata_path.exists():
        print(f"Metadata file not found: {metadata_path}")  # noqa: T201
        sys.exit(1)
    if not shacl_path.exists():
        print(f"SHACL file not found: {shacl_path}")  # noqa: T201
        sys.exit(1)

    try:
        conforms, results_text = validate_metadata_shacl(metadata_path, shacl_path)
    except ImportError:
        print("pySHACL not installed. Please install it with `pip install pyshacl`")  # noqa: T201
        sys.exit(1)

    if conforms:
        print("SHACL validation SUCCESSFUL")  # noqa: T201
    else:
        print("SHACL validation FAILED")  # noqa: T201
        print(results_text)  # noqa: T201
        sys.exit(1)

validate_metadata_shacl(metadata_file: Path, shacl_file: Path) -> tuple[bool, str]

Validate CSVW-EO metadata against a SHACL schema.

Parameters:

Name Type Description Default
metadata_file Path

Path to the metadata file in JSON-LD format.

required
shacl_file Path

Path to the SHACL shapes file in Turtle format.

required

Returns:

Type Description
Tuple[bool, str]

A tuple containing: - bool : Whether the metadata conforms to the SHACL schema. - str : Textual validation report produced by pySHACL.

Source code in csvw-eo-library/src/csvw_eo/validate_metadata_shacl.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def validate_metadata_shacl(metadata_file: Path, shacl_file: Path) -> tuple[bool, str]:
    """
    Validate CSVW-EO metadata against a SHACL schema.

    Parameters
    ----------
    metadata_file : Path
        Path to the metadata file in JSON-LD format.
    shacl_file : Path
        Path to the SHACL shapes file in Turtle format.

    Returns
    -------
    Tuple[bool, str]
        A tuple containing:
        - bool : Whether the metadata conforms to the SHACL schema.
        - str : Textual validation report produced by pySHACL.

    """
    data_graph = Graph()
    data_graph.parse(metadata_file, format="json-ld")

    shacl_graph = Graph()
    shacl_graph.parse(shacl_file, format="turtle")

    conforms, _, results_text = shacl_validate(
        data_graph,
        shacl_graph=shacl_graph,
        inference="rdfs",
        abort_on_first=False,
        meta_shacl=False,
        debug=False,
    )

    return conforms, results_text

Structural Validation

csvw_eo.assert_same_structure

Utility script to verify that a generated dummy CSV preserves the structural.

properties of an original CSV dataset.

The script checks: - column names and order - inferred CSVW-EO datatypes - nullability (required vs optional columns) - optional categorical value compatibility

It does NOT check statistical similarity, only structural compatibility.

assert_same_structure(df1: pd.DataFrame, df2: pd.DataFrame, check_categories: bool = True) -> None

Verify that two CSV files share the same structural schema.

The function checks column names/order, inferred datatypes, nullability constraints, and optionally categorical value sets.

Parameters:

Name Type Description Default
df1 DataFrame

Original dataframe.

required
df2 DataFrame

Dummy dataframe.

required
check_categories bool

Whether to verify that categorical values in the dummy data are subsets of those in the original data.

True

Raises:

Type Description
AssertionError

If any structural mismatch is detected.

Source code in csvw-eo-library/src/csvw_eo/assert_same_structure.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def assert_same_structure(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    check_categories: bool = True,
) -> None:
    """
    Verify that two CSV files share the same structural schema.

    The function checks column names/order, inferred datatypes,
    nullability constraints, and optionally categorical value sets.

    Parameters
    ----------
    df1 : pd.DataFrame
        Original dataframe.
    df2 : pd.DataFrame
        Dummy dataframe.
    check_categories : bool, default=True
        Whether to verify that categorical values in the dummy data
        are subsets of those in the original data.

    Raises
    ------
    AssertionError
        If any structural mismatch is detected.

    """
    # Columns: order and names
    if list(df1.columns) != list(df2.columns):
        raise AssertionError(
            f"Column names/order differ:\nOriginal: {list(df1.columns)}\nDummy:{list(df2.columns)}"
        )

    # Data types
    for col in df1.columns:
        dtype1 = infer_xmlschema_datatype(df1[col])
        dtype2 = infer_xmlschema_datatype(df2[col])

        group1 = XSD_GROUP_MAP.get(dtype1)
        group2 = XSD_GROUP_MAP.get(dtype2)

        # If both are integer types, accept subtype differences
        if group1 == DataTypesGroups.INTEGER and group2 == DataTypesGroups.INTEGER:
            continue

        if dtype1 != dtype2:
            raise AssertionError(f"Column '{col}' dtype mismatch: original={dtype1}, dummy={dtype2}")

    # Nullability
    for col in df1.columns:
        required1: bool = df1[col].notna().all()
        required2: bool = df2[col].notna().all()

        if required1 != required2:
            raise AssertionError(
                f"Column '{col}' nullability mismatch: original required={required1}, "
                f"dummy required={required2}"
            )

    # Categorical subset check
    if check_categories:
        cat_cols = [col for col in df1.columns if is_categorical(df1[col])]
        for col in cat_cols:
            vals1 = set(df1[col].dropna().unique())
            vals2 = set(df2[col].dropna().unique())

            if not vals2.issubset(vals1):
                raise AssertionError(
                    f"Column '{col}' dummy values {vals2} are not subset of original {vals1}"
                )

main() -> None

Command-line entry point for the CSV structure validator.

Source code in csvw-eo-library/src/csvw_eo/assert_same_structure.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def main() -> None:
    """Command-line entry point for the CSV structure validator."""
    parser = argparse.ArgumentParser(
        description="Assert that two CSV files match CSVW-EO structural properties"
    )
    parser.add_argument("original_csv", type=str, help="Original CSV file")
    parser.add_argument("dummy_csv", type=str, help="Dummy CSV file")
    parser.add_argument(
        "--no-categories",
        action="store_true",
        help="Skip categorical subset validation",
    )

    args = parser.parse_args()

    df1 = pd.read_csv(Path(args.original_csv), parse_dates=True)
    df2 = pd.read_csv(Path(args.dummy_csv), parse_dates=True)
    try:
        assert_same_structure(
            df1,
            df2,
            check_categories=not args.no_categories,
        )
    except AssertionError as e:
        print(f"Structure mismatch: {e}")  # noqa: T201
        sys.exit(1)
    except Exception as e:
        print(f"ERROR: {e}")  # noqa: T201
        sys.exit(2)