Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/concepts/function-modifiers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ The ``@check_output`` function modifiers are applied on the **node output / func

In the future, validatation capabailities may be added to ``@schema``. For now, it's only added metadata.

@check_output
~~~~~~~~~~~~~
@check_output*
~~~~~~~~~~~~~~

The ``@check_output`` implements many data checks for Python objects and DataFrame/Series including data type, min/max/between, count, fraction of null/nan values, and allow null/nan. Failed checks are either logged (``importance="warn"``) or make the dataflow fail (``importance="fail"``).

Expand All @@ -162,6 +162,7 @@ The next snippet checks if the returned Series is of type ``np.int32``, which is

- To see all available validators, go to the file ``hamilton/data_quality/default_validators.py`` and view the variable ``AVAILABLE_DEFAULT_VALIDATORS``.
- The function modifier ``@check_output_custom`` allows you to define your own validator. Validators inherit the ``base.BaseDefaultValidator`` class and are essentially standardized Hamilton node definitions (instead of functions). See ``hamilton/data_quality/default_validators.py`` or reach out on `Slack <https://join.slack.com/t/hamilton-opensource/shared_invite/zt-1bjs72asx-wcUTgH7q7QX1igiQ5bbdcg>`_ for help!
- Note: ``@check_output_custom`` decorators cannot be stacked, but they instead can take multiple validators.

.. note::

Expand Down
13 changes: 13 additions & 0 deletions hamilton/function_modifiers/validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from collections import defaultdict
from typing import Any, Callable, Collection, Dict, List, Type

from hamilton import node
Expand Down Expand Up @@ -38,13 +39,21 @@ def transform_node(
validators = self.get_validators(node_)
validator_nodes = []
validator_name_map = {}
validator_name_count = defaultdict(int)
for validator in validators:

def validation_function(validator_to_call: dq_base.DataValidator = validator, **kwargs):
result = list(kwargs.values())[0] # This should just have one kwarg
return validator_to_call.validate(result)

validator_node_name = node_.name + "_" + validator.name()
validator_name_count[validator_node_name] = (
validator_name_count[validator_node_name] + 1
)
if validator_name_count[validator_node_name] > 1:
validator_node_name = (
validator_node_name + "_" + str(validator_name_count[validator_node_name] - 1)
)
validator_node = node.Node(
name=validator_node_name, # TODO -- determine a good approach towards naming this
typ=dq_base.ValidationResult,
Expand Down Expand Up @@ -125,6 +134,10 @@ def __init__(self, *validators: dq_base.DataValidator, target_: base.TargetType
4. **Collection[str]**: This will check all nodes specified in the list.

In all likelihood, you *don't* want ``...``, but the others are useful.

Note: you cannot stack `@check_output_custom` decorators. If you want to use multiple custom validators, \
you should pass them all in as arguments to a single `@check_output_custom` decorator.

"""
super(check_output_custom, self).__init__(target=target_)
self.validators = list(validators)
Expand Down
52 changes: 52 additions & 0 deletions tests/function_modifiers/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,58 @@ def fn(input: pd.Series) -> pd.Series:
)


def test_check_output_custom_node_transform_duplicate():
"""You should be able to pass in the same validator twice; IRL it would be different args."""
decorator = check_output_custom(
SampleDataValidator2(dataset_length=1, importance="warn"),
SampleDataValidator2(dataset_length=1, importance="warn"),
)

def fn(input: pd.Series) -> pd.Series:
return input

node_ = node.Node.from_fn(fn)
subdag = decorator.transform_node(node_, config={}, fn=fn)
assert 4 == len(subdag)
subdag_as_dict = {node_.name: node_ for node_ in subdag}
assert sorted(subdag_as_dict.keys()) == [
"fn",
"fn_dummy_data_validator_2",
"fn_dummy_data_validator_2_1",
"fn_raw",
]
# TODO -- change when we change the naming scheme
assert subdag_as_dict["fn_raw"].input_types["input"][1] == DependencyType.REQUIRED
assert 3 == len(
subdag_as_dict["fn"].input_types
) # Three dependencies -- the two with DQ + the original
data_validators = [
value
for value in subdag_as_dict.values()
if value.tags.get("hamilton.data_quality.contains_dq_results", False)
]
assert len(data_validators) == 2 # One for each validator
first_validator, _ = data_validators
assert (
IS_DATA_VALIDATOR_TAG in first_validator.tags
and first_validator.tags[IS_DATA_VALIDATOR_TAG] is True
) # Validates that all the required tags are included
assert (
DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG in first_validator.tags
and first_validator.tags[DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG] == "fn"
)

# The final function should take in everything but only use the raw results
assert (
subdag_as_dict["fn"].callable(
fn_raw="test",
fn_dummy_data_validator_2=ValidationResult(True, "", {}),
fn_dummy_data_validator_2_1=ValidationResult(True, "", {}),
)
== "test"
)


def test_check_output_custom_node_transform_raises_exception_with_failure():
decorator = check_output_custom(
SampleDataValidator2(dataset_length=1, importance="fail"),
Expand Down