Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Release Notes
**Future Releases**
* Enhancements
* Fixes
* `IDColumnsDataCheck` now works with Unknown data type :pr:`4203`
* Changes
* Documentation Changes
* Testing Changes
Expand Down
10 changes: 7 additions & 3 deletions evalml/data_checks/id_columns_data_check.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Data check that checks if any of the features are likely to be ID columns."""

from evalml.data_checks import (
DataCheck,
DataCheckActionCode,
Expand Down Expand Up @@ -180,16 +181,19 @@ def validate(self, X, y=None):
] # columns whose name is "id"
id_cols = {col: 0.95 for col in cols_named_id}

for dtypes in [["Double"], ["Integer", "IntegerNullable", "Categorical"]]:
X_temp = X.ww.select(include=dtypes)
for types in [
["Double"],
["Integer", "IntegerNullable", "Categorical", "Unknown"],
]:
X_temp = X.ww.select(include=types)
check_all_unique = X_temp.nunique() == len(X_temp)
cols_with_all_unique = check_all_unique[
check_all_unique
].index.tolist() # columns whose values are all unique

# Temporary solution for downstream instances of integers being mapped to doubles.
# Will be removed when resolved.
if dtypes == ["Double"]:
if types == ["Double"]:
cols_with_all_unique = [
col
for col in cols_with_all_unique
Expand Down
24 changes: 20 additions & 4 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def graphviz():
def get_test_data_with_or_without_primary_key():
def _get_test_data_with_primary_key(input_type, has_primary_key):
X = None
if input_type == "integer":
if input_type == "Integer":
X_dict = {
"col_1_id": [0, 1, 2, 3],
"col_2": [2, 3, 4, 5],
Expand All @@ -117,7 +117,7 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
X_dict["col_1_id"] = [1, 1, 2, 3]
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "integer_nullable":
elif input_type == "IntegerNullable":
X_dict = {
"col_1_id": pd.Series([0, 1, 2, 3], dtype="Int64"),
"col_2": pd.Series([2, 3, 4, 5], dtype="Int64"),
Expand All @@ -128,7 +128,7 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
X_dict["col_1_id"] = pd.Series([1, 1, 2, 3], dtype="Int64")
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "double":
elif input_type == "Double":
X_dict = {
"col_1_id": [0.0, 1.0, 2.0, 3.0],
"col_2": [2, 3, 4, 5],
Expand All @@ -139,7 +139,23 @@ def _get_test_data_with_primary_key(input_type, has_primary_key):
X_dict["col_1_id"] = [1.0, 1.0, 2.0, 3.0]
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "string":
elif input_type == "Unknown":
X_dict = {
"col_1_id": ["a", "b", "c", "d"],
"col_2": ["w", "x", "y", "z"],
"col_3_id": [
"123456789012345",
"234567890123456",
"3456789012345678",
"45678901234567",
],
"col_5": ["0", "0", "1", "2"],
}
if not has_primary_key:
X_dict["col_1_id"] = ["b", "b", "c", "d"]
X = pd.DataFrame.from_dict(X_dict)

elif input_type == "Categorical":
X_dict = {
"col_1_id": ["a", "b", "c", "d"],
"col_2": ["w", "x", "y", "z"],
Expand Down
16 changes: 9 additions & 7 deletions evalml/tests/data_checks_tests/test_id_columns_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def test_id_cols_data_check_input_formats(logical_type):

@pytest.mark.parametrize(
"input_type",
["integer", "integer_nullable", "string", "double"],
["Integer", "IntegerNullable", "Unknown", "Double", "Categorical"],
)
def test_identified_first_col_primary_key(
input_type,
Expand Down Expand Up @@ -300,7 +300,7 @@ def test_identified_first_col_primary_key(

@pytest.mark.parametrize(
"input_type",
["integer", "integer_nullable", "string", "double"],
["Integer", "IntegerNullable", "Unknown", "Double", "Categorical"],
)
def test_unidentified_first_col_primary_key(
input_type,
Expand All @@ -312,7 +312,7 @@ def test_unidentified_first_col_primary_key(
)

id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
if input_type == "string":
if input_type in ["Unknown", "Categorical"]:
order = ["col_2", "col_3_id", "col_1_id"]
else:
order = ["col_2", "col_1_id", "col_3_id"]
Expand All @@ -335,13 +335,15 @@ def test_unidentified_first_col_primary_key(
]

X = X.rename(columns={"col_1_id": "col_1"})
if input_type == "integer":
if input_type == "Integer":
X.at[0, "col_1"] = 0
elif input_type == "integer_nullable":
elif input_type == "IntegerNullable":
X.at[0, "col_1"] = 0
elif input_type == "double":
elif input_type == "Double":
X.at[0, "col_1"] = 0.0
elif input_type == "string":
elif input_type == "Unknown":
X.at[0, "col_1"] = "a"
elif input_type == "Categorical":
X["col_1"] = X["col_1"].cat.add_categories("a")
X.at[0, "col_1"] = "a"

Expand Down