Merge pull request #430 from lnccbrown/427-add-choices-argument-to-hssm

digicosmos86 · web-flow · commit 19f786d9abec · 2024-05-16T16:07:26.000-04:00
Allow users to specify the number of choices
diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml
@@ -49,8 +49,11 @@ jobs:
       - name: Linting
         run: ruff check src/hssm
 
-      - name: Run tests
-        run: pytest -n auto -s
+      - name: Run fast tests
+        run: pytest -n auto -s --ignore=tests/slow
+
+      - name: Run slow tests
+        run: pytest -n auto -s tests/slow
 
   publish:
     name: Build wheel and publish to test-PyPI, and then PyPI, and publish docs
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -47,8 +47,11 @@ jobs:
       - name: Linting
         run: ruff check src/hssm
 
-      - name: Run tests
-        run: pytest -n auto -s
+      - name: Run fast tests
+        run: pytest -n auto -s --ignore=tests/slow
+
+      - name: Run slow tests
+        run: pytest -n auto -s tests/slow
 
       - name: build docs
         run: mkdocs build
diff --git a/src/hssm/hssm.py b/src/hssm/hssm.py
@@ -73,6 +73,13 @@ class HSSM:
         "ddm_seq2_no_bias". If any other string is passed, the model will be considered
         custom, in which case all `model_config`, `loglik`, and `loglik_kind` have to be
         provided by the user.
+    choices : optional
+        When an `int`, the number of choices that the participants can make. If `2`, the
+        choices are [-1, 1] by default. If anything greater than `2`, the choices are
+        [0, 1, ..., n_choices - 1] by default. If a `list` is provided, it should be the
+        list of choices that the participants can make. Defaults to `2`. If any value
+        other than the choices provided is found in the "response" column of the data,
+        an error will be raised.
     include : optional
         A list of dictionaries specifying parameter specifications to include in the
         model. If left unspecified, defaults will be used for all parameter
@@ -225,6 +232,7 @@ def __init__(
         self,
         data: pd.DataFrame,
         model: SupportedModels | str = "ddm",
+        choices: int | list[int] = 2,
         include: list[dict | Param] | None = None,
         model_config: ModelConfig | dict | None = None,
         loglik: (
@@ -282,8 +290,20 @@ def __init__(
         self.loglik_kind = self.model_config.loglik_kind
         self.extra_fields = self.model_config.extra_fields
 
-        self.choices = self.data["response"].unique().astype(int)
-        self.n_choices = len(self.choices)
+        if isinstance(choices, int):
+            if choices == 2:
+                self.n_choices = 2
+                self.choices = [-1, 1]
+            elif choices > 2:
+                self.n_choices = choices
+                self.choices = list(range(choices))
+            else:
+                raise ValueError("choices must be greater than 1.")
+        elif isinstance(choices, list):
+            self.n_choices = len(choices)
+            self.choices = choices
+        else:
+            raise ValueError("choices must be an integer or a list of integers.")
 
         self._pre_check_data_sanity()
 
@@ -1393,13 +1413,6 @@ def _pre_check_data_sanity(self):
                     + "`participant_id` is not found in your dataset."
                 )
 
-        if self.n_choices == 2:
-            if -1 not in self.choices or 1 not in self.choices:
-                raise ValueError(
-                    "The response column must contain only -1 and 1 when there are "
-                    + "two responses."
-                )
-
     def _post_check_data_sanity(self):
         """Check if the data is clean enough for the model."""
         if self.deadline or self.missing_data:
@@ -1425,6 +1438,24 @@ def _post_check_data_sanity(self):
                 + "which is not allowed."
             )
 
+        valid_responses = self.data.loc[self.data["rt"] != -999.0, "response"]
+        unique_responses = valid_responses.unique().astype(int)
+
+        if np.any(~np.isin(unique_responses, self.choices)):
+            invalid_responses = sorted(
+                unique_responses[~np.isin(unique_responses, self.choices)]
+            )
+            raise ValueError(
+                f"Invalid responses found in your dataset: {invalid_responses}"
+            )
+
+        if len(unique_responses) != self.n_choices:
+            missing_responses = sorted(np.setdiff1d(self.choices, unique_responses))
+            _logger.warning(
+                f"You set choices to be {self.choices}, but {missing_responses} are "
+                + "missing from your dataset."
+            )
+
     def _postprocess_initvals_deterministic(
         self, initval_settings: dict = INITVAL_SETTINGS
     ) -> None:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -37,8 +37,8 @@ def data_angle():
 def data_ddm_reg():
     # Generate some fake simulation data
     intercept = 1.5
-    x = np.random.uniform(-5.0, 5.0, size=1000)
-    y = np.random.uniform(-5.0, 5.0, size=1000)
+    x = np.random.uniform(-0.5, 0.5, size=1000)
+    y = np.random.uniform(-0.5, 0.5, size=1000)
 
     v = intercept + 0.8 * x + 0.3 * y
     true_values = np.column_stack(
@@ -57,6 +57,35 @@ def data_ddm_reg():
     return dataset_reg_v
 
 
+@pytest.fixture(scope="module")
+def data_ddm_reg_va():
+    # Generate some fake simulation data
+    intercept = 1.5
+    intercept_a = 1.0
+    x = np.random.uniform(-0.5, 0.5, size=100)
+    y = np.random.uniform(-0.5, 0.5, size=100)
+
+    m = np.random.uniform(-0.5, 0.5, size=100)
+    n = np.random.uniform(-0.5, 0.5, size=100)
+
+    v = intercept + 0.8 * x + 0.3 * y
+    a = intercept_a + 0.1 * m + 0.1 * n
+    true_values = np.column_stack([v, a, np.repeat([[0.5, 0.5]], axis=0, repeats=100)])
+
+    dataset_reg_va = hssm.simulate_data(
+        model="ddm",
+        theta=true_values,
+        size=1,  # Generate one data point for each of the 1000 set of true values
+    )
+
+    dataset_reg_va["x"] = x
+    dataset_reg_va["y"] = y
+    dataset_reg_va["m"] = m
+    dataset_reg_va["n"] = n
+
+    return dataset_reg_va
+
+
 @pytest.fixture
 def cav_idata():
     return az.from_netcdf("tests/fixtures/cavanagh_idata.nc")
diff --git a/tests/slow/test_mcmc.py b/tests/slow/test_mcmc.py
@@ -184,7 +184,7 @@ def test_reg_models(data_ddm_reg, loglik_kind, backend, sampler, step, expected)
 
 
 @pytest.mark.parametrize(parameter_names, parameter_grid)
-def test_reg_models_v_a(data_ddm_reg, loglik_kind, backend, sampler, step, expected):
+def test_reg_models_v_a(data_ddm_reg_va, loglik_kind, backend, sampler, step, expected):
     print("PYMC VERSION: ")
     print(pm.__version__)
     print("TEST INPUTS WERE: ")
@@ -199,27 +199,27 @@ def test_reg_models_v_a(data_ddm_reg, loglik_kind, backend, sampler, step, expec
         },
     )
     param_reg_a = dict(
-        formula="a ~ 1 + x + y",
+        formula="a ~ 1 + m + n",
         prior={
             "Intercept": {
-                "name": "Uniform",
-                "lower": 0.5,
-                "upper": 3.0,
-                "initval": 1.0,
+                "name": "Normal",
+                "mu": 1.0,
+                "sigma": 0.5,
             },
-            "x": {"name": "Uniform", "lower": -0.50, "upper": 0.50, "initval": 0.0},
-            "y": {"name": "Uniform", "lower": -0.50, "upper": 0.50, "initval": 0.0},
+            "m": {"name": "Uniform", "lower": 0.0, "upper": 0.2},
+            "n": {"name": "Uniform", "lower": 0.0, "upper": 0.2},
         },
         link="identity",
     )
 
     model = hssm.HSSM(
-        data_ddm_reg,
+        data_ddm_reg_va,
         loglik_kind=loglik_kind,
         model_config={"backend": backend},
         v=param_reg_v,
         a=param_reg_a,
     )
+    print(model.params["a"])
     run_sample(model, sampler, step, expected)
 
     # Only runs once
diff --git a/tests/test_data_sanity.py b/tests/test_data_sanity.py
@@ -41,10 +41,10 @@ def test_data_sanity_check(data_ddm, cpn, caplog):
     ):
         hssm.HSSM(data=data_ddm, model="ddm", hierarchical=True)
 
-    # Case 5: raise error if there are missing fields in data
+    # Case 5: raise error if there are invalid responses in data
     with pytest.raises(
         ValueError,
-        match="The response column must contain only -1 and 1 when there are two responses.",
+        match=r"Invalid responses found in your dataset: \[0\]",
     ):
         data_ddm_miscoded = data_ddm.copy()
         data_ddm_miscoded["response"] = data_ddm_miscoded["response"].replace(
@@ -53,7 +53,27 @@ def test_data_sanity_check(data_ddm, cpn, caplog):
 
         hssm.HSSM(data=data_ddm_miscoded, model="ddm")
 
-    # Case 6: if deadline or missing_data is True, data should contain missing values
+    with pytest.raises(
+        ValueError,
+        match=r"Invalid responses found in your dataset: \[0\]",
+    ):
+        data_ddm_miscoded = data_ddm.copy()
+        data_ddm_miscoded["response"] = np.random.choice([0, 1, 2], data_ddm.shape[0])
+
+        hssm.HSSM(data=data_ddm_miscoded, model="ddm", choices=[1, 2, 3])
+
+    # Case 6: raise warning if there are missing responses in data
+    data_ddm_miscoded = data_ddm.copy()
+    data_ddm_miscoded["response"] = np.random.choice([1, 2], data_ddm.shape[0])
+
+    hssm.HSSM(data=data_ddm_miscoded, model="ddm", choices=[1, 2, 3])
+
+    assert (
+        caplog.records[-1].msg
+        == "You set choices to be [1, 2, 3], but [3] are missing from your dataset."
+    )
+
+    # Case 7: if deadline or missing_data is True, data should contain missing values
     with pytest.raises(
         ValueError,
         match="You have no missing data in your dataset, "