ImperialCollegeLondon · davidorme · Feb 17, 2023 · Feb 9, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/docs/source/development/defining_new_models.md b/docs/source/development/defining_new_models.md
@@ -193,47 +193,73 @@ def cleanup(self) -> None:
     """Placeholder function for freshwater model cleanup."""
 ```
 
-## Including a configuration schema
+## Writing a schema for the module configuration
 
-A detailed description of the configuration system can be found
-[here](../virtual_rainforest/core/config.md). The key thing to note is that a `JSON`
-schema file should be saved within your model folder. This file should have a name of
-the format "{MODEL_NAME}_schema.json". In order for this schema to be generally
-accessible, it needs to be registered in the model's `__init__.py` (i.e. the
-`__init__.py` in the model folder). This means that when the model is imported, it's
-schema is automatically added to the schema registry.
+The root module directory **must** also contain a [JSONSchema](https://json-schema.org/)
+document that defines the configuration options for the model.  A detailed description
+of the configuration system works can be found
+[here](../virtual_rainforest/core/config.md) but the schema definition is used to
+validate configuration files for a Virtual Rainforest simulation that uses your model.
 
-```python
-from virtual_rainforest.core.config import register_schema
+So the example model used here would need to provide the file:
+`virtual_rainforest/models/freshwater/freshwater_schema.json`
 
-@register_schema("freshwater")
-def schema() -> dict:
-    """Defines the schema that the freshwater module configuration should conform to."""
+Writing JSONSchema documents can be very tedious. The following tools may be of use:
 
-    schema_file = Path(__file__).parent.resolve() / "freshwater_schema.json"
+* [https://www.jsonschema.net/app](https://www.jsonschema.net/app): this is a web
+  application that takes a data document - which is what the configuration file - and
+  automatically generates a JSON schema to validate it. You will need to then edit it
+  but you'll be starting with a valid schema!
+* [https://jsonschemalint.com/](https://jsonschemalint.com/) works the other way. It
+  takes a data document and a schema and checks whether the data is compliant. This can
+  be useful for checking errors.
 
-    with schema_file.open() as f:
-        config_schema = json.load(f)
+Both of those tools take data documents formatted as JSON as inputs, where we use TOML
+configuration files, but there are lots of web tools to convert TOML to JSON and back.
 
-    return config_schema
-```
+## Setting up the model `__init__.py` file
+
+All model directories need to include an `__init__.py` file. The simple presence of the
+`__init__.py` file tells Python that the directory content should be treated as module,
+but then the file needs to contain code to do two things:
+
+1. The `__init__.py` file needs to register the JSONSchema file for the module. The
+    {meth}`~virtual_rainforest.core.config.register_schema` function takes a module name
+    and the path to the schema file and then, after checking the file can be loaded and is
+    valid, adds the schema to the schema registry
+    {data}`~virtual_rainforest.core.config.SCHEMA_REGISTRY`.
 
-## Ensuring that schema and models are always added to the registry
+1. It also needs to import the main BaseModel subclass. So for example, it should import
+    `FreshwaterModel` from the `virtual_rainforest.models.freshwater.freshwater_model`
+    module. This gives a shorter reference for a commonly used object
+    (`virtual_rainforest.models.freshwater.FreshwaterModel`) but it also means
+    that the BaseModel class is always imported when the model module
+    (`virtual_rainforest.models.freshwater`) is imported. This is used when the package
+    is loaded to automatically discover all the BaseModel classes and register them.
 
-At the moment, a configuration schema only gets added to the schema registry when the
-model it belongs to is imported, and a `Model` class only gets added to the registry
-when the class itself is imported. This is a problem because the script that runs the
-main Virtual Rainforest simulation does not import these things directly. To circumvent
-this these imports needed to be placed in the top level `__init__.py` file (the one in
-the same folder as `main.py`). This won't pass the `pre-commit` checks unless `flake8`
-checks are turned off for the relevant lines. It's only strictly necessary to import the
-`Model` class, as this implicitly entails importing the specific model as a whole.
-However, for the sake of clarity we currently include both imports.
+    The class is not going to actually be used within the file, so needs `#noqa: 401`
+    to suppress a `flake8` error.
+
+The resulting `__init__.py` file should then look something like this:
 
 ```python
-# Import all module schema here to ensure that they are added to the registry
-from virtual_rainforest.models.freshwater import schema  # noqa
+"""This is the freshwater model module. The module level docstring should contain a 
+short description of the overall model design and purpose.
+"""  # noqa: D204, D415
+
+from importlib import resources
 
-# Import models here so that they also end up in the registry
-from virtual_rainforest.models.freshwater.model import FreshWaterModel  # noqa
+from virtual_rainforest.core.config import register_schema
+from virtual_rainforest.models.freshwater.freshwater_model import (
+    FreshWaterModel  
+) # noqa: 401
+
+with resources.path(
+    "virtual_rainforest.models.freshwater", "freshwater_schema.json"
+) as schema_file_path:
+    register_schema(module_name="freshwater", schema_file_path=schema_file_path)
 ```
+
+When the `virtual_rainforest` package is loaded, it will automatically import
+`virtual_rainforest.models.freshwater`. That will cause both the model and the schema to
+be loaded and registered.
diff --git a/tests/core/test_config.py b/tests/core/test_config.py
@@ -6,10 +6,12 @@
 to date.
 """
 
+import json
 from contextlib import nullcontext as does_not_raise
 from logging import CRITICAL, ERROR, INFO
 from pathlib import Path
 
+import jsonschema
 import pytest
 
 import virtual_rainforest.core.config as config
@@ -290,72 +292,64 @@ def test_final_validation_log(caplog, shared_datadir, file_path, expected_log_en
     [
         (
             "core",
-            {},
+            "",
             ValueError,
             (
                 (
                     CRITICAL,
-                    "The module schema core is used multiple times, this shouldn't"
-                    " be the case!",
+                    "The module schema for core is already registered",
                 ),
             ),
         ),
         (
             "test",
             "najsnjasnda",
-            OSError,
-            ((CRITICAL, "Module schema test not valid JSON!"),),
+            json.JSONDecodeError,
+            (
+                (ERROR, "JSON error in schema file"),
+                (CRITICAL, "Schema registration for test failed: check log"),
+            ),
         ),
         (
             "bad_module_1",
-            {"type": "object", "propertie": {"bad_module_1": {}}},
-            KeyError,
+            '{"type": "hobbit", "properties": {"bad_module_1": {}}}',
+            jsonschema.SchemaError,
             (
-                (
-                    CRITICAL,
-                    "Schema for bad_module_1 module incorrectly structured, "
-                    "'properties' key missing!",
-                ),
+                (ERROR, "Module schema invalid in: "),
+                (CRITICAL, "Schema registration for bad_module_1 failed: check log"),
             ),
         ),
         (
             "bad_module_2",
-            {"type": "object", "properties": {"bad_module_1": {}}},
-            KeyError,
+            '{"type": "object", "properties": {"bad_module_1": {}}}',
+            ValueError,
             (
-                (
-                    CRITICAL,
-                    "Schema for bad_module_2 module incorrectly structured, "
-                    "'bad_module_2' key missing!",
-                ),
+                (ERROR, "Missing key in module schema bad_module_2:"),
+                (CRITICAL, "Schema registration for bad_module_2 failed: check log"),
             ),
         ),
         (
             "bad_module_3",
-            {"type": "object", "properties": {"bad_module_3": {}}},
-            KeyError,
+            '{"type": "object", "properties": {"bad_module_3": {}}}',
+            ValueError,
             (
-                (
-                    CRITICAL,
-                    "Schema for bad_module_3 module incorrectly structured, 'required'"
-                    " key missing!",
-                ),
+                (ERROR, "Missing key in module schema bad_module_3"),
+                (CRITICAL, "Schema registration for bad_module_3 failed: check log"),
             ),
         ),
     ],
 )
 def test_register_schema_errors(
-    caplog, schema_name, schema, expected_exception, expected_log_entries
+    caplog, mocker, schema_name, schema, expected_exception, expected_log_entries
 ):
     """Test that the schema registering decorator throws the correct errors."""
-    # Check that construct_combined_schema fails as expected
-    with pytest.raises(expected_exception):
 
-        @register_schema(schema_name)
-        def to_be_decorated() -> dict:
-            return schema
+    data = mocker.mock_open(read_data=schema)
+    mocker.patch("builtins.open", data)
 
-        to_be_decorated()
+    # Check that construct_combined_schema fails as expected
+    with pytest.raises(expected_exception):
+        register_schema(schema_name, "file_path")
 
     # Then check that the correct (critical error) log messages are emitted
     log_check(caplog, expected_log_entries)

diff --git a/virtual_rainforest/__init__.py b/virtual_rainforest/__init__.py
@@ -1,13 +1,18 @@
 import importlib.metadata
+import pkgutil
+from importlib import import_module
 
-# Import all module schema here to ensure that they are added to the registry
-from virtual_rainforest.core import schema  # noqa
-from virtual_rainforest.models.abiotic import schema  # noqa
-from virtual_rainforest.models.abiotic.abiotic_model import AbioticModel  # noqa
-from virtual_rainforest.models.plants import schema  # noqa
-from virtual_rainforest.models.soil import schema  # noqa
+import virtual_rainforest.models as vfm
 
-# Import models here so that they also end up in the registry
-from virtual_rainforest.models.soil.soil_model import SoilModel  # noqa
+# Import the core module schema to register the schema
+import_module("virtual_rainforest.core")
+
+# Autodiscover models in the models module to add their schema and BaseModel subclass
+# All modules within virtual_rainforest.model are expected to:
+# - import their BaseModel subclass to the module root, for example:
+#   from virtual_rainforest.models.soil.model import SoilModel  # noqa: F401
+# - register their configuration schema using core.config.register_schema
+for module_info in pkgutil.iter_modules(vfm.__path__):
+    import_module(f"virtual_rainforest.models.{module_info.name}")
 
 __version__ = importlib.metadata.version("virtual_rainforest")
diff --git a/virtual_rainforest/core/__init__.py b/virtual_rainforest/core/__init__.py
@@ -27,19 +27,9 @@
 configuration schema for the core submodules.
 """  # noqa: D205, D415
 
-import json
-from pathlib import Path
+from importlib import resources
 
 from virtual_rainforest.core.config import register_schema
 
-
-@register_schema("core")
-def schema() -> dict:
-    """Defines the schema that the core module configuration should conform to."""
-
-    schema_file = Path(__file__).parent.resolve() / "core_schema.json"
-
-    with schema_file.open() as f:
-        config_schema = json.load(f)
-
-    return config_schema
+with resources.path("virtual_rainforest.core", "core_schema.json") as schema_file_path:
+    register_schema(module_name="core", schema_file_path=schema_file_path)