ImperialCollegeLondon · sallymatson · Jan 17, 2025 · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -36,7 +36,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
 
     - name: Install Poetry
-      uses: abatilo/actions-poetry@v4.0.0
+      uses: abatilo/actions-poetry@v3.0.2
       with:
         poetry-version: 1.5.1
 
@@ -71,7 +71,7 @@ jobs:
           python-version: "3.10"
 
       - name: Install Poetry
-        uses: abatilo/actions-poetry@v4.0.0
+        uses: abatilo/actions-poetry@v3.0.2
         with:
           poetry-version: 1.5.1
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -168,6 +168,8 @@ class MyReferenceStyle(AuthorYearReferenceStyle):
     ("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.__repr__"),
     ("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.count"),
     ("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.index"),
+    ("py:exc", "ParserError"),
+    ("py:exc", "BadZipFile"),
 ]
 intersphinx_mapping = {
     "numpy": ("https://numpy.org/doc/stable/", None),

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ tomli = {version = "^2.0.1", python = "<3.11"}
 tomli-w = "^1.0.0" 
 tqdm = "^4.66.2" 
 xarray = ">=2024.6,<2026.0" 
+openpyxl = "^3.1.5"
 
 [tool.poetry.group.types.dependencies]
 types-dataclasses = "^0.6.6"

diff --git a/tests/core/data/garbage.csv b/tests/core/data/garbage.csv
@@ -0,0 +1,2 @@
+a,b,c
+1,2,"3
diff --git a/tests/core/data/garbage.xlsx b/tests/core/data/garbage.xlsx
diff --git a/tests/core/data/reader_test.csv b/tests/core/data/reader_test.csv
@@ -0,0 +1,4 @@
+var1,var2
+1,4
+2,5
+3,6
diff --git a/tests/core/data/reader_test.xlsx b/tests/core/data/reader_test.xlsx
diff --git a/tests/core/test_readers.py b/tests/core/test_readers.py
@@ -2,8 +2,10 @@
 
 from contextlib import nullcontext as does_not_raise
 from logging import CRITICAL, DEBUG, INFO
+from zipfile import BadZipFile
 
 import pytest
+from pandas.errors import ParserError
 from xarray import DataArray
 
 from tests.conftest import log_check
@@ -93,7 +95,11 @@ def test_func():
     ],
 )
 def test_load_netcdf(shared_datadir, caplog, file, file_var, exp_err, expected_log):
-    """Test the netdcf variable loader."""
+    """Test the netdcf variable loader.
+
+    The tests here are dependent on the test_file_format_loader, so cannot be run
+    individually.
+    """
 
     from virtual_ecosystem.core.readers import load_netcdf
 
@@ -105,6 +111,98 @@ def test_load_netcdf(shared_datadir, caplog, file, file_var, exp_err, expected_l
     log_check(caplog, expected_log)
 
 
+@pytest.mark.parametrize(
+    argnames=["file", "file_var", "exp_err", "expected_log"],
+    argvalues=[
+        (
+            "not_there.csv",
+            "irrelevant",
+            pytest.raises(FileNotFoundError),
+            ((CRITICAL, "Data file not found"),),
+        ),
+        (
+            "garbage.csv",
+            "irrelevant",
+            pytest.raises(ParserError),
+            ((CRITICAL, "Could not load data from"),),
+        ),
+        (
+            "reader_test.csv",
+            "missing",
+            pytest.raises(KeyError),
+            ((CRITICAL, "Variable missing not found in"),),
+        ),
+        (
+            "reader_test.csv",
+            "var1",
+            does_not_raise(),
+            (),
+        ),
+    ],
+)
+def test_load_csv(shared_datadir, caplog, file, file_var, exp_err, expected_log):
+    """Test the netdcf variable loader.
+
+    The tests here are dependent on the test_file_format_loader, so cannot be run
+    individually.
+    """
+
+    from virtual_ecosystem.core.readers import load_csv
+
+    with exp_err:
+        darray = load_csv(shared_datadir / file, file_var)
+        assert isinstance(darray, DataArray)
+
+    # Check the error reports
+    log_check(caplog, expected_log)
+
+
+@pytest.mark.parametrize(
+    argnames=["file", "file_var", "exp_err", "expected_log"],
+    argvalues=[
+        (
+            "not_there.xlsx",
+            "irrelevant",
+            pytest.raises(FileNotFoundError),
+            ((CRITICAL, "Data file not found"),),
+        ),
+        (
+            "garbage.xlsx",
+            "irrelevant",
+            pytest.raises(BadZipFile),
+            ((CRITICAL, "Could not load data from"),),
+        ),
+        (
+            "reader_test.xlsx",
+            "missing",
+            pytest.raises(KeyError),
+            ((CRITICAL, "Variable missing not found in"),),
+        ),
+        (
+            "reader_test.xlsx",
+            "var1",
+            does_not_raise(),
+            (),
+        ),
+    ],
+)
+def test_load_excel(shared_datadir, caplog, file, file_var, exp_err, expected_log):
+    """Test the netdcf variable loader.
+
+    The tests here are dependent on the test_file_format_loader, so cannot be run
+    individually.
+    """
+
+    from virtual_ecosystem.core.readers import load_excel
+
+    with exp_err:
+        darray = load_excel(shared_datadir / file, file_var)
+        assert isinstance(darray, DataArray)
+
+    # Check the error reports
+    log_check(caplog, expected_log)
+
+
 @pytest.mark.parametrize(
     argnames=[
         "filename",

diff --git a/virtual_ecosystem/core/readers.py b/virtual_ecosystem/core/readers.py
@@ -33,7 +33,10 @@ def new_function_to_load_tif_data(...):
 
 from collections.abc import Callable
 from pathlib import Path
+from zipfile import BadZipFile
 
+from pandas import read_csv, read_excel
+from pandas.errors import ParserError
 from xarray import DataArray, load_dataset
 
 from virtual_ecosystem.core.logger import LOGGER
@@ -55,7 +58,7 @@ def new_function_to_load_tif_data(...):
 """
 
 
-def register_file_format_loader(file_types: tuple[str]) -> Callable:
+def register_file_format_loader(file_types: tuple[str, ...]) -> Callable:
     """Adds a data loader function to the data loader registry.
 
     This decorator is used to register a function that loads data from a given file type
@@ -134,6 +137,88 @@ def load_netcdf(file: Path, var_name: str) -> DataArray:
     return dataset[var_name]
 
 
+@register_file_format_loader(file_types=(".csv",))
+def load_csv(file: Path, var_name: str) -> DataArray:
+    """Loads a DataArray from a csv file.
+
+    Args:
+        file: A Path for a csv or excel file containing the variable to load.
+        var_name: A string providing the name of the variables in this file.
+
+    Raises:
+        FileNotFoundError: with bad file path names.
+        ParserError: if the csv data is not readable.
+    """
+
+    to_raise: Exception
+
+    # Try to load file
+    try:
+        dataset = read_csv(file)
+    except FileNotFoundError:
+        to_raise = FileNotFoundError(f"Data file not found: {file}")
+        LOGGER.critical(to_raise)
+        raise to_raise
+    except ParserError as err:
+        to_raise = ParserError(f"Could not load data from {file}: {err}.")
+        LOGGER.critical(to_raise)
+        raise to_raise
+
+    # Check if file var is in the dataset
+    if var_name not in dataset.columns:
+        to_raise = KeyError(f"Variable {var_name} not found in {file}")
+        LOGGER.critical(to_raise)
+        raise to_raise
+
+    return dataset[var_name].to_xarray()
+
+
+@register_file_format_loader(file_types=(".xlsx",))
+def load_excel(file: Path, var_name: str) -> DataArray:
+    """Loads a DataArray from an excel file.
+
+    Args:
+        file: A Path for a csv or excel file containing the variable to load.
+        var_name: A string providing the name of the variables in this file.
+
+    Raises:
+        FileNotFoundError: with bad file path names.
+        BadZipFile: if the excel file is corrupted.
+        Exception: catches other exceptions from openpyxl.
+
+    Note: BadZipFile is the most common error thrown by openpyxl for corrupted excel
+    files, which is based on their internal processing files as zips. The general
+    exception is included to cover other possible issues from openpyxl, as it has
+    various other potential failure modes.
+    """
+
+    to_raise: Exception
+
+    # Determine dataframe file type & load file
+    try:
+        dataset = read_excel(file, engine="openpyxl")
+    except FileNotFoundError:
+        to_raise = FileNotFoundError(f"Data file not found: {file}")
+        LOGGER.critical(to_raise)
+        raise to_raise
+    except BadZipFile as err:
+        to_raise = BadZipFile(f"Could not load data from {file}: {err}.")
+        LOGGER.critical(to_raise)
+        raise to_raise
+    except Exception as err:
+        to_raise = Exception(f"Unidentified exception opening {file}: {err}")
+        LOGGER.critical(to_raise)
+        raise to_raise
+
+    # Check if file var is in the dataset
+    if var_name not in dataset.columns:
+        to_raise = KeyError(f"Variable {var_name} not found in {file}")
+        LOGGER.critical(to_raise)
+        raise to_raise
+
+    return dataset[var_name].to_xarray()
+
+
 def load_to_dataarray(
     file: Path,
     var_name: str,