Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Install Poetry
uses: abatilo/actions-poetry@v4.0.0
uses: abatilo/actions-poetry@v3.0.2
with:
poetry-version: 1.5.1

Expand Down Expand Up @@ -71,7 +71,7 @@ jobs:
python-version: "3.10"

- name: Install Poetry
uses: abatilo/actions-poetry@v4.0.0
uses: abatilo/actions-poetry@v3.0.2
with:
poetry-version: 1.5.1

Expand Down
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ class MyReferenceStyle(AuthorYearReferenceStyle):
("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.__repr__"),
("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.count"),
("py:obj", "virtual_ecosystem.core.grid.GRID_STRUCTURE_SIG.index"),
("py:exc", "ParserError"),
("py:exc", "BadZipFile"),
]
intersphinx_mapping = {
"numpy": ("https://numpy.org/doc/stable/", None),
Expand Down
165 changes: 95 additions & 70 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ tomli = {version = "^2.0.1", python = "<3.11"}
tomli-w = "^1.0.0"
tqdm = "^4.66.2"
xarray = ">=2024.6,<2026.0"
openpyxl = "^3.1.5"

[tool.poetry.group.types.dependencies]
types-dataclasses = "^0.6.6"
Expand Down
2 changes: 2 additions & 0 deletions tests/core/data/garbage.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a,b,c
1,2,"3
Binary file added tests/core/data/garbage.xlsx
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/core/data/reader_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
var1,var2
1,4
2,5
3,6
Binary file added tests/core/data/reader_test.xlsx
Binary file not shown.
100 changes: 99 additions & 1 deletion tests/core/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

from contextlib import nullcontext as does_not_raise
from logging import CRITICAL, DEBUG, INFO
from zipfile import BadZipFile

import pytest
from pandas.errors import ParserError
from xarray import DataArray

from tests.conftest import log_check
Expand Down Expand Up @@ -93,7 +95,11 @@ def test_func():
],
)
def test_load_netcdf(shared_datadir, caplog, file, file_var, exp_err, expected_log):
"""Test the netdcf variable loader."""
"""Test the netdcf variable loader.

The tests here are dependent on the test_file_format_loader, so cannot be run
individually.
"""

from virtual_ecosystem.core.readers import load_netcdf

Expand All @@ -105,6 +111,98 @@ def test_load_netcdf(shared_datadir, caplog, file, file_var, exp_err, expected_l
log_check(caplog, expected_log)


@pytest.mark.parametrize(
argnames=["file", "file_var", "exp_err", "expected_log"],
argvalues=[
(
"not_there.csv",
"irrelevant",
pytest.raises(FileNotFoundError),
((CRITICAL, "Data file not found"),),
),
(
"garbage.csv",
"irrelevant",
pytest.raises(ParserError),
((CRITICAL, "Could not load data from"),),
),
(
"reader_test.csv",
"missing",
pytest.raises(KeyError),
((CRITICAL, "Variable missing not found in"),),
),
(
"reader_test.csv",
"var1",
does_not_raise(),
(),
),
],
)
def test_load_csv(shared_datadir, caplog, file, file_var, exp_err, expected_log):
"""Test the netdcf variable loader.

The tests here are dependent on the test_file_format_loader, so cannot be run
individually.
"""

from virtual_ecosystem.core.readers import load_csv

with exp_err:
darray = load_csv(shared_datadir / file, file_var)
assert isinstance(darray, DataArray)

# Check the error reports
log_check(caplog, expected_log)


@pytest.mark.parametrize(
argnames=["file", "file_var", "exp_err", "expected_log"],
argvalues=[
(
"not_there.xlsx",
"irrelevant",
pytest.raises(FileNotFoundError),
((CRITICAL, "Data file not found"),),
),
(
"garbage.xlsx",
"irrelevant",
pytest.raises(BadZipFile),
((CRITICAL, "Could not load data from"),),
),
(
"reader_test.xlsx",
"missing",
pytest.raises(KeyError),
((CRITICAL, "Variable missing not found in"),),
),
(
"reader_test.xlsx",
"var1",
does_not_raise(),
(),
),
],
)
def test_load_excel(shared_datadir, caplog, file, file_var, exp_err, expected_log):
"""Test the netdcf variable loader.

The tests here are dependent on the test_file_format_loader, so cannot be run
individually.
"""

from virtual_ecosystem.core.readers import load_excel

with exp_err:
darray = load_excel(shared_datadir / file, file_var)
assert isinstance(darray, DataArray)

# Check the error reports
log_check(caplog, expected_log)


@pytest.mark.parametrize(
argnames=[
"filename",
Expand Down
87 changes: 86 additions & 1 deletion virtual_ecosystem/core/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def new_function_to_load_tif_data(...):

from collections.abc import Callable
from pathlib import Path
from zipfile import BadZipFile

from pandas import read_csv, read_excel
from pandas.errors import ParserError
from xarray import DataArray, load_dataset

from virtual_ecosystem.core.logger import LOGGER
Expand All @@ -55,7 +58,7 @@ def new_function_to_load_tif_data(...):
"""


def register_file_format_loader(file_types: tuple[str]) -> Callable:
def register_file_format_loader(file_types: tuple[str, ...]) -> Callable:
"""Adds a data loader function to the data loader registry.

This decorator is used to register a function that loads data from a given file type
Expand Down Expand Up @@ -134,6 +137,88 @@ def load_netcdf(file: Path, var_name: str) -> DataArray:
return dataset[var_name]


@register_file_format_loader(file_types=(".csv",))
def load_csv(file: Path, var_name: str) -> DataArray:
"""Loads a DataArray from a csv file.

Args:
file: A Path for a csv or excel file containing the variable to load.
var_name: A string providing the name of the variables in this file.

Raises:
FileNotFoundError: with bad file path names.
ParserError: if the csv data is not readable.
"""

to_raise: Exception

# Try to load file
try:
dataset = read_csv(file)
except FileNotFoundError:
to_raise = FileNotFoundError(f"Data file not found: {file}")
LOGGER.critical(to_raise)
raise to_raise
except ParserError as err:
to_raise = ParserError(f"Could not load data from {file}: {err}.")
LOGGER.critical(to_raise)
raise to_raise

# Check if file var is in the dataset
if var_name not in dataset.columns:
to_raise = KeyError(f"Variable {var_name} not found in {file}")
LOGGER.critical(to_raise)
raise to_raise

return dataset[var_name].to_xarray()


@register_file_format_loader(file_types=(".xlsx",))
def load_excel(file: Path, var_name: str) -> DataArray:
"""Loads a DataArray from an excel file.

Args:
file: A Path for a csv or excel file containing the variable to load.
var_name: A string providing the name of the variables in this file.

Raises:
FileNotFoundError: with bad file path names.
BadZipFile: if the excel file is corrupted.
Exception: catches other exceptions from openpyxl.

Note: BadZipFile is the most common error thrown by openpyxl for corrupted excel
files, which is based on their internal processing files as zips. The general
exception is included to cover other possible issues from openpyxl, as it has
various other potential failure modes.
"""

to_raise: Exception

# Determine dataframe file type & load file
try:
dataset = read_excel(file, engine="openpyxl")
except FileNotFoundError:
to_raise = FileNotFoundError(f"Data file not found: {file}")
LOGGER.critical(to_raise)
raise to_raise
except BadZipFile as err:
to_raise = BadZipFile(f"Could not load data from {file}: {err}.")
LOGGER.critical(to_raise)
raise to_raise
except Exception as err:
to_raise = Exception(f"Unidentified exception opening {file}: {err}")
LOGGER.critical(to_raise)
raise to_raise
Comment on lines +208 to +211
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's worth explicitly adding BadZipError - it's by far the most likely thing to happen and we can effectively report it as "Does not appear to be an Excel file". The general Exception can then handle anything else. I'd add a comment on that exception to say that openpyxl has a whole bunch of assorted failure modes for corrupt/non-excel Zip files and we're not going to try and track down an exhaustive list.

You can then add an actual zip file of (e.g.) a simple text file to test that failure mode.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do!!

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a corrupted excel file which triggers that error - I think that should cover that failure case?


# Check if file var is in the dataset
if var_name not in dataset.columns:
to_raise = KeyError(f"Variable {var_name} not found in {file}")
LOGGER.critical(to_raise)
raise to_raise

return dataset[var_name].to_xarray()


def load_to_dataarray(
file: Path,
var_name: str,
Expand Down