Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
329ca5f
move compare() and from_matched() to new file
jsmariegaard Sep 8, 2023
fd161dd
prepare for obs as xr
jsmariegaard Sep 8, 2023
0cb0527
temporary handle both Dataframe and xr Dataset
jsmariegaard Sep 8, 2023
f825d99
add xr.Dataset
jsmariegaard Sep 8, 2023
e717b19
update type
jsmariegaard Sep 8, 2023
ff719b1
accept pd.Series also
jsmariegaard Sep 8, 2023
4fe4a86
pd.DataFrame | pd.Series | xr.Dataset
jsmariegaard Sep 12, 2023
4526a15
use xr.Dataset; wip
jsmariegaard Sep 12, 2023
361d250
parsing data
jsmariegaard Sep 12, 2023
30f5a8e
Merge branch 'main' into use-xarray-dataset
jsmariegaard Sep 22, 2023
f043b62
Delete match.py
jsmariegaard Sep 22, 2023
4badd4f
Merge branch 'main' into use-xarray-dataset
jsmariegaard Sep 22, 2023
136ae83
temporary convert to DataFrame
jsmariegaard Sep 22, 2023
bad35ea
temporary convert to DataFrame
jsmariegaard Sep 22, 2023
7c74ca9
_values_as_series
jsmariegaard Sep 22, 2023
fa0d2dd
major refactoring of observation, changing data container to xarray a…
jsmariegaard Sep 22, 2023
c23fe6d
.sel(time=slice
jsmariegaard Sep 22, 2023
e2c0eeb
not a series
jsmariegaard Sep 22, 2023
c6aefe6
IndexError
jsmariegaard Sep 22, 2023
d7cfc1a
.time.to_index(), new error messages, and NaN or not-NaN
jsmariegaard Sep 22, 2023
00c9ff9
dropna or not? default items?
jsmariegaard Sep 22, 2023
94a221a
from __future__ import annotations
jsmariegaard Sep 22, 2023
05bd64e
removed wrong type-hints
jsmariegaard Sep 23, 2023
ef0114e
remove unused
jsmariegaard Sep 23, 2023
b2ed1d1
clean-up, docstrings
jsmariegaard Sep 23, 2023
a195490
Allow also xr.DataArray
jsmariegaard Sep 23, 2023
5afe195
use PointType, TrackType
jsmariegaard Sep 23, 2023
fab778c
simplify make_unique_index
jsmariegaard Sep 23, 2023
55639f7
Change to xr.Dataset
jsmariegaard Sep 23, 2023
bff7561
temporary handle both dataframe and xr.dataset
jsmariegaard Sep 24, 2023
122ef85
n_points for all timeseries
jsmariegaard Sep 24, 2023
509f3c0
don't test for dataframe specific properties
jsmariegaard Sep 24, 2023
27406b2
temporary support both pandas and xarray
jsmariegaard Sep 24, 2023
a7c7f9f
re-implement PointModelResult init
jsmariegaard Sep 24, 2023
2136d97
add support for mikeio.Dfs0 and rename
jsmariegaard Sep 24, 2023
12d42d8
remove pandas specific testing (now xarray)
jsmariegaard Sep 24, 2023
ee693a8
values property
jsmariegaard Sep 24, 2023
8da2132
# type: ignore
jsmariegaard Sep 24, 2023
8d25b11
docstrings and remove pandas stuff
jsmariegaard Sep 24, 2023
92e6a52
use _values_as_series
jsmariegaard Sep 24, 2023
204aff3
move time-matching code from _comparison to matching
jsmariegaard Sep 24, 2023
f8adb62
_get_model_start_end and minor fixes after extracting functionality f…
jsmariegaard Sep 24, 2023
24ded45
use parse_modeldata_list and match_data_in_time
jsmariegaard Sep 24, 2023
a0d8042
disable test due to change in Comparer
jsmariegaard Sep 25, 2023
79d3cd5
TODO: fix this line as it is no longer possible to create Comparer th…
jsmariegaard Sep 25, 2023
044af66
handle empty data
jsmariegaard Sep 27, 2023
9f6165e
use match_data_in_time
jsmariegaard Sep 27, 2023
355d85c
type hint - thanks mypy
jsmariegaard Sep 27, 2023
14aedd0
remove unused
jsmariegaard Sep 27, 2023
a3d9541
type hints
jsmariegaard Sep 27, 2023
907e665
rename match_data_in_time to match_time; docstring
jsmariegaard Sep 27, 2023
e602b3b
Quantity to_dict
jsmariegaard Sep 27, 2023
afb367e
TimeSeries, only one attribute data and validate data
jsmariegaard Sep 27, 2023
f3fe90e
mv observation tests to sub folder
jsmariegaard Sep 27, 2023
9464304
use name
jsmariegaard Sep 27, 2023
46f3239
name, quantity, color
jsmariegaard Sep 27, 2023
eeaf1fb
use xr.Dataset and updated timeseries class
jsmariegaard Sep 27, 2023
800bc79
use xr.Dataset and updated timeseries class
jsmariegaard Sep 27, 2023
4b73d1b
type hints and remove unused
jsmariegaard Sep 27, 2023
2b42d74
allow setting x, y, z
jsmariegaard Sep 27, 2023
861bf89
use ds.coords for point x, y, z
jsmariegaard Sep 27, 2023
3e9656e
moved again - pytest requires unique file name
jsmariegaard Sep 27, 2023
fa162f7
weight property
jsmariegaard Sep 27, 2023
7c55c60
to_dataframe
jsmariegaard Sep 27, 2023
b7c30a9
to_dataframe temporary
jsmariegaard Sep 27, 2023
2359221
to_dataframe and gtype
jsmariegaard Sep 27, 2023
d2cddb4
relax validation, fix to_dataframe
jsmariegaard Sep 27, 2023
37d2026
x, y already defined
jsmariegaard Sep 27, 2023
8c9029b
insist on str type
jsmariegaard Sep 27, 2023
d627126
type hint after extraction (to keep mypy happy)
jsmariegaard Sep 27, 2023
8a6e6eb
Test that the dataset can be persisted as NetCDF
ecomodeller Sep 28, 2023
444210a
Try again
ecomodeller Sep 28, 2023
5bb5396
Period
ecomodeller Sep 28, 2023
6837b12
Unused imports
ecomodeller Sep 28, 2023
7c9b537
Track coordinates
ecomodeller Sep 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 37 additions & 210 deletions modelskill/comparison/_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timedelta
from datetime import datetime
from copy import deepcopy

from .. import metrics as mtr
from .. import Quantity
from .. import __version__

# from .. import __version__
from ..observation import Observation, PointObservation, TrackObservation

from ._comparer_plotter import ComparerPlotter
Expand Down Expand Up @@ -115,7 +116,6 @@ def _validate_metrics(metrics) -> None:
)


TimeDeltaTypes = Union[float, int, np.timedelta64, pd.Timedelta, timedelta]
TimeTypes = Union[str, np.datetime64, pd.Timestamp, datetime]
IdOrNameTypes = Union[int, str, List[int], List[str]]

Expand All @@ -138,60 +138,6 @@ def all(self) -> Sequence[str]:
return [self.obs] + list(self.model) + list(self.aux)


def _interp_time(df: pd.DataFrame, new_time: pd.DatetimeIndex) -> pd.DataFrame:
"""Interpolate time series to new time index"""
new_df = (
df.reindex(df.index.union(new_time))
.interpolate(method="time", limit_area="inside")
.reindex(new_time)
)
return new_df


def _time_delta_to_pd_timedelta(time_delta: TimeDeltaTypes) -> pd.Timedelta:
if isinstance(time_delta, (timedelta, np.timedelta64)):
time_delta = pd.Timedelta(time_delta)
elif np.isscalar(time_delta):
# assume seconds
time_delta = pd.Timedelta(time_delta, "s") # type: ignore
assert isinstance(time_delta, pd.Timedelta)
return time_delta


def _remove_model_gaps(
df: pd.DataFrame,
mod_index: pd.DatetimeIndex,
max_gap: TimeDeltaTypes,
) -> pd.DataFrame:
"""Remove model gaps longer than max_gap from dataframe"""
max_gap = _time_delta_to_pd_timedelta(max_gap)
valid_time = _get_valid_query_time(mod_index, df.index, max_gap)
return df.loc[valid_time]


def _get_valid_query_time(
mod_index: pd.DatetimeIndex, obs_index: pd.DatetimeIndex, max_gap: pd.Timedelta
):
"""Used only by _remove_model_gaps"""
# init dataframe of available timesteps and their index
df = pd.DataFrame(index=mod_index)
df["idx"] = range(len(df))

# for query times get available left and right index of source times
df = _interp_time(df, obs_index).dropna()
df["idxa"] = np.floor(df.idx).astype(int)
df["idxb"] = np.ceil(df.idx).astype(int)

# time of left and right source times and time delta
df["ta"] = mod_index[df.idxa]
df["tb"] = mod_index[df.idxb]
df["dt"] = df.tb - df.ta

# valid query times where time delta is less than max_gap
valid_idx = df.dt <= max_gap
return valid_idx


def _parse_metric(metric, default_metrics, return_list=False):
if metric is None:
metric = default_metrics
Expand Down Expand Up @@ -429,18 +375,13 @@ class Comparer:
"""
Comparer class for comparing model and observation data.

Typically, the Comparer is part of a ComparerCollection, initialized using the `compare` function.
Typically, the Comparer is part of a ComparerCollection,
created with the `compare` function.

Parameters
----------
observation : Observation
Observation data
modeldata : list of pd.DataFrame or list of mikeio.Dataset or list of xr.DataArray or list of xr.Dataset
Model data
max_model_gap : float or int or np.timedelta64 or pd.Timedelta or timedelta, optional
Maximum time gap to interpolate model data to observation time. If None, no interpolation is done.
matched_data : xr.Dataset, optional
Matched data. If None, observation and modeldata must be provided.
matched_data : xr.Dataset
Matched data
raw_mod_data : dict of pd.DataFrame, optional
Raw model data. If None, observation and modeldata must be provided.

Expand All @@ -462,34 +403,23 @@ class Comparer:

def __init__(
self,
observation=None,
modeldata=None,
max_model_gap: Optional[TimeDeltaTypes] = None,
matched_data: Optional[xr.Dataset] = None,
matched_data: xr.Dataset,
raw_mod_data: Optional[Dict[str, pd.DataFrame]] = None,
):
self.plot = Comparer.plotter(self)

if matched_data is not None:
self.data = self._parse_matched_data(matched_data)
self.raw_mod_data = (
raw_mod_data
if raw_mod_data is not None
else {
key: value.to_dataframe()
for key, value in matched_data.data_vars.items()
if value.attrs["kind"] == "model"
}
)
# TODO get quantity from matched_data object
# self.quantity: Quantity = Quantity.undefined()
else:
self.raw_mod_data = (
self._parse_modeldata_list(modeldata) if modeldata is not None else {}
)

self.data = self._initialise_comparer(observation, max_model_gap)
# self.quantity: Quantity = observation.quantity # TODO: make property
self.data = self._parse_matched_data(matched_data)
self.raw_mod_data = (
raw_mod_data
if raw_mod_data is not None
else {
key: value.to_dataframe()
for key, value in matched_data.data_vars.items()
if value.attrs["kind"] == "model"
}
)
# TODO get quantity from matched_data object
# self.quantity: Quantity = Quantity.undefined()

def _parse_matched_data(self, matched_data):
if not isinstance(matched_data, xr.Dataset):
Expand Down Expand Up @@ -525,103 +455,6 @@ def _parse_matched_data(self, matched_data):

return matched_data

def _mask_model_outside_observation_track(self, name, df_mod, df_obs) -> None:
if len(df_mod) == 0:
return
if len(df_mod) != len(df_obs):
raise ValueError("model and observation data must have same length")

mod_xy = df_mod[["x", "y"]]
obs_xy = df_obs[["x", "y"]]
d_xy = np.sqrt(np.sum((obs_xy - mod_xy) ** 2, axis=1))
# TODO why not use a fixed tolerance?
tol_xy = self._minimal_accepted_distance(obs_xy)
mask = d_xy > tol_xy
df_mod.loc[mask, name] = np.nan
if all(mask):
warnings.warn("no (spatial) overlap between model and observation points")

def _initialise_comparer(self, observation, max_model_gap) -> xr.Dataset:
assert isinstance(observation, (PointObservation, TrackObservation))
gtype = "point" if isinstance(observation, PointObservation) else "track"
observation = deepcopy(observation)
observation.trim(self._mod_start, self._mod_end)

first = True
for name, mdata in self.raw_mod_data.items():
df = self._model2obs_interp(observation, mdata, max_model_gap)
if gtype == "track":
# TODO why is it necessary to do mask here? Isn't it an error if the model data is outside the observation track?
self._mask_model_outside_observation_track(name, df, observation.data)

if first:
data = df
else:
data[name] = df[name]

first = False

data.index.name = "time"
data = data.dropna()
data = data.to_xarray()
data.attrs["gtype"] = gtype

if gtype == "point":
data["x"] = observation.x
data["y"] = observation.y
data["z"] = observation.z # type: ignore

data.attrs["name"] = observation.name
data.attrs["quantity_name"] = observation.quantity.name
data["x"].attrs["kind"] = "position"
data["y"].attrs["kind"] = "position"
data[self._obs_name].attrs["kind"] = "observation"
data[self._obs_name].attrs["unit"] = observation.quantity.unit
data[self._obs_name].attrs["color"] = observation.color
data[self._obs_name].attrs["weight"] = observation.weight
for n in self.mod_names:
data[n].attrs["kind"] = "model"

data.attrs["modelskill_version"] = __version__

return data

@staticmethod
def _minimal_accepted_distance(obs_xy):
# all consequtive distances
vec = np.sqrt(np.sum(np.diff(obs_xy, axis=0), axis=1) ** 2)
# fraction of small quantile
return 0.5 * np.quantile(vec, 0.1)

def _parse_modeldata_list(self, modeldata) -> Dict[str, pd.DataFrame]:
"""Convert to dict of dataframes"""
if not isinstance(modeldata, Sequence):
modeldata = [modeldata]

mod_dfs = [self._parse_single_modeldata(m) for m in modeldata]
return {m.columns[-1]: m for m in mod_dfs if m is not None}

@staticmethod
def _parse_single_modeldata(modeldata) -> pd.DataFrame:
"""Convert to dataframe and set index to pd.DatetimeIndex"""
if hasattr(modeldata, "to_dataframe"):
mod_df = modeldata.to_dataframe()
elif isinstance(modeldata, pd.DataFrame):
mod_df = modeldata
else:
raise ValueError(
f"Unknown modeldata type '{type(modeldata)}' (mikeio.Dataset, xr.DataArray, xr.Dataset or pd.DataFrame)"
)

if not isinstance(mod_df.index, pd.DatetimeIndex):
raise ValueError(
"Modeldata index must be datetime-like (pd.DatetimeIndex, pd.to_datetime)"
)

time = mod_df.index.round(freq="100us") # 0.0001s accuracy
mod_df.index = pd.DatetimeIndex(time, freq="infer")
return mod_df

@classmethod
def from_matched_data(
cls,
Expand Down Expand Up @@ -706,14 +539,22 @@ def time(self) -> pd.DatetimeIndex:
def _mod_start(self) -> pd.Timestamp:
mod_starts = [pd.Timestamp.max]
for m in self.raw_mod_data.values():
mod_starts.append(m.index[0])
time = (
m.index if isinstance(m, pd.DataFrame) else m.time
) # TODO: xr.Dataset
if len(time) > 0:
mod_starts.append(time[0])
return min(mod_starts)

@property
def _mod_end(self) -> pd.Timestamp:
mod_ends = [pd.Timestamp.min]
for m in self.raw_mod_data.values():
mod_ends.append(m.index[-1])
time = (
m.index if isinstance(m, pd.DataFrame) else m.time
) # TODO: xr.Dataset
if len(time) > 0:
mod_ends.append(time[-1])
return max(mod_ends)

@property
Expand Down Expand Up @@ -955,6 +796,7 @@ def __add__(
self, other: Union["Comparer", "ComparerCollection"]
) -> "ComparerCollection":
from ._collection import ComparerCollection
from ..matching import match_time

if not isinstance(other, (Comparer, ComparerCollection)):
raise TypeError(f"Cannot add {type(other)} to {type(self)}")
Expand All @@ -973,15 +815,12 @@ def __add__(
cmp.data = cmp.data.isel(time=index)

else:
cols = ["x", "y"]
mod_data = [self.data[cols + [m]] for m in self.mod_names]
for m in other.mod_names:
mod_data.append(other.data[cols + [m]])

cls = self.__class__
cmp = cls.__new__(cls)
cmp.__init__(self._to_observation(), mod_data)
# TODO cmp = cls.clone()
raw_mod_data = self.raw_mod_data.copy()
raw_mod_data.update(other.raw_mod_data)
matched = match_time(
observation=self._to_observation(), raw_mod_data=raw_mod_data
)
cmp = self.__class__(matched_data=matched, raw_mod_data=raw_mod_data)

return cmp
else:
Expand All @@ -990,18 +829,6 @@ def __add__(
cc.add_comparer(other)
return cc

def _model2obs_interp(
self, obs, mod_df: pd.DataFrame, max_model_gap: Optional[TimeDeltaTypes]
):
"""interpolate model to measurement time"""
df = _interp_time(mod_df.dropna(), obs.time)
df[self._obs_name] = obs.values

if max_model_gap is not None:
df = _remove_model_gaps(df, mod_df.dropna().index, max_model_gap)

return df

def sel(
self,
model: Optional[IdOrNameTypes] = None,
Expand Down
13 changes: 10 additions & 3 deletions modelskill/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import mikeio

from modelskill import ModelResult
from .matching import match_time, parse_modeldata_list
from .observation import Observation, PointObservation, TrackObservation
from .utils import is_iterable_not_str
from . import plotting
Expand Down Expand Up @@ -198,7 +199,7 @@ def extract(self, max_model_gap: Optional[float] = None) -> Optional[PointCompar
if hasattr(mr, "extract"):
mr = mr.extract(self.obs)

df = mr.data
df = mr.to_dataframe() # TODO: xr.Dataset
if (df is not None) and (len(df) > 0):
df_model.append(df)
else:
Expand All @@ -212,7 +213,10 @@ def extract(self, max_model_gap: Optional[float] = None) -> Optional[PointCompar
)
return None

comparer = PointComparer(self.obs, df_model, max_model_gap=max_model_gap)
raw_mod_data = parse_modeldata_list(df_model)
matched_data = match_time(self.obs, raw_mod_data, max_model_gap)

comparer = PointComparer(matched_data=matched_data, raw_mod_data=raw_mod_data)
return self._comparer_or_None(comparer)


Expand Down Expand Up @@ -263,7 +267,10 @@ def extract(self, max_model_gap: Optional[float] = None) -> Optional[TrackCompar
)
return None

comparer = TrackComparer(self.obs, df_model, max_model_gap=max_model_gap)
raw_mod_data = parse_modeldata_list(df_model)
matched_data = match_time(self.obs, raw_mod_data, max_model_gap)

comparer = TrackComparer(matched_data=matched_data, raw_mod_data=raw_mod_data)
return self._comparer_or_None(comparer)


Expand Down
Loading