Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
13fa0bb
Refactor long_dataframe
ecomodeller Jan 3, 2024
dee2220
Test should irrespective of branch
ecomodeller Jan 3, 2024
46cac79
Undo
ecomodeller Jan 4, 2024
1807f3a
TODO
ecomodeller Jan 4, 2024
aa71ad6
sk for skill, gs for gridded_skill
jsmariegaard Jan 4, 2024
c1eafad
groupby sort=False, xy nan if non-unique
jsmariegaard Jan 4, 2024
b9c8131
temporary disable nan for non-unique xy
jsmariegaard Jan 4, 2024
cbf452e
rename local
jsmariegaard Jan 4, 2024
2d3345e
Old bug fixed - due to previous different ordering in skill this lead…
jsmariegaard Jan 5, 2024
f9edfc9
fix tests
jsmariegaard Jan 5, 2024
340f05b
to_dataframe
jsmariegaard Jan 5, 2024
23b07ea
test to_dataframe methods
jsmariegaard Jan 5, 2024
5b90d16
docstrings and sort_values
jsmariegaard Jan 5, 2024
20c0a09
include aux data in to_dataframe
jsmariegaard Jan 5, 2024
8b7c307
docstrings
jsmariegaard Jan 5, 2024
3ada90b
_append_xy_to_res; fixes wrong x y for track
jsmariegaard Jan 5, 2024
8c405a6
insert note in docstring for matching multi models
jsmariegaard Jan 5, 2024
8e6e981
n_points_removed
jsmariegaard Jan 5, 2024
99dc10b
Merge pull request #382 from DHI/refactor_long_dataframe
jsmariegaard Jan 7, 2024
a13ce4c
catch new warning
jsmariegaard Jan 7, 2024
df8b196
Merge branch 'groupby-sort-false' of https://github.com/DHI/modelskil…
jsmariegaard Jan 7, 2024
9355c3a
test_xy_in_skill
jsmariegaard Jan 7, 2024
d78762e
test_xy_in_skill_no_obs
jsmariegaard Jan 7, 2024
5aecd56
test_skill_mm_swaplevel, test_skill_mm_sort_index
jsmariegaard Jan 7, 2024
0e79b07
test_skill_mm_sort_values
jsmariegaard Jan 7, 2024
2a2ae4b
Use .data instead of ._df when creating new SkillTable class
jsmariegaard Jan 7, 2024
17b0363
text x, y on skill() from Comparer; and that x, y are kept under oper…
jsmariegaard Jan 7, 2024
364971e
also sel
jsmariegaard Jan 7, 2024
a542443
More tweaks to long_dataframe
ecomodeller Jan 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/full_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ on:
push:
branches: [ main]
pull_request:
branches: [ main ]

jobs:
build:
Expand Down
105 changes: 64 additions & 41 deletions modelskill/comparison/_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Iterable,
overload,
Hashable,
Tuple,
)
import warnings
import zipfile
Expand Down Expand Up @@ -82,17 +83,23 @@ def _all_df_template(n_quantities: int = 1):
class ComparerCollection(Mapping, Scoreable):
"""
Collection of comparers, constructed by calling the `modelskill.match`
method. Can be indexed by name or index.
method or by initializing with a list of comparers.

NOTE: In case of multiple model results with different time coverage,
only the _overlapping_ time period will be used! (intersection)

Examples
--------
>>> import modelskill as ms
>>> mr = ms.DfsuModelResult("Oresund2D.dfsu", item=0)
>>> o1 = ms.PointObservation("klagshamn.dfs0", item=0, x=366844, y=6154291, name="Klagshamn")
>>> o2 = ms.PointObservation("drogden.dfs0", item=0, x=355568.0, y=6156863.0)
>>> cc = ms.match(obs=[o1,o2], mod=mr)
>>> sk = cc.skill()
>>> cc["Klagshamn"].plot.timeseries()
>>> cmp1 = ms.match(o1, mr) # Comparer
>>> cmp2 = ms.match(o2, mr) # Comparer
>>> ccA = ms.ComparerCollection([cmp1, cmp2])
>>> ccB = ms.match(obs=[o1, o2], mod=mr)
>>> sk = ccB.skill()
>>> ccB["Klagshamn"].plot.timeseries()
"""

plotter = ComparerCollectionPlotter
Expand Down Expand Up @@ -562,65 +569,80 @@ def skill(
res = _groupby_df(df, by, pmetrics)
mtr_cols = [m.__name__ for m in pmetrics] # type: ignore
res = res.dropna(subset=mtr_cols, how="all") # TODO: ok to remove empty?
res["x"] = df.groupby(by=by, observed=False).x.first()
res["y"] = df.groupby(by=by, observed=False).y.first()
# TODO: set x,y to NaN if TrackObservation, x.nunique() > 1
res = self._append_xy_to_res(res, cc)
res = cc._add_as_col_if_not_in_index(df, skilldf=res)
return SkillTable(res)

def _to_long_dataframe(self, attrs_keys=None, observed=False) -> pd.DataFrame:
def _to_long_dataframe(
self, attrs_keys: Iterable[str] | None = None, observed: bool = False
) -> pd.DataFrame:
"""Return a copy of the data as a long-format pandas DataFrame (for groupby operations)"""
# TODO delegate to each comparer
attrs_keys = attrs_keys or []
res = _all_df_template(self.n_quantities)
frames = []
cols = list(res.keys()) + attrs_keys

for cmp in self._comparers.values():
for j in range(cmp.n_models):
mod_name = cmp.mod_names[j]
attrs = {key: cmp.data.attrs.get(key, False) for key in attrs_keys}
for mod_name in cmp.mod_names:
# drop "x", "y", ?
df = cmp.data.drop_vars(["z"])[[mod_name]].to_dataframe().copy()
df = df.rename(columns={mod_name: "mod_val"})
df["model"] = mod_name
df["observation"] = cmp.name
df = (
cmp.data[[mod_name]]
.to_dataframe()
.copy()
.rename(columns={mod_name: "mod_val"})
.assign(model=mod_name, observation=cmp.name, x=cmp.x, y=cmp.y)
.assign(obs_val=cmp.data["Observation"].values)
.assign(**attrs)
)
if self.n_quantities > 1:
df["quantity"] = cmp.quantity.name
df["x"] = cmp.x
df["y"] = cmp.y
df["obs_val"] = cmp.data["Observation"].values
for key in attrs_keys:
if key in cmp.data.attrs:
df[key] = cmp.data.attrs[key]
else:
df[key] = False
frames.append(df[cols])
if len(frames) > 0:
res = pd.concat(frames)

frames.append(df)
res = pd.concat(frames)
cat_cols = res.select_dtypes(include=["object"]).columns
res[cat_cols] = res[cat_cols].astype("category")

if observed:
res = res.loc[~(res == False).any(axis=1)] # noqa
res.index.name = "time"
# TODO change index to regular column, it is not unique
return res

@staticmethod
def _attrs_keys_in_by(by):
"""Check if 'attrs:' is in by and return attrs_keys"""
def _attrs_keys_in_by(by: str | List[str]) -> Tuple[List[str], List[str]]:
"""skill() helper: Check if 'attrs:' is in by and return attrs_keys"""
attrs_keys = []
by = [by] if isinstance(by, str) else by
for j, b in enumerate(by):
if isinstance(b, str) and b.startswith("attrs:"):
key = b.split(":")[1]
attrs_keys.append(key)
by[j] = key # remove 'attrs:' prefix
attrs_keys = None if len(attrs_keys) == 0 else attrs_keys
return by, attrs_keys

@staticmethod
def _append_xy_to_res(res: pd.DataFrame, cc: ComparerCollection) -> pd.DataFrame:
"""skill() helper: Append x and y to res if possible"""
res["x"] = np.nan
res["y"] = np.nan

# for MultiIndex in res find "observation" level and
# insert x, y if gtype=point for that observation
if "observation" in res.index.names:
idx_names = res.index.names
res = res.reset_index()
for cmp in cc:
if cmp.gtype == "point":
res.loc[res.observation == cmp.name, "x"] = cmp.x
res.loc[res.observation == cmp.name, "y"] = cmp.y
res = res.set_index(idx_names)
return res

def _add_as_col_if_not_in_index(
self, df, skilldf, fields=["model", "observation", "quantity"]
):
"""Add a field to skilldf if unique in df"""
"""skill() helper: Add a field to skilldf if unique in df"""
for field in reversed(fields):
if (field == "model") and (self.n_models <= 1):
continue
Expand Down Expand Up @@ -681,7 +703,8 @@ def gridded_skill(
--------
>>> import modelskill as ms
>>> cc = ms.match([HKNA,EPL,c2], mr) # with satellite track measurements
>>> cc.gridded_skill(metrics='bias')
>>> gs = cc.gridded_skill(metrics='bias')
>>> gs.data
<xarray.Dataset>
Dimensions: (x: 5, y: 5)
Coordinates:
Expand All @@ -692,8 +715,8 @@ def gridded_skill(
n (x, y) int32 3 0 0 14 37 17 50 36 72 ... 0 0 15 20 0 0 0 28 76
bias (x, y) float64 -0.02626 nan nan ... nan 0.06785 -0.1143

>>> ds = cc.gridded_skill(binsize=0.5)
>>> ds.coords
>>> gs = cc.gridded_skill(binsize=0.5)
>>> gs.data.coords
Coordinates:
observation 'alti'
* x (x) float64 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5
Expand Down Expand Up @@ -788,9 +811,9 @@ def mean_skill(
>>> cc.mean_skill().round(2)
n bias rmse urmse mae cc si r2
HKZN_local 564 -0.09 0.31 0.28 0.24 0.97 0.09 0.99
>>> s = cc.mean_skill(weights="equal")
>>> s = cc.mean_skill(weights="points")
>>> s = cc.mean_skill(weights={"EPL": 2.0}) # more weight on EPL, others=1.0
>>> sk = cc.mean_skill(weights="equal")
>>> sk = cc.mean_skill(weights="points")
>>> sk = cc.mean_skill(weights={"EPL": 2.0}) # more weight on EPL, others=1.0
"""

# TODO remove in v1.1
Expand Down Expand Up @@ -819,13 +842,13 @@ def mean_skill(

# skill assessment
pmetrics = _parse_metric(metrics)
s = cc.skill(metrics=pmetrics)
if s is None:
sk = cc.skill(metrics=pmetrics)
if sk is None:
return None
skilldf = s.to_dataframe()
skilldf = sk.to_dataframe()

# weights
weights = cc._parse_weights(weights, s.obs_names)
weights = cc._parse_weights(weights, sk.obs_names)
skilldf["weights"] = (
skilldf.n if weights is None else np.tile(weights, len(mod_names)) # type: ignore
)
Expand Down Expand Up @@ -946,7 +969,7 @@ def _parse_weights(self, weights, observations):
weights = np.ones(n_obs) # equal weight to all
elif isinstance(weights, dict):
w_dict = weights
weights = [w_dict.get(name, 1.0) for name in (self.obs_names)]
weights = [w_dict.get(name, 1.0) for name in observations]

elif isinstance(weights, str):
if weights.lower() == "equal":
Expand Down
6 changes: 3 additions & 3 deletions modelskill/comparison/_collection_plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,13 +497,13 @@ def taylor(

metrics = [mtr._std_obs, mtr._std_mod, mtr.cc]
skill_func = self.cc.mean_skill if aggregate_observations else self.cc.skill
s = skill_func(
sk = skill_func(
metrics=metrics, # type: ignore
)
if s is None:
if sk is None:
return

df = s.to_dataframe()
df = sk.to_dataframe()
ref_std = 1.0 if normalize_std else df.iloc[0]["_std_obs"]

if isinstance(df.index, pd.MultiIndex):
Expand Down
31 changes: 26 additions & 5 deletions modelskill/comparison/_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,9 +973,8 @@ def skill(

df = cmp._to_long_dataframe()
res = _groupby_df(df, by, metrics)
res["x"] = df.groupby(by=by, observed=False).x.first()
res["y"] = df.groupby(by=by, observed=False).y.first()
# TODO: set x,y to NaN if TrackObservation
res["x"] = np.nan if self.gtype == "track" else cmp.x
res["y"] = np.nan if self.gtype == "track" else cmp.y
res = self._add_as_col_if_not_in_index(df, skilldf=res)
return SkillTable(res)

Expand Down Expand Up @@ -1107,8 +1106,8 @@ def gridded_skill(
n (x, y) int32 3 0 0 14 37 17 50 36 72 ... 0 0 15 20 0 0 0 28 76
bias (x, y) float64 -0.02626 nan nan ... nan 0.06785 -0.1143

>>> ds = cc.gridded_skill(binsize=0.5)
>>> ds.coords
>>> gs = cc.gridded_skill(binsize=0.5)
>>> gs.data.coords
Coordinates:
observation 'alti'
* x (x) float64 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5
Expand Down Expand Up @@ -1184,6 +1183,28 @@ def remove_bias(self, correct="Model") -> Comparer:
)
return cmp

def to_dataframe(self) -> pd.DataFrame:
"""Convert matched data to pandas DataFrame

Include x, y coordinates only if gtype=track

Returns
-------
pd.DataFrame
data as a pandas DataFrame
"""
if self.gtype == str(GeometryType.POINT):
# we remove the scalar coordinate variables as they
# will otherwise be columns in the dataframe
return self.data.drop_vars(["x", "y", "z"]).to_dataframe()
elif self.gtype == str(GeometryType.TRACK):
df = self.data.drop_vars(["z"]).to_dataframe()
# make sure that x, y cols are first
cols = ["x", "y"] + [c for c in df.columns if c not in ["x", "y"]]
return df[cols]
else:
raise NotImplementedError(f"Unknown gtype: {self.gtype}")

def save(self, filename: Union[str, Path]) -> None:
"""Save to netcdf file

Expand Down
5 changes: 1 addition & 4 deletions modelskill/comparison/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,16 @@ def _groupby_df(
) -> pd.DataFrame:
def calc_metrics(group):
row = {}
# row["x"] = group.x.first() if group.x.nunique() == 1 else np.nan
# row["y"] = group.y.first() if group.y.nunique() == 1 else np.nan
row["n"] = len(group)
for metric in metrics:
row[metric.__name__] = metric(group.obs_val, group.mod_val)
return pd.Series(row)

# .drop(columns=["x", "y"])
if _dt_in_by(by):
df, by = _add_dt_to_df(df, by)

# sort=False to avoid re-ordering compared to original cc (also for performance)
res = df.groupby(by=by, observed=False).apply(calc_metrics)
res = df.groupby(by=by, observed=False, sort=False).apply(calc_metrics)

if n_min:
# nan for all cols but n
Expand Down
14 changes: 13 additions & 1 deletion modelskill/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,14 @@ def match(
gtype=None,
max_model_gap=None,
):
"""Compare observations and model results
"""Match observation and model result data in space and time

NOTE: In case of multiple model results with different time coverage,
only the _overlapping_ time period will be used! (intersection)

NOTE: In case of multiple observations, multiple models can _only_
be matched if they are _all_ of SpatialField type, e.g. DfsuModelResult
or GridModelResult.

Parameters
----------
Expand Down Expand Up @@ -468,9 +475,14 @@ def _select_overlapping_trackdata_with_tolerance(
df = mod_df.join(obs_df, how="inner", lsuffix="_mod", rsuffix="_obs")

# 2. remove model points outside observation track
n_points = len(df)
keep_x = np.abs((df.x_mod - df.x_obs)) < spatial_tolerance
keep_y = np.abs((df.y_mod - df.y_obs)) < spatial_tolerance
df = df[keep_x & keep_y]
if n_points_removed := n_points - len(df):
warnings.warn(
f"Removed {n_points_removed} model points outside observation track (spatial_tolerance={spatial_tolerance})"
)
return mri.data.sel(time=df.index)


Expand Down
Loading