-
Notifications
You must be signed in to change notification settings - Fork 1k
Introduce classes for collecting source statistics #19276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a7295f9
962b0df
ce1f735
7249b02
3bb6cc6
c307f21
115ba93
4756216
0d0d04f
79d9d61
77e0836
30c061a
8376f94
08c95ee
e82e499
e027966
06a7274
585aa87
47bf2dc
b3f5449
36de941
acc987f
9158e18
8903708
6072ed1
da23a23
d6594fe
4cc694a
bd784d3
b356dd0
ecb9c8c
185322d
593d79d
7fcda5d
e84e8b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,7 +4,8 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING, Any | ||
| import dataclasses | ||
| from typing import TYPE_CHECKING, Any, Generic, TypeVar | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Generator, Iterator | ||
|
|
@@ -44,3 +45,109 @@ def __rich_repr__(self) -> Generator[Any, None, None]: | |
| def get_key_name(node: Node) -> str: | ||
| """Generate the key name for a Node.""" | ||
| return f"{type(node).__name__.lower()}-{hash(node)}" | ||
|
|
||
|
|
||
| T = TypeVar("T") | ||
|
|
||
|
|
||
| @dataclasses.dataclass | ||
| class ColumnStat(Generic[T]): | ||
| """ | ||
| Generic column-statistic. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| value | ||
| Statistics value. Value will be None | ||
| if the statistics is unknown. | ||
| exact | ||
| Whether the statistics is known exactly. | ||
| """ | ||
|
|
||
| value: T | None = None | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| exact: bool = False | ||
|
|
||
|
|
||
| @dataclasses.dataclass | ||
| class UniqueStats: | ||
| """ | ||
| Unique-value statistics. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| count | ||
| Unique-value count. | ||
| fraction | ||
| Unique-value fraction. This corresponds to the total | ||
| number of unique values (count) divided by the total | ||
| number of rows. | ||
| """ | ||
|
|
||
| count: ColumnStat[int] = dataclasses.field(default_factory=ColumnStat[int]) | ||
| fraction: ColumnStat[float] = dataclasses.field(default_factory=ColumnStat[float]) | ||
|
|
||
|
|
||
| class DataSourceInfo: | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we will want to record/track We define Parquet- and DataFrame-specific sub-classes for
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You will get a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh right makes sense
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """ | ||
| Datasource information. | ||
|
|
||
| Notes | ||
| ----- | ||
| This class should be sub-classed for specific | ||
| datasource types (e.g. Parquet, DataFrame, etc.). | ||
| The required properties/methods enable lazy | ||
| sampling of the underlying datasource. | ||
| """ | ||
|
|
||
| @property | ||
| def row_count(self) -> ColumnStat[int]: | ||
| """Data source row-count estimate.""" | ||
| return ColumnStat[int]() # pragma: no cover | ||
|
|
||
| def unique_stats(self, column: str) -> UniqueStats: | ||
| """Return unique-value statistics for a column.""" | ||
| return UniqueStats() # pragma: no cover | ||
|
|
||
| def storage_size(self, column: str) -> ColumnStat[int]: | ||
| """Return the average column size for a single file.""" | ||
| return ColumnStat[int]() | ||
|
|
||
| def add_unique_stats_column(self, column: str) -> None: | ||
| """Add a column needing unique-value information.""" | ||
|
|
||
|
|
||
| class ColumnStats: | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """ | ||
| Column statistics. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| name | ||
| Column name. | ||
| source | ||
| Datasource information. | ||
| source_name | ||
| Source-column name. | ||
| unique_stats | ||
| Unique-value statistics. | ||
| """ | ||
|
|
||
| __slots__ = ("name", "source_info", "source_name", "unique_stats") | ||
|
|
||
| name: str | ||
| source_info: DataSourceInfo | ||
| source_name: str | ||
| unique_stats: UniqueStats | ||
|
|
||
| def __init__( | ||
| self, | ||
| name: str, | ||
| *, | ||
| source_info: DataSourceInfo | None = None, | ||
| source_name: str | None = None, | ||
| unique_stats: UniqueStats | None = None, | ||
| ) -> None: | ||
| self.name = name | ||
| self.source_info = source_info or DataSourceInfo() | ||
| self.source_name = source_name or name | ||
| self.unique_stats = unique_stats or UniqueStats() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that I am adding the "base" stats/info classes to the
basemodule, because these classes should not have any type dependencies (and should be available to use in other modules without any circular-dependency worries). We may want to add a dedicatedstatisticsmodule to implement the IR-specific logic for populating/propagating this logic, but I'll leave that decision for a follow-up.