Skip to content

Commit ff292c2

Browse files
swapdewalkarskrawcz
authored andcommitted
Add Pandas reader for spss
1 parent 2cfd6e6 commit ff292c2

3 files changed

Lines changed: 63 additions & 0 deletions

File tree

hamilton/plugins/pandas_extensions.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1533,6 +1533,43 @@ def name(cls) -> str:
15331533
return "excel"
15341534

15351535

1536+
@dataclasses.dataclass
1537+
class PandasSPSSReader(DataLoader):
1538+
"""Class for loading/reading spss files with Pandas.
1539+
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_spss.html
1540+
"""
1541+
1542+
path: Union[str, Path]
1543+
# kwargs
1544+
usecols: Optional[Union[List[Hashable], Callable[[str], bool]]] = None
1545+
convert_categoricals: bool = True
1546+
dtype_backend: Literal["pyarrow", "numpy_nullable"] = "numpy_nullable"
1547+
1548+
@classmethod
1549+
def applicable_types(cls) -> Collection[Type]:
1550+
return [DATAFRAME_TYPE]
1551+
1552+
def _get_loading_kwargs(self) -> Dict[str, Any]:
1553+
# Puts kwargs in a dict
1554+
kwargs = dataclasses.asdict(self)
1555+
1556+
# path corresponds to 'io' argument of pandas.read_spss,
1557+
# but we send it separately
1558+
del kwargs["path"]
1559+
1560+
return kwargs
1561+
1562+
def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
1563+
# Loads the data and returns the df and metadata of the spss file
1564+
df = pd.read_spss(self.path, **self._get_loading_kwargs())
1565+
metadata = utils.get_file_and_dataframe_metadata(self.path, df)
1566+
return df, metadata
1567+
1568+
@classmethod
1569+
def name(cls) -> str:
1570+
return "spss"
1571+
1572+
15361573
def register_data_loaders():
15371574
"""Function to register the data loaders for this extension."""
15381575
for loader in [
@@ -1558,6 +1595,7 @@ def register_data_loaders():
15581595
PandasORCReader,
15591596
PandasExcelWriter,
15601597
PandasExcelReader,
1598+
PandasSPSSReader,
15611599
]:
15621600
registry.register_adapter(loader)
15631601

requirements-test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ pandera
1515
plotly
1616
polars
1717
pyarrow
18+
pyreadstat # for SPSS data loader
1819
pytest
1920
pytest-cov
2021
scikit-learn

tests/plugins/test_pandas_extensions.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
PandasParquetWriter,
2525
PandasPickleReader,
2626
PandasPickleWriter,
27+
PandasSPSSReader,
2728
PandasSqlReader,
2829
PandasSqlWriter,
2930
PandasStataReader,
@@ -274,3 +275,26 @@ def test_pandas_excel_reader(tmp_path: pathlib.Path) -> None:
274275
"department",
275276
"email",
276277
]
278+
279+
280+
def test_pandas_spss_reader(tmp_path: pathlib.Path) -> None:
281+
import pyreadstat
282+
283+
path_to_test = "tests/resources/data/test_load_from_data.xlsx"
284+
reader = PandasExcelReader(path=path_to_test)
285+
df, metadata = reader.load_data(pd.DataFrame)
286+
pyreadstat.write_sav(df, tmp_path / "test.sav")
287+
288+
reader = PandasSPSSReader(path=tmp_path / "test.sav")
289+
df, metadata = reader.load_data(pd.DataFrame)
290+
291+
assert PandasSPSSReader.applicable_types() == [pd.DataFrame]
292+
assert df.loc[0, "firstName"] == "John"
293+
assert df.shape == (3, 5)
294+
assert metadata["dataframe_metadata"]["column_names"] == [
295+
"firstName",
296+
"lastName",
297+
"age",
298+
"department",
299+
"email",
300+
]

0 commit comments

Comments
 (0)