Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/hats/catalog/dataset/collection_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,6 @@ def read_from_dir(cls, catalog_dir: str | Path | UPath) -> Self:
new object from the contents of a ``collection.properties`` file in the directory.
"""
file_path = file_io.get_upath(catalog_dir) / "collection.properties"
if not file_io.does_file_or_directory_exist(file_path):
raise FileNotFoundError(f"No properties file found where expected: {str(file_path)}")
p = Properties()
with file_path.open("rb") as f:
p.load(f, "utf-8")
Expand Down
4 changes: 2 additions & 2 deletions src/hats/catalog/dataset/table_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,9 @@ def read_from_dir(cls, catalog_dir: str | Path | UPath) -> Self:
"""
catalog_path = file_io.get_upath(catalog_dir)
file_path = catalog_path / "hats.properties"
if not file_io.does_file_or_directory_exist(file_path):
if not file_path.exists():
file_path = catalog_path / "properties"
if not file_io.does_file_or_directory_exist(file_path):
if not file_path.exists():
raise FileNotFoundError(f"No properties file found where expected: {str(file_path)}")
p = Properties()
with file_path.open("rb") as f:
Expand Down
8 changes: 3 additions & 5 deletions src/hats/catalog/partition_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,9 @@ def read_from_dir(cls, catalog_base_dir: str | Path | UPath | None) -> Partition
A `PartitionInfo` object with the data from the file
"""
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
if file_io.does_file_or_directory_exist(partition_info_file):
try:
pixel_list = PartitionInfo._read_from_csv(partition_info_file)
else:
except FileNotFoundError:
warnings.warn("Computing partitions from catalog parquet files. This may be slow.")

# Read the dataset dir to get the list of files.
Expand Down Expand Up @@ -200,9 +200,7 @@ def _read_from_csv(cls, partition_info_file: str | Path | UPath) -> PartitionInf
PartitionInfo
A `PartitionInfo` object with the data from the file
"""
if not file_io.does_file_or_directory_exist(partition_info_file):
raise FileNotFoundError(f"No partition info found where expected: {str(partition_info_file)}")

partition_info_file = file_io.get_upath(partition_info_file)
data_frame = file_io.load_csv_to_pandas(partition_info_file)

return [
Expand Down
4 changes: 0 additions & 4 deletions src/hats/io/file_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@
write_string_to_file,
)
from .file_pointer import (
append_paths_to_pointer,
directory_has_contents,
does_file_or_directory_exist,
find_files_matching_path,
get_upath,
get_upath_for_protocol,
is_regular_file,
)
2 changes: 0 additions & 2 deletions src/hats/io/file_io/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,6 @@ def read_parquet_metadata(file_pointer: str | Path | UPath, **kwargs) -> pq.File
parqeut file metadata (includes schema)
"""
file_pointer = get_upath(file_pointer)
if file_pointer is None or not file_pointer.exists():
raise FileNotFoundError("Parquet file does not exist")
if _parquet_precache_all_bytes(file_pointer): # pragma: no cover
return pq.read_metadata(BytesIO(file_pointer.read_bytes()), **kwargs)

Expand Down
87 changes: 0 additions & 87 deletions src/hats/io/file_io/file_pointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,93 +61,6 @@ def get_upath_for_protocol(path: str | Path) -> UPath:
return upath


def append_paths_to_pointer(pointer: str | Path | UPath, *paths: str) -> UPath:
"""Append directories and/or a file name to a specified file pointer.

Parameters
----------
pointer : str | Path | UPath
`FilePointer` object to add path to
*paths: str
any number of directory names optionally followed by a file name to append to the
pointer

Returns
-------
UPath
New file pointer to path given by joining given pointer and path names
"""
pointer = get_upath(pointer)
return pointer.joinpath(*paths)


def does_file_or_directory_exist(pointer: str | Path | UPath) -> bool:
"""Checks if a file or directory exists for a given file pointer

Parameters
----------
pointer : str | Path | UPath
File Pointer to check if file or directory exists at

Returns
-------
bool
True if file or directory at `pointer` exists, False if not
"""
pointer = get_upath(pointer)
return pointer.exists()


def is_regular_file(pointer: str | Path | UPath) -> bool:
"""Checks if a regular file (NOT a directory) exists for a given file pointer.

Parameters
----------
pointer : str | Path | UPath
File Pointer to check if a regular file

Returns
-------
bool
True if regular file at `pointer` exists, False if not or is a directory
"""
pointer = get_upath(pointer)
return pointer.is_file()


def find_files_matching_path(pointer: str | Path | UPath, *paths: str) -> list[UPath]:
"""Find files or directories matching the provided path parts.

Parameters
----------
pointer : str | Path | UPath
base File Pointer in which to find contents
*paths: str
any number of directory names optionally followed by a file name.
directory or file names may be replaced with `*` as a matcher.

Returns
-------
list[UPath]
New file pointers to files found matching the path
"""
pointer = get_upath(pointer)

if len(paths) == 0:
return [pointer]

matcher = pointer.fs.sep.join(paths)
contents = []
for child in pointer.rglob(matcher):
contents.append(child)

if len(contents) == 0:
return []

contents.sort()
return contents


def directory_has_contents(pointer: str | Path | UPath) -> bool:
"""Checks if a directory already has some contents (any files or subdirectories)

Expand Down
10 changes: 5 additions & 5 deletions src/hats/io/parquet_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,12 +528,12 @@ def pick_metadata_schema_file(catalog_base_dir: str | Path | UPath) -> UPath | N
path to a parquet file containing metadata schema.
"""
common_metadata_file = paths.get_common_metadata_pointer(catalog_base_dir)
common_metadata_exists = file_io.does_file_or_directory_exist(common_metadata_file)
if common_metadata_file.exists():
return common_metadata_file
metadata_file = paths.get_parquet_metadata_pointer(catalog_base_dir)
metadata_exists = file_io.does_file_or_directory_exist(metadata_file)
if not (common_metadata_exists or metadata_exists):
return None
return common_metadata_file if common_metadata_exists else metadata_file
if metadata_file.exists():
return metadata_file
return None


# pylint: disable=protected-access
Expand Down
17 changes: 5 additions & 12 deletions src/hats/io/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from hats.catalog.margin_cache.margin_catalog import MarginCatalog
from hats.catalog.partition_info import PartitionInfo
from hats.io import get_common_metadata_pointer, get_parquet_metadata_pointer, get_partition_info_pointer
from hats.io.file_io import does_file_or_directory_exist, get_upath
from hats.io.file_io.file_pointer import is_regular_file
from hats.io.file_io import get_upath
from hats.io.paths import get_healpix_from_path
from hats.loaders import read_hats
from hats.pixel_math.healpix_pixel import INVALID_PIXEL
Expand Down Expand Up @@ -242,7 +241,7 @@ def _is_valid_catalog_strict(pointer, handle_error, verbose):
parquet_path_pixels = []
for hats_file in dataset.files:
hats_fp = UPath(hats_file, protocol=metadata_file.protocol, **metadata_file.storage_options)
if not does_file_or_directory_exist(hats_fp):
if not hats_fp.exists():
handle_error(f"Pixel partition is missing: {hats_fp}")
is_valid = False
healpix_pixel = get_healpix_from_path(hats_file)
Expand Down Expand Up @@ -291,20 +290,14 @@ def is_collection_info_valid(pointer: str | Path | UPath) -> bool:

def _is_partition_info_valid(pointer: UPath) -> bool:
"""Checks if partition_info is valid for a given base catalog pointer"""
partition_info_pointer = get_partition_info_pointer(pointer)
partition_info_exists = is_regular_file(partition_info_pointer)
return partition_info_exists
return get_partition_info_pointer(pointer).exists()


def _is_metadata_valid(pointer: UPath) -> bool:
"""Checks if _metadata is valid for a given base catalog pointer"""
metadata_file = get_parquet_metadata_pointer(pointer)
metadata_file_exists = is_regular_file(metadata_file)
return metadata_file_exists
return get_parquet_metadata_pointer(pointer).exists()


def _is_common_metadata_valid(pointer: UPath) -> bool:
"""Checks if _common_metadata is valid for a given base catalog pointer"""
metadata_file = get_common_metadata_pointer(pointer)
metadata_file_exists = is_regular_file(metadata_file)
return metadata_file_exists
return get_common_metadata_pointer(pointer).exists()
33 changes: 25 additions & 8 deletions src/hats/loaders/read_hats.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,23 @@
}


def read_hats(catalog_path: str | Path | UPath) -> CatalogCollection | Dataset:
def read_hats(
catalog_path: str | Path | UPath, *, single_catalog: bool | None = None, read_moc: bool = True
) -> CatalogCollection | Dataset:
"""Reads a HATS Catalog from a HATS directory

Parameters
----------
catalog_path : str | Path | UPath
path to the root directory of the catalog
single_catalog: bool
If you happen to already know that the `catalog_path` points to a
single catalog, instead of a catalog collection, this flag can
save a few file read operations.
read_moc: bool
If you happen to know that your catalog does not have a MOC (or if
you know that your use case will not utilize a MOC), then you can
skip the file read and memory load of the MOC.

Returns
-------
Expand All @@ -50,20 +60,26 @@ def read_hats(catalog_path: str | Path | UPath) -> CatalogCollection | Dataset:
catalog = hats.read_hats(UPath(..., anon=True))
"""
path = file_io.get_upath(catalog_path)
if single_catalog is not None:
if single_catalog:
return _load_catalog(path, read_moc=read_moc)
return _load_collection(path, read_moc=read_moc)
if (path / "hats.properties").exists() or (path / "properties").exists():
return _load_catalog(path)
return _load_catalog(path, read_moc=read_moc)
if (path / "collection.properties").exists():
return _load_collection(path)
return _load_collection(path, read_moc=read_moc)
raise FileNotFoundError(f"Failed to read HATS at location {catalog_path}")


def _load_collection(collection_path: UPath) -> CatalogCollection:
def _load_collection(collection_path: UPath, read_moc: bool = True) -> CatalogCollection:
collection_properties = CollectionProperties.read_from_dir(collection_path)
main_catalog = _load_catalog(collection_path / collection_properties.hats_primary_table_url)
main_catalog = _load_catalog(
collection_path / collection_properties.hats_primary_table_url, read_moc=read_moc
)
return CatalogCollection(collection_path, collection_properties, main_catalog)


def _load_catalog(catalog_path: UPath) -> Dataset:
def _load_catalog(catalog_path: UPath, read_moc: bool = True) -> Dataset:
properties = TableProperties.read_from_dir(catalog_path)
dataset_type = properties.catalog_type
if dataset_type not in DATASET_TYPE_TO_CLASS:
Expand All @@ -78,7 +94,8 @@ def _load_catalog(catalog_path: UPath) -> Dataset:
}
if _is_healpix_dataset(dataset_type):
kwargs["pixels"] = PartitionInfo.read_from_dir(catalog_path)
kwargs["moc"] = _read_moc_from_point_map(catalog_path)
if read_moc:
kwargs["moc"] = _read_moc_from_point_map(catalog_path)
return loader(**kwargs)


Expand All @@ -95,7 +112,7 @@ def _is_healpix_dataset(dataset_type):
def _read_moc_from_point_map(catalog_base_dir: str | Path | UPath) -> MOC | None:
"""Reads a MOC object from the `point_map.fits` file if it exists in the catalog directory"""
point_map_path = paths.get_point_map_file_pointer(catalog_base_dir)
if not file_io.does_file_or_directory_exist(point_map_path):
if not point_map_path.exists():
return None
fits_image = file_io.read_fits_image(point_map_path)
order = hp.npix2order(len(fits_image))
Expand Down
6 changes: 4 additions & 2 deletions tests/hats/catalog/loaders/test_read_hats.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,20 @@ def test_read_hats_collection_info_only(collection_path):

def test_read_hats_branches(
small_sky_dir,
small_sky_collection_dir,
small_sky_order1_dir,
association_catalog_path,
small_sky_source_object_index_dir,
margin_catalog_path,
small_sky_source_dir,
test_data_dir,
):
read_hats(small_sky_dir)
read_hats(small_sky_dir, single_catalog=True)
read_hats(small_sky_collection_dir, single_catalog=False)
read_hats(small_sky_order1_dir)
read_hats(association_catalog_path)
read_hats(small_sky_source_object_index_dir)
read_hats(margin_catalog_path)
read_hats(margin_catalog_path, read_moc=False)
read_hats(small_sky_source_dir)
read_hats(test_data_dir / "square_map")
read_hats(test_data_dir / "small_sky_healpix13")
Expand Down
5 changes: 2 additions & 3 deletions tests/hats/io/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from hats.io import file_io
from hats.io.file_io.file_io import load_text_file
from hats.io.file_io.file_pointer import does_file_or_directory_exist

# pylint: disable=missing-function-docstring, redefined-outer-name

Expand All @@ -31,7 +30,7 @@ def assert_text_file_matches(expected_lines, file_name):
expected_lines(:obj:`string array`) list of strings, formatted as regular expressions.
file_name (str): fully-specified path of the file to read
"""
assert does_file_or_directory_exist(file_name), f"file not found [{file_name}]"
assert file_name.exists(), f"file not found [{file_name}]"
contents = load_text_file(file_name)

assert len(expected_lines) == len(
Expand All @@ -49,7 +48,7 @@ def assert_text_file_matches(expected_lines, file_name):
def check_parquet_schema():
def check_parquet_schema(file_name, expected_schema, expected_num_row_groups=1):
"""Check parquet schema against expectations"""
assert file_io.does_file_or_directory_exist(file_name), f"file not found [{file_name}]"
assert file_name.exists(), f"file not found [{file_name}]"

single_metadata = file_io.read_parquet_metadata(file_name)
schema = single_metadata.schema.to_arrow_schema()
Expand Down
Loading