diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index b87cbc3..6174976 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -87,8 +87,8 @@ jobs: black --check --line-length 119 --target-version py35 py/bintensors # TODO: uncomment this after adding formal pytest - # - name: Run tests - # run: | - # cargo test - # pip install .[testing] - # pytest -sv tests/ + - name: Run tests + run: | + cargo test + pip install .[testing] + pytest -sv tests/ diff --git a/binding/python/makefile b/binding/python/makefile new file mode 100644 index 0000000..6b7ce74 --- /dev/null +++ b/binding/python/makefile @@ -0,0 +1,56 @@ +.PHONY: help clean test lint lint-check format format-dir + +# Define default Python and pip executables +PYTHON ?= python +PIP ?= pip +PYTEST ?= pytest +BLACK ?= black +BLACK_OPTS ?= --line-length 119 --target-version py35 + +# Source directories +SRC_DIRS ?= py/bintensors tests + +help: + @echo "Available make targets:" + @echo " help - Show this help message" + @echo " clean - Remove build artifacts and cache files" + @echo " test - Run all tests" + @echo " lint - Run Black lint check on all source files" + @echo " check - Run Black lint check without modifying files" + @echo " format - Format all source files with Black" + @echo " format-dir DIR=path/to/dir - Format files in specific directory" + +clean: + rm -rf build/ dist/ *.egg-info/ .pytest_cache/ .coverage htmlcov/ .eggs/ + find . -type d -name __pycache__ -exec rm -rf {} + + find . -type f -name "*.pyc" -delete + +test: + $(PYTEST) -sv tests/ + +lint: + $(BLACK) $(BLACK_OPTS) $(SRC_DIRS) + +check: + $(BLACK) $(BLACK_OPTS) --check $(SRC_DIRS) + +format: lint + +format-dir: + @if [ -z "$(DIR)" ]; then \ + echo "Error: DIR parameter is required. Usage: make format-dir DIR=path/to/dir"; \ + exit 1; \ + fi + @if [ ! -d "$(DIR)" ]; then \ + echo "Error: Directory '$(DIR)' does not exist"; \ + exit 1; \ + fi + $(BLACK) $(BLACK_OPTS) "$(DIR)" + +# Install development dependencies +install-dev: + $(PIP) install -e ".[dev]" + +# Build the package +build: + $(PIP) install . \ No newline at end of file diff --git a/binding/python/py/bintensors/numpy.py b/binding/python/py/bintensors/numpy.py index baf5c1c..387bf4b 100644 --- a/binding/python/py/bintensors/numpy.py +++ b/binding/python/py/bintensors/numpy.py @@ -178,9 +178,6 @@ def load_file(filename: Union[str, os.PathLike]) -> Dict[str, np.ndarray]: # np.float8 formats require 2.1; we do not support these dtypes on earlier versions -_float8_e4m3fn = getattr(np, "float8_e4m3fn", None) -_float8_e5m2 = getattr(np, "float8_e5m2", None) - _TYPES = { "F64": np.float64, "F32": np.float32, @@ -193,9 +190,7 @@ def load_file(filename: Union[str, os.PathLike]) -> Dict[str, np.ndarray]: "U16": np.uint16, "I8": np.int8, "U8": np.uint8, - "BOOL": np.bool, - "F8_E4M3": _float8_e4m3fn, - "F8_E5M2": _float8_e5m2, + "BOOL": bool, } diff --git a/binding/python/pyproject.toml b/binding/python/pyproject.toml index a96044d..6937762 100644 --- a/binding/python/pyproject.toml +++ b/binding/python/pyproject.toml @@ -41,11 +41,10 @@ quality = [ ] testing = [ "bintensors[numpy]", - "h5py>=3.7.0", + "bintensors[torch]", "setuptools_rust>=1.5.2", "pytest>=7.2.0", "pytest-benchmark>=4.0.0", - # "python-afl>=0.7.3", "hypothesis>=6.70.2", ] all = [ diff --git a/binding/python/tests/test_buffer.py b/binding/python/tests/test_buffer.py new file mode 100644 index 0000000..c5ad20a --- /dev/null +++ b/binding/python/tests/test_buffer.py @@ -0,0 +1,234 @@ +import pytest +import struct + +import torch +from typing import List, Dict, Tuple +from itertools import chain + +from bintensors import BintensorError +from bintensors.torch import save, load + +_DTYPE = { + "BOL": 0, + "U8": 1, + "I8": 2, + "F8_E5M2": 3, + "F8_E4M3": 4, + "I16": 5, + "U16": 6, + "F16": 7, + "BF16": 8, + "I32": 9, + "U32": 10, + "F32": 11, + "F64": 12, + "I64": 13, + "F64": 14, +} + + +def encode_unsigned_variant_encoding(number: int) -> bytes: + """Encodes an unsigned integer into a variable-length format.""" + if number > 0xFFFFFFFF: + return b"\xfd" + number.to_bytes(8, "little") + elif number > 0xFFFF: + return b"\xfc" + number.to_bytes(4, "little") + elif number > 0xFA: + return b"\xfb" + number.to_bytes(2, "little") + else: + return number.to_bytes(1, "little") + + +def encode_tensor_info(dtype: str, shape: Tuple[int, ...], offset: Tuple[int, int]) -> List[bytes]: + """Encodes the struct TensorInfo into byte buffer""" + if dtype not in _DTYPE: + raise ValueError(f"Unsupported dtype: {dtype}") + + # flatten out the tensor info + layout = chain([_DTYPE[dtype], len(shape)], shape, offset) + return b"".join(list(map(encode_unsigned_variant_encoding, layout))) + + +def encode_hash_map(index_map: Dict[str, int]) -> List[bytes]: + """Encodes a dictionary of string keys and integer values.""" + length = encode_unsigned_variant_encoding(len(index_map)) + + hash_map_layout = chain.from_iterable( + ( + encode_unsigned_variant_encoding(len(k)), + k.encode("utf-8"), + encode_unsigned_variant_encoding(v), + ) + for k, v in index_map.items() + ) + + return b"".join(chain([length], hash_map_layout)) + + +def test_empty_file(): + "bintensors allows empty dictonary" + tensor_dict = {} + buffer = save(tensor_dict) + # decouple first 8 bytes part of the buffer unsinged long long + header_size = struct.unpack(" bool: + return np.array_equal(lhs, rhs) + + +def _ASSERT_SHAPE(value: Tuple[int, ...], expected: Tuple[int, ...]) -> bool: + assert value == expected, f"assert mismatch of shapes of a tensor, found {value}, expected {expected}" + + +def create_gpt2_numpy_dict(n_layers: int) -> Dict[str, np.ndarray]: + tensors = {} + tensors["wte"] = np.zeros((50257, 768)) + tensors["wpe"] = np.zeros((1024, 768)) + for i in range(n_layers): + tensors[f"h.{i}.ln_1.weight"] = np.zeros((768,)) + tensors[f"h.{i}.ln_1.bias"] = np.zeros((768,)) + tensors[f"h.{i}.attn.bias"] = np.zeros((1, 1, 1024, 1024)) + tensors[f"h.{i}.attn.c_attn.weight"] = np.zeros((768, 2304)) + tensors[f"h.{i}.attn.c_attn.bias"] = np.zeros((2304)) + tensors[f"h.{i}.attn.c_proj.weight"] = np.zeros((768, 768)) + tensors[f"h.{i}.attn.c_proj.bias"] = np.zeros((768)) + tensors[f"h.{i}.ln_2.weight"] = np.zeros((768)) + tensors[f"h.{i}.ln_2.bias"] = np.zeros((768)) + tensors[f"h.{i}.mlp.c_fc.weight"] = np.zeros((768, 3072)) + tensors[f"h.{i}.mlp.c_fc.bias"] = np.zeros((3072)) + tensors[f"h.{i}.mlp.c_proj.weight"] = np.zeros((3072, 768)) + tensors[f"h.{i}.mlp.c_proj.bias"] = np.zeros((768)) + tensors["ln_f.weight"] = np.zeros((768)) + tensors["ln_f.bias"] = np.zeros((768)) + return tensors + + +def test_save_and_load_gpt2_tensors_dict(): + small_gpt2 = create_gpt2_numpy_dict(2) + buffer = save(small_gpt2) + model = load(buffer) + assert all((_compare_np_array(small_gpt2[key], model[key]) for key in small_gpt2.keys())) + + +def test_save_and_load_zero_sized_tensors(): + _SHAPES_A, _SHAPES_B = (0,), (0, 0, 0) + tensor_dict = {"ln.weight": np.zeros(_SHAPES_A), "ln.bias": np.zeros(_SHAPES_B)} + + buffer = save(tensor_dict) + loaded_tensor_dict = load(buffer) + _ASSERT_SHAPE(tuple(loaded_tensor_dict["ln.weight"].shape), _SHAPES_A) + _ASSERT_SHAPE(tuple(loaded_tensor_dict["ln.bias"].shape), _SHAPES_B) + + +def test_tensor_dtype_roundtrip(): + tensor_dict = { + "float32": np.zeros((10, 10), dtype=np.float32), + "float64": np.zeros((10, 10), dtype=np.float64), + "int32": np.zeros((10, 10), dtype=np.int32), + "int64": np.zeros((10, 10), dtype=np.int64), + } + buffer = save(tensor_dict) + loaded_dict = load(buffer) + + for key, value in tensor_dict.items(): + assert _compare_np_array(loaded_dict[key], value) + assert loaded_dict[key].dtype == value.dtype + + +def test_save_and_load_large_gpt2_dict(): + tensor_dict = create_gpt2_numpy_dict(20) + buffer = save(tensor_dict) + loaded_dict = load(buffer) + assert all((_compare_np_array(loaded_dict[key], value) for key, value in tensor_dict.items())) + + +def test_tensor_dict_with_mixed_shapes(): + tensor_dict = { + "scalar": np.array(5.0), + "vector": np.array([1.0, 2.0, 3.0]), + "matrix": np.array([[1.0, 2.0], [3.0, 4.0]]), + "tensor3d": np.random.rand(2, 3, 4), + } + + buffer = save(tensor_dict) + loaded_dict = load(buffer) + + for key, value in tensor_dict.items(): + assert _compare_np_array(loaded_dict[key], value) + assert loaded_dict[key].shape == value.shape + + +def test_invalid_tensor_dict_raises_error(): + invalid_dict = {"valid": np.zeros((5, 5)), "invalid": "string_value"} + + with pytest.raises(Exception): + _ = save(invalid_dict) + + +def test_save_file_and_load_file_consistency(): + tensor_dict = create_gpt2_numpy_dict(1) + filename = "" + loaded_dict = {} + with tempfile.NamedTemporaryFile(delete=False) as tmp: + filename = tmp.name + try: + save_file(tensor_dict, filename) + loaded_dict = load_file(filename) + finally: + for key, value in tensor_dict.items(): + assert _compare_np_array(loaded_dict[key], value) + + +def test_safe_open_access_and_metadata(): + tensor_dict = create_gpt2_numpy_dict(1) + with tempfile.NamedTemporaryFile(delete=False) as tmp: + filename = tmp.name + # save file into tempfile + save_file(tensor_dict, filename) + + # contex manager numpy framework allocation on bintensors file + with safe_open(filename, "numpy") as model: + assert model.get_tensor("h.0.ln_1.weight") is not None + assert model.get_tensor("h.0.ln_1.bias") is not None + assert model.metadata() is None + + +def test_safe_open_access_with_metadata(): + tensor_dict = create_gpt2_numpy_dict(1) + with tempfile.NamedTemporaryFile(delete=False) as tmp: + filename = tmp.name + + save_file(tensor_dict, filename, metadata={"hello": "world"}) + with safe_open(filename, "numpy") as model: + assert model.get_tensor("h.0.ln_1.weight") is not None + assert model.get_tensor("h.0.ln_1.bias") is not None + assert model.metadata()["hello"] == "world" diff --git a/binding/python/tests/test_pt.py b/binding/python/tests/test_pt.py new file mode 100644 index 0000000..17b218d --- /dev/null +++ b/binding/python/tests/test_pt.py @@ -0,0 +1,126 @@ +import pytest + +import os +import tempfile +import torch + +from typing import Dict, Tuple +from bintensors.torch import load, save, save_file, load_file, safe_open + + +def _compare_torch_tensors(lhs: torch.Tensor, rhs: torch.Tensor) -> bool: + return torch.equal(lhs, rhs) + + +def _ASSERT_SHAPE(value: Tuple[int, ...], expected: Tuple[int, ...]) -> bool: + assert value == expected, f"assert mismatch of shapes of a tensor, found {value}, expected {expected}" + + +def create_gpt2_tensors_dict(n_layers: int) -> Dict[str, torch.Tensor]: + tensors = {} + tensors["wte"] = torch.zeros((50257, 768)) + tensors["wpe"] = torch.zeros((1024, 768)) + for i in range(n_layers): + tensors[f"h.{i}.ln_1.weight"] = torch.zeros((768,)) + tensors[f"h.{i}.ln_1.bias"] = torch.zeros((768,)) + tensors[f"h.{i}.attn.bias"] = torch.zeros((1, 1, 1024, 1024)) + tensors[f"h.{i}.attn.c_attn.weight"] = torch.zeros((768, 2304)) + tensors[f"h.{i}.attn.c_attn.bias"] = torch.zeros((2304)) + tensors[f"h.{i}.attn.c_proj.weight"] = torch.zeros((768, 768)) + tensors[f"h.{i}.attn.c_proj.bias"] = torch.zeros((768)) + tensors[f"h.{i}.ln_2.weight"] = torch.zeros((768)) + tensors[f"h.{i}.ln_2.bias"] = torch.zeros((768)) + tensors[f"h.{i}.mlp.c_fc.weight"] = torch.zeros((768, 3072)) + tensors[f"h.{i}.mlp.c_fc.bias"] = torch.zeros((3072)) + tensors[f"h.{i}.mlp.c_proj.weight"] = torch.zeros((3072, 768)) + tensors[f"h.{i}.mlp.c_proj.bias"] = torch.zeros((768)) + tensors["ln_f.weight"] = torch.zeros((768)) + tensors["ln_f.bias"] = torch.zeros((768)) + return tensors + + +class ToyRegressionModel(torch.nn.Module): + def __init__(self, n_in: int, n_out: int): + super().__init__() + self.ln_1 = torch.nn.Linear(n_in, n_out) + self.output = torch.nn.Linear(n_in, 1) + + def forward(self, x: torch.Tensor): + x = self.ln_1(x) + x = torch.nn.functional.sigmoid(x) + x = self.output(x) + return x + + +def test_pt_save_and_load_gpt2_tensors_dict(): + small_gpt2 = create_gpt2_tensors_dict(2) + buffer = save(small_gpt2) + model = load(buffer) + all((_compare_torch_tensors(small_gpt2[key], model[key]) for key in small_gpt2.keys())) + + +def test_pt_save_and_load_zero_sized_tensors(): + _SHAPES_A, _SHAPES_B = (0,), (0, 0, 0) + tensor_dict = {"ln.weight": torch.zeros(_SHAPES_A), "ln.bias": torch.zeros(_SHAPES_B)} + + buffer = save(tensor_dict) + loaded_tensor_dict = load(buffer) + _ASSERT_SHAPE(tuple(loaded_tensor_dict["ln.weight"].shape), _SHAPES_A) + _ASSERT_SHAPE(tuple(loaded_tensor_dict["ln.bias"].shape), _SHAPES_B) + + +def test_pt_tensor_dtype_roundtrip(): + tensor_dict = { + "float32": torch.zeros((10, 10), dtype=torch.float32), + "float64": torch.zeros((10, 10), dtype=torch.float64), + "int32": torch.zeros((10, 10), dtype=torch.int32), + "int64": torch.zeros((10, 10), dtype=torch.int64), + } + buffer = save(tensor_dict) + loaded_dict = load(buffer) + + for key, value in tensor_dict.items(): + assert _compare_torch_tensors(loaded_dict[key], value) + assert loaded_dict[key].dtype == value.dtype + + +def test_nn_module_toy_model(): + model = ToyRegressionModel(10, 10) + + tensor_dict = model.state_dict() + buffer = save(tensor_dict) + loaded_dict = load(buffer) + + assert all((_compare_torch_tensors(loaded_dict[key], value) for (key, value) in tensor_dict.items())) + + model.load_state_dict(state_dict=loaded_dict, strict=True) + + +def test_pt_invalid_tensor_dict_raises_error(): + invalid_dict = {"valid": torch.zeros((5, 5)), "invalid": "string_value"} + + with pytest.raises(Exception): + _ = save(invalid_dict) + + +def test_pt_save_file_and_load_file_consistency(): + tensor_dict = create_gpt2_tensors_dict(1) + filename = "" + with tempfile.NamedTemporaryFile(delete=False) as tmp: + filename = tmp.name + save_file(tensor_dict, filename) + loaded_dict = load_file(filename) + + for key, value in tensor_dict.items(): + assert _compare_torch_tensors(loaded_dict[key], value) + + +def test_pt_safe_open_access_and_metadata(): + tensor_dict = create_gpt2_tensors_dict(1) + with tempfile.NamedTemporaryFile(delete=False) as tmp: + filename = tmp.name + save_file(tensor_dict, filename) + with safe_open(filename, "pt") as model: + assert model.get_tensor("h.0.ln_1.weight") is not None + assert model.get_tensor("h.0.ln_1.bias") is not None + assert model.metadata() is None