Skip to content

Commit d431e46

Browse files
committed
[rayorch] integrate RayOrch for parallel data processing with new operators and storage classes
- Added `RayAcceleratedOperator` for transparent data-parallel execution of DataFlow operators. - Introduced `InMemoryStorage` for efficient in-memory data handling within Ray actors. - Created dummy operators for testing and validation of the Ray integration. - Updated `requirements.txt` to include `rayorch` dependency. - Added comprehensive tests for both serial and parallel execution scenarios. This implementation enhances the DataFlow framework by enabling efficient multi-GPU processing without modifying existing operators.
1 parent e015bc4 commit d431e46

File tree

12 files changed

+1823
-0
lines changed

12 files changed

+1823
-0
lines changed

dataflow/rayorch/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""RayOrch integration for DataFlow — transparent data-parallel acceleration.
2+
3+
Usage::
4+
5+
from dataflow.rayorch import RayAcceleratedOperator
6+
7+
scorer = RayAcceleratedOperator(
8+
FineWebEduSampleEvaluator,
9+
replicas=4,
10+
num_gpus_per_replica=0.25,
11+
).op_cls_init(device="cuda")
12+
scorer.run(storage, input_key="text", output_key="edu_score")
13+
"""
14+
15+
from .accelerated_op import RayAcceleratedOperator
16+
from .memory_storage import InMemoryStorage as _InMemoryStorage
17+
18+
__all__ = [
19+
"RayAcceleratedOperator",
20+
]

dataflow/rayorch/_test_ops.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""Dummy CPU-only operators for testing RayAcceleratedOperator.
2+
3+
These are intentionally trivial, deterministic, and row-independent so
4+
they can be used in CI without GPU resources. Prefixed with underscore
5+
to signal internal/test-only usage.
6+
"""
7+
from __future__ import annotations
8+
9+
from dataflow.core.operator import OperatorABC
10+
from dataflow.utils.storage import DataFlowStorage
11+
12+
13+
class DummyDoubleOp(OperatorABC):
14+
"""Multiplies a numeric column by 2."""
15+
16+
def __init__(self):
17+
super().__init__()
18+
19+
def run(
20+
self,
21+
storage: DataFlowStorage,
22+
input_key: str = "value",
23+
output_key: str = "doubled",
24+
):
25+
df = storage.read("dataframe")
26+
df[output_key] = df[input_key] * 2
27+
storage.write(df)
28+
29+
30+
class DummyIncrementOp(OperatorABC):
31+
"""Adds 1 to a numeric column."""
32+
33+
def __init__(self):
34+
super().__init__()
35+
36+
def run(
37+
self,
38+
storage: DataFlowStorage,
39+
input_key: str = "doubled",
40+
output_key: str = "incremented",
41+
):
42+
df = storage.read("dataframe")
43+
df[output_key] = df[input_key] + 1
44+
storage.write(df)

dataflow/rayorch/accelerated_op.py

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
from __future__ import annotations
2+
3+
import inspect
4+
from typing import Any, Generic, Optional, Protocol, Type, ParamSpec
5+
6+
import pandas as pd
7+
8+
from dataflow.core.operator import OperatorABC
9+
from dataflow.utils.storage import DataFlowStorage
10+
11+
from .memory_storage import InMemoryStorage
12+
13+
14+
_INITP = ParamSpec("_INITP")
15+
_RUNP = ParamSpec("_RUNP")
16+
17+
18+
class _OperatorProto(Protocol[_INITP, _RUNP]):
19+
"""Structural type that captures both ``__init__`` and ``run`` signatures.
20+
21+
Pyright / Pylance infers ``_INITP`` and ``_RUNP`` from the concrete
22+
operator so that :meth:`op_cls_init` and :meth:`run` expose the
23+
original parameter lists for IDE auto-complete.
24+
"""
25+
26+
def __init__(self, *args: _INITP.args, **kwargs: _INITP.kwargs) -> None: ...
27+
28+
def run(
29+
self,
30+
storage: DataFlowStorage,
31+
*args: _RUNP.args,
32+
**kwargs: _RUNP.kwargs,
33+
) -> Any: ...
34+
35+
36+
class _OpRunner:
37+
"""Actor-side worker: each replica holds an independent operator instance.
38+
39+
Receives a chunk of records (``list[dict]``), wraps it in
40+
:class:`InMemoryStorage`, delegates to the DataFlow operator's ``run``,
41+
and returns the result as ``list[dict]``.
42+
"""
43+
44+
def __init__(self, op_cls: type, op_init_args: tuple, op_init_kwargs: dict):
45+
self.op = op_cls(*op_init_args, **op_init_kwargs)
46+
47+
def run(self, records: list[dict], run_params: dict) -> list[dict]:
48+
if not records:
49+
return []
50+
df = pd.DataFrame(records)
51+
storage = InMemoryStorage(df)
52+
self.op.run(storage, *run_params.get("args", ()), **run_params.get("kwargs", {}))
53+
return storage.result.to_dict("records")
54+
55+
56+
class RayAcceleratedOperator(OperatorABC, Generic[_INITP, _RUNP]):
57+
"""DataFlow operator backed by RayOrch for transparent data-parallel execution.
58+
59+
From the pipeline's perspective this is a normal :class:`OperatorABC`:
60+
it reads from and writes to :class:`DataFlowStorage` sequentially.
61+
Internally it fans the DataFrame out to *replicas* Ray actors,
62+
each holding an independent copy of the wrapped operator (and its model).
63+
64+
Actors are created **lazily** on the first ``run()`` call so that
65+
pipeline ``compile()`` does not trigger heavyweight model loading.
66+
67+
Only suitable for **row-independent (map-style)** operators. Operators
68+
that need cross-row global state (e.g. semantic dedup with a full
69+
similarity matrix) should *not* use this wrapper.
70+
71+
Both ``op_cls_init`` and ``run`` have their signatures inferred from
72+
``op_cls`` via ``ParamSpec``, giving full IDE auto-complete.
73+
74+
Parameters
75+
----------
76+
op_cls:
77+
The DataFlow operator class to parallelize.
78+
replicas:
79+
Number of parallel actor replicas.
80+
num_gpus_per_replica:
81+
Fractional GPU allocation per replica (e.g. ``0.25`` to share one
82+
GPU across four replicas).
83+
env:
84+
Optional RayOrch ``EnvRegistry`` key for a custom ``runtime_env``.
85+
86+
Example
87+
-------
88+
::
89+
90+
from dataflow.rayorch import RayAcceleratedOperator
91+
from dataflow.operators.text_pt.eval import FineWebEduSampleEvaluator
92+
93+
scorer = RayAcceleratedOperator(
94+
FineWebEduSampleEvaluator,
95+
replicas=4,
96+
num_gpus_per_replica=0.25,
97+
).op_cls_init(device="cuda") # ← IDE shows __init__ params
98+
99+
scorer.run(storage, input_key="text") # ← IDE shows run params
100+
"""
101+
102+
def __init__(
103+
self,
104+
op_cls: Type[_OperatorProto[_INITP, _RUNP]],
105+
*,
106+
replicas: int = 1,
107+
num_gpus_per_replica: float = 0.0,
108+
env: Optional[str] = None,
109+
):
110+
super().__init__()
111+
self._op_cls = op_cls
112+
self._op_init_args: tuple = ()
113+
self._op_init_kwargs: dict = {}
114+
self._replicas = replicas
115+
self._num_gpus_per_replica = num_gpus_per_replica
116+
self._env = env
117+
self._module = None # created lazily
118+
119+
# PipelineABC.compile() compatibility:
120+
# compile() → AutoOP uses inspect.signature(operator.run) to bind()
121+
# call arguments. Our class-level run(storage, *args, **kwargs) would
122+
# cause bind() to dump extra params into *args, which later gets
123+
# serialised as an "args" key and leaks into the inner operator on
124+
# _compiled_forward replay. Installing the inner operator's named
125+
# signature on the instance avoids this entirely.
126+
self._install_inner_run_signature(op_cls)
127+
128+
def op_cls_init(
129+
self,
130+
*args: _INITP.args,
131+
**kwargs: _INITP.kwargs,
132+
) -> RayAcceleratedOperator[_INITP, _RUNP]:
133+
"""Configure how the wrapped operator is constructed inside each actor.
134+
135+
Parameters match ``op_cls.__init__``, so IDE auto-complete works.
136+
May be omitted if the operator's defaults are sufficient.
137+
"""
138+
self._op_init_args = args
139+
self._op_init_kwargs = kwargs
140+
return self
141+
142+
def _ensure_initialized(self) -> None:
143+
if self._module is not None:
144+
return
145+
from rayorch import Dispatch, RayModule
146+
147+
self._module = RayModule(
148+
_OpRunner,
149+
replicas=self._replicas,
150+
num_gpus_per_replica=self._num_gpus_per_replica,
151+
dispatch_mode=Dispatch.SHARD_CONTIGUOUS,
152+
env=self._env,
153+
)
154+
self._module.pre_init(
155+
op_cls=self._op_cls,
156+
op_init_args=self._op_init_args,
157+
op_init_kwargs=self._op_init_kwargs,
158+
)
159+
160+
# --- inner signature propagation ---
161+
162+
def _install_inner_run_signature(self, op_cls: type) -> None:
163+
"""Replace ``self.run`` with a thin proxy carrying ``op_cls.run``'s
164+
``__signature__``.
165+
166+
Why: ``PipelineABC.compile()`` → ``AutoOP`` uses
167+
``inspect.signature(operator.run)`` to ``bind()`` the call arguments.
168+
If the signature is the generic ``(storage, *args, **kwargs)`` from
169+
this wrapper, positional-overflow values land in ``*args`` and get
170+
serialised as an ``"args"`` key in the kwargs dict. On replay via
171+
``_compiled_forward(**kwargs)``, that ``"args"`` key leaks into the
172+
inner operator as an unexpected keyword argument.
173+
174+
By exposing the inner operator's **named** parameters here,
175+
``bind()`` resolves every argument to a keyword — no ``*args``
176+
residue, no downstream pollution. Only this file changes; DataFlow
177+
core is untouched.
178+
"""
179+
inner_sig = inspect.signature(op_cls.run)
180+
params = [p for p in inner_sig.parameters.values() if p.name != "self"]
181+
182+
impl = self._run_impl
183+
184+
def run(*args: Any, **kwargs: Any) -> None:
185+
return impl(*args, **kwargs)
186+
187+
run.__signature__ = inspect.Signature(params) # type: ignore[attr-defined]
188+
run.__doc__ = getattr(op_cls.run, "__doc__", None)
189+
run.__name__ = "run"
190+
run.__qualname__ = f"{type(self).__qualname__}.run"
191+
self.run = run # type: ignore[assignment]
192+
193+
# --- DataFlow OperatorABC interface ---
194+
# Two-level design for compile() compatibility:
195+
# 1. Class-level `run` — satisfies OperatorABC's abstract method so the
196+
# class can be instantiated. Delegates to `_run_impl`.
197+
# 2. Instance-level `run` (proxy) — installed by
198+
# `_install_inner_run_signature` in __init__, carries the inner
199+
# operator's __signature__ so AutoOP.bind() resolves args to keywords.
200+
# Python attribute lookup checks instance __dict__ before the class,
201+
# so the proxy always wins at runtime.
202+
203+
def run( # type: ignore[override]
204+
self,
205+
storage: DataFlowStorage,
206+
*args: _RUNP.args,
207+
**kwargs: _RUNP.kwargs,
208+
) -> None:
209+
return self._run_impl(storage, *args, **kwargs)
210+
211+
def _run_impl(
212+
self,
213+
storage: DataFlowStorage,
214+
*args: _RUNP.args,
215+
**kwargs: _RUNP.kwargs,
216+
) -> None:
217+
self._ensure_initialized()
218+
df = storage.read("dataframe")
219+
records: list[dict] = df.to_dict("records")
220+
run_params: dict = {"args": args, "kwargs": kwargs}
221+
result_records = self._module(records, run_params)
222+
storage.write(pd.DataFrame(result_records))
223+
224+
# --- lifecycle helpers ---
225+
226+
def shutdown(self) -> None:
227+
"""Terminate all Ray actors held by this operator."""
228+
if self._module is None:
229+
return
230+
import ray
231+
232+
for actor in self._module.actors:
233+
ray.kill(actor)
234+
self._module = None
235+
236+
def __repr__(self) -> str:
237+
state = "initialized" if self._module is not None else "lazy"
238+
return (
239+
f"RayAcceleratedOperator({self._op_cls.__name__}, "
240+
f"replicas={self._replicas}, state={state})"
241+
)

dataflow/rayorch/memory_storage.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from __future__ import annotations
2+
3+
from typing import Any, Literal
4+
5+
import pandas as pd
6+
7+
from dataflow.utils.storage import DataFlowStorage
8+
9+
10+
class InMemoryStorage(DataFlowStorage):
11+
"""Lightweight in-memory ``DataFlowStorage`` for use inside Ray actors.
12+
13+
Avoids filesystem I/O and step-file coupling so that each actor
14+
replica can independently read a DataFrame chunk and write results back.
15+
16+
Typical lifecycle inside ``_OpRunner.run``::
17+
18+
storage = InMemoryStorage(df_chunk)
19+
some_dataflow_op.run(storage, input_key="text", output_key="score")
20+
result = storage.result # DataFrame written by the operator
21+
22+
This storage does **not** participate in ``PipelineABC.compile()``,
23+
so ``step()`` simply returns ``self`` (no copy needed).
24+
A single ``_df`` is mutated in-place throughout the lifecycle:
25+
``write()`` replaces it, ``read()`` returns it.
26+
"""
27+
28+
def __init__(self, df: pd.DataFrame):
29+
self._df = df
30+
self.operator_step = 0
31+
32+
# --- DataFlowStorage ABC ---
33+
34+
def read(self, output_type: Literal["dataframe", "dict"] = "dataframe") -> Any:
35+
if output_type == "dataframe":
36+
return self._df
37+
if output_type == "dict":
38+
return self._df.to_dict("records")
39+
raise ValueError(f"Unsupported output_type: {output_type}")
40+
41+
def write(self, data: Any) -> Any:
42+
if isinstance(data, pd.DataFrame):
43+
self._df = data
44+
elif isinstance(data, list):
45+
self._df = pd.DataFrame(data)
46+
else:
47+
raise ValueError(f"Unsupported data type for write: {type(data)}")
48+
return None
49+
50+
def get_keys_from_dataframe(self) -> list[str]:
51+
return self._df.columns.tolist()
52+
53+
def step(self):
54+
self.operator_step += 1
55+
return self
56+
57+
# --- helpers ---
58+
59+
@property
60+
def result(self) -> pd.DataFrame:
61+
"""Return the current DataFrame (after any writes)."""
62+
return self._df

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,7 @@ db-dtypes
8888
google-cloud-bigquery-storage
8989

9090
distflow
91+
92+
# RayOrch for parallelization
93+
# https://github.com/OpenDCAI/RayOrch
94+
rayorch==0.0.1

test/rayorch/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
bench_results.json

0 commit comments

Comments
 (0)