Add local mode support for json scan and json document scan

bohou-aryn · bohou-aryn · commit 98eddbeec4ab · 2024-10-14T20:51:24.000-07:00
diff --git a/lib/sycamore/sycamore/connectors/file/file_scan.py b/lib/sycamore/sycamore/connectors/file/file_scan.py
@@ -1,5 +1,7 @@
 import json
 from abc import ABC, abstractmethod
+from io import BytesIO
+
 import boto3
 import mimetypes
 from typing import Any, Optional, Union, Tuple, Callable, TYPE_CHECKING
@@ -288,6 +290,46 @@ def execute(self, **kwargs) -> "Dataset":
         doc_extractor = self._doc_extractor if self._doc_extractor else self._to_document
         return json_dataset.flat_map(doc_extractor, **self.resource_args)
 
+    def local_source(self, **kwargs) -> list[Document]:
+        if isinstance(self._paths, str):
+            paths = [self._paths]
+        else:
+            paths = self._paths
+
+        documents = []
+
+        def process_file(info):
+            if not info.is_file:
+                return
+
+            with self._filesystem.open_input_file(info.path) as file:
+                import pyarrow.json as pyjson
+                import pyarrow
+
+                buffer: pyarrow.lib.Buffer = file.read_buffer()
+                table = pyjson.read_json(BytesIO(buffer))
+                rows = table.to_pylist()
+                for row in rows:
+                    row["path"] = path
+                doc_extractor = self._doc_extractor if self._doc_extractor else self._to_document
+                docs = [doc_extractor(row)[0] for row in rows]
+                documents.extend(docs)
+
+        for orig_path in paths:
+            from sycamore.utils.pyarrow import cross_check_infer_fs
+
+            (filesystem, path) = cross_check_infer_fs(self._filesystem, orig_path)
+            if self._filesystem is None:
+                self._filesystem = filesystem
+
+            path_info = filesystem.get_file_info(path)
+            if path_info.is_file:
+                process_file(path_info)
+            else:
+                for info in filesystem.get_file_info(FileSelector(path, recursive=True)):
+                    process_file(info)
+        return documents
+
     def format(self):
         return "json"
 
@@ -321,5 +363,42 @@ def execute(self, **kwargs) -> "Dataset":
         )
         return ds.flat_map(self.json_as_document, **self.resource_args)
 
+    def local_source(self, **kwargs) -> list[Document]:
+        if isinstance(self._paths, str):
+            paths = [self._paths]
+        else:
+            paths = self._paths
+
+        documents = []
+
+        def process_file(info):
+            if not info.is_file:
+                return
+
+            with self._filesystem.open_input_file(info.path) as file:
+                import pyarrow.json as pyjson
+                import pyarrow
+
+                buffer: pyarrow.lib.Buffer = file.read_buffer()
+                table = pyjson.read_json(BytesIO(buffer))
+                rows = table.to_pylist()
+                docs = [Document(row) for row in rows]
+                documents.extend(docs)
+
+        for orig_path in paths:
+            from sycamore.utils.pyarrow import cross_check_infer_fs
+
+            (filesystem, path) = cross_check_infer_fs(self._filesystem, orig_path)
+            if self._filesystem is None:
+                self._filesystem = filesystem
+
+            path_info = filesystem.get_file_info(path)
+            if path_info.is_file:
+                process_file(path_info)
+            else:
+                for info in filesystem.get_file_info(FileSelector(path, recursive=True)):
+                    process_file(info)
+        return documents
+
     def format(self):
         return "jsonl"
diff --git a/lib/sycamore/sycamore/tests/unit/scans/test_file_scan.py b/lib/sycamore/sycamore/tests/unit/scans/test_file_scan.py
@@ -46,6 +46,15 @@ def test_json_scan(self):
         assert set(doc.properties.keys()) == set(["props", "path"])
         assert doc.properties["props"] == "propValue"
 
+    def test_json_scan_local(self):
+        paths = str(TEST_DIR / "resources/data/json/example.json")
+        scan = JsonScan(paths, properties="props")
+        ds = scan.local_source()
+        doc = Document.from_row(ds[0])
+
+        assert set(doc.properties.keys()) == set(["props", "path"])
+        assert doc.properties["props"] == "propValue"
+
     def test_json_scan_all_props(self):
         paths = str(TEST_DIR / "resources/data/json/example.json")
         scan = JsonScan(paths)
@@ -58,6 +67,17 @@ def test_json_scan_all_props(self):
         assert doc.properties["props"] == "propValue"
         assert isinstance(doc.properties["web-app"], dict)
 
+    def test_json_scan_all_props_local(self):
+        paths = str(TEST_DIR / "resources/data/json/example.json")
+        scan = JsonScan(paths)
+        ds = scan.local_source()
+
+        doc = Document.from_row(ds[0])
+
+        assert set(doc.properties.keys()) == set(["web-app", "props", "path"])
+        assert doc.properties["props"] == "propValue"
+        assert isinstance(doc.properties["web-app"], dict)
+
     def test_json_scan_body_field(self):
         paths = str(TEST_DIR / "resources/data/json/example.json")
         scan = JsonScan(paths, document_body_field="props")