microsoft · SkyInTheSea · Apr 9, 2026 · Apr 17, 2026 · hildebrandmw · Apr 13, 2026
diff --git a/diskann-providers/src/model/graph/provider/async_/simple_neighbor_provider.rs b/diskann-providers/src/model/graph/provider/async_/simple_neighbor_provider.rs
@@ -288,6 +288,26 @@ impl storage::bin::GetAdjacencyList for SimpleNeighborProviderAsync<u32> {
         Ok(list)
     }
 
+    /// Optimized version that reuses a pre-allocated buffer to avoid per-call allocation.
+    ///
+    /// This directly copies the adjacency list into the provided buffer, avoiding the
+    /// intermediate `AdjacencyList` allocation that `get_adjacency_list` requires.
+    fn get_adjacency_list_into(&self, i: usize, buffer: &mut Vec<u32>) -> ANNResult<()> {
+        #[cfg(test)]
+        self.num_get_calls.increment();
+
+        // Lint: We don't have a good way of recovering from lock poisoning anyways.
+        let _guard = self.locks[i].read().unwrap();
+
+        // SAFETY: We are holding the read lock for `i`.
+        let list = unsafe { self.get_slice(i) };
+
+        // Reuse buffer: clear and copy data directly
+        buffer.clear();
+        buffer.extend_from_slice(list);
+        Ok(())
+    }
+
     fn total(&self) -> usize {
         self.locks.len()
     }
@@ -346,6 +366,32 @@ impl storage::bin::GetAdjacencyList for DiskAdaptor<'_> {
         Ok(list)
     }
 
+    /// Optimized version that reuses a pre-allocated buffer to avoid per-call allocation.
+    ///
+    /// This directly reads neighbors into the buffer and performs the start point remapping
+    /// in-place, avoiding the intermediate `AdjacencyList` allocation.
+    fn get_adjacency_list_into(&self, i: usize, buffer: &mut Vec<u32>) -> ANNResult<()> {
+        // Lint: We don't have a good way of recovering from lock poisoning anyways.
+        #[allow(clippy::unwrap_used)]
+        let _guard = self.provider.locks[i].read().unwrap();
+
+        // SAFETY: We are holding the read lock for `i`.
+        let list = unsafe { self.provider.get_slice(i) };
+
+        // Reuse buffer: clear and copy data directly
+        buffer.clear();
+        buffer.extend_from_slice(list);
+
+        // Remap the in-memory start point to the actual start point
+        for id in buffer.iter_mut() {
+            if *id == self.inmem_start_point {
+                *id = self.actual_start_point;
+            }
+        }
+
+        Ok(())
+    }
+
     fn total(&self) -> usize {
         // Don't include any start points at the end.
         self.provider.locks.len() - self.provider.num_start_points

diff --git a/diskann-providers/src/storage/bin.rs b/diskann-providers/src/storage/bin.rs
@@ -100,6 +100,23 @@ pub(crate) trait GetAdjacencyList {
     /// Retrieve the data stored at index `i`.
     fn get_adjacency_list(&self, i: usize) -> ANNResult<Self::Item<'_>>;
 
+    /// Retrieve the data stored at index `i` into a pre-allocated buffer.
+    ///
+    /// This method allows callers to reuse a buffer across multiple calls,
+    /// avoiding per-call memory allocation overhead. The buffer is cleared
+    /// before being populated with the adjacency list data.
+    ///
+    /// Default implementation falls back to `get_adjacency_list` and copies.
+    fn get_adjacency_list_into(&self, i: usize, buffer: &mut Vec<Self::Element>) -> ANNResult<()>
+    where
+        Self::Element: Clone,
+    {
+        buffer.clear();
+        let list = self.get_adjacency_list(i)?;
+        buffer.extend_from_slice(&list);
+        Ok(())
+    }
+
     /// Return the total number of elements contained in `self`.
     fn total(&self) -> usize;
 
@@ -344,31 +361,31 @@ where
     let mut observed_max_degree: u32 = 0;
 
     out.write_all(&index_size.to_le_bytes())?;
-    out.write_all(&observed_max_degree.to_le_bytes())?; // Will be updated later with correct max_degree
+    out.write_all(&observed_max_degree.to_le_bytes())?;
     out.write_all(&start_point.to_le_bytes())?;
-
     out.write_all(&graph.additional_points().to_le_bytes())?;
+
     let total = graph.total();
 
+    // Pre-allocate a reusable buffer for adjacency lists
+    let initial_capacity = graph.max_degree().map(|d| d as usize).unwrap_or(128);
+    let mut neighbor_buffer: Vec<u32> = Vec::with_capacity(initial_capacity);
+
     for i in 0..total {
-        let binding = graph.get_adjacency_list(i)?;
-        let neighbors: &[u32] = &binding;
-        let num_neighbors: u32 = neighbors.len() as u32;
+        // Reuse buffer to avoid per-vertex allocation overhead
+        graph.get_adjacency_list_into(i, &mut neighbor_buffer)?;
+        let num_neighbors: u32 = neighbor_buffer.len() as u32;
 
-        // Write the number of neighbors as a `u32`.
         out.write_all(&num_neighbors.to_le_bytes())?;
 
-        // Write all the neighbors, applying transformation if provided.
-        neighbors
-            .iter()
-            .copied()
-            .try_for_each(|n| out.write_all(&n.to_le_bytes()))?;
+        // Bulk write using bytemuck for zero-copy conversion
+        let neighbor_bytes: &[u8] = bytemuck::must_cast_slice(&neighbor_buffer);
+        out.write_all(neighbor_bytes)?;
 
         observed_max_degree = observed_max_degree.max(num_neighbors);
-        index_size += (std::mem::size_of::<u32>() * (1 + neighbors.len())) as u64;
+        index_size += (std::mem::size_of::<u32>() * (1 + neighbor_buffer.len())) as u64;
     }
 
-    // Use configured max degree if provided, otherwise use observed
     let max_degree = graph.max_degree().unwrap_or(observed_max_degree);
 
     // Finish up by writing the observed index size and max degree.