valkey-io · rileydes-improving · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/docs/commands/ft.aggregate.md b/docs/commands/ft.aggregate.md
@@ -92,3 +92,4 @@ The following reducer functions are available. The reducer functions that take a
 | MAX 1 <expression>            | The largest numerical values of the expression.                                                                                    |
 | AVG 1 <expression>            | The numerical average of the values of the expression.                                                                             |
 | STDDEV 1 <expression>         | The standard deviation the values of the expression.                                                                               |
+| RANDOM_SAMPLE 2 <expression> <sample_size> | A random sample of values from the expression using reservoir sampling. Returns an array of up to sample_size elements.   |
diff --git a/integration/compatibility/aggregate-answers.pickle.gz b/integration/compatibility/aggregate-answers.pickle.gz
diff --git a/integration/compatibility/generate.py b/integration/compatibility/generate.py
@@ -150,15 +150,15 @@ def check(self, dialect, *orig_cmd):
 
     def checkall(self, dialect, *orig_cmd, **kwargs):
         '''Non-vector commands. Doesn't have support for '*' yet. '''
-        self.checkvec(self, dialect, orig_cmd, kwargs)
-        self.check(self, dialect, orig_cmd)
+        self.checkvec(dialect, *orig_cmd, **kwargs)
+        self.check(dialect, *orig_cmd)
 
     def test_bad_numeric_data(self, key_type, dialect):
         self.setup_data("bad numbers", key_type)
-        self.check(dialect, f"ft.search {key_type}_idx1",  "@n1:[-inf inf]")
-        self.check(dialect, f"ft.search {key_type}_idx1", "-@n1:[-inf inf]")
-        self.check(dialect, f"ft.search {key_type}_idx1",  "@n2:[-inf inf]")
-        self.check(dialect, f"ft.search {key_type}_idx1", "-@n2:[-inf inf]")
+        self.check(dialect, "ft.search", f"{key_type}_idx1", "@n1:[-inf inf]")
+        self.check(dialect, "ft.search", f"{key_type}_idx1", "-@n1:[-inf inf]")
+        self.check(dialect, "ft.search", f"{key_type}_idx1", "@n2:[-inf inf]")
+        self.check(dialect, "ft.search", f"{key_type}_idx1", "-@n2:[-inf inf]")
 
     def test_search_reverse(self, key_type, dialect):
         self.setup_data("reverse vector numbers", key_type)
@@ -263,6 +263,34 @@ def test_aggregate_groupby(self, key_type, dialect):
             f"ft.aggregate {key_type}_idx1 * load 6 @__key @n1 @n2 @t1 @t2 @t3 groupby 1 @t3 reduce max 1 @n1 as nmax"
         )
         self.check(dialect, f'ft.aggregate {key_type}_idx1 * load 6 @__key @n1 @n2 @t1 @t2 @t3 groupby 1 @t1 reduce max 1 @n2 as nmax')
+
+    def test_aggregate_random_sample_errors(self, key_type, dialect):
+        """Test error behavior for RANDOM_SAMPLE edge cases against Redis.
+
+        Only cases that Redis rejects are included. Success succeeds 
+        are omitted because RANDOM_SAMPLE output is non-deterministic 
+        and can't be compared row-by-row against a recorded answer.
+        """
+        self.setup_data("sortable numbers", key_type)
+
+        error_cases = [
+            # Wrong argument counts.
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 0 as s",
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 1 @n1 as s",
+            # Non-numeric sample size.
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 2 @n1 invalid as s",
+            # Non-integer sample size.
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 2 @n1 1.5 as s",
+            # Negative sample size.
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 2 @n1 -1 as s",
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 2 @n1 -5 as s",
+            # Above cap (1000).
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 2 @n1 1001 as s",
+            f"ft.aggregate {key_type}_idx1 * load 2 @__key @n1 groupby 1 @t1 reduce random_sample 2 @n1 10000 as s",
+        ]
+        for cmd in error_cases:
+            self.execute_command(cmd.split())
+
     def test_aggregate_limit(self, key_type, dialect):
         self.setup_data("sortable numbers", key_type)
         self.check(dialect, f"ft.aggregate {key_type}_idx1  * load 3 @__key @n1 @n2")
@@ -463,7 +491,7 @@ def test_search_sortby(self, key_type, dialect):
 
         for sort_key in ["n1", "n2"]:
             for direction in ["ASC", "DESC", ""]:
-                for return_keys in ["", "RETURN 3 @n1 @t1"]:
+                for return_keys in ["", "RETURN 2 @n1 @t1"]:
                     for wsk in ["", "WITHSORTKEYS"]:
                         for limit in ["LIMIT 0 5", "LIMIT 2 3", ""]:
                             self.check(dialect, f"ft.search {key_type}_idx1 * SORTBY {sort_key} {direction} {return_keys} {limit} {wsk}")

diff --git a/integration/test_non_vector.py b/integration/test_non_vector.py
@@ -258,6 +258,162 @@ def validate_aggregate_queries(client: Valkey):
     )
     assert result[0] == 2
 
+def validate_random_sample_queries(client: Valkey):
+    """
+        Test FT.AGGREGATE with RANDOM_SAMPLE reducer.
+    """
+    # 1. Basic RANDOM_SAMPLE functionality
+    # Use APPLY to create a constant field for grouping all records together
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 1000]",
+        "LOAD", "1", "price",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "5", "AS", "sample"
+    )
+    assert result[0] == 1
+    # Result should have a sample field with an array
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    assert b'sample' in row
+    # Sample should be an array (list in Python)
+    sample = row[b'sample']
+    assert isinstance(sample, list)
+    assert len(sample) <= 5  # Should have at most 5 elements
+
+    # 2. RANDOM_SAMPLE with various sample sizes
+    # Sample size smaller than group size
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 100]",
+        "LOAD", "1", "price",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "10", "AS", "sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    sample = row[b'sample']
+    assert isinstance(sample, list)
+    assert len(sample) == 10  # Should have exactly 10 elements
+
+    # Sample size larger than group size
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 5]",
+        "LOAD", "1", "price",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "100", "AS", "sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    sample = row[b'sample']
+    assert isinstance(sample, list)
+    assert len(sample) == 5  # Should have all 5 elements
+
+    # Sample size = 0 (empty array)
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 100]",
+        "LOAD", "1", "price",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "0", "AS", "sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    sample = row[b'sample']
+    assert isinstance(sample, list)
+    assert len(sample) == 0  # Should be empty
+
+    # 3. Multiple RANDOM_SAMPLE reducers in same query
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 100]",
+        "LOAD", "2", "price", "rating",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "5", "AS", "price_sample",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@rating", "5", "AS", "rating_sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    assert b'price_sample' in row
+    assert b'rating_sample' in row
+    price_sample = row[b'price_sample']
+    rating_sample = row[b'rating_sample']
+    assert isinstance(price_sample, list)
+    assert isinstance(rating_sample, list)
+    assert len(price_sample) == 5
+    assert len(rating_sample) == 5
+    # Samples should be independent (different values)
+    assert price_sample != rating_sample
+
+    # 4. RANDOM_SAMPLE with GROUPBY operations
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 1000]",
+        "LOAD", "2", "price", "category",
+        "GROUPBY", "1", "@category",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "10", "AS", "sample"
+    )
+    assert result[0] == 2  # Two categories: electronics and books
+    for i in range(1, len(result)):
+        row = dict(zip(result[i][::2], result[i][1::2]))
+        assert b'category' in row
+        assert b'sample' in row
+        sample = row[b'sample']
+        assert isinstance(sample, list)
+        assert len(sample) == 10  # Each group should have 10 samples
+
+    # 5. RANDOM_SAMPLE with numeric fields
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 100]",
+        "LOAD", "1", "price",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "5", "AS", "sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    sample = row[b'sample']
+    assert isinstance(sample, list)
+    # All values should be numeric strings
+    for val in sample:
+        assert isinstance(val, bytes)
+        float(val)  # Should be convertible to float
+
+    # 6. RANDOM_SAMPLE with string fields
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 100]",
+        "LOAD", "1", "category",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@category", "5", "AS", "sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    sample = row[b'sample']
+    assert isinstance(sample, list)
+    # All values should be strings
+    for val in sample:
+        assert isinstance(val, bytes)
+        assert val in [b'electronics', b'books']
+
+    # 7. RANDOM_SAMPLE with mixed-type fields (numeric and string in same property)
+    # This tests that RANDOM_SAMPLE handles different types correctly
+    result = client.execute_command(
+        "FT.AGGREGATE", "products", "@price:[1 100]",
+        "LOAD", "2", "price", "rating",
+        "APPLY", "1", "AS", "all",
+        "GROUPBY", "1", "@all",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@price", "3", "AS", "price_sample",
+        "REDUCE", "RANDOM_SAMPLE", "2", "@rating", "3", "AS", "rating_sample"
+    )
+    assert result[0] == 1
+    row = dict(zip(result[1][::2], result[1][1::2]))
+    price_sample = row[b'price_sample']
+    rating_sample = row[b'rating_sample']
+    assert isinstance(price_sample, list)
+    assert isinstance(rating_sample, list)
+    assert len(price_sample) == 3
+    assert len(rating_sample) == 3
+
 def validate_aggregate_complex_queries(client: Valkey):
     """
         Test complex FT.AGGREGATE queries with numeric and tag.
@@ -488,6 +644,8 @@ def test_aggregate_complex(self):
         for doc in json_docs:
             assert client.execute_command(*doc) == b"OK"
         validate_aggregate_complex_queries(client)
+        # Test RANDOM_SAMPLE functionality
+        validate_random_sample_queries(client)
 
     def test_uningested_multi_field(self):
         """
@@ -550,6 +708,8 @@ def test_aggregate_complex_cluster(self):
         for doc in aggregate_complex_json_docs:
             assert cluster_client.execute_command(*doc) == b"OK"
         validate_aggregate_complex_queries(cluster_client)
+        # Test RANDOM_SAMPLE functionality in cluster mode
+        validate_random_sample_queries(cluster_client)
 
     def test_max_search_keys_fetch_limited(self):
         """

diff --git a/src/commands/commands.h b/src/commands/commands.h
@@ -105,7 +105,7 @@ struct QueryCommand : public query::SearchParameters {
   //
   // Determine if we need full results or if we can optimize with trimming
   //
-  virtual bool RequiresCompleteResults() const = 0;
+  bool RequiresCompleteResults() const override = 0;
   //
   // Called when query completes.
   //

diff --git a/src/commands/ft.aggregate.json b/src/commands/ft.aggregate.json
@@ -263,6 +263,11 @@
                     "name": "STDDEV",
                     "type": "pure-token",
                     "token": "STDDEV"
+                  },
+                  {
+                    "name": "RANDOM_SAMPLE",
+                    "type": "pure-token",
+                    "token": "RANDOM_SAMPLE"
                   }
                 ]
               },
@@ -361,4 +366,4 @@
     "module_since": "1.1.0",
     "summary": "Performs a search of the specified index. The keys which match the query expression are subjected to further processing as specified"
   }
-}
+}
diff --git a/src/commands/ft_aggregate.cc b/src/commands/ft_aggregate.cc
@@ -124,13 +124,48 @@ absl::Status AggregateParameters::ParseCommand(vmsdk::ArgsIterator &itr) {
   return absl::OkStatus();
 }
 
+// Forward declaration for recursive serialization
+void SerializeValueToResp(ValkeyModuleCtx *ctx, const expr::Value &value);
+
+void SerializeArrayToResp(ValkeyModuleCtx *ctx, const expr::Value::Array vec) {
+  ValkeyModule_ReplyWithArray(ctx, vec->size());
+  for (const auto &elem : *vec) {
+    SerializeValueToResp(ctx, elem);
+  }
+}
+
+void SerializeValueToResp(ValkeyModuleCtx *ctx, const expr::Value &value) {
+  if (value.IsArray()) {
+    SerializeArrayToResp(ctx, value.GetArray());
+  } else if (value.IsBool()) {
+    ValkeyModule_ReplyWithLongLong(ctx, value.GetBool() ? 1 : 0);
+  } else if (value.IsDouble()) {
+    auto value_str = value.AsString();
+    ValkeyModule_ReplyWithStringBuffer(ctx, value_str.data(), value_str.size());
+  } else if (value.IsString()) {
+    auto value_sv = value.GetStringView();
+    ValkeyModule_ReplyWithStringBuffer(ctx, value_sv.data(), value_sv.size());
+  } else {
+    // Fallback for Nil and unknown types
+    ValkeyModule_ReplyWithNull(ctx);
+  }
+}
+
 bool ReplyWithValue(ValkeyModuleCtx *ctx,
                     data_model::AttributeDataType data_type,
                     std::string_view name, indexes::IndexerType indexer_type,
                     const expr::Value &value, int dialect) {
   if (value.IsNil()) {
     return false;
   }
+
+  // Handle vector values with RESP array serialization
+  if (value.IsArray()) {
+    ValkeyModule_ReplyWithSimpleString(ctx, name.data());
+    SerializeArrayToResp(ctx, value.GetArray());
+    return true;
+  }
+
   if (data_type == data_model::AttributeDataType::ATTRIBUTE_DATA_TYPE_HASH) {
     ValkeyModule_ReplyWithSimpleString(ctx, name.data());
     auto value_sv = value.AsStringView();