detection-rules/detection_rules/eswrap.py at main · elastic/detection-rules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.

"""Elasticsearch cli commands."""

import json
import sys
import time
from collections import defaultdict
from pathlib import Path
from typing import IO, Any

import click
import elasticsearch
import kql  # type: ignore[reportMissingTypeStubs]
from elasticsearch import Elasticsearch
from elasticsearch.client import AsyncSearchClient

from .config import parse_rules_config
from .main import root
from .misc import add_params, elasticsearch_options, get_elasticsearch_client, nested_get, raise_client_error
from .rule import TOMLRule
from .rule_loader import RuleCollection
from .utils import event_sort, format_command_options, get_path, normalize_timing_and_sort, unix_time_to_formatted

COLLECTION_DIR = get_path(["collections"])
MATCH_ALL: dict[str, dict[str, Any]] = {"bool": {"filter": [{"match_all": {}}]}}
RULES_CONFIG = parse_rules_config()


def add_range_to_dsl(dsl_filter: list[dict[str, Any]], start_time: str, end_time: str = "now") -> None:
    dsl_filter.append(
        {
            "range": {
                "@timestamp": {
                    "gt": start_time,
                    "lte": end_time,
                    "format": "strict_date_optional_time",
                },
            },
        }
    )


def parse_unique_field_results(
    rule_type: str,
    unique_fields: list[str],
    search_results: dict[str, Any],
) -> dict[str, Any]:
    parsed_results: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
    hits = search_results["hits"]
    hits = hits["hits"] if rule_type != "eql" else hits.get("events") or hits.get("sequences", [])
    for hit in hits:
        for field in unique_fields:
            if "events" in hit:
                match: list[Any] = []
                for event in hit["events"]:
                    matched = nested_get(event["_source"], field)
                    match.extend([matched] if not isinstance(matched, list) else matched)  # type: ignore[reportUnknownArgumentType]
                    if not match:
                        continue
            else:
                match = nested_get(hit["_source"], field)
                if not match:
                    continue

            match = ",".join(sorted(match)) if isinstance(match, list) else match  # type: ignore[reportUnknownArgumentType]
            parsed_results[field][match] += 1  # type: ignore[reportUnknownArgumentType]
    # if rule.type == eql, structure is different
    return {"results": parsed_results} if parsed_results else {}


class Events:
    """Events collected from Elasticsearch."""

    def __init__(self, events: dict[str, Any]) -> None:
        self.events = self._normalize_event_timing(events)

    @staticmethod
    def _normalize_event_timing(events: dict[str, Any]) -> dict[str, Any]:
        """Normalize event timestamps and sort."""
        for agent_type, _events in events.items():
            events[agent_type] = normalize_timing_and_sort(_events)

        return events

    @staticmethod
    def _get_dump_dir(
        rta_name: str | None = None,
        host_id: str | None = None,
        host_os_family: str | None = None,
    ) -> Path:
        """Prepare and get the dump path."""
        if rta_name and host_os_family:
            dump_dir = get_path(["unit_tests", "data", "true_positives", rta_name, host_os_family])
            dump_dir.mkdir(parents=True, exist_ok=True)
            return dump_dir
        time_str = time.strftime("%Y%m%dT%H%M%SL")
        dump_dir = COLLECTION_DIR / (host_id or "unknown_host") / time_str
        dump_dir.mkdir(parents=True, exist_ok=True)
        return dump_dir

    def evaluate_against_rule(self, rule_id: str, verbose: bool = True) -> list[Any]:
        """Evaluate a rule against collected events and update mapping."""
        rule = RuleCollection.default().id_map.get(rule_id)
        if not rule:
            raise ValueError(f"Unable to find rule with ID {rule_id}")
        merged_events = combine_sources(*self.events.values())
        filtered = evaluate(rule, merged_events, normalize_kql_keywords=RULES_CONFIG.normalize_kql_keywords)

        if verbose:
            click.echo("Matching results found")

        return filtered

    def echo_events(self, pager: bool = False, pretty: bool = True) -> None:
        """Print events to stdout."""
        echo_fn = click.echo_via_pager if pager else click.echo
        echo_fn(json.dumps(self.events, indent=2 if pretty else None, sort_keys=True))

    def save(self, rta_name: str | None = None, dump_dir: Path | None = None, host_id: str | None = None) -> None:
        """Save collected events."""
        if not self.events:
            raise ValueError("Nothing to save. Run Collector.run() method first or verify logging")

        host_os_family = None
        for key in self.events:
            if self.events.get(key, {})[0].get("host", {}).get("id") == host_id:
                host_os_family = self.events.get(key, {})[0].get("host", {}).get("os").get("family")
                break
        if not host_os_family:
            click.echo(f"Unable to determine host.os.family for host_id: {host_id}")
            host_os_family = click.prompt(
                "Please enter the host.os.family for this host_id",
                type=click.Choice(["windows", "macos", "linux"]),
                default="windows",
            )

        dump_dir = dump_dir or self._get_dump_dir(rta_name=rta_name, host_id=host_id, host_os_family=host_os_family)

        for source, events in self.events.items():
            path = dump_dir / (source + ".ndjson")
            with path.open("w") as f:
                f.writelines([json.dumps(e, sort_keys=True) + "\n" for e in events])
                click.echo(f"{len(events)} events saved to: {path}")


class CollectEvents:
    """Event collector for elastic stack."""

    def __init__(self, client: Elasticsearch, max_events: int = 3000) -> None:
        self.client = client
        self.max_events = max_events

    def _build_timestamp_map(self, index: str) -> dict[str, Any]:
        """Build a mapping of indexes to timestamp data formats."""
        mappings = self.client.indices.get_mapping(index=index)
        return {n: m["mappings"].get("properties", {}).get("@timestamp", {}) for n, m in mappings.items()}

    def _get_last_event_time(self, index: str, dsl: dict[str, Any] | None = None) -> None | str:
        """Get timestamp of most recent event."""
        last_event = self.client.search(query=dsl, index=index, size=1, sort="@timestamp:desc")["hits"]["hits"]
        if not last_event:
            return None

        last_event = last_event[0]
        index = last_event["_index"]
        timestamp = last_event["_source"]["@timestamp"]

        timestamp_map = self._build_timestamp_map(index)
        event_date_format = timestamp_map[index].get("format", "").split("||")

        # there are many native supported date formats and even custom data formats, but most, including beats use the
        # default `strict_date_optional_time`. It would be difficult to try to account for all possible formats, so this
        # will work on the default and unix time.
        if set(event_date_format) & {"epoch_millis", "epoch_second"}:
            timestamp = unix_time_to_formatted(timestamp)

        return timestamp

    @staticmethod
    def _prep_query(
        query: str | dict[str, Any],
        language: str,
        index: str | list[str] | tuple[str],
        start_time: str | None = None,
        end_time: str | None = None,
    ) -> tuple[str, dict[str, Any], str | None]:
        """Prep a query for search."""
        index_str = ",".join(index if isinstance(index, (list | tuple)) else index.split(","))
        lucene_query = str(query) if language == "lucene" else None

        if language in ("kql", "kuery"):
            formatted_dsl = {"query": kql.to_dsl(query)}  # type: ignore[reportUnknownMemberType]
        elif language == "eql":
            formatted_dsl = {"query": query, "filter": MATCH_ALL}
        elif language == "lucene":
            formatted_dsl: dict[str, Any] = {"query": {"bool": {"filter": []}}}
        elif language == "dsl":
            formatted_dsl = {"query": query}
        else:
            raise ValueError(f"Unknown search language: {language}")

        if start_time or end_time:
            end_time = end_time or "now"
            dsl = (
                formatted_dsl["filter"]["bool"]["filter"]
                if language == "eql"
                else formatted_dsl["query"]["bool"].setdefault("filter", [])
            )
            if not start_time:
                raise ValueError("No start time provided")

            add_range_to_dsl(dsl, start_time, end_time)

        return index_str, formatted_dsl, lucene_query

    def search(  # noqa: PLR0913
        self,
        query: str | dict[str, Any],
        language: str,
        index: str | list[str] = "*",
        start_time: str | None = None,
        end_time: str | None = None,
        size: int | None = None,
        **kwargs: Any,
    ) -> list[Any]:
        """Search an elasticsearch instance."""
        index_str, formatted_dsl, lucene_query = self._prep_query(
            query=query, language=language, index=index, start_time=start_time, end_time=end_time
        )
        formatted_dsl.update(size=size or self.max_events)

        if language == "eql":
            results = self.client.eql.search(body=formatted_dsl, index=index_str, **kwargs)["hits"]
            results = results.get("events") or results.get("sequences", [])
        else:
            results = self.client.search(
                body=formatted_dsl,
                q=lucene_query,
                index=index_str,
                allow_no_indices=True,
                ignore_unavailable=True,
                **kwargs,
            )["hits"]["hits"]

        return results

    def search_from_rule(
        self,
        rules: RuleCollection,
        start_time: str | None = None,
        end_time: str = "now",
        size: int | None = None,
    ) -> dict[str, Any]:
        """Search an elasticsearch instance using a rule."""
        async_client = AsyncSearchClient(self.client)
        survey_results: dict[str, Any] = {}
        multi_search: list[dict[str, Any]] = []
        multi_search_rules: list[TOMLRule] = []
        async_searches: list[tuple[TOMLRule, Any]] = []
        eql_searches: list[tuple[TOMLRule, dict[str, Any]]] = []

        for rule in rules:
            if not rule.contents.data.get("query"):
                continue

            language = rule.contents.data.get("language")
            query = rule.contents.data.query  # type: ignore[reportAttributeAccessIssue]
            rule_type = rule.contents.data.type
            index_str, formatted_dsl, _ = self._prep_query(
                query=query,  # type: ignore[reportUnknownArgumentType]
                language=language,  # type: ignore[reportUnknownArgumentType]
                index=rule.contents.data.get("index", "*"),  # type: ignore[reportUnknownArgumentType]
                start_time=start_time,
                end_time=end_time,
            )
            formatted_dsl.update(size=size or self.max_events)

            # prep for searches: msearch for kql | async search for lucene | eql client search for eql
            if language == "kuery":
                multi_search_rules.append(rule)
                multi_search.append({"index": index_str, "allow_no_indices": "true", "ignore_unavailable": "true"})
                multi_search.append(formatted_dsl)
            elif language == "lucene":
                # wait for 0 to try and force async with no immediate results (not guaranteed)
                result = async_client.submit(
                    body=formatted_dsl,
                    q=query,  # type: ignore[reportUnknownArgumentType]
                    index=index_str,
                    allow_no_indices=True,
                    ignore_unavailable=True,
                    wait_for_completion_timeout=0,
                )
                if result["is_running"] is True:
                    async_searches.append((rule, result["id"]))
                else:
                    survey_results[rule.id] = parse_unique_field_results(
                        rule_type, ["process.name"], result["response"]
                    )
            elif language == "eql":
                eql_body: dict[str, Any] = {
                    "index": index_str,
                    "params": {"ignore_unavailable": "true", "allow_no_indices": "true"},
                    "body": {"query": query, "filter": formatted_dsl["filter"]},
                }
                eql_searches.append((rule, eql_body))

        # assemble search results
        multi_search_results = self.client.msearch(searches=multi_search)
        for index, result in enumerate(multi_search_results["responses"]):
            try:
                rule = multi_search_rules[index]
                survey_results[rule.id] = parse_unique_field_results(
                    rule.contents.data.type,
                    rule.contents.data.unique_fields,  # type: ignore[reportAttributeAccessIssje]
                    result,
                )
            except KeyError:
                survey_results[multi_search_rules[index].id] = {"error_retrieving_results": True}

        for entry in eql_searches:
            rule, search_args = entry
            try:
                result = self.client.eql.search(**search_args)
                survey_results[rule.id] = parse_unique_field_results(
                    rule.contents.data.type,
                    rule.contents.data.unique_fields,  # type: ignore[reportAttributeAccessIssue]
                    result,  # type: ignore[reportAttributeAccessIssue]
                )
            except (elasticsearch.NotFoundError, elasticsearch.RequestError) as e:
                survey_results[rule.id] = {"error_retrieving_results": True, "error": e.info["error"]["reason"]}

        for entry in async_searches:
            rule: TOMLRule
            rule, async_id = entry
            result = async_client.get(id=async_id)["response"]
            survey_results[rule.id] = parse_unique_field_results(rule.contents.data.type, ["process.name"], result)

        return survey_results

    def count(
        self,
        query: str,
        language: str,
        index: str | list[str],
        start_time: str | None = None,
        end_time: str | None = "now",
    ) -> Any:
        """Get a count of documents from elasticsearch."""
        index_str, formatted_dsl, lucene_query = self._prep_query(
            query=query,
            language=language,
            index=index,
            start_time=start_time,
            end_time=end_time,
        )

        # EQL API has no count endpoint
        if language == "eql":
            results = self.search(
                query=query,
                language=language,
                index=index,
                start_time=start_time,
                end_time=end_time,
                size=1000,
            )
            return len(results)
        resp = self.client.count(
            body=formatted_dsl,
            index=index_str,
            q=lucene_query,
            allow_no_indices=True,
            ignore_unavailable=True,
        )

        return resp["count"]

    def count_from_rule(
        self,
        rules: RuleCollection,
        start_time: str | None = None,
        end_time: str | None = "now",
    ) -> dict[str, Any]:
        """Get a count of documents from elasticsearch using a rule."""
        survey_results: dict[str, Any] = {}

        for rule in rules.rules:
            rule_results: dict[str, Any] = {"rule_id": rule.id, "name": rule.name}

            if not rule.contents.data.get("query"):
                continue

            try:
                rule_results["search_count"] = self.count(
                    query=rule.contents.data.query,  # type: ignore[reportAttributeAccessIssue]
                    language=rule.contents.data.language,  # type: ignore[reportAttributeAccessIssue]
                    index=rule.contents.data.get("index", "*"),  # type: ignore[reportAttributeAccessIssue]
                    start_time=start_time,
                    end_time=end_time,
                )
            except (elasticsearch.NotFoundError, elasticsearch.RequestError):
                rule_results["search_count"] = -1

            survey_results[rule.id] = rule_results

        return survey_results


def evaluate(rule: TOMLRule, events: list[Any], normalize_kql_keywords: bool = False) -> list[Any]:
    """Evaluate a query against events."""
    evaluator = kql.get_evaluator(kql.parse(rule.query), normalize_kql_keywords=normalize_kql_keywords)  # type: ignore[reportUnknownMemberType]
    return list(filter(evaluator, events))  # type: ignore[reportUnknownMemberType]


def combine_sources(sources: list[Any]) -> list[Any]:
    """Combine lists of events from multiple sources."""
    combined: list[Any] = []
    for source in sources:
        combined.extend(source.copy())

    return event_sort(combined)


class CollectEventsWithDSL(CollectEvents):
    """Collect events from elasticsearch."""

    @staticmethod
    def _group_events_by_type(events: list[Any]) -> dict[str, list[Any]]:
        """Group events by agent.type."""
        event_by_type: dict[str, list[Any]] = {}

        for event in events:
            event_by_type.setdefault(event["_source"]["agent"]["type"], []).append(event["_source"])

        return event_by_type

    def run(self, dsl: dict[str, Any], indexes: str | list[str], start_time: str) -> Events:
        """Collect the events."""
        results = self.search(
            dsl,
            language="dsl",
            index=indexes,
            start_time=start_time,
            end_time="now",
            size=5000,
            sort=[{"@timestamp": {"order": "asc"}}],
        )
        events = self._group_events_by_type(results)
        return Events(events)


@root.command("normalize-data")
@click.argument("events-file", type=Path)
def normalize_data(events_file: Path) -> None:
    """Normalize Elasticsearch data timestamps and sort."""

    file_name = events_file.name
    content = events_file.read_text()
    lines = content.splitlines()

    events = Events({file_name: [json.loads(line) for line in lines]})
    events.save(dump_dir=events_file.parent)


@root.group("es")
@add_params(*elasticsearch_options)
@click.pass_context
def es_group(ctx: click.Context, **kwargs: Any) -> None:
    """Commands for integrating with Elasticsearch."""
    _ = ctx.ensure_object(dict)  # type: ignore[reportUnknownVariableType]

    # only initialize an es client if the subcommand is invoked without help (hacky)
    if sys.argv[-1] in ctx.help_option_names:
        click.echo("Elasticsearch client:")
        click.echo(format_command_options(ctx))

    else:
        ctx.obj["es"] = get_elasticsearch_client(ctx=ctx, **kwargs)


@es_group.command("collect-events")
@click.argument("host-id")
@click.option("--query", "-q", help="KQL query to scope search")
@click.option("--index", "-i", multiple=True, help="Index(es) to search against (default: all indexes)")
@click.option("--rta-name", "-r", help="Name of RTA in order to save events directly to unit tests data directory")
@click.option("--rule-id", help="Updates rule mapping in rule-mapping.yaml file (requires --rta-name)")
@click.option("--view-events", is_flag=True, help="Print events after saving")
@click.pass_context
def collect_events(  # noqa: PLR0913
    ctx: click.Context,
    host_id: str,
    query: str,
    index: list[str],
    rta_name: str,
    rule_id: str,
    view_events: bool,
) -> Events:
    """Collect events from Elasticsearch."""
    client: Elasticsearch = ctx.obj["es"]
    dsl = kql.to_dsl(query) if query else MATCH_ALL  # type: ignore[reportUnknownMemberType]
    dsl["bool"].setdefault("filter", []).append(  # type: ignore[reportUnknownMemberType]
        {
            "bool": {
                "should": [{"match_phrase": {"host.id": host_id}}],
            },
        }
    )

    try:
        collector = CollectEventsWithDSL(client)
        start = time.time()
        click.pause("Press any key once detonation is complete ...")
        start_time = f"now-{round(time.time() - start) + 5}s"
        events = collector.run(dsl, index or "*", start_time)  # type: ignore[reportUnknownArgument]
        events.save(rta_name=rta_name, host_id=host_id)

        if rta_name and rule_id:
            _ = events.evaluate_against_rule(rule_id)

        if view_events and events.events:
            events.echo_events(pager=True)

    except AssertionError as e:
        error_msg = "No events collected! Verify events are streaming and that the agent-hostname is correct"
        raise_client_error(error_msg, e, ctx=ctx)

    return events


@es_group.command("index-rules")
@click.option("--query", "-q", help="Optional KQL query to limit to specific rules")
@click.option("--from-file", "-f", type=click.File("r"), help="Load a previously saved uploadable bulk file")
@click.option("--save_files", "-s", is_flag=True, help="Optionally save the bulk request to a file")
@click.pass_context
def index_repo(ctx: click.Context, query: str, from_file: IO[Any] | None, save_files: bool) -> None:
    """Index rules based on KQL search results to an elasticsearch instance."""
    from .main import generate_rules_index

    es_client: Elasticsearch = ctx.obj["es"]

    if from_file:
        bulk_upload_docs = from_file.read()

        # light validation only
        try:
            index_body = [json.loads(line) for line in bulk_upload_docs.splitlines()]
            click.echo(f"{len([r for r in index_body if 'rule' in r])} rules included")
        except json.JSONDecodeError:
            raise_client_error(f"Improperly formatted bulk request file: {from_file.name}")
    else:
        bulk_upload_docs, _ = ctx.invoke(generate_rules_index, query=query, save_files=save_files)

    _ = es_client.bulk(operations=bulk_upload_docs)