diff --git a/openstack/neutron/alerts/openstack/neutron.alerts b/openstack/neutron/alerts/openstack/neutron.alerts deleted file mode 100644 index c499760c4b5..00000000000 --- a/openstack/neutron/alerts/openstack/neutron.alerts +++ /dev/null @@ -1,53 +0,0 @@ -groups: -- name: neutron.alerts - rules: - - alert: OpenstackNeutronMonitorAgentHeartbeat - expr: max(openstack_neutron_monitor_agents_heartbeat_seconds) by (agent_type) > 75 - for: 10m - labels: - context: Agent Heartbeat - support_group: network-api - dashboard: neutron - service: Neutron - severity: warning - tier: os - meta: 'Agent {{ $labels.agent_type }} Heartbeat is above 75secs in {{ $labels.host }}' - playbook: docs/support/playbook/neutron/agent_heartbeat - annotations: - description: Agent {{ $labels.agent_type }} Heartbeat is above 75secs in {{ $labels.host }} - summary: Openstack Neutron Metric to monitor Agents Heartbeat - - - alert: OpenstackNeutronNetworkSegmentsExhaustion - expr: sum(openstack_neutron_network_segments_free) by (hostgroup) / sum(openstack_neutron_network_segments_total) by (hostgroup) < 0.2 - for: 5m - labels: - severity: warning - support_group: network-api - tier: os - service: neutron - context: '{{ $labels.context }}' - dashboard: neutron - meta: 'Network segment `{{ $labels.hostgroup }}` has less than 20% free allocations left.' - playbook: docs/devops/alert/neutron/#openstackneutronnetworksegmentsexhaustion - annotations: - description: 'Network segment `{{ $labels.hostgroup }}` has less than 20% free allocations left.' - summary: Openstack Neutron free network allocations exhaustion - - - alert: OpenstackNeutronAsr1KDriverNetconfUnreachable - expr: | - sum by (device, host) (rate(neutron_asr1k_l2_device_unreachable_total[5m])) + - sum by (device, host) (rate(neutron_asr1k_l3_device_unreachable_total[5m])) - > 0 - for: 20m - labels: - severity: critical - support_group: network-data - tier: net - service: asr - context: '{{ $labels.host }}' - dashboard: asr1k-driver-dashboard - meta: Asr1k driver for agent {{ $labels.host }} cannot reach device `{{ $labels.device }}` via netconf-yang. - playbook: docs/devops/alert/network/router#OpenstackNeutronAsr1KDriverNetconfUnreachable - annotations: - description: Asr1k driver for agent {{ $labels.host }} cannot reach device `{{ $labels.device }}` via netconf-yang. - summary: Asr1k driver cannot reach device diff --git a/prometheus-exporters/network-generic-ssh-exporter/alerts/asr.alerts b/prometheus-exporters/network-generic-ssh-exporter/alerts/asr.alerts index e52d4592854..3be909a0d06 100644 --- a/prometheus-exporters/network-generic-ssh-exporter/alerts/asr.alerts +++ b/prometheus-exporters/network-generic-ssh-exporter/alerts/asr.alerts @@ -10,7 +10,7 @@ groups: service: asr context: asr meta: "Neutron Router `{{ $labels.name }}` is reporting ARP punt policer drops" - playbook: 'docs/support/playbook/network/control_plane_router/arp_punt_drop' + playbook: 'docs/network/playbooks/control_plane_router/arp_punt_drop/' dashboard: neutron-router annotations: description: "Neutron Router `{{ $labels.model }}` `{{ $labels.name }}` is reporting ARP punt policer drops" @@ -26,7 +26,7 @@ groups: service: asr context: asr meta: "NAT table on Neutron Router `{{ $labels.name }}` has 3M translations for 15 mins. The device will stop creating new sessions soon." - playbook: 'docs/devops/alert/network/router#asr_nat_table_overflow' + playbook: 'docs/network/playbooks/router#NetworkAsrNatTableFull' dashboard: neutron-router spc: "ServiceAreaCode=04&TicketType=01&Priority=1&ServiceName=NW_CLOUD_CC&ServiceUnit=10&Subject=NetworkAsrNatTableIsNearlyFull+-+devicename%3A+{{ $labels.name }}&Description=NAT+table+on+ASR+devicename+{{ $labels.name }}+is+nearly+full+for+15+min+with+more+than+2M+NAT+translations.+This+will+stop+creating+new+NAT+sessions+soon." annotations: @@ -44,7 +44,7 @@ groups: service: asr context: asr meta: "Cluster down! All ASR routers in the cluster `{{ $labels.cluster }}` are inactive." - playbook: 'docs/devops/alert/network/router#asr_both_devices_down' + playbook: 'docs/network/playbooks/router#asr_both_devices_down' dashboard: neutron-router annotations: description: "Cluster down! All ASR routers in the cluster `{{ $labels.cluster }}` are inactive." @@ -61,7 +61,7 @@ groups: service: asr context: asr meta: "Split brain! All ASR routers in the cluster `{{ $labels.cluster }}` are active." - playbook: 'docs/devops/alert/network/router#asr_both_devices_up' + playbook: 'docs/network/playbooks/router#asr_both_devices_up' dashboard: neutron-router annotations: description: "Split brain! All ASR routers in the cluster `{{ $labels.cluster }}` are active." @@ -78,7 +78,7 @@ groups: service: asr context: asr meta: "Redundancy synchronization errors on `{{ $labels.name }}`." - playbook: 'docs/devops/alert/network/router#NetworkAsrRedundancyReplicationErrors' + playbook: 'docs/network/playbooks/router#NetworkAsrRedundancyReplicationErrors' dashboard: neutron-router annotations: description: "Redundancy synchronization errors on `{{ $labels.name }}`." @@ -95,7 +95,7 @@ groups: service: asr context: asr meta: "Gatekeeper cache on `{{ $labels.name }}` is almost full." - playbook: 'docs/devops/alert/network/router#NetworkAsrNatGatekeeperCacheOverflow' + playbook: 'docs/network/playbooks/router#NetworkAsrNatGatekeeperCacheOverflow' dashboard: neutron-router annotations: description: "Gatekeeper cache on `{{ $labels.name }}` is almost full." @@ -113,7 +113,7 @@ groups: service: asr context: asr meta: "BGP on `{{ $labels.name }}` to `{{ $labels.peer_ip }}` in VRF `{{ $labels.vrf }}` went down. DAPnets in this VRF are at risk." - playbook: 'docs/devops/alert/network/router#NetworkAsrBgpFabricPeerDown' + playbook: 'docs/network/playbooks/router#NetworkAsrBgpFabricPeerDown' dashboard: neutron-router-bgp annotations: description: "BGP on `{{ $labels.name }}` to `{{ $labels.peer_ip }}` in VRF `{{ $labels.vrf }}` went down. DAPnets in this VRF are at risk." @@ -132,7 +132,7 @@ groups: service: asr context: asr meta: "Neutron Router mesh BGP peering on `{{ $labels.name }}` to `{{ $labels.peer_ip }}` went down. L3VPN outage!" - playbook: 'docs/devops/alert/network/router#NetworkAsrBgpNeutronPeerDown' + playbook: 'docs/network/playbooks/router#NetworkAsrBgpNeutronPeerDown' dashboard: neutron-router-bgp annotations: description: "Neutron Router mesh BGP peering on `{{ $labels.name }}` to `{{ $labels.peer_ip }}` went down. L3VPN outage!" @@ -150,7 +150,7 @@ groups: service: asr context: asr meta: "BGP peering on `{{ $labels.name }}` to `{{ $labels.peer_ip }}` ) (core) went down. L3VPN impacted." - playbook: 'docs/devops/alert/network/router#NetworkAsrBgpCorePeerDown' + playbook: 'docs/network/playbooks/router#NetworkAsrBgpCorePeerDown' dashboard: neutron-router-bgp annotations: description: "BGP peering on `{{ $labels.name }}` to `{{ $labels.peer_ip }}` ) (core) went down. L3VPN impacted." @@ -169,7 +169,7 @@ groups: tier: net service: asr context: asr - playbook: 'docs/devops/alert/network/router#NetworkAsrNatTcpPortPreAllocationLow' + playbook: 'docs/network/playbooks/router#NetworkAsrNatTcpPortPreAllocationLow' dashboard: neutron-router annotations: summary: "NAT has too few preallocated TCP ports" @@ -188,7 +188,7 @@ groups: tier: net service: asr context: asr - playbook: 'docs/devops/alert/network/router#NetworkAsrNatTcpPortPreAllocationLow' + playbook: 'docs/network/playbooks/router#NetworkAsrNatTcpPortPreAllocationLow' dashboard: neutron-router annotations: summary: "NAT has too few preallocated UDP ports" @@ -204,7 +204,7 @@ groups: tier: net service: asr context: asr - playbook: 'docs/devops/alert/network/router.html#NetworkAsrHighMemoryUtilization' + playbook: 'docs/network/playbooks/router#NetworkAsrHighMemoryUtilization' dashboard: neutron-router annotations: summary: "Control Processor Memory Utilization on `{{ $labels.name }}` is very High" @@ -221,7 +221,7 @@ groups: service: asr context: asr meta: "Cisco ASR990x device `{{ $labels.name }}` with reference clock `{{ $labels.reference_clock }}` has a high root dispersion." - playbook: 'docs/devops/alert/network/ntp' + playbook: 'docs/network/playbooks/ntp/' annotations: description: "Cisco ASR990x device `{{ $labels.name }}` has a high root dispersion." summary: "Cisco ASR990x device `{{ $labels.name }}` has a high root dispersion." @@ -236,7 +236,7 @@ groups: service: asr context: asr meta: "Cisco ASR990x device `{{ $labels.name }}` with reference clock `{{ $labels.reference_clock }}` has a high NTP offset." - playbook: 'docs/devops/alert/network/ntp' + playbook: 'docs/network/playbooks/ntp/' annotations: description: "Cisco ASR990x device `{{ $labels.name }}` has a high NTP offset." summary: "Cisco ASR990x device `{{ $labels.name }}` has a high NTP offset." diff --git a/prometheus-exporters/snmp-exporter/alerts/snmp-asr.alerts b/prometheus-exporters/snmp-exporter/alerts/snmp-asr.alerts index 7491041cd83..73a831122e1 100644 --- a/prometheus-exporters/snmp-exporter/alerts/snmp-asr.alerts +++ b/prometheus-exporters/snmp-exporter/alerts/snmp-asr.alerts @@ -12,7 +12,7 @@ groups: service: asr context: asr meta: "Interface `{{ $labels.ifDescr }}` of ASR devicename `{{ $labels.devicename }}` experiencing high bandwidth utilization." - playbook: 'docs/devops/alert/network/router#NetworkAsrInterfaceOverUtilization' + playbook: 'docs/network/playbooks/router#NetworkAsrInterfaceOverUtilization' dashboard: neutron-datapath-bandwith annotations: description: "Interface `{{ $labels.ifDescr }}` of ASR devicename `{{ $labels.devicename }}` experiencing high egress bandwidth utilization.Immediate action required" @@ -28,7 +28,7 @@ groups: service: asr context: asr meta: "Interface `{{ $labels.ifDescr }}` of ASR devicename `{{ $labels.devicename }}` experiencing high bandwidth utilization." - playbook: 'docs/devops/alert/network/router#NetworkAsrInterfaceOverUtilization' + playbook: 'docs/network/playbooks/router#NetworkAsrInterfaceOverUtilization' dashboard: neutron-datapath-bandwith annotations: description: "Interface `{{ $labels.ifDescr }}` of ASR devicename `{{ $labels.devicename }}` experiencing high ingress bandwidth utilization.Immediate action required" @@ -45,7 +45,7 @@ groups: service: asr context: asr meta: "IOSD process on `{{ $labels.devicename }}` is using a lot of CPU." - playbook: 'docs/devops/alert/network/router#NetworkAsrHighIosdCpuUtilization' + playbook: 'docs/network/playbooks/router#NetworkAsrHighIosdCpuUtilization' dashboard: neutron-router annotations: description: "IOSD process on `{{ $labels.devicename }}` is using a lot of CPU." @@ -62,7 +62,7 @@ groups: service: asr context: asr meta: "The Control Processor CPU on `{{ $labels.devicename }}` is very busy." - playbook: 'docs/devops/alert/network/router#NetworkAsrHighCpuUtilization' + playbook: 'docs/network/playbooks/router#NetworkAsrHighCpuUtilization' dashboard: neutron-router annotations: description: "The Control Processor CPU on `{{ $labels.devicename }}` is very busy." @@ -91,7 +91,7 @@ groups: service: asr context: asr meta: "A high QFP utilization is seen on `{{ $labels.devicename }}`. This may impact forwarding." - playbook: 'docs/devops/alert/network/router#NetworkAsrHighQfpCpuUtilization' + playbook: 'docs/network/playbooks/router#NetworkAsrHighQfpCpuUtilization' dashboard: neutron-router annotations: description: "A high QFP utilization is seen on `{{ $labels.devicename }}`. This may impact forwarding."