Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
64eb38f
test 1
okdas May 10, 2023
9484791
test
okdas May 10, 2023
86c92c8
try token
okdas May 10, 2023
bd02e3f
try to submit
okdas May 10, 2023
8b8ac77
use GITHUB_WORKSPACE
okdas May 10, 2023
593ee08
troubleshoot
okdas May 10, 2023
e5951ab
troubleshoot
okdas May 10, 2023
192c154
test this
okdas May 10, 2023
bd8eeaf
try this
okdas May 10, 2023
cc9c0f0
added perms
okdas May 10, 2023
e4bdc03
Empty-Commit
okdas May 10, 2023
6238b95
Empty-Commit
okdas May 10, 2023
674055c
put it all together
okdas May 11, 2023
271f321
try without artifacts
okdas May 11, 2023
acb9d29
safedump
okdas May 11, 2023
9c489f2
add cluster manager sts pods kill
okdas May 12, 2023
3dc987c
time limit wait-for-infra
okdas May 12, 2023
f2aae81
fix linting errors
okdas May 12, 2023
0036d05
move into the separate CI
okdas May 12, 2023
9ade326
fix spell mistake
okdas May 16, 2023
79d691f
Merge branch 'main' into e2e-automation
okdas May 16, 2023
5207af2
Update build/localnet/cluster-manager/sts_kill.go
okdas May 17, 2023
3bef80c
Update build/localnet/cluster-manager/sts_kill.go
okdas May 17, 2023
a13b2b3
Update build/localnet/cluster-manager/sts_kill.go
okdas May 17, 2023
39f07c2
Update build/localnet/cluster-manager/sts_kill.go
okdas May 17, 2023
293f5d7
requested changes
okdas May 17, 2023
385d7a3
Merge remote-tracking branch 'origin/main' into e2e-automation
okdas May 17, 2023
30b7557
bump the date
okdas May 17, 2023
e1a66e9
remove unused sa
okdas May 17, 2023
6de718c
update pointers
okdas May 18, 2023
5b0da95
Update .github/workflows/e2e-test.yml
okdas May 18, 2023
6bc8ee2
requested changes
okdas May 19, 2023
5f8da74
[CI] Add inline error check linter (#770)
okdas May 18, 2023
535246f
nump changelog
okdas May 19, 2023
1e6aad9
Merge remote-tracking branch 'origin/main' into e2e-automation
okdas May 19, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
name: E2E test on DevNet

# Only trigger, when the build workflow succeeded, and allow manual triggering.
on:
workflow_dispatch:
workflow_run:
workflows: ["Test, build and push artifacts"]
types:
- completed

jobs:
e2e-tests:
runs-on: ubuntu-latest
if: contains(github.event.pull_request.labels.*.name, 'e2e-devnet-test')
env:
ARGO_SERVER: "workflows.dev-us-east4-1.poktnodes.network:8443"
ARGO_HTTP1: true
ARGO_SECURE: true
permissions:
contents: "read"
id-token: "write"

steps:
- id: "auth"
uses: "google-github-actions/auth@v1"
with:
credentials_json: "${{ secrets.ARGO_WORKFLOW_EXTERNAL }}"

- id: "get-credentials"
uses: "google-github-actions/get-gke-credentials@v1"
with:
cluster_name: "nodes-gcp-dev-us-east4-1"
location: "us-east4"

- id: "install-argo"
run: |
curl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.4.7/argo-linux-amd64.gz
gunzip argo-linux-amd64.gz
chmod +x argo-linux-amd64
mv ./argo-linux-amd64 /usr/local/bin/argo
argo version

- id: "wait-for-infra"
shell: bash
run: |
start_time=$(date +%s) # store current time
timeout=900 # 15 minute timeout in seconds

until argo template get dev-e2e-tests --namespace=devnet-issue-${{ github.event.pull_request.number }}; do
current_time=$(date +%s)
elapsed_time=$(( current_time - start_time ))
if (( elapsed_time > timeout )); then
echo "Timeout of $timeout seconds reached. Exiting..."
exit 1
fi
echo "Waiting for devnet-issue-${{ github.event.pull_request.number }} to be provisioned..."
sleep 5
done

- id: "run-e2e-tests"
run: |
argo submit --wait --log --namespace devnet-issue-${{ github.event.pull_request.number }} --from 'wftmpl/dev-e2e-tests' --parameter gitsha="${{ github.event.pull_request.head.sha }}"
5 changes: 5 additions & 0 deletions build/docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]


## [0.0.0.43] - 2023-05-18

- Added functionality to `cluster-manager` to delete crashed pods so StatefulSetController would recreate them with a new version.

## [0.0.0.42] - 2023-05-12

- Added private keys for all (except fisherman) actors
Expand Down
137 changes: 137 additions & 0 deletions build/localnet/cluster-manager/crashed_pods_deleter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package main

// Monitors Pods created by StatefulSets, and if the Pods are in a `CrashLoopBackOff` status,
// and they have a different image tag - kill them. StatefulSet would then recreate the Pod with a new image.

import (
"context"
"errors"
"strings"

pocketk8s "github.com/pokt-network/pocket/shared/k8s"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
watch "k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
appstypedv1 "k8s.io/client-go/kubernetes/typed/apps/v1"
coretypedv1 "k8s.io/client-go/kubernetes/typed/core/v1"
)

// Loop through existing pods and set up a watch for new Pods so we don't hit Kubernetes API all the time
// This is a blocking function, intended for running in a goroutine
func initCrashedPodsDeleter(client *kubernetes.Clientset) {
stsClient := client.AppsV1().StatefulSets(pocketk8s.CurrentNamespace)
podClient := client.CoreV1().Pods(pocketk8s.CurrentNamespace)

// Loop through all existing Pods and delete the ones that are in CrashLoopBackOff status
podList, err := podClient.List(context.TODO(), metav1.ListOptions{})
if err != nil {
logger.Error().Err(err).Msg("error listing pods on init")
}

for i := range podList.Items {
pod := &podList.Items[i]
if err := deleteCrashedPods(pod, stsClient, podClient); err != nil {
logger.Error().Err(err).Msg("error deleting crashed pod on init")
}
}

// Set up a watch for new Pods
w, err := podClient.Watch(context.TODO(), metav1.ListOptions{})
if err != nil {
logger.Error().Err(err).Msg("error setting up watch for new pods")
}
for event := range w.ResultChan() {
switch event.Type {
case watch.Added, watch.Modified:
pod, ok := event.Object.(*corev1.Pod)
if !ok {
logger.Error().Msg("error casting pod on watch")
continue
}

if err := deleteCrashedPods(pod, stsClient, podClient); err != nil {
logger.Error().Err(err).Msg("error deleting crashed pod on watch")
}
}
}
}

func isContainerStatusErroneous(status *corev1.ContainerStatus) bool {
return status.State.Waiting != nil &&
(strings.HasPrefix(status.State.Waiting.Reason, "Err") ||
strings.HasSuffix(status.State.Waiting.Reason, "BackOff"))
}

func deleteCrashedPods(
pod *corev1.Pod,
stsClient appstypedv1.StatefulSetInterface,
podClient coretypedv1.PodInterface,
) error {
// If annotation is present, we monitor the Pod
containerToMonitor, ok := pod.Annotations["cluster-manager-delete-on-crash-container"]
if !ok {
return nil
}

for ci := range pod.Spec.Containers {
podContainer := &pod.Spec.Containers[ci]

// Only proceed if container is the one we monitor
if podContainer.Name != containerToMonitor {
continue
}

for si := range pod.Status.ContainerStatuses {
containerStatus := &pod.Status.ContainerStatuses[si]

// Only proceed if container is in some sort of Err status
if !isContainerStatusErroneous(containerStatus) {
continue
}

// Get StatefulSet that created the Pod
var stsName string
for _, ownerRef := range pod.OwnerReferences {
if ownerRef.Kind == "StatefulSet" {
stsName = ownerRef.Name
break
}
}

if stsName == "" {
return errors.New("no StatefulSet found for this pod")
}

sts, err := stsClient.Get(context.TODO(), stsName, metav1.GetOptions{})
if err != nil {
return err
}

// Loop through all containers in the StatefulSet and find the one we monitor
for sci := range sts.Spec.Template.Spec.Containers {
stsContainer := &sts.Spec.Template.Spec.Containers[sci]
if stsContainer.Name != containerToMonitor {
continue
}

// If images are different, delete the Pod
if stsContainer.Image != podContainer.Image {
deletePolicy := metav1.DeletePropagationForeground

if err := podClient.Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
PropagationPolicy: &deletePolicy,
}); err != nil {
return err
}

logger.Info().Str("pod", pod.Name).Msg("deleted crashed pod")
} else {
logger.Info().Str("pod", pod.Name).Msg("pod crashed, but image is the same, not deleting")
}
}
}
}

return nil
}
3 changes: 3 additions & 0 deletions build/localnet/cluster-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ func main() {
panic(err.Error())
}

// Monitor for crashed pods and delete them
go initCrashedPodsDeleter(clientset)

validatorKeysMap, err := pocketk8s.FetchValidatorPrivateKeys(clientset)
if err != nil {
panic(err)
Expand Down
39 changes: 38 additions & 1 deletion build/localnet/manifests/cluster-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,41 @@ spec:
env:
- name: RPC_HOST
value: pocket-full-nodes
serviceAccountName: cluster-manager-account
serviceAccountName: cluster-manager
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: cluster-manager
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: cluster-manager
subjects:
- kind: ServiceAccount
name: cluster-manager
apiGroup: ""
roleRef:
kind: Role
name: cluster-manager
apiGroup: ""
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: cluster-manager
rules:
- apiGroups: [""]
resources: ["secrets"]
resourceNames: ["validators-private-keys"]
verbs: ["get"]
- apiGroups: [""]
resources: ["services", "pods"]
verbs: ["watch", "list", "get"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["delete"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["get"]
17 changes: 0 additions & 17 deletions build/localnet/manifests/role-bindings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,7 @@ subjects:
- kind: ServiceAccount
name: debug-client-account
apiGroup: ""
- kind: ServiceAccount
name: cluster-manager-account
apiGroup: ""
roleRef:
kind: Role
name: private-keys-viewer
apiGroup: ""
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: services-watcher-binding
namespace: default
subjects:
- kind: ServiceAccount
name: cluster-manager-account
apiGroup: ""
roleRef:
kind: Role
name: services-watcher
apiGroup: ""
6 changes: 0 additions & 6 deletions build/localnet/manifests/service-accounts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,3 @@ kind: ServiceAccount
metadata:
name: debug-client-account
namespace: default
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: cluster-manager-account
namespace: default