diff --git a/README.md b/README.md index 7631924f..494683bf 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ An [Ansible AWX](https://github.com/ansible/awx) operator for Kubernetes built w * [Upgrade of instances without auto upgrade](#upgrade-of-instances-without-auto-upgrade) * [Service Account](#service-account) * [Labeling operator managed objects](#labeling-operator-managed-objects) + * [Pods termination grace period](#pods-termination-grace-period) * [Uninstall](#uninstall) * [Upgrading](#upgrading) * [Backup](#backup) @@ -1248,6 +1249,46 @@ spec: ... ``` +#### Pods termination grace period + +During deployment restarts or new rollouts, when old ReplicaSet Pods are being +terminated, the corresponding jobs which are managed (executed or controlled) +by old AWX Pods may end up in `Error` state as there is no mechanism to +transfer them to the newly spawned AWX Pods. To work around the problem one +could set `termination_grace_period_seconds` in AWX spec, which does the +following: + +* It sets the corresponding + [`terminationGracePeriodSeconds`](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-termination) + Pod spec of the AWX Deployment to the value provided + + > The grace period is the duration in seconds after the processes running in + > the pod are sent a termination signal and the time when the processes are + > forcibly halted with a kill signal + +* It adds a + [`PreStop`](https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#hook-handler-execution) + hook script, which will keep AWX Pods in terminating state until it finished, + up to `terminationGracePeriodSeconds`. + + > This grace period applies to the total time it takes for both the PreStop + > hook to execute and for the Container to stop normally + + While the hook script just waits until the corresponding AWX Pod (instance) + no longer has any managed jobs, in which case it finishes with success and + hands over the overall Pod termination process to normal AWX processes. + +One may want to set this value to the maximum duration they accept to wait for +the affected Jobs to finish. Keeping in mind that such finishing jobs may +increase Pods termination time in such situations as `kubectl rollout restart`, +AWX upgrade by the operator, or Kubernetes [API-initiated +evictions](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/). + + +| Name | Description | Default | +| -------------------------------- | --------------------------------------------------------------- | ------- | +| termination_grace_period_seconds | Optional duration in seconds pods needs to terminate gracefully | not set | + ### Uninstall ### To uninstall an AWX deployment instance, you basically need to remove the AWX kind related to that instance. For example, to delete an AWX instance named awx-demo, you would do: diff --git a/config/crd/bases/awx.ansible.com_awxs.yaml b/config/crd/bases/awx.ansible.com_awxs.yaml index 0d645191..dbca15f5 100644 --- a/config/crd/bases/awx.ansible.com_awxs.yaml +++ b/config/crd/bases/awx.ansible.com_awxs.yaml @@ -526,6 +526,10 @@ spec: type: array type: object type: object + termination_grace_period_seconds: + description: Optional duration in seconds pods needs to terminate gracefully + type: integer + format: int32 service_labels: description: Additional labels to apply to the service type: string diff --git a/config/manifests/bases/awx-operator.clusterserviceversion.yaml b/config/manifests/bases/awx-operator.clusterserviceversion.yaml index 46e0fd94..43018338 100644 --- a/config/manifests/bases/awx-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/awx-operator.clusterserviceversion.yaml @@ -622,6 +622,11 @@ spec: x-descriptors: - urn:alm:descriptor:com.tectonic.ui:advanced - urn:alm:descriptor:com.tectonic.ui:hidden + - displayName: Termination Grace Period Seconds + path: termination_grace_period_seconds + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:advanced + - urn:alm:descriptor:com.tectonic.ui:hidden - displayName: Service Labels path: service_labels x-descriptors: diff --git a/roles/installer/files/pre-stop/termination-env b/roles/installer/files/pre-stop/termination-env new file mode 100644 index 00000000..f9d58fa8 --- /dev/null +++ b/roles/installer/files/pre-stop/termination-env @@ -0,0 +1,66 @@ +# file, which when exists, indicates that `master` script has successfully +# completed pre-stop script execution +marker_file="${PRE_STOP_MARKER_FILE:-/var/lib/pre-stop/.termination_marker}" + +# file which the running `master` script continuously updates (mtime) to +# indicate it's still running. this file is then read by `watcher`s to +# understand if they still have to wait for `termination_marker` +heartbeat_file="${PRE_STOP_HEARTBEAT_FILE:-/var/lib/pre-stop/.heartbeat}" + +# file which: +# * `watcher`s create when they bail out because they didn't see the +# `heartbeat_file` to be updated within `$heartbeat_failed_threshold`; +# * `master` creates when its handler command fails; +# when scripts see such file, they also give up +bailout_file="${PRE_STOP_BAILOUT_FILE:-/var/lib/pre-stop/.bailout}" +heartbeat_threshold="${PRE_STOP_HEARTBEAT_THRESHOLD:-60}" + +# where the scripts' stdout/stderr are streamed +stdout="${PRE_STOP_STDOUT:-/proc/1/fd/1}" +stderr="${PRE_STOP_STDERR:-/proc/1/fd/2}" + +# command the `master` script executes, which when successfully finishes, +# causes the script to create the `marker_file` +handler="${PRE_STOP_HANDLER:-bash -c \"PYTHONUNBUFFERED=x awx-manage disable_instance --wait --retry=inf\"}" + +log_prefix="${PRE_STOP_LOG_PREFIX:-preStop.exec}" +[[ -n ${PRE_STOP_LOG_ROLE} ]] && log_prefix="${log_prefix}] [$PRE_STOP_LOG_ROLE" + +# interval at which `watcher`s check for `marker_file` presence +recheck_sleep="${PRE_STOP_RECHECK_SLEEP:-1}" +# interval at which `watcher`s report into $stdout that they are still watching +report_every="${PRE_STOP_REPORT_EVERY:-30}" + +function log { + printf "[%s] $1\n" "$log_prefix" "${@:2}" +} + +function parameters_string { + for param in "$@"; do + printf "%s=\"%s\"\n" "$param" "${!param}" + done | paste -s -d ' ' +} + +function check_bailout { + if [[ -f $bailout_file ]]; then + log "\"%s\" file has been detected, accepting bail out signal and failing the hook script" \ + "$bailout_file" + exit 1 + fi +} + +function check_heartbeat { + if [[ -f $heartbeat_file ]]; then + delta=$(( $(date +%s) - $(stat -c %Y "$heartbeat_file") )) + else + delta=$(( $(date +%s) - $1 )) + fi + + if [[ $delta -gt $heartbeat_threshold ]]; then + log "The heartbeat file hasn't been updated since %ss, which is above the threshold of %ds, assuming the master is not operating and failing the hook script" \ + $delta + $heartbeat_threshold + touch "$bailout_file" + exit 1 + fi +} diff --git a/roles/installer/files/pre-stop/termination-master b/roles/installer/files/pre-stop/termination-master new file mode 100755 index 00000000..a34df232 --- /dev/null +++ b/roles/installer/files/pre-stop/termination-master @@ -0,0 +1,50 @@ +#/usr/bin/env bash + +PRE_STOP_LOG_ROLE="${PRE_STOP_LOG_ROLE:-master}" +source $(dirname "$0")/termination-env + +{ + +log "The hook has started: %s" \ + "$(parameters_string \ + "marker_file" \ + "heartbeat_file" \ + "bailout_file" \ + "handler" \ + )" + +touch "$heartbeat_file" + +set -o pipefail +eval "$handler" 2>&1 | while IFS= read -r line; do + # we check the files here and break early, but overall script termination + # happens later - as we need to distinguish between files detection and + # command failure, while bash doesn't offer a simple way to do this here + # inside the loop (`exit` does not terminate the script) + [[ -f $bailout_file ]] && break + [[ -f $marker_file ]] && break + + log "[handler] %s" "$line" + touch "$heartbeat_file" +done +ec=$? +set +o pipefail + +# process various cases in specific order +check_bailout + +if [[ -f $marker_file ]]; then + log "Done! The marker file has been detected, assuming some other instance of the script has run to completion" + exit 0 +elif [[ $ec -ne 0 ]]; then + log "The handler has failed with \"%d\" exit code, failing the hook script too" \ + $ec + # signal others to bail out + touch "$bailout_file" + exit $ec +else + log "Done! Generating the marker file allowing to proceed to termination" + touch "$marker_file" +fi + +} > "$stdout" 2> "$stderr" diff --git a/roles/installer/files/pre-stop/termination-waiter b/roles/installer/files/pre-stop/termination-waiter new file mode 100755 index 00000000..e7762c59 --- /dev/null +++ b/roles/installer/files/pre-stop/termination-waiter @@ -0,0 +1,33 @@ +#/usr/bin/env bash + +PRE_STOP_LOG_ROLE="${PRE_STOP_LOG_ROLE:-waiter}" +source $(dirname "$0")/termination-env + +{ + +log "The hook has started: %s" \ + "$(parameters_string \ + "marker_file" \ + "heartbeat_file" \ + "bailout_file" \ + "recheck_sleep" \ + "report_every" \ + )" + +n=0 +checks_started=$(date +%s) + +while ! [[ -f $marker_file ]]; do + check_bailout + check_heartbeat $checks_started + + if [[ $(($n % $report_every)) -eq 0 ]]; then + log "Waiting for the marker file to be accessible..." + fi + n=$(($n + 1)) + sleep $recheck_sleep +done + +log "The marker file found, exiting to proceed to termination" + +} > "$stdout" 2> "$stderr" diff --git a/roles/installer/tasks/resources_configuration.yml b/roles/installer/tasks/resources_configuration.yml index 0d56eb1c..f4286443 100644 --- a/roles/installer/tasks/resources_configuration.yml +++ b/roles/installer/tasks/resources_configuration.yml @@ -208,6 +208,7 @@ wait: yes loop: - 'configmaps/config' + - 'configmaps/pre_stop_scripts' - 'secrets/app_credentials' - 'rbac/service_account' - 'storage/persistent' diff --git a/roles/installer/templates/configmaps/pre_stop_scripts.yaml.j2 b/roles/installer/templates/configmaps/pre_stop_scripts.yaml.j2 new file mode 100644 index 00000000..4aeff8c4 --- /dev/null +++ b/roles/installer/templates/configmaps/pre_stop_scripts.yaml.j2 @@ -0,0 +1,16 @@ +{% if termination_grace_period_seconds is defined %} +apiVersion: v1 +kind: ConfigMap +metadata: + name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-pre-stop-scripts' + namespace: '{{ ansible_operator_meta.namespace }}' + labels: + {{ lookup("template", "../common/templates/labels/common.yaml.j2") | indent(width=4) | trim }} +data: + termination-master: | + {{ lookup("file", "files/pre-stop/termination-master") | indent(width=4) | trim }} + termination-waiter: | + {{ lookup("file", "files/pre-stop/termination-waiter") | indent(width=4) | trim }} + termination-env: | + {{ lookup("file", "files/pre-stop/termination-env") | indent(width=4) | trim }} +{% endif %} diff --git a/roles/installer/templates/deployments/deployment.yaml.j2 b/roles/installer/templates/deployments/deployment.yaml.j2 index 81e0a519..3c5de60f 100644 --- a/roles/installer/templates/deployments/deployment.yaml.j2 +++ b/roles/installer/templates/deployments/deployment.yaml.j2 @@ -23,6 +23,7 @@ spec: annotations: {% for template in [ "configmaps/config", + "configmaps/pre_stop_scripts", "secrets/app_credentials", "storage/persistent", ] %} @@ -139,6 +140,23 @@ spec: mountPath: "/var/run/redis" - name: "{{ ansible_operator_meta.name }}-redis-data" mountPath: "/data" +{% if termination_grace_period_seconds is defined %} + - name: pre-stop-data + mountPath: /var/lib/pre-stop + - name: pre-stop-scripts + mountPath: /var/lib/pre-stop/scripts + lifecycle: + preStop: + exec: + command: + - bash + - -c + # redis image doesn't support writing to `/proc/1/fd/*` + - > + PRE_STOP_STDOUT=/dev/stdout + PRE_STOP_STDERR=/dev/stderr + /var/lib/pre-stop/scripts/termination-waiter +{% endif %} resources: {{ redis_resource_requirements }} - image: '{{ _image }}' name: '{{ ansible_operator_meta.name }}-web' @@ -226,6 +244,18 @@ spec: {% endif %} {% if web_extra_volume_mounts -%} {{ web_extra_volume_mounts | indent(width=12, first=True) }} +{% endif %} +{% if termination_grace_period_seconds is defined %} + - name: pre-stop-data + mountPath: /var/lib/pre-stop + - name: pre-stop-scripts + mountPath: /var/lib/pre-stop/scripts + lifecycle: + preStop: + exec: + command: + - bash + - /var/lib/pre-stop/scripts/termination-waiter {% endif %} env: - name: MY_POD_NAMESPACE @@ -308,6 +338,18 @@ spec: {% endif %} {% if task_extra_volume_mounts -%} {{ task_extra_volume_mounts | indent(width=12, first=True) }} +{% endif %} +{% if termination_grace_period_seconds is defined %} + - name: pre-stop-data + mountPath: /var/lib/pre-stop + - name: pre-stop-scripts + mountPath: /var/lib/pre-stop/scripts + lifecycle: + preStop: + exec: + command: + - bash + - /var/lib/pre-stop/scripts/termination-master {% endif %} env: - name: SUPERVISOR_WEB_CONFIG_PATH @@ -377,6 +419,18 @@ spec: mountPath: "/var/lib/awx/projects" {% if ee_extra_volume_mounts -%} {{ ee_extra_volume_mounts | indent(width=12, first=True) }} +{% endif %} +{% if termination_grace_period_seconds is defined %} + - name: pre-stop-data + mountPath: /var/lib/pre-stop + - name: pre-stop-scripts + mountPath: /var/lib/pre-stop/scripts + lifecycle: + preStop: + exec: + command: + - bash + - /var/lib/pre-stop/scripts/termination-waiter {% endif %} env: {% if development_mode | bool %} @@ -412,6 +466,9 @@ spec: {% if security_context_settings|length %} {{ security_context_settings | to_nice_yaml | indent(8) }} {% endif %} +{% endif %} +{% if termination_grace_period_seconds is defined %} + terminationGracePeriodSeconds: {{ termination_grace_period_seconds }} {% endif %} volumes: {% if bundle_ca_crt %} @@ -441,6 +498,14 @@ spec: items: - key: ldap-ca.crt path: 'ldap-ca.crt' +{% endif %} +{% if termination_grace_period_seconds is defined %} + - name: pre-stop-data + emptyDir: {} + - name: pre-stop-scripts + configMap: + name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-pre-stop-scripts' + defaultMode: 0775 {% endif %} - name: "{{ ansible_operator_meta.name }}-application-credentials" secret: