AWX: Add termination_grace_period_seconds

This commit is contained in:
Stanislav Zaprudskiy
2022-12-23 13:57:13 +01:00
parent f042cb3d00
commit 336ea58a0a
9 changed files with 281 additions and 0 deletions

View File

@@ -49,6 +49,7 @@ An [Ansible AWX](https://github.com/ansible/awx) operator for Kubernetes built w
* [Upgrade of instances without auto upgrade](#upgrade-of-instances-without-auto-upgrade)
* [Service Account](#service-account)
* [Labeling operator managed objects](#labeling-operator-managed-objects)
* [Pods termination grace period](#pods-termination-grace-period)
* [Uninstall](#uninstall)
* [Upgrading](#upgrading)
* [Backup](#backup)
@@ -1248,6 +1249,46 @@ spec:
...
```
#### Pods termination grace period
During deployment restarts or new rollouts, when old ReplicaSet Pods are being
terminated, the corresponding jobs which are managed (executed or controlled)
by old AWX Pods may end up in `Error` state as there is no mechanism to
transfer them to the newly spawned AWX Pods. To work around the problem one
could set `termination_grace_period_seconds` in AWX spec, which does the
following:
* It sets the corresponding
[`terminationGracePeriodSeconds`](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-termination)
Pod spec of the AWX Deployment to the value provided
> The grace period is the duration in seconds after the processes running in
> the pod are sent a termination signal and the time when the processes are
> forcibly halted with a kill signal
* It adds a
[`PreStop`](https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#hook-handler-execution)
hook script, which will keep AWX Pods in terminating state until it finished,
up to `terminationGracePeriodSeconds`.
> This grace period applies to the total time it takes for both the PreStop
> hook to execute and for the Container to stop normally
While the hook script just waits until the corresponding AWX Pod (instance)
no longer has any managed jobs, in which case it finishes with success and
hands over the overall Pod termination process to normal AWX processes.
One may want to set this value to the maximum duration they accept to wait for
the affected Jobs to finish. Keeping in mind that such finishing jobs may
increase Pods termination time in such situations as `kubectl rollout restart`,
AWX upgrade by the operator, or Kubernetes [API-initiated
evictions](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/).
| Name | Description | Default |
| -------------------------------- | --------------------------------------------------------------- | ------- |
| termination_grace_period_seconds | Optional duration in seconds pods needs to terminate gracefully | not set |
### Uninstall ###
To uninstall an AWX deployment instance, you basically need to remove the AWX kind related to that instance. For example, to delete an AWX instance named awx-demo, you would do:

View File

@@ -526,6 +526,10 @@ spec:
type: array
type: object
type: object
termination_grace_period_seconds:
description: Optional duration in seconds pods needs to terminate gracefully
type: integer
format: int32
service_labels:
description: Additional labels to apply to the service
type: string

View File

@@ -622,6 +622,11 @@ spec:
x-descriptors:
- urn:alm:descriptor:com.tectonic.ui:advanced
- urn:alm:descriptor:com.tectonic.ui:hidden
- displayName: Termination Grace Period Seconds
path: termination_grace_period_seconds
x-descriptors:
- urn:alm:descriptor:com.tectonic.ui:advanced
- urn:alm:descriptor:com.tectonic.ui:hidden
- displayName: Service Labels
path: service_labels
x-descriptors:

View File

@@ -0,0 +1,66 @@
# file, which when exists, indicates that `master` script has successfully
# completed pre-stop script execution
marker_file="${PRE_STOP_MARKER_FILE:-/var/lib/pre-stop/.termination_marker}"
# file which the running `master` script continuously updates (mtime) to
# indicate it's still running. this file is then read by `watcher`s to
# understand if they still have to wait for `termination_marker`
heartbeat_file="${PRE_STOP_HEARTBEAT_FILE:-/var/lib/pre-stop/.heartbeat}"
# file which:
# * `watcher`s create when they bail out because they didn't see the
# `heartbeat_file` to be updated within `$heartbeat_failed_threshold`;
# * `master` creates when its handler command fails;
# when scripts see such file, they also give up
bailout_file="${PRE_STOP_BAILOUT_FILE:-/var/lib/pre-stop/.bailout}"
heartbeat_threshold="${PRE_STOP_HEARTBEAT_THRESHOLD:-60}"
# where the scripts' stdout/stderr are streamed
stdout="${PRE_STOP_STDOUT:-/proc/1/fd/1}"
stderr="${PRE_STOP_STDERR:-/proc/1/fd/2}"
# command the `master` script executes, which when successfully finishes,
# causes the script to create the `marker_file`
handler="${PRE_STOP_HANDLER:-bash -c \"PYTHONUNBUFFERED=x awx-manage disable_instance --wait --retry=inf\"}"
log_prefix="${PRE_STOP_LOG_PREFIX:-preStop.exec}"
[[ -n ${PRE_STOP_LOG_ROLE} ]] && log_prefix="${log_prefix}] [$PRE_STOP_LOG_ROLE"
# interval at which `watcher`s check for `marker_file` presence
recheck_sleep="${PRE_STOP_RECHECK_SLEEP:-1}"
# interval at which `watcher`s report into $stdout that they are still watching
report_every="${PRE_STOP_REPORT_EVERY:-30}"
function log {
printf "[%s] $1\n" "$log_prefix" "${@:2}"
}
function parameters_string {
for param in "$@"; do
printf "%s=\"%s\"\n" "$param" "${!param}"
done | paste -s -d ' '
}
function check_bailout {
if [[ -f $bailout_file ]]; then
log "\"%s\" file has been detected, accepting bail out signal and failing the hook script" \
"$bailout_file"
exit 1
fi
}
function check_heartbeat {
if [[ -f $heartbeat_file ]]; then
delta=$(( $(date +%s) - $(stat -c %Y "$heartbeat_file") ))
else
delta=$(( $(date +%s) - $1 ))
fi
if [[ $delta -gt $heartbeat_threshold ]]; then
log "The heartbeat file hasn't been updated since %ss, which is above the threshold of %ds, assuming the master is not operating and failing the hook script" \
$delta
$heartbeat_threshold
touch "$bailout_file"
exit 1
fi
}

View File

@@ -0,0 +1,50 @@
#/usr/bin/env bash
PRE_STOP_LOG_ROLE="${PRE_STOP_LOG_ROLE:-master}"
source $(dirname "$0")/termination-env
{
log "The hook has started: %s" \
"$(parameters_string \
"marker_file" \
"heartbeat_file" \
"bailout_file" \
"handler" \
)"
touch "$heartbeat_file"
set -o pipefail
eval "$handler" 2>&1 | while IFS= read -r line; do
# we check the files here and break early, but overall script termination
# happens later - as we need to distinguish between files detection and
# command failure, while bash doesn't offer a simple way to do this here
# inside the loop (`exit` does not terminate the script)
[[ -f $bailout_file ]] && break
[[ -f $marker_file ]] && break
log "[handler] %s" "$line"
touch "$heartbeat_file"
done
ec=$?
set +o pipefail
# process various cases in specific order
check_bailout
if [[ -f $marker_file ]]; then
log "Done! The marker file has been detected, assuming some other instance of the script has run to completion"
exit 0
elif [[ $ec -ne 0 ]]; then
log "The handler has failed with \"%d\" exit code, failing the hook script too" \
$ec
# signal others to bail out
touch "$bailout_file"
exit $ec
else
log "Done! Generating the marker file allowing to proceed to termination"
touch "$marker_file"
fi
} > "$stdout" 2> "$stderr"

View File

@@ -0,0 +1,33 @@
#/usr/bin/env bash
PRE_STOP_LOG_ROLE="${PRE_STOP_LOG_ROLE:-waiter}"
source $(dirname "$0")/termination-env
{
log "The hook has started: %s" \
"$(parameters_string \
"marker_file" \
"heartbeat_file" \
"bailout_file" \
"recheck_sleep" \
"report_every" \
)"
n=0
checks_started=$(date +%s)
while ! [[ -f $marker_file ]]; do
check_bailout
check_heartbeat $checks_started
if [[ $(($n % $report_every)) -eq 0 ]]; then
log "Waiting for the marker file to be accessible..."
fi
n=$(($n + 1))
sleep $recheck_sleep
done
log "The marker file found, exiting to proceed to termination"
} > "$stdout" 2> "$stderr"

View File

@@ -208,6 +208,7 @@
wait: yes
loop:
- 'configmaps/config'
- 'configmaps/pre_stop_scripts'
- 'secrets/app_credentials'
- 'rbac/service_account'
- 'storage/persistent'

View File

@@ -0,0 +1,16 @@
{% if termination_grace_period_seconds is defined %}
apiVersion: v1
kind: ConfigMap
metadata:
name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-pre-stop-scripts'
namespace: '{{ ansible_operator_meta.namespace }}'
labels:
{{ lookup("template", "../common/templates/labels/common.yaml.j2") | indent(width=4) | trim }}
data:
termination-master: |
{{ lookup("file", "files/pre-stop/termination-master") | indent(width=4) | trim }}
termination-waiter: |
{{ lookup("file", "files/pre-stop/termination-waiter") | indent(width=4) | trim }}
termination-env: |
{{ lookup("file", "files/pre-stop/termination-env") | indent(width=4) | trim }}
{% endif %}

View File

@@ -23,6 +23,7 @@ spec:
annotations:
{% for template in [
"configmaps/config",
"configmaps/pre_stop_scripts",
"secrets/app_credentials",
"storage/persistent",
] %}
@@ -139,6 +140,23 @@ spec:
mountPath: "/var/run/redis"
- name: "{{ ansible_operator_meta.name }}-redis-data"
mountPath: "/data"
{% if termination_grace_period_seconds is defined %}
- name: pre-stop-data
mountPath: /var/lib/pre-stop
- name: pre-stop-scripts
mountPath: /var/lib/pre-stop/scripts
lifecycle:
preStop:
exec:
command:
- bash
- -c
# redis image doesn't support writing to `/proc/1/fd/*`
- >
PRE_STOP_STDOUT=/dev/stdout
PRE_STOP_STDERR=/dev/stderr
/var/lib/pre-stop/scripts/termination-waiter
{% endif %}
resources: {{ redis_resource_requirements }}
- image: '{{ _image }}'
name: '{{ ansible_operator_meta.name }}-web'
@@ -226,6 +244,18 @@ spec:
{% endif %}
{% if web_extra_volume_mounts -%}
{{ web_extra_volume_mounts | indent(width=12, first=True) }}
{% endif %}
{% if termination_grace_period_seconds is defined %}
- name: pre-stop-data
mountPath: /var/lib/pre-stop
- name: pre-stop-scripts
mountPath: /var/lib/pre-stop/scripts
lifecycle:
preStop:
exec:
command:
- bash
- /var/lib/pre-stop/scripts/termination-waiter
{% endif %}
env:
- name: MY_POD_NAMESPACE
@@ -308,6 +338,18 @@ spec:
{% endif %}
{% if task_extra_volume_mounts -%}
{{ task_extra_volume_mounts | indent(width=12, first=True) }}
{% endif %}
{% if termination_grace_period_seconds is defined %}
- name: pre-stop-data
mountPath: /var/lib/pre-stop
- name: pre-stop-scripts
mountPath: /var/lib/pre-stop/scripts
lifecycle:
preStop:
exec:
command:
- bash
- /var/lib/pre-stop/scripts/termination-master
{% endif %}
env:
- name: SUPERVISOR_WEB_CONFIG_PATH
@@ -377,6 +419,18 @@ spec:
mountPath: "/var/lib/awx/projects"
{% if ee_extra_volume_mounts -%}
{{ ee_extra_volume_mounts | indent(width=12, first=True) }}
{% endif %}
{% if termination_grace_period_seconds is defined %}
- name: pre-stop-data
mountPath: /var/lib/pre-stop
- name: pre-stop-scripts
mountPath: /var/lib/pre-stop/scripts
lifecycle:
preStop:
exec:
command:
- bash
- /var/lib/pre-stop/scripts/termination-waiter
{% endif %}
env:
{% if development_mode | bool %}
@@ -412,6 +466,9 @@ spec:
{% if security_context_settings|length %}
{{ security_context_settings | to_nice_yaml | indent(8) }}
{% endif %}
{% endif %}
{% if termination_grace_period_seconds is defined %}
terminationGracePeriodSeconds: {{ termination_grace_period_seconds }}
{% endif %}
volumes:
{% if bundle_ca_crt %}
@@ -441,6 +498,14 @@ spec:
items:
- key: ldap-ca.crt
path: 'ldap-ca.crt'
{% endif %}
{% if termination_grace_period_seconds is defined %}
- name: pre-stop-data
emptyDir: {}
- name: pre-stop-scripts
configMap:
name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-pre-stop-scripts'
defaultMode: 0775
{% endif %}
- name: "{{ ansible_operator_meta.name }}-application-credentials"
secret: