mirror of
https://github.com/ansible/awx-operator.git
synced 2026-03-26 21:33:14 +00:00
AWX: Add termination_grace_period_seconds
This commit is contained in:
41
README.md
41
README.md
@@ -49,6 +49,7 @@ An [Ansible AWX](https://github.com/ansible/awx) operator for Kubernetes built w
|
||||
* [Upgrade of instances without auto upgrade](#upgrade-of-instances-without-auto-upgrade)
|
||||
* [Service Account](#service-account)
|
||||
* [Labeling operator managed objects](#labeling-operator-managed-objects)
|
||||
* [Pods termination grace period](#pods-termination-grace-period)
|
||||
* [Uninstall](#uninstall)
|
||||
* [Upgrading](#upgrading)
|
||||
* [Backup](#backup)
|
||||
@@ -1248,6 +1249,46 @@ spec:
|
||||
...
|
||||
```
|
||||
|
||||
#### Pods termination grace period
|
||||
|
||||
During deployment restarts or new rollouts, when old ReplicaSet Pods are being
|
||||
terminated, the corresponding jobs which are managed (executed or controlled)
|
||||
by old AWX Pods may end up in `Error` state as there is no mechanism to
|
||||
transfer them to the newly spawned AWX Pods. To work around the problem one
|
||||
could set `termination_grace_period_seconds` in AWX spec, which does the
|
||||
following:
|
||||
|
||||
* It sets the corresponding
|
||||
[`terminationGracePeriodSeconds`](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-termination)
|
||||
Pod spec of the AWX Deployment to the value provided
|
||||
|
||||
> The grace period is the duration in seconds after the processes running in
|
||||
> the pod are sent a termination signal and the time when the processes are
|
||||
> forcibly halted with a kill signal
|
||||
|
||||
* It adds a
|
||||
[`PreStop`](https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#hook-handler-execution)
|
||||
hook script, which will keep AWX Pods in terminating state until it finished,
|
||||
up to `terminationGracePeriodSeconds`.
|
||||
|
||||
> This grace period applies to the total time it takes for both the PreStop
|
||||
> hook to execute and for the Container to stop normally
|
||||
|
||||
While the hook script just waits until the corresponding AWX Pod (instance)
|
||||
no longer has any managed jobs, in which case it finishes with success and
|
||||
hands over the overall Pod termination process to normal AWX processes.
|
||||
|
||||
One may want to set this value to the maximum duration they accept to wait for
|
||||
the affected Jobs to finish. Keeping in mind that such finishing jobs may
|
||||
increase Pods termination time in such situations as `kubectl rollout restart`,
|
||||
AWX upgrade by the operator, or Kubernetes [API-initiated
|
||||
evictions](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/).
|
||||
|
||||
|
||||
| Name | Description | Default |
|
||||
| -------------------------------- | --------------------------------------------------------------- | ------- |
|
||||
| termination_grace_period_seconds | Optional duration in seconds pods needs to terminate gracefully | not set |
|
||||
|
||||
### Uninstall ###
|
||||
|
||||
To uninstall an AWX deployment instance, you basically need to remove the AWX kind related to that instance. For example, to delete an AWX instance named awx-demo, you would do:
|
||||
|
||||
@@ -526,6 +526,10 @@ spec:
|
||||
type: array
|
||||
type: object
|
||||
type: object
|
||||
termination_grace_period_seconds:
|
||||
description: Optional duration in seconds pods needs to terminate gracefully
|
||||
type: integer
|
||||
format: int32
|
||||
service_labels:
|
||||
description: Additional labels to apply to the service
|
||||
type: string
|
||||
|
||||
@@ -622,6 +622,11 @@ spec:
|
||||
x-descriptors:
|
||||
- urn:alm:descriptor:com.tectonic.ui:advanced
|
||||
- urn:alm:descriptor:com.tectonic.ui:hidden
|
||||
- displayName: Termination Grace Period Seconds
|
||||
path: termination_grace_period_seconds
|
||||
x-descriptors:
|
||||
- urn:alm:descriptor:com.tectonic.ui:advanced
|
||||
- urn:alm:descriptor:com.tectonic.ui:hidden
|
||||
- displayName: Service Labels
|
||||
path: service_labels
|
||||
x-descriptors:
|
||||
|
||||
66
roles/installer/files/pre-stop/termination-env
Normal file
66
roles/installer/files/pre-stop/termination-env
Normal file
@@ -0,0 +1,66 @@
|
||||
# file, which when exists, indicates that `master` script has successfully
|
||||
# completed pre-stop script execution
|
||||
marker_file="${PRE_STOP_MARKER_FILE:-/var/lib/pre-stop/.termination_marker}"
|
||||
|
||||
# file which the running `master` script continuously updates (mtime) to
|
||||
# indicate it's still running. this file is then read by `watcher`s to
|
||||
# understand if they still have to wait for `termination_marker`
|
||||
heartbeat_file="${PRE_STOP_HEARTBEAT_FILE:-/var/lib/pre-stop/.heartbeat}"
|
||||
|
||||
# file which:
|
||||
# * `watcher`s create when they bail out because they didn't see the
|
||||
# `heartbeat_file` to be updated within `$heartbeat_failed_threshold`;
|
||||
# * `master` creates when its handler command fails;
|
||||
# when scripts see such file, they also give up
|
||||
bailout_file="${PRE_STOP_BAILOUT_FILE:-/var/lib/pre-stop/.bailout}"
|
||||
heartbeat_threshold="${PRE_STOP_HEARTBEAT_THRESHOLD:-60}"
|
||||
|
||||
# where the scripts' stdout/stderr are streamed
|
||||
stdout="${PRE_STOP_STDOUT:-/proc/1/fd/1}"
|
||||
stderr="${PRE_STOP_STDERR:-/proc/1/fd/2}"
|
||||
|
||||
# command the `master` script executes, which when successfully finishes,
|
||||
# causes the script to create the `marker_file`
|
||||
handler="${PRE_STOP_HANDLER:-bash -c \"PYTHONUNBUFFERED=x awx-manage disable_instance --wait --retry=inf\"}"
|
||||
|
||||
log_prefix="${PRE_STOP_LOG_PREFIX:-preStop.exec}"
|
||||
[[ -n ${PRE_STOP_LOG_ROLE} ]] && log_prefix="${log_prefix}] [$PRE_STOP_LOG_ROLE"
|
||||
|
||||
# interval at which `watcher`s check for `marker_file` presence
|
||||
recheck_sleep="${PRE_STOP_RECHECK_SLEEP:-1}"
|
||||
# interval at which `watcher`s report into $stdout that they are still watching
|
||||
report_every="${PRE_STOP_REPORT_EVERY:-30}"
|
||||
|
||||
function log {
|
||||
printf "[%s] $1\n" "$log_prefix" "${@:2}"
|
||||
}
|
||||
|
||||
function parameters_string {
|
||||
for param in "$@"; do
|
||||
printf "%s=\"%s\"\n" "$param" "${!param}"
|
||||
done | paste -s -d ' '
|
||||
}
|
||||
|
||||
function check_bailout {
|
||||
if [[ -f $bailout_file ]]; then
|
||||
log "\"%s\" file has been detected, accepting bail out signal and failing the hook script" \
|
||||
"$bailout_file"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function check_heartbeat {
|
||||
if [[ -f $heartbeat_file ]]; then
|
||||
delta=$(( $(date +%s) - $(stat -c %Y "$heartbeat_file") ))
|
||||
else
|
||||
delta=$(( $(date +%s) - $1 ))
|
||||
fi
|
||||
|
||||
if [[ $delta -gt $heartbeat_threshold ]]; then
|
||||
log "The heartbeat file hasn't been updated since %ss, which is above the threshold of %ds, assuming the master is not operating and failing the hook script" \
|
||||
$delta
|
||||
$heartbeat_threshold
|
||||
touch "$bailout_file"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
50
roles/installer/files/pre-stop/termination-master
Executable file
50
roles/installer/files/pre-stop/termination-master
Executable file
@@ -0,0 +1,50 @@
|
||||
#/usr/bin/env bash
|
||||
|
||||
PRE_STOP_LOG_ROLE="${PRE_STOP_LOG_ROLE:-master}"
|
||||
source $(dirname "$0")/termination-env
|
||||
|
||||
{
|
||||
|
||||
log "The hook has started: %s" \
|
||||
"$(parameters_string \
|
||||
"marker_file" \
|
||||
"heartbeat_file" \
|
||||
"bailout_file" \
|
||||
"handler" \
|
||||
)"
|
||||
|
||||
touch "$heartbeat_file"
|
||||
|
||||
set -o pipefail
|
||||
eval "$handler" 2>&1 | while IFS= read -r line; do
|
||||
# we check the files here and break early, but overall script termination
|
||||
# happens later - as we need to distinguish between files detection and
|
||||
# command failure, while bash doesn't offer a simple way to do this here
|
||||
# inside the loop (`exit` does not terminate the script)
|
||||
[[ -f $bailout_file ]] && break
|
||||
[[ -f $marker_file ]] && break
|
||||
|
||||
log "[handler] %s" "$line"
|
||||
touch "$heartbeat_file"
|
||||
done
|
||||
ec=$?
|
||||
set +o pipefail
|
||||
|
||||
# process various cases in specific order
|
||||
check_bailout
|
||||
|
||||
if [[ -f $marker_file ]]; then
|
||||
log "Done! The marker file has been detected, assuming some other instance of the script has run to completion"
|
||||
exit 0
|
||||
elif [[ $ec -ne 0 ]]; then
|
||||
log "The handler has failed with \"%d\" exit code, failing the hook script too" \
|
||||
$ec
|
||||
# signal others to bail out
|
||||
touch "$bailout_file"
|
||||
exit $ec
|
||||
else
|
||||
log "Done! Generating the marker file allowing to proceed to termination"
|
||||
touch "$marker_file"
|
||||
fi
|
||||
|
||||
} > "$stdout" 2> "$stderr"
|
||||
33
roles/installer/files/pre-stop/termination-waiter
Executable file
33
roles/installer/files/pre-stop/termination-waiter
Executable file
@@ -0,0 +1,33 @@
|
||||
#/usr/bin/env bash
|
||||
|
||||
PRE_STOP_LOG_ROLE="${PRE_STOP_LOG_ROLE:-waiter}"
|
||||
source $(dirname "$0")/termination-env
|
||||
|
||||
{
|
||||
|
||||
log "The hook has started: %s" \
|
||||
"$(parameters_string \
|
||||
"marker_file" \
|
||||
"heartbeat_file" \
|
||||
"bailout_file" \
|
||||
"recheck_sleep" \
|
||||
"report_every" \
|
||||
)"
|
||||
|
||||
n=0
|
||||
checks_started=$(date +%s)
|
||||
|
||||
while ! [[ -f $marker_file ]]; do
|
||||
check_bailout
|
||||
check_heartbeat $checks_started
|
||||
|
||||
if [[ $(($n % $report_every)) -eq 0 ]]; then
|
||||
log "Waiting for the marker file to be accessible..."
|
||||
fi
|
||||
n=$(($n + 1))
|
||||
sleep $recheck_sleep
|
||||
done
|
||||
|
||||
log "The marker file found, exiting to proceed to termination"
|
||||
|
||||
} > "$stdout" 2> "$stderr"
|
||||
@@ -208,6 +208,7 @@
|
||||
wait: yes
|
||||
loop:
|
||||
- 'configmaps/config'
|
||||
- 'configmaps/pre_stop_scripts'
|
||||
- 'secrets/app_credentials'
|
||||
- 'rbac/service_account'
|
||||
- 'storage/persistent'
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-pre-stop-scripts'
|
||||
namespace: '{{ ansible_operator_meta.namespace }}'
|
||||
labels:
|
||||
{{ lookup("template", "../common/templates/labels/common.yaml.j2") | indent(width=4) | trim }}
|
||||
data:
|
||||
termination-master: |
|
||||
{{ lookup("file", "files/pre-stop/termination-master") | indent(width=4) | trim }}
|
||||
termination-waiter: |
|
||||
{{ lookup("file", "files/pre-stop/termination-waiter") | indent(width=4) | trim }}
|
||||
termination-env: |
|
||||
{{ lookup("file", "files/pre-stop/termination-env") | indent(width=4) | trim }}
|
||||
{% endif %}
|
||||
@@ -23,6 +23,7 @@ spec:
|
||||
annotations:
|
||||
{% for template in [
|
||||
"configmaps/config",
|
||||
"configmaps/pre_stop_scripts",
|
||||
"secrets/app_credentials",
|
||||
"storage/persistent",
|
||||
] %}
|
||||
@@ -139,6 +140,23 @@ spec:
|
||||
mountPath: "/var/run/redis"
|
||||
- name: "{{ ansible_operator_meta.name }}-redis-data"
|
||||
mountPath: "/data"
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
- name: pre-stop-data
|
||||
mountPath: /var/lib/pre-stop
|
||||
- name: pre-stop-scripts
|
||||
mountPath: /var/lib/pre-stop/scripts
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- bash
|
||||
- -c
|
||||
# redis image doesn't support writing to `/proc/1/fd/*`
|
||||
- >
|
||||
PRE_STOP_STDOUT=/dev/stdout
|
||||
PRE_STOP_STDERR=/dev/stderr
|
||||
/var/lib/pre-stop/scripts/termination-waiter
|
||||
{% endif %}
|
||||
resources: {{ redis_resource_requirements }}
|
||||
- image: '{{ _image }}'
|
||||
name: '{{ ansible_operator_meta.name }}-web'
|
||||
@@ -226,6 +244,18 @@ spec:
|
||||
{% endif %}
|
||||
{% if web_extra_volume_mounts -%}
|
||||
{{ web_extra_volume_mounts | indent(width=12, first=True) }}
|
||||
{% endif %}
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
- name: pre-stop-data
|
||||
mountPath: /var/lib/pre-stop
|
||||
- name: pre-stop-scripts
|
||||
mountPath: /var/lib/pre-stop/scripts
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- bash
|
||||
- /var/lib/pre-stop/scripts/termination-waiter
|
||||
{% endif %}
|
||||
env:
|
||||
- name: MY_POD_NAMESPACE
|
||||
@@ -308,6 +338,18 @@ spec:
|
||||
{% endif %}
|
||||
{% if task_extra_volume_mounts -%}
|
||||
{{ task_extra_volume_mounts | indent(width=12, first=True) }}
|
||||
{% endif %}
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
- name: pre-stop-data
|
||||
mountPath: /var/lib/pre-stop
|
||||
- name: pre-stop-scripts
|
||||
mountPath: /var/lib/pre-stop/scripts
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- bash
|
||||
- /var/lib/pre-stop/scripts/termination-master
|
||||
{% endif %}
|
||||
env:
|
||||
- name: SUPERVISOR_WEB_CONFIG_PATH
|
||||
@@ -377,6 +419,18 @@ spec:
|
||||
mountPath: "/var/lib/awx/projects"
|
||||
{% if ee_extra_volume_mounts -%}
|
||||
{{ ee_extra_volume_mounts | indent(width=12, first=True) }}
|
||||
{% endif %}
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
- name: pre-stop-data
|
||||
mountPath: /var/lib/pre-stop
|
||||
- name: pre-stop-scripts
|
||||
mountPath: /var/lib/pre-stop/scripts
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- bash
|
||||
- /var/lib/pre-stop/scripts/termination-waiter
|
||||
{% endif %}
|
||||
env:
|
||||
{% if development_mode | bool %}
|
||||
@@ -412,6 +466,9 @@ spec:
|
||||
{% if security_context_settings|length %}
|
||||
{{ security_context_settings | to_nice_yaml | indent(8) }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
terminationGracePeriodSeconds: {{ termination_grace_period_seconds }}
|
||||
{% endif %}
|
||||
volumes:
|
||||
{% if bundle_ca_crt %}
|
||||
@@ -441,6 +498,14 @@ spec:
|
||||
items:
|
||||
- key: ldap-ca.crt
|
||||
path: 'ldap-ca.crt'
|
||||
{% endif %}
|
||||
{% if termination_grace_period_seconds is defined %}
|
||||
- name: pre-stop-data
|
||||
emptyDir: {}
|
||||
- name: pre-stop-scripts
|
||||
configMap:
|
||||
name: '{{ ansible_operator_meta.name }}-{{ deployment_type }}-pre-stop-scripts'
|
||||
defaultMode: 0775
|
||||
{% endif %}
|
||||
- name: "{{ ansible_operator_meta.name }}-application-credentials"
|
||||
secret:
|
||||
|
||||
Reference in New Issue
Block a user