MariaDB Galera rolling operations (paved road)

Paved road case study.

Scope & windows

Multi-node MariaDB Galera cluster behind a load balancer.

Rolling node ops during short windows; no write loss.

Role

Platform/automation engineer (lane design + implementation).

Approach

Opinionated lane with tags: preflight → change → validate.

Rolling execution with serial: 1 across the hostgroup; any_errors_fatal: true.

LB drain per node via HAProxy runtime socket; wait until 0 live conns (runtime JSON).

wsrep gates before/after: Synced, ready=ON, cluster Primary.

Validate with retries/delays; enable on LB only after gates pass; simple write probe.

Job logs = audit trail; runs pin a git ref for repeatability.

Results

Predictable windows; fewer manual steps; lower incident risk.

Clear pass/fail gates; easy to pause/rollback per node.

Auditable runs (who/what/which ref).

Confidentiality

Client artifacts can't be shared.

Examples are anonymized and recreated; configs, names, and IPs are placeholders.

Receipts use the actual stack and are representative.

Code snippets

Prereqs:

HAProxy admin socket (e.g. /run/haproxy/admin.sock)

socat & jq on LB nodes

MySQL auth via ~/.my.cnf or vault

1) Top-level playbook — roll 1-by-1, fail closed

Ansible - galera-rolling-updates.yml

# galera-rolling-updates.yml — rolling lane for the whole group
- hosts: galera_nodes
  become: true
  gather_facts: false
  serial: "{{ batch_size | default(1) }}"
  any_errors_fatal: true
  max_fail_percentage: 0
  vars:
    haproxy_socket: /run/haproxy/admin.sock
    backend: galera-backend
    server: "{{ inventory_hostname_short }}"
  tasks:
    - block:
        - import_tasks: preflight.yml     # cluster OK + drain this node
          tags: [preflight]
        - import_tasks: change.yml        # package/config/restart
          tags: [change]
        - import_tasks: validate.yml      # wait → wsrep gates → enable → probe
          tags: [validate]
      rescue:
        - name: Keep node drained on failure (fail closed)
          shell: |
            printf 'disable server {{ backend }}/{{ server }}
' | socat - {{ haproxy_socket }}
          args: { executable: /bin/bash }
          changed_when: true
        - name: Stop rollout; investigate this node
          fail:
            msg: "Node {{ inventory_hostname }} failed and remains DISABLED on LB."
      always:
        - name: Report LB state for this node
          shell: |
            printf 'show stat json\n' | socat - {{ haproxy_socket }} \
            | jq -r --arg b "{{ backend }}" --arg s "{{ server }}" \
                 '.[]|select(.pxname==$b and .svname==$s)|"(.svname): (.status) scur=(.scur)"'
          args: { executable: /bin/bash }
          changed_when: false

2) Preflight.yml — cluster OK, then drain + wait to 0 live conns

Ansible - preflight.yml

# 2.1 Cluster must be healthy before touching LB
- name: wsrep must be Primary, Synced, ready
  shell: |
    mysql -Nse "SHOW STATUS LIKE 'wsrep_cluster_status';
                SHOW STATUS LIKE 'wsrep_local_state_comment';
                SHOW STATUS LIKE 'wsrep_ready'"     | awk '
      NR==1 && $2!="Primary" {exit 1}
      NR==2 && $2!="Synced"  {exit 1}
      NR==3 && $2!="ON"      {exit 1}'
  args: { executable: /bin/bash }
  changed_when: false

# 2.2 Drain this node from HAProxy and wait until scur==0
- name: Drain and wait for 0 live connections (HAProxy runtime JSON)
  shell: |
    printf 'disable server %s/%s\n' "$BACKEND" "$SERVER" | socat - "$HAPROXY_SOCKET"
    for i in {1..60}; do
      scur=$(printf 'show stat json\n' | socat - "$HAPROXY_SOCKET" \
        | jq -r --arg b "$BACKEND" --arg s "$SERVER" \
          '.[]|select(.pxname==$b and .svname==$s)|.scur // 0')
      [ "${scur:-0}" = "0" ] && exit 0
      sleep 1
    done
    echo "not drained"; exit 1
  args: { executable: /bin/bash }
  environment:
    HAPROXY_SOCKET: "{{ haproxy_socket }}"
    BACKEND: "{{ backend }}"
    SERVER: "{{ server }}"
  changed_when: true

3) change.yml — example node ops (service update/restart)

Ansible - change.yml

# use package/service/etc. to update/restart the service
- name: Stop MariaDB
  service: { name: mariadb, state: stopped }

- name: Update MariaDB (dnf example; replace on apt-based)
  package: {name: MariaDB-server, state: latest}

- name: Start MariaDB
  service: { name: mariadb, state: started, enabled: true }

4) validate.yml — retries + enable only when green

Ansible - validate.yml

# 4.0 Vars for retry rhythm
- vars:
    validate_retries: 30    # ~2-3 min total
    validate_delay:   5

# 4.1 Wait for mysqld to accept connections
- name: Wait for MySQL (local ping)
  shell: "mysqladmin ping -h 127.0.0.1 --silent"
  register: ping
  retries: "{{ validate_retries }}"
  delay:   "{{ validate_delay }}"
  until: ping.rc == 0
  changed_when: false

# 4.2 Wait for wsrep gates (Synced/ON/Primary)
- name: Wait for wsrep gates
  shell: |
    mysql -Nse "SHOW STATUS LIKE 'wsrep_local_state_comment';
                SHOW STATUS LIKE 'wsrep_ready';
                SHOW STATUS LIKE 'wsrep_cluster_status'" \
    | awk '
      NR==1 && $2!="Synced"  {exit 1}
      NR==2 && $2!="ON"      {exit 1}
      NR==3 && $2!="Primary" {exit 1}'
  register: wsrep
  retries: "{{ validate_retries }}"
  delay:   "{{ validate_delay }}"
  until: wsrep.rc == 0
  changed_when: false

# 4.3 Enable node on HAProxy (only after gates pass)
- name: Enable node on HAProxy
  shell: "printf 'enable server {{ backend }}/{{ server }}\n' | socat - {{ haproxy_socket }}"
  args: { executable: /bin/bash }
  changed_when: true

# 4.4 Wait until HAProxy reports the server UP
- name: Wait for HAProxy to show UP
  shell: |
    printf 'show stat json\n' | socat - {{ haproxy_socket }} \
    | jq -e --arg b "{{ backend }}" --arg s "{{ server }}" \
      '.[]|select(.pxname==$b and .svname==$s)|.status=="UP"'
  register: up
  retries: "{{ validate_retries }}"
  delay:   "{{ validate_delay }}"
  until: up.rc == 0
  args: { executable: /bin/bash }
  changed_when: false

# 4.5 Simple write probe
- name: Write probe (create/insert/delete)
  shell: |
    mysql -e "CREATE DATABASE IF NOT EXISTS health;
              CREATE TABLE IF NOT EXISTS health.t(i int);
              INSERT INTO health.t VALUES (1);
              DELETE FROM health.t;"
  register: probe
  retries: 3
  delay: 5
  until: probe.rc == 0
  args: { executable: /bin/bash }
  changed_when: true

5) Rundeck job — run the whole hostgroup (serial in play)

Rundeck - job.yml

- id: galera-rolling
  name: Galera rolling op (hostgroup)
  sequence:
    commands:
      # Phase 1: preflight only (sanity)
      - exec: ansible-playbook -i inv/${option.env} galera-rolling-updates.yml \
              -l ${option.hostgroup} -t preflight
      # Phase 2: full lane (preflight→change→validate), serial in play controls order
      - exec: ansible-playbook -i inv/${option.env} galera-rolling-updates.yml \
              -l ${option.hostgroup}
  options:
    - name: env;       values: [dev,test,prod]; required: true
    - name: hostgroup; values: [galera_nodes];  required: true