Fix windmill postgres upgrade reliability: persistent-volume staging dir, tighten sentinels, supervisorctl

Agent-Logs-Url: https://github.com/nextcloud/all-in-one/sessions/5e1df9cc-b63e-4a5a-9148-38907b6d8db9

Co-authored-by: szaimen <42591237+szaimen@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-04-28 11:15:38 +00:00
committed by GitHub
parent c60861d79c
commit e077aea86d
2 changed files with 76 additions and 25 deletions

View File

@@ -31,6 +31,11 @@ fi
PGDATA="/var/lib/postgresql/data"
DUMP_FILE="$PGDATA/windmill-db-dump.sql"
# Staging directory for the new cluster during a major-version upgrade.
# It lives INSIDE $PGDATA (the persistent volume) so the dump file is never
# on tmpfs and is never lost if the container crashes mid-upgrade.
UPGRADE_DIR="$PGDATA/upgrade_data"
# Current PG major version as shipped in this image
CURRENT_PG_MAJOR=$(cat /etc/postgres-major-version 2>/dev/null)
@@ -38,6 +43,9 @@ CURRENT_PG_MAJOR=$(cat /etc/postgres-major-version 2>/dev/null)
if [ -f "$PGDATA/import.failed" ]; then
echo "The database import failed the last time. Please restore a backup and try again."
echo "For further clues on what went wrong, look at the logs above."
if [ -d "$UPGRADE_DIR" ]; then
echo "The staged upgraded cluster is still present at $UPGRADE_DIR."
fi
exit 1
fi
@@ -48,6 +56,14 @@ if [ -f "$PGDATA/export.failed" ]; then
exit 1
fi
# ── Clean up any leftover upgrade staging directory ──────────────────────────
# Normally absent; only present if a previous upgrade was interrupted after the
# swap completed but before the directory was removed (extremely unlikely).
if [ -d "$UPGRADE_DIR" ]; then
echo "Removing leftover upgrade staging directory..."
rm -rf "$UPGRADE_DIR"
fi
# Write the standard pg_hba.conf and listen_addresses settings into a data directory.
configure_pg() {
local datadir="$1"
@@ -85,34 +101,29 @@ if [ -f "$PGDATA/PG_VERSION" ]; then
echo "Restoring database from dump into new PostgreSQL $CURRENT_PG_MAJOR cluster."
# Copy dump out of PGDATA before wiping it
cp "$DUMP_FILE" /tmp/windmill-db-dump.sql
# Mark import as in-progress
# (written to /tmp because PGDATA is about to be wiped)
IMPORT_FAILED_TMP=/tmp/windmill-import.failed
touch "$IMPORT_FAILED_TMP"
# Set the sentinel BEFORE any destructive operation so that a crash at
# any point leaves the guard in place and blocks the next start.
touch "$PGDATA/import.failed"
set -ex
# Remove old data directory
rm -rf "${PGDATA:?}/"*
# Initialise the new cluster in a subdirectory of the persistent volume.
# This keeps the dump file ($DUMP_FILE) untouched throughout the upgrade;
# no data is ever copied to or relied upon from tmpfs.
rm -rf "$UPGRADE_DIR"
mkdir "$UPGRADE_DIR"
# Initialise a fresh cluster for the new major version
initdb -D "$PGDATA" \
initdb -D "$UPGRADE_DIR" \
--username=windmill \
--auth-local=trust \
--auth-host=trust \
--no-instructions
configure_pg "$PGDATA"
configure_pg "$UPGRADE_DIR"
# Mark import as in-progress inside the new data dir
touch "$PGDATA/import.failed"
# Start postgres temporarily on an alternate TCP port so we can import
export PGPORT=11000
postgres -D "$PGDATA" -h 127.0.0.1 -p 11000 &
# Start postgres temporarily on an alternate TCP port so we can import.
# Use explicit flags; do NOT export PGPORT to avoid side-effects.
postgres -D "$UPGRADE_DIR" -h 127.0.0.1 -p 11000 &
TEMP_PG_PID=$!
# Wait until postgres accepts connections
@@ -125,19 +136,36 @@ if [ -f "$PGDATA/PG_VERSION" ]; then
psql -h 127.0.0.1 -p 11000 -U windmill -d postgres \
-c "CREATE DATABASE windmill OWNER windmill;"
# Restore from dump
# Restore from dump. $DUMP_FILE still lives in $PGDATA — it was never
# wiped because we used $UPGRADE_DIR for the new cluster.
echo "Restoring the database from dump..."
psql -h 127.0.0.1 -p 11000 -U windmill -d windmill < /tmp/windmill-db-dump.sql
psql -h 127.0.0.1 -p 11000 -U windmill -d windmill < "$DUMP_FILE"
# Stop the temporary postgres cleanly
pg_ctl -D "$PGDATA" stop -m smart -t 1800
pg_ctl -D "$UPGRADE_DIR" stop -m smart -t 1800
wait "$TEMP_PG_PID" 2>/dev/null || true
unset PGPORT
# ── Swap the upgraded cluster into the main PGDATA slot ──────────────
# Remove all old cluster files except: the staging dir, the dump,
# the import log, and the import.failed sentinel.
DUMP_BASENAME="$(basename "$DUMP_FILE")"
find "$PGDATA" -maxdepth 1 -mindepth 1 \
! -name 'upgrade_data' \
! -name "$DUMP_BASENAME" \
! -name 'database-import.log' \
! -name 'import.failed' \
-exec rm -rf {} +
# Move the new cluster files into PGDATA
find "$UPGRADE_DIR" -maxdepth 1 -mindepth 1 -exec mv -t "$PGDATA" {} +
# Remove the now-empty staging directory
rmdir "$UPGRADE_DIR"
set +ex
# Remove sentinel files
rm -f "$PGDATA/import.failed" "$IMPORT_FAILED_TMP"
# Remove the sentinel only after the swap has fully completed
rm "$PGDATA/import.failed"
echo "PostgreSQL upgrade to $CURRENT_PG_MAJOR complete."
fi
fi
@@ -167,6 +195,17 @@ fi
# ── Dump database and shut down on container stop ────────────────────────────
do_database_dump() {
# Stop windmill first so it is not writing to the database during the dump.
supervisorctl -c /supervisord.conf stop windmill 2>/dev/null || true
# Verify postgres is still accepting connections before attempting the dump.
if ! pg_isready -h /var/run/postgresql -U windmill -q; then
echo "WARNING: postgres is not ready; skipping dump."
kill "$SUPERVISORD_PID" 2>/dev/null || true
wait "$SUPERVISORD_PID" 2>/dev/null || true
return
fi
set -x
touch "$PGDATA/export.failed"
rm -f "$DUMP_FILE.temp"
@@ -175,10 +214,12 @@ do_database_dump() {
rm "$PGDATA/export.failed"
echo "Database dump successful!"
else
rm -f "$DUMP_FILE.temp"
echo "Database dump unsuccessful!"
fi
set +x
# Stop supervisord (which stops postgres and windmill)
# Stop supervisord (which stops postgres and any remaining programs)
kill "$SUPERVISORD_PID" 2>/dev/null || true
wait "$SUPERVISORD_PID" 2>/dev/null || true
}

View File

@@ -7,6 +7,16 @@ logfile_maxbytes=50MB
logfile_backups=10
loglevel=error
[unix_http_server]
file=/var/run/supervisord/supervisor.sock
chmod=0700
[supervisorctl]
serverurl=unix:///var/run/supervisord/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory=supervisor.rpcinterface:make_main_rpcinterface
[program:postgresql]
command=postgres -D /var/lib/postgresql/data -k /var/run/postgresql
stdout_logfile=/dev/stdout