From 46eb2dfc7dbf76e2a271de44b0879ff26e5408e7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:31:14 +0000 Subject: [PATCH 1/5] fix: prevent 502 Bad Gateway via PHP-FPM worker pool exhaustion and cold-start latency - Add request_terminate_timeout = PHP_MAX_TIME in start.sh: without this (default 0 = disabled) workers blocked on a slow DB query, stalled Redis connection, or hung syscall are never reaped. Over time they fill pm.max_children and Apache returns 502 Bad Gateway to the reverse proxy. - Set pm.process_idle_timeout = 300s in Dockerfile: the upstream default of 10 s kills all idle workers after a brief quiet period. The next request burst must then wait for fresh PHP-FPM forks; on a loaded host that spawn latency can push Apache past its FastCGI deadline and produce a 502. 300 s keeps a warm pool through normal desktop-sync polling cycles. - Add a dedicated 502 troubleshooting subsection to reverse-proxy.md documenting the six most common causes (proxy timeout, worker exhaustion, stuck workers, Redis session lock contention, container cold start, Caddy cert renewal) with actionable diagnostics. Agent-Logs-Url: https://github.com/nextcloud/all-in-one/sessions/2fd7a6d1-bfdb-4f26-a8d0-cd54a7307999 Co-authored-by: szaimen <42591237+szaimen@users.noreply.github.com> --- Containers/nextcloud/Dockerfile | 8 ++++++++ Containers/nextcloud/start.sh | 9 +++++++++ reverse-proxy.md | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/Containers/nextcloud/Dockerfile b/Containers/nextcloud/Dockerfile index e98bbd9c..b2fa0691 100644 --- a/Containers/nextcloud/Dockerfile +++ b/Containers/nextcloud/Dockerfile @@ -250,6 +250,14 @@ RUN set -ex; \ # We don't actually expect so many children but don't want to limit it artificially because people will report issues otherwise. # Also children will usually be terminated again after the process is done due to the ondemand setting sed -i 's/^pm.max_children =.*/pm.max_children = 5000/' /usr/local/etc/php-fpm.d/www.conf; \ +# With pm = ondemand, workers are killed after pm.process_idle_timeout seconds +# of inactivity. The upstream default is 10 s, which is aggressive: after a +# brief quiet period (e.g. desktop-sync clients polling every few seconds), all +# workers are reaped and the next request burst must wait for fresh forks. On +# a loaded host that spawn latency can push Apache past its FastCGI timeout and +# produce a 502. 300 s (5 min) keeps a warm pool through normal sync-client +# polling cycles while still reclaiming memory during genuinely idle periods. + sed -i 's/^;*pm.process_idle_timeout.*/pm.process_idle_timeout = 300s/' /usr/local/etc/php-fpm.d/www.conf; \ sed -i 's|access.log = /proc/self/fd/2|access.log = /proc/self/fd/1|' /usr/local/etc/php-fpm.d/docker.conf; \ \ echo "[ -n \"\$TERM\" ] && [ -f /root.motd ] && cat /root.motd" >> /root/.bashrc; \ diff --git a/Containers/nextcloud/start.sh b/Containers/nextcloud/start.sh index 51d1f346..41cd0122 100644 --- a/Containers/nextcloud/start.sh +++ b/Containers/nextcloud/start.sh @@ -156,6 +156,15 @@ while [ "$THIS_IS_AIO" = "true" ] && [ -z "$(dig nextcloud-aio-apache A +short + sleep 5 done +# Set request_terminate_timeout so that PHP-FPM forcibly kills workers that +# exceed the wall-clock limit. Without this (default = 0 = disabled) a worker +# stuck on a slow DB query, a stalled Redis connection, or a hung syscall is +# never reaped. Over time these zombies fill up pm.max_children, leaving no +# free slots for legitimate requests and causing Apache to return 502 Bad +# Gateway upstream. Setting it equal to PHP_MAX_TIME means a worker lives at +# most as long as a PHP script is allowed to run, which keeps the pool healthy. +sed -i "s|^;*request_terminate_timeout = .*|request_terminate_timeout = ${PHP_MAX_TIME}|" /usr/local/etc/php-fpm.d/www.conf + set -x # shellcheck disable=SC2235 if [ "$THIS_IS_AIO" = "true" ] && [ "$APACHE_PORT" = 443 ]; then diff --git a/reverse-proxy.md b/reverse-proxy.md index ea81c844..d12e56ab 100644 --- a/reverse-proxy.md +++ b/reverse-proxy.md @@ -1240,6 +1240,27 @@ If something does not work, follow the steps below: 1. Try to configure everything from scratch - if it still does not work by following https://github.com/nextcloud/all-in-one#how-to-properly-reset-the-instance. 1. As last resort, you may disable the domain validation by adding `--env SKIP_DOMAIN_VALIDATION=true` to the docker run command. But only use this if you are completely sure that you've correctly configured everything! Also see [this documentation](https://github.com/nextcloud/all-in-one#how-to-skip-the-domain-validation). +#### Troubleshooting 502 Bad Gateway errors + +A **502 Bad Gateway** response from your reverse proxy means the proxy successfully connected to AIO's Apache/Caddy layer but received an invalid or no response from the backend. Common causes and fixes: + +1. **Reverse proxy read timeout too short** — The nginx default `proxy_read_timeout` is only 60 seconds. Long-running operations (large file uploads, remote storage scans, etc.) can exceed this limit, causing nginx to close the connection and show 502 (or 504) to the client. Make sure your reverse proxy timeout is set higher than AIO's `NEXTCLOUD_MAX_TIME` (default 3600 s). The sample configs in this document already set `proxy_read_timeout 3610s` for nginx and equivalent values for other proxies. + +2. **PHP-FPM worker pool exhausted** — Each PHP request is served by a PHP-FPM worker. Workers are spawned on demand and terminated after 5 minutes of idle time. If a burst of concurrent requests arrives and no workers are available (e.g. all current workers are busy with long operations), Apache returns 502 to Caddy, which returns it to your proxy. Check worker usage with: + ``` + sudo docker exec nextcloud-aio-nextcloud php-fpm -t && \ + sudo docker exec nextcloud-aio-nextcloud ps aux | grep php-fpm | wc -l + ``` + If the count is consistently high, consider increasing `PHP_MEMORY_LIMIT` so individual workers use less memory and the host can sustain more of them, or reduce concurrent background jobs. + +3. **Stuck PHP-FPM workers** — Without a `request_terminate_timeout`, a PHP worker that blocks indefinitely (e.g. waiting on a slow database query or a stalled Redis connection) is never killed. Over time, these stuck workers consume all available slots in `pm.max_children`, leaving no free workers for new requests and causing a cascade of 502 errors. AIO now sets `request_terminate_timeout` automatically, but if you run a custom PHP-FPM config you must set it yourself. + +4. **Redis session lock contention** — When one request holds a session lock (e.g. a long file upload or a background scan), other requests for the same user session queue up waiting for the lock. If the lock wait time exceeds the reverse proxy timeout the proxy sees a 502. You can reduce this with session-less API calls or by checking for Redis performance issues: `sudo docker logs nextcloud-aio-redis`. + +5. **AIO containers not fully started** — After a container restart there is a brief window where PHP-FPM accepts TCP connections but is not yet ready to serve requests. Retrying the failed request usually succeeds. To confirm, check: `sudo docker logs nextcloud-aio-nextcloud` and `sudo docker logs nextcloud-aio-apache` for startup messages. + +6. **Apache or Caddy restart during certificate renewal** — Caddy renews TLS certificates automatically and briefly restarts. During those few seconds your reverse proxy may see a 502. These are transient and self-healing. + ### 8. Removing the reverse proxy If you, at some point, want to remove the reverse proxy, here are some general steps: From 2bc3c4e7ecc3c8ad341e1f80c0352f9ac5669af7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:32:37 +0000 Subject: [PATCH 2/5] refine: improve sed pattern specificity and remove hardcoded timeout from docs - Use \s*= in pm.process_idle_timeout sed pattern to match only setting lines, not comment-only lines that mention the setting name - Remove hardcoded '5 minutes' from docs; reference pm.process_idle_timeout by name so it stays accurate if the value changes Agent-Logs-Url: https://github.com/nextcloud/all-in-one/sessions/2fd7a6d1-bfdb-4f26-a8d0-cd54a7307999 Co-authored-by: szaimen <42591237+szaimen@users.noreply.github.com> --- Containers/nextcloud/Dockerfile | 2 +- reverse-proxy.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Containers/nextcloud/Dockerfile b/Containers/nextcloud/Dockerfile index b2fa0691..5d83b9f5 100644 --- a/Containers/nextcloud/Dockerfile +++ b/Containers/nextcloud/Dockerfile @@ -257,7 +257,7 @@ RUN set -ex; \ # a loaded host that spawn latency can push Apache past its FastCGI timeout and # produce a 502. 300 s (5 min) keeps a warm pool through normal sync-client # polling cycles while still reclaiming memory during genuinely idle periods. - sed -i 's/^;*pm.process_idle_timeout.*/pm.process_idle_timeout = 300s/' /usr/local/etc/php-fpm.d/www.conf; \ + sed -i 's/^;*pm.process_idle_timeout\s*=.*/pm.process_idle_timeout = 300s/' /usr/local/etc/php-fpm.d/www.conf; \ sed -i 's|access.log = /proc/self/fd/2|access.log = /proc/self/fd/1|' /usr/local/etc/php-fpm.d/docker.conf; \ \ echo "[ -n \"\$TERM\" ] && [ -f /root.motd ] && cat /root.motd" >> /root/.bashrc; \ diff --git a/reverse-proxy.md b/reverse-proxy.md index d12e56ab..efca9eaf 100644 --- a/reverse-proxy.md +++ b/reverse-proxy.md @@ -1246,7 +1246,7 @@ A **502 Bad Gateway** response from your reverse proxy means the proxy successfu 1. **Reverse proxy read timeout too short** — The nginx default `proxy_read_timeout` is only 60 seconds. Long-running operations (large file uploads, remote storage scans, etc.) can exceed this limit, causing nginx to close the connection and show 502 (or 504) to the client. Make sure your reverse proxy timeout is set higher than AIO's `NEXTCLOUD_MAX_TIME` (default 3600 s). The sample configs in this document already set `proxy_read_timeout 3610s` for nginx and equivalent values for other proxies. -2. **PHP-FPM worker pool exhausted** — Each PHP request is served by a PHP-FPM worker. Workers are spawned on demand and terminated after 5 minutes of idle time. If a burst of concurrent requests arrives and no workers are available (e.g. all current workers are busy with long operations), Apache returns 502 to Caddy, which returns it to your proxy. Check worker usage with: +2. **PHP-FPM worker pool exhausted** — Each PHP request is served by a PHP-FPM worker. Workers are spawned on demand and terminated after a configurable idle timeout (`pm.process_idle_timeout`). If a burst of concurrent requests arrives and no workers are available (e.g. all current workers are busy with long operations), Apache returns 502 to Caddy, which returns it to your proxy. Check worker usage with: ``` sudo docker exec nextcloud-aio-nextcloud php-fpm -t && \ sudo docker exec nextcloud-aio-nextcloud ps aux | grep php-fpm | wc -l From ffd9dd2da8caf7a33b9c5cca03070e827386895d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:54:49 +0000 Subject: [PATCH 3/5] refactor: move request_terminate_timeout to Dockerfile; remove 502 docs section - Remove request_terminate_timeout sed from start.sh (was runtime-dynamic) - Add request_terminate_timeout sed to Dockerfile alongside pm.process_idle_timeout so it is baked into the image permanently at build time - Remove the 502 Bad Gateway troubleshooting subsection from reverse-proxy.md Agent-Logs-Url: https://github.com/nextcloud/all-in-one/sessions/bc8a9920-0b43-4645-9591-180fa8783767 Co-authored-by: szaimen <42591237+szaimen@users.noreply.github.com> --- Containers/nextcloud/Dockerfile | 7 +++++++ Containers/nextcloud/start.sh | 9 --------- reverse-proxy.md | 21 --------------------- 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/Containers/nextcloud/Dockerfile b/Containers/nextcloud/Dockerfile index 5d83b9f5..66fc34b9 100644 --- a/Containers/nextcloud/Dockerfile +++ b/Containers/nextcloud/Dockerfile @@ -258,6 +258,13 @@ RUN set -ex; \ # produce a 502. 300 s (5 min) keeps a warm pool through normal sync-client # polling cycles while still reclaiming memory during genuinely idle periods. sed -i 's/^;*pm.process_idle_timeout\s*=.*/pm.process_idle_timeout = 300s/' /usr/local/etc/php-fpm.d/www.conf; \ +# Set request_terminate_timeout so that PHP-FPM forcibly kills workers that +# exceed the wall-clock limit. Without this (default = 0 = disabled) a worker +# stuck on a slow DB query, a stalled Redis connection, or a hung syscall is +# never reaped. Over time these zombies fill up pm.max_children, leaving no +# free slots for legitimate requests and causing Apache to return 502 Bad +# Gateway upstream. + sed -i "s|^;*request_terminate_timeout = .*|request_terminate_timeout = ${PHP_MAX_TIME}|" /usr/local/etc/php-fpm.d/www.conf; \ sed -i 's|access.log = /proc/self/fd/2|access.log = /proc/self/fd/1|' /usr/local/etc/php-fpm.d/docker.conf; \ \ echo "[ -n \"\$TERM\" ] && [ -f /root.motd ] && cat /root.motd" >> /root/.bashrc; \ diff --git a/Containers/nextcloud/start.sh b/Containers/nextcloud/start.sh index 41cd0122..51d1f346 100644 --- a/Containers/nextcloud/start.sh +++ b/Containers/nextcloud/start.sh @@ -156,15 +156,6 @@ while [ "$THIS_IS_AIO" = "true" ] && [ -z "$(dig nextcloud-aio-apache A +short + sleep 5 done -# Set request_terminate_timeout so that PHP-FPM forcibly kills workers that -# exceed the wall-clock limit. Without this (default = 0 = disabled) a worker -# stuck on a slow DB query, a stalled Redis connection, or a hung syscall is -# never reaped. Over time these zombies fill up pm.max_children, leaving no -# free slots for legitimate requests and causing Apache to return 502 Bad -# Gateway upstream. Setting it equal to PHP_MAX_TIME means a worker lives at -# most as long as a PHP script is allowed to run, which keeps the pool healthy. -sed -i "s|^;*request_terminate_timeout = .*|request_terminate_timeout = ${PHP_MAX_TIME}|" /usr/local/etc/php-fpm.d/www.conf - set -x # shellcheck disable=SC2235 if [ "$THIS_IS_AIO" = "true" ] && [ "$APACHE_PORT" = 443 ]; then diff --git a/reverse-proxy.md b/reverse-proxy.md index efca9eaf..ea81c844 100644 --- a/reverse-proxy.md +++ b/reverse-proxy.md @@ -1240,27 +1240,6 @@ If something does not work, follow the steps below: 1. Try to configure everything from scratch - if it still does not work by following https://github.com/nextcloud/all-in-one#how-to-properly-reset-the-instance. 1. As last resort, you may disable the domain validation by adding `--env SKIP_DOMAIN_VALIDATION=true` to the docker run command. But only use this if you are completely sure that you've correctly configured everything! Also see [this documentation](https://github.com/nextcloud/all-in-one#how-to-skip-the-domain-validation). -#### Troubleshooting 502 Bad Gateway errors - -A **502 Bad Gateway** response from your reverse proxy means the proxy successfully connected to AIO's Apache/Caddy layer but received an invalid or no response from the backend. Common causes and fixes: - -1. **Reverse proxy read timeout too short** — The nginx default `proxy_read_timeout` is only 60 seconds. Long-running operations (large file uploads, remote storage scans, etc.) can exceed this limit, causing nginx to close the connection and show 502 (or 504) to the client. Make sure your reverse proxy timeout is set higher than AIO's `NEXTCLOUD_MAX_TIME` (default 3600 s). The sample configs in this document already set `proxy_read_timeout 3610s` for nginx and equivalent values for other proxies. - -2. **PHP-FPM worker pool exhausted** — Each PHP request is served by a PHP-FPM worker. Workers are spawned on demand and terminated after a configurable idle timeout (`pm.process_idle_timeout`). If a burst of concurrent requests arrives and no workers are available (e.g. all current workers are busy with long operations), Apache returns 502 to Caddy, which returns it to your proxy. Check worker usage with: - ``` - sudo docker exec nextcloud-aio-nextcloud php-fpm -t && \ - sudo docker exec nextcloud-aio-nextcloud ps aux | grep php-fpm | wc -l - ``` - If the count is consistently high, consider increasing `PHP_MEMORY_LIMIT` so individual workers use less memory and the host can sustain more of them, or reduce concurrent background jobs. - -3. **Stuck PHP-FPM workers** — Without a `request_terminate_timeout`, a PHP worker that blocks indefinitely (e.g. waiting on a slow database query or a stalled Redis connection) is never killed. Over time, these stuck workers consume all available slots in `pm.max_children`, leaving no free workers for new requests and causing a cascade of 502 errors. AIO now sets `request_terminate_timeout` automatically, but if you run a custom PHP-FPM config you must set it yourself. - -4. **Redis session lock contention** — When one request holds a session lock (e.g. a long file upload or a background scan), other requests for the same user session queue up waiting for the lock. If the lock wait time exceeds the reverse proxy timeout the proxy sees a 502. You can reduce this with session-less API calls or by checking for Redis performance issues: `sudo docker logs nextcloud-aio-redis`. - -5. **AIO containers not fully started** — After a container restart there is a brief window where PHP-FPM accepts TCP connections but is not yet ready to serve requests. Retrying the failed request usually succeeds. To confirm, check: `sudo docker logs nextcloud-aio-nextcloud` and `sudo docker logs nextcloud-aio-apache` for startup messages. - -6. **Apache or Caddy restart during certificate renewal** — Caddy renews TLS certificates automatically and briefly restarts. During those few seconds your reverse proxy may see a 502. These are transient and self-healing. - ### 8. Removing the reverse proxy If you, at some point, want to remove the reverse proxy, here are some general steps: From 457a0d9fef973226309532490ea5c58bbbc5f0db Mon Sep 17 00:00:00 2001 From: "Simon L." Date: Mon, 27 Apr 2026 17:56:07 +0200 Subject: [PATCH 4/5] Apply suggestion from @szaimen Signed-off-by: Simon L. --- Containers/nextcloud/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Containers/nextcloud/Dockerfile b/Containers/nextcloud/Dockerfile index 66fc34b9..9dbf8460 100644 --- a/Containers/nextcloud/Dockerfile +++ b/Containers/nextcloud/Dockerfile @@ -264,7 +264,7 @@ RUN set -ex; \ # never reaped. Over time these zombies fill up pm.max_children, leaving no # free slots for legitimate requests and causing Apache to return 502 Bad # Gateway upstream. - sed -i "s|^;*request_terminate_timeout = .*|request_terminate_timeout = ${PHP_MAX_TIME}|" /usr/local/etc/php-fpm.d/www.conf; \ + sed -i 's|^;*request_terminate_timeout = .*|request_terminate_timeout = ${PHP_MAX_TIME}|' /usr/local/etc/php-fpm.d/www.conf; \ sed -i 's|access.log = /proc/self/fd/2|access.log = /proc/self/fd/1|' /usr/local/etc/php-fpm.d/docker.conf; \ \ echo "[ -n \"\$TERM\" ] && [ -f /root.motd ] && cat /root.motd" >> /root/.bashrc; \ From 461f9e14c1ba2cd8413d7f7669ca52bae448053d Mon Sep 17 00:00:00 2001 From: "Simon L." Date: Mon, 27 Apr 2026 17:57:47 +0200 Subject: [PATCH 5/5] Apply suggestion from @szaimen Signed-off-by: Simon L. --- Containers/nextcloud/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Containers/nextcloud/Dockerfile b/Containers/nextcloud/Dockerfile index 9dbf8460..2573fb24 100644 --- a/Containers/nextcloud/Dockerfile +++ b/Containers/nextcloud/Dockerfile @@ -264,7 +264,7 @@ RUN set -ex; \ # never reaped. Over time these zombies fill up pm.max_children, leaving no # free slots for legitimate requests and causing Apache to return 502 Bad # Gateway upstream. - sed -i 's|^;*request_terminate_timeout = .*|request_terminate_timeout = ${PHP_MAX_TIME}|' /usr/local/etc/php-fpm.d/www.conf; \ + sed -i "s|^;*request_terminate_timeout = .*|request_terminate_timeout = \${PHP_MAX_TIME}|" /usr/local/etc/php-fpm.d/www.conf; \ sed -i 's|access.log = /proc/self/fd/2|access.log = /proc/self/fd/1|' /usr/local/etc/php-fpm.d/docker.conf; \ \ echo "[ -n \"\$TERM\" ] && [ -f /root.motd ] && cat /root.motd" >> /root/.bashrc; \