From f145e90e322256f1c4e1eea8e175d2e188ceab43 Mon Sep 17 00:00:00 2001 From: Roy Williams Date: Wed, 12 Feb 2020 20:35:03 -0500 Subject: [PATCH] Set `max_accept` on `gevent` worker-class to 1 when workers > 1 We've had really terrible tail latencies with gevent and gunicorn under load. Inspecting our services with strace we see the following: ``` 23:11:01.651529 accept4(5, {sa_family=AF_UNIX}, [110->2], SOCK_CLOEXEC) = 223 <0.000015> ..{18 successful calls to accept4}... 23:11:01.652590 accept4(5, {sa_family=AF_UNIX}, [110->2], SOCK_CLOEXEC) = 249 <0.000010> 23:11:01.652647 accept4(5, 0x7ffcd46c09d0, [110], SOCK_CLOEXEC) = -1 EAGAIN (Resource temporarily unavailable) <0.000012> 23:11:01.657622 getsockname(5, {sa_family=AF_UNIX, sun_path="/run/gunicorn/gunicorn.sock"}, [110->30]) = 0 <0.000009> 23:11:01.657682 recvfrom(223, "XXX"..., 8192, 0, NULL, NULL) = 511 <0.000011> ..{16 calls to recvfrom}... 23:11:01.740726 recvfrom(243, "XXX"..., 8192, 0, NULL, NULL) = 511 <0.000012> 23:11:01.746074 getsockname(5, {sa_family=AF_UNIX, sun_path="/run/gunicorn/gunicorn.sock"}, [110->30]) = 0 <0.000013> 23:11:01.746153 recvfrom(246, "XXX"..., 8192, 0, NULL, NULL) = 511 <0.000014> 23:11:01.751540 getsockname(5, {sa_family=AF_UNIX, sun_path="/run/gunicorn/gunicorn.sock"}, [110->30]) = 0 <0.000010> 23:11:01.751599 recvfrom(249, "XXX"..., 8192, 0, NULL, NULL) = 511 <0.000013> ``` Notice we see a flury of 20 `accept4`s followed by 20 calls to to `recvfrom`. Each call to `recvfrom` happens 5ms after the previous, so the last `recvfrom` is called ~100ms after the call to `accept4` for that fd. gevent suggest setting `max_accept` to a lower value when there's multiple working processes on the same listening socket: https://github.com/gevent/gevent/blob/785b7b5546fcd0a184ea954f5d358539c530d95f/src/gevent/baseserver.py#L89-L102 gevent sets `max_accept` to `1` when `wsgi.multiprocess` is True: https://github.com/gevent/gevent/blob/9d27d269ed01a7e752966caa7a6f85d773780a1a/src/gevent/pywsgi.py#L1470-L1472 gunicorn does in fact set this when the number of workers is > 1: https://github.com/benoitc/gunicorn/blob/e4e20f273e95f505277a8dadf390bbdd162cfff4/gunicorn/http/wsgi.py#L73 and this gets passed to `gevent.pywsgi.WSGIServer`: https://github.com/benoitc/gunicorn/blob/e4e20f273e95f505277a8dadf390bbdd162cfff4/gunicorn/workers/ggevent.py#L67-L75 However, when `worker-class` is `gevent` we directly create a `gevent.server.StreamServer`: https://github.com/benoitc/gunicorn/blob/e4e20f273e95f505277a8dadf390bbdd162cfff4/gunicorn/workers/ggevent.py#L77-L78 Fixing this dropped the p50 response time on an especially probelmatic benchmark from 250ms to 115ms. --- gunicorn/workers/ggevent.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gunicorn/workers/ggevent.py b/gunicorn/workers/ggevent.py index 57340221..3941814f 100644 --- a/gunicorn/workers/ggevent.py +++ b/gunicorn/workers/ggevent.py @@ -76,6 +76,8 @@ class GeventWorker(AsyncWorker): else: hfun = partial(self.handle, s) server = StreamServer(s, handle=hfun, spawn=pool, **ssl_args) + if self.cfg.workers > 1: + server.max_accept = 1 server.start() servers.append(server)