Add optional fast HTTP parser for ASGI workers

- Add http_parser config setting (auto/fast/python)
- Add gunicorn_h1c as optional dependency [fast]
- Add unified HttpParser class with fallback to pure Python
- Parser tries gunicorn_h1c in 'auto' mode, falls back gracefully
- 'fast' mode requires gunicorn_h1c, 'python' forces pure Python

Install with: pip install gunicorn[fast]
This commit is contained in:
Benoit Chesneau 2026-03-21 09:19:41 +01:00
parent 2cc38503b7
commit b833a9b6df
3 changed files with 679 additions and 0 deletions

639
gunicorn/asgi/parser.py Normal file
View File

@ -0,0 +1,639 @@
#
# This file is part of gunicorn released under the MIT license.
# See the NOTICE for more information.
"""
Unified HTTP parser interface for ASGI workers.
Provides a common interface for both fast C parser (gunicorn_h1c)
and the pure Python parser, with incremental (push-based) parsing.
"""
import re
import ipaddress
import socket
import struct
from gunicorn.http.errors import (
InvalidHeader, InvalidHeaderName, NoMoreData,
InvalidRequestLine, InvalidRequestMethod, InvalidHTTPVersion,
LimitRequestLine, LimitRequestHeaders,
UnsupportedTransferCoding, ObsoleteFolding,
InvalidProxyLine, InvalidProxyHeader, ForbiddenProxyRequest,
InvalidSchemeHeaders, ExpectationFailed,
)
from gunicorn.http.message import PP_V2_SIGNATURE, PPCommand, PPFamily, PPProtocol
from gunicorn.util import bytes_to_str, split_request_uri
MAX_REQUEST_LINE = 8190
MAX_HEADERS = 32768
DEFAULT_MAX_HEADERFIELD_SIZE = 8190
# Reuse regex patterns
RFC9110_5_6_2_TOKEN_SPECIALS = r"!#$%&'*+-.^_`|~"
TOKEN_RE = re.compile(r"[%s0-9a-zA-Z]+" % (re.escape(RFC9110_5_6_2_TOKEN_SPECIALS)))
METHOD_BADCHAR_RE = re.compile("[a-z#]")
VERSION_RE = re.compile(r"HTTP/(\d)\.(\d)")
RFC9110_5_5_INVALID_AND_DANGEROUS = re.compile(r"[\0\r\n]")
def _ip_in_allow_list(ip_str, allow_list, networks):
"""Check if IP address is in the allow list."""
if '*' in allow_list:
return True
try:
ip = ipaddress.ip_address(ip_str)
except ValueError:
return False
for network in networks:
if ip in network:
return True
return False
class ParseResult:
"""Result of header parsing."""
__slots__ = (
'method', 'uri', 'path', 'query', 'fragment', 'version',
'headers', 'scheme', 'content_length', 'chunked',
'keep_alive', 'consumed', 'proxy_protocol_info',
'must_close', 'expect_100_continue',
)
def __init__(self):
self.method = None
self.uri = None
self.path = None
self.query = None
self.fragment = None
self.version = None
self.headers = []
self.scheme = "http"
self.content_length = 0
self.chunked = False
self.keep_alive = True
self.consumed = 0
self.proxy_protocol_info = None
self.must_close = False
self.expect_100_continue = False
class HttpParser:
"""Unified incremental HTTP parser.
Works with both gunicorn_h1c (fast C extension) and pure Python parsing.
Designed for push-based parsing where data arrives via data_received().
"""
# Class-level cache for fast parser availability (import check is expensive)
_fast_available = None
_h1c_module = None
def __init__(self, cfg, peer_addr, is_ssl=False, req_number=1, is_trusted_proxy=False):
"""Initialize the parser.
Args:
cfg: gunicorn config object
peer_addr: client address tuple (host, port)
is_ssl: whether this is an SSL connection
req_number: request number on this connection (for proxy protocol)
is_trusted_proxy: whether peer is in forwarded_allow_ips (pre-computed)
"""
self.cfg = cfg
self.peer_addr = peer_addr
self.is_ssl = is_ssl
self.req_number = req_number
self._is_trusted_proxy = is_trusted_proxy
self._result = None
# Limits
self.limit_request_line = cfg.limit_request_line
if self.limit_request_line < 0 or self.limit_request_line >= MAX_REQUEST_LINE:
self.limit_request_line = MAX_REQUEST_LINE
self.limit_request_fields = cfg.limit_request_fields
if self.limit_request_fields <= 0 or self.limit_request_fields > MAX_HEADERS:
self.limit_request_fields = MAX_HEADERS
self.limit_request_field_size = cfg.limit_request_field_size
if self.limit_request_field_size < 0:
self.limit_request_field_size = DEFAULT_MAX_HEADERFIELD_SIZE
max_header_field_size = self.limit_request_field_size or DEFAULT_MAX_HEADERFIELD_SIZE
self.max_buffer_headers = self.limit_request_fields * (max_header_field_size + 2) + 4
# Use cached fast parser check (import is expensive, do once per process)
self._use_fast = self._check_fast_available()
def _check_fast_available(self):
"""Check if fast C parser is available (cached at class level)."""
parser_setting = getattr(self.cfg, 'http_parser', 'auto')
if parser_setting == 'python':
return False
# Use class-level cache to avoid repeated import checks
if HttpParser._fast_available is None:
try:
import gunicorn_h1c
HttpParser._fast_available = True
HttpParser._h1c_module = gunicorn_h1c
except ImportError:
HttpParser._fast_available = False
if not HttpParser._fast_available and parser_setting == 'fast':
raise RuntimeError("gunicorn_h1c not installed but http_parser='fast'")
return HttpParser._fast_available
def feed(self, buffer):
"""Parse buffer incrementally.
Args:
buffer: bytearray containing received data
Returns:
ParseResult if headers are complete, None if more data needed
Raises:
Various HTTP parsing errors for malformed requests
"""
if self._use_fast:
return self._feed_fast(buffer)
else:
return self._feed_python(buffer)
def _feed_fast(self, buffer):
"""Parse using fast C parser."""
try:
result = HttpParser._h1c_module.parse_request(bytes(buffer))
# gunicorn_h1c returns bytes, convert to strings (latin-1)
pr = ParseResult()
pr.method = bytes_to_str(result['method'])
# gunicorn_h1c returns 'path' which is the full URI (path?query)
pr.uri = bytes_to_str(result['path'])
# Parse path/query from URI
try:
parts = split_request_uri(pr.uri)
pr.path = parts.path or ""
pr.query = parts.query or ""
pr.fragment = parts.fragment or ""
except ValueError:
pr.path = pr.uri
pr.query = ""
pr.fragment = ""
pr.version = (1, result['minor_version'])
# Headers - convert to uppercase strings
pr.headers = [(bytes_to_str(n).upper(), bytes_to_str(v)) for n, v in result['headers']]
pr.consumed = result['consumed']
pr.keep_alive = result['minor_version'] >= 1
pr.scheme = "https" if self.is_ssl else "http"
# Parse body info from headers
self._parse_body_info(pr)
self._result = pr
return pr
except Exception as e:
if "incomplete" in str(e).lower():
return None
raise
def _feed_python(self, buffer):
"""Parse using pure Python parser."""
# Handle proxy protocol on first request
mode = self.cfg.proxy_protocol
proxy_info = None
buf_offset = 0
if mode != "off" and self.req_number == 1:
# Check for proxy protocol
if len(buffer) < 12:
return None # Need more data
if mode in ("v2", "auto") and buffer[:12] == PP_V2_SIGNATURE:
self._proxy_protocol_access_check()
consumed, proxy_info = self._parse_proxy_v2(buffer)
if consumed is None:
return None # Need more data
buf_offset = consumed
elif mode in ("v1", "auto") and buffer[:6] == b"PROXY ":
self._proxy_protocol_access_check()
consumed, proxy_info = self._parse_proxy_v1(buffer)
if consumed is None:
return None # Need more data
buf_offset = consumed
# Find request line
idx = buffer.find(b"\r\n", buf_offset)
if idx < 0:
if len(buffer) - buf_offset > self.limit_request_line:
raise LimitRequestLine(len(buffer) - buf_offset, self.limit_request_line)
return None # Need more data
line_len = idx - buf_offset
if line_len > self.limit_request_line:
raise LimitRequestLine(line_len, self.limit_request_line)
request_line = bytes(buffer[buf_offset:idx])
headers_start = idx + 2
# Find end of headers
headers_end = buffer.find(b"\r\n\r\n", headers_start)
if headers_end < 0:
# Check for empty headers case
if buffer[headers_start:headers_start + 2] == b"\r\n":
headers_end = headers_start
else:
if len(buffer) - headers_start > self.max_buffer_headers:
raise LimitRequestHeaders("max buffer headers")
return None # Need more data
# Parse request line
pr = ParseResult()
pr.proxy_protocol_info = proxy_info
self._parse_request_line(request_line, pr)
# Parse headers (if any)
if buffer[headers_start:headers_start + 2] == b"\r\n":
# Empty headers
pr.consumed = headers_start + 2
else:
headers_data = bytes(buffer[headers_start:headers_end])
pr.headers = self._parse_headers(headers_data)
pr.consumed = headers_end + 4
# Set scheme
pr.scheme = "https" if self.is_ssl else "http"
# Check for scheme headers from trusted proxy
self._apply_scheme_headers(pr)
# Parse body info
self._parse_body_info(pr)
# Determine keep-alive
pr.keep_alive = self._should_keep_alive(pr)
self._result = pr
return pr
def _proxy_protocol_access_check(self):
"""Check if proxy protocol is allowed from this peer."""
if isinstance(self.peer_addr, tuple):
if not _ip_in_allow_list(
self.peer_addr[0],
self.cfg.proxy_allow_ips,
self.cfg.proxy_allow_networks()
):
raise ForbiddenProxyRequest(self.peer_addr[0])
def _parse_proxy_v1(self, buffer):
"""Parse PROXY protocol v1 (text format).
Returns (consumed, info) or (None, None) if incomplete.
"""
idx = buffer.find(b"\r\n")
if idx < 0:
return None, None
line = bytes_to_str(bytes(buffer[:idx]))
bits = line.split(" ")
if len(bits) != 6:
raise InvalidProxyLine(line)
proto = bits[1]
s_addr = bits[2]
d_addr = bits[3]
if proto not in ["TCP4", "TCP6"]:
raise InvalidProxyLine("protocol '%s' not supported" % proto)
if proto == "TCP4":
try:
socket.inet_pton(socket.AF_INET, s_addr)
socket.inet_pton(socket.AF_INET, d_addr)
except OSError:
raise InvalidProxyLine(line)
elif proto == "TCP6":
try:
socket.inet_pton(socket.AF_INET6, s_addr)
socket.inet_pton(socket.AF_INET6, d_addr)
except OSError:
raise InvalidProxyLine(line)
try:
s_port = int(bits[4])
d_port = int(bits[5])
except ValueError:
raise InvalidProxyLine("invalid port %s" % line)
if not ((0 <= s_port <= 65535) and (0 <= d_port <= 65535)):
raise InvalidProxyLine("invalid port %s" % line)
info = {
"proxy_protocol": proto,
"client_addr": s_addr,
"client_port": s_port,
"proxy_addr": d_addr,
"proxy_port": d_port
}
return idx + 2, info
def _parse_proxy_v2(self, buffer):
"""Parse PROXY protocol v2 (binary format).
Returns (consumed, info) or (None, None) if incomplete.
"""
if len(buffer) < 16:
return None, None
ver_cmd = buffer[12]
fam_proto = buffer[13]
length = struct.unpack(">H", bytes(buffer[14:16]))[0]
version = (ver_cmd & 0xF0) >> 4
if version != 2:
raise InvalidProxyHeader("unsupported version %d" % version)
command = ver_cmd & 0x0F
if command not in (PPCommand.LOCAL, PPCommand.PROXY):
raise InvalidProxyHeader("unsupported command %d" % command)
total_size = 16 + length
if len(buffer) < total_size:
return None, None
if command == PPCommand.LOCAL:
info = {
"proxy_protocol": "LOCAL",
"client_addr": None,
"client_port": None,
"proxy_addr": None,
"proxy_port": None
}
return total_size, info
family = (fam_proto & 0xF0) >> 4
protocol = fam_proto & 0x0F
if protocol != PPProtocol.STREAM:
raise InvalidProxyHeader("only TCP protocol is supported")
addr_data = bytes(buffer[16:16 + length])
if family == PPFamily.INET:
if length < 12:
raise InvalidProxyHeader("insufficient address data for IPv4")
s_addr = socket.inet_ntop(socket.AF_INET, addr_data[0:4])
d_addr = socket.inet_ntop(socket.AF_INET, addr_data[4:8])
s_port = struct.unpack(">H", addr_data[8:10])[0]
d_port = struct.unpack(">H", addr_data[10:12])[0]
proto = "TCP4"
elif family == PPFamily.INET6:
if length < 36:
raise InvalidProxyHeader("insufficient address data for IPv6")
s_addr = socket.inet_ntop(socket.AF_INET6, addr_data[0:16])
d_addr = socket.inet_ntop(socket.AF_INET6, addr_data[16:32])
s_port = struct.unpack(">H", addr_data[32:34])[0]
d_port = struct.unpack(">H", addr_data[34:36])[0]
proto = "TCP6"
elif family == PPFamily.UNSPEC:
info = {
"proxy_protocol": "UNSPEC",
"client_addr": None,
"client_port": None,
"proxy_addr": None,
"proxy_port": None
}
return total_size, info
else:
raise InvalidProxyHeader("unsupported address family %d" % family)
info = {
"proxy_protocol": proto,
"client_addr": s_addr,
"client_port": s_port,
"proxy_addr": d_addr,
"proxy_port": d_port
}
return total_size, info
def _parse_request_line(self, line_bytes, result):
"""Parse the HTTP request line."""
bits = [bytes_to_str(bit) for bit in line_bytes.split(b" ", 2)]
if len(bits) != 3:
raise InvalidRequestLine(bytes_to_str(line_bytes))
# Method
result.method = bits[0]
if not self.cfg.permit_unconventional_http_method:
if METHOD_BADCHAR_RE.search(result.method):
raise InvalidRequestMethod(result.method)
if not 3 <= len(bits[0]) <= 20:
raise InvalidRequestMethod(result.method)
if not TOKEN_RE.fullmatch(result.method):
raise InvalidRequestMethod(result.method)
if self.cfg.casefold_http_method:
result.method = result.method.upper()
# URI
result.uri = bits[1]
if len(result.uri) == 0:
raise InvalidRequestLine(bytes_to_str(line_bytes))
try:
parts = split_request_uri(result.uri)
except ValueError:
raise InvalidRequestLine(bytes_to_str(line_bytes))
result.path = parts.path or ""
result.query = parts.query or ""
result.fragment = parts.fragment or ""
# Version
match = VERSION_RE.fullmatch(bits[2])
if match is None:
raise InvalidHTTPVersion(bits[2])
result.version = (int(match.group(1)), int(match.group(2)))
if not (1, 0) <= result.version < (2, 0):
if not self.cfg.permit_unconventional_http_version:
raise InvalidHTTPVersion(result.version)
def _parse_headers(self, data):
"""Parse HTTP headers from raw data."""
headers = []
lines = [bytes_to_str(line) for line in data.split(b"\r\n")]
num_lines = len(lines)
i = 0
while i < num_lines:
if len(headers) >= self.limit_request_fields:
raise LimitRequestHeaders("limit request headers fields")
curr = lines[i]
i += 1
header_length = len(curr) + len("\r\n")
if curr.find(":") <= 0:
raise InvalidHeader(curr)
name, value = curr.split(":", 1)
if self.cfg.strip_header_spaces:
name = name.rstrip(" \t")
if not TOKEN_RE.fullmatch(name):
raise InvalidHeaderName(name)
name = name.upper()
value = [value.strip(" \t")]
# Handle obsolete folding
while i < num_lines and lines[i].startswith((" ", "\t")):
if not self.cfg.permit_obsolete_folding:
raise ObsoleteFolding(name)
curr = lines[i]
i += 1
header_length += len(curr) + len("\r\n")
if header_length > self.limit_request_field_size > 0:
raise LimitRequestHeaders("limit request headers fields size")
value.append(curr.strip("\t "))
value = " ".join(value)
if RFC9110_5_5_INVALID_AND_DANGEROUS.search(value):
raise InvalidHeader(name)
if header_length > self.limit_request_field_size > 0:
raise LimitRequestHeaders("limit request headers fields size")
# Handle underscore in header names
if "_" in name:
forwarder_headers = self.cfg.forwarder_headers
if name in forwarder_headers or "*" in forwarder_headers:
pass
elif self.cfg.header_map == "dangerous":
pass
elif self.cfg.header_map == "drop":
continue
else:
raise InvalidHeaderName(name)
headers.append((name, value))
return headers
def _apply_scheme_headers(self, result):
"""Apply scheme headers from trusted proxy."""
if not isinstance(self.peer_addr, tuple):
return
# Use pre-computed trusted proxy check (avoids IP parsing on every request)
if not self._is_trusted_proxy:
return
secure_scheme_headers = self.cfg.secure_scheme_headers
scheme_header = False
for name, value in result.headers:
if name == "EXPECT":
if value.lower() == "100-continue":
if result.version >= (1, 1):
result.expect_100_continue = True
else:
raise ExpectationFailed(value)
if name in secure_scheme_headers:
secure = value == secure_scheme_headers[name]
scheme = "https" if secure else "http"
if scheme_header:
if scheme != result.scheme:
raise InvalidSchemeHeaders()
else:
scheme_header = True
result.scheme = scheme
def _parse_body_info(self, result):
"""Parse Content-Length and Transfer-Encoding from headers."""
chunked = False
content_length = None
for name, value in result.headers:
if name == "CONTENT-LENGTH":
if content_length is not None:
raise InvalidHeader("CONTENT-LENGTH")
content_length = value
elif name == "TRANSFER-ENCODING":
vals = [v.strip() for v in value.split(',')]
for val in vals:
if val.lower() == "chunked":
if chunked:
raise InvalidHeader("TRANSFER-ENCODING")
chunked = True
elif val.lower() == "identity":
if chunked:
raise InvalidHeader("TRANSFER-ENCODING")
elif val.lower() in ('compress', 'deflate', 'gzip'):
if chunked:
raise InvalidHeader("TRANSFER-ENCODING")
result.must_close = True
else:
raise UnsupportedTransferCoding(value)
if chunked:
if result.version < (1, 1):
raise InvalidHeader("TRANSFER-ENCODING")
if content_length is not None:
raise InvalidHeader("CONTENT-LENGTH")
result.chunked = True
result.content_length = -1
elif content_length is not None:
try:
if str(content_length).isnumeric():
result.content_length = int(content_length)
else:
raise InvalidHeader("CONTENT-LENGTH")
except ValueError:
raise InvalidHeader("CONTENT-LENGTH")
if result.content_length < 0:
raise InvalidHeader("CONTENT-LENGTH")
else:
result.content_length = 0
def _should_keep_alive(self, result):
"""Determine if connection should be kept alive."""
if result.must_close:
return False
for name, value in result.headers:
if name == "CONNECTION":
v = value.lower().strip(" \t")
if v == "close":
return False
elif v == "keep-alive":
return True
break
return result.version > (1, 0)
def reset(self):
"""Reset parser state for next request on keep-alive connection."""
self._result = None
self.req_number += 1

View File

@ -2772,6 +2772,21 @@ def validate_asgi_lifespan(val):
return val
def validate_http_parser(val):
"""Validate http_parser setting.
Accepts: auto, fast, python
"""
if val is None:
return "auto"
if not isinstance(val, str):
raise TypeError("http_parser must be a string")
val = val.lower().strip()
if val not in ("auto", "fast", "python"):
raise ValueError("http_parser must be: auto, fast, or python")
return val
class ASGILoop(Setting):
name = "asgi_loop"
section = "Worker Processes"
@ -2845,6 +2860,30 @@ class ASGIDisconnectGracePeriod(Setting):
"""
class HttpParser(Setting):
name = "http_parser"
section = "Worker Processes"
cli = ["--http-parser"]
meta = "STRING"
validator = validate_http_parser
default = "auto"
desc = """\
HTTP parser implementation for ASGI workers.
- auto: Use gunicorn_h1c if available, otherwise pure Python (default)
- fast: Require gunicorn_h1c C extension (fail if unavailable)
- python: Force pure Python parser
The gunicorn_h1c C extension provides significantly faster HTTP
parsing using picohttpparser with SIMD optimizations. Install it
with: pip install gunicorn[fast]
This setting only affects the ``asgi`` worker type.
.. versionadded:: 25.0.0
"""
class RootPath(Setting):
name = "root_path"
section = "Server Mechanics"

View File

@ -53,6 +53,7 @@ tornado = ["tornado>=6.5.0"]
gthread = []
setproctitle = ["setproctitle"]
http2 = ["h2>=4.1.0"]
fast = ["gunicorn_h1c>=0.1.0"]
testing = [
"gevent>=24.10.1",
"eventlet>=0.40.3",