Fix non-ASCII URL handling in ASGI worker

Percent-decode path to UTF-8 and preserve raw_path as original bytes
per ASGI spec. Fixes #3543
This commit is contained in:
Benoit Chesneau 2026-03-22 16:35:05 +01:00
parent 932331d8a4
commit ba1aaa5e33
3 changed files with 105 additions and 10 deletions

View File

@ -443,7 +443,7 @@ class CallbackRequest:
__slots__ = (
'method', 'uri', 'path', 'query', 'fragment', 'version',
'headers', 'headers_bytes', 'scheme',
'headers', 'headers_bytes', 'scheme', 'raw_path',
'content_length', 'chunked', 'must_close',
'proxy_protocol_info', '_expect_100_continue',
)
@ -458,6 +458,7 @@ class CallbackRequest:
self.headers = []
self.headers_bytes = []
self.scheme = "http"
self.raw_path = b''
self.content_length = 0
self.chunked = False
self.must_close = False
@ -475,20 +476,27 @@ class CallbackRequest:
Returns:
CallbackRequest instance
"""
from urllib.parse import unquote_to_bytes
req = cls()
req.method = parser.method.decode('ascii')
# Parse path and query from URL
raw_path = parser.path
if b'?' in raw_path:
path_part, query_part = raw_path.split(b'?', 1)
req.path = path_part.decode('latin-1')
# Per ASGI spec:
# - path: percent-decoded UTF-8 string
# - raw_path: original bytes as received
raw_url = parser.path
if b'?' in raw_url:
path_part, query_part = raw_url.split(b'?', 1)
req.raw_path = path_part # Store original bytes
req.path = unquote_to_bytes(path_part).decode('utf-8', errors='replace')
req.query = query_part.decode('latin-1')
else:
req.path = raw_path.decode('latin-1')
req.raw_path = raw_url # Store original bytes
req.path = unquote_to_bytes(raw_url).decode('utf-8', errors='replace')
req.query = ''
req.uri = raw_path.decode('latin-1')
req.uri = raw_url.decode('latin-1')
req.fragment = ''
req.version = parser.http_version

View File

@ -921,7 +921,7 @@ class ASGIProtocol(asyncio.Protocol):
"method": request.method,
"scheme": request.scheme,
"path": request.path,
"raw_path": request.path.encode("latin-1") if request.path else b"",
"raw_path": request.raw_path if request.raw_path else b"",
"query_string": request.query.encode("latin-1") if request.query else b"",
"root_path": self.cfg.root_path or "",
"headers": headers,
@ -985,7 +985,7 @@ class ASGIProtocol(asyncio.Protocol):
"http_version": f"{request.version[0]}.{request.version[1]}",
"scheme": "wss" if request.scheme == "https" else "ws",
"path": request.path,
"raw_path": request.path.encode("latin-1") if request.path else b"",
"raw_path": request.raw_path if request.raw_path else b"",
"query_string": request.query.encode("latin-1") if request.query else b"",
"root_path": self.cfg.root_path or "",
"headers": headers,
@ -1457,7 +1457,7 @@ class ASGIProtocol(asyncio.Protocol):
"method": request.method,
"scheme": request.scheme,
"path": request.path,
"raw_path": request.path.encode("latin-1") if request.path else b"",
"raw_path": getattr(request, 'raw_path', None) or (request.path.encode("latin-1") if request.path else b""),
"query_string": request.query.encode("latin-1") if request.query else b"",
"root_path": self.cfg.root_path or "",
"headers": headers,

View File

@ -429,3 +429,90 @@ class TestCallbackBehavior:
assert parser.is_complete
assert body_chunks == [] # Body was skipped
class TestCallbackRequest:
"""Test CallbackRequest building from parser state."""
def test_non_ascii_path_decoding(self, http_parser):
"""Test that percent-encoded UTF-8 paths are decoded correctly.
Per ASGI spec:
- path: percent-decoded UTF-8 string
- raw_path: original bytes as received
"""
from gunicorn.asgi.parser import CallbackRequest
parser_class = get_parser_class(http_parser)
parser = parser_class()
# ö = %C3%B6 in UTF-8 percent-encoded
parser.feed(b"GET /%C3%B6/ HTTP/1.1\r\nHost: test\r\n\r\n")
request = CallbackRequest.from_parser(parser)
# path should be percent-decoded UTF-8 string
assert request.path == "/\u00f6/" # /ö/
# raw_path should be original bytes
assert request.raw_path == b"/%C3%B6/"
def test_non_ascii_path_with_query(self, http_parser):
"""Test percent-encoded path with query string."""
from gunicorn.asgi.parser import CallbackRequest
parser_class = get_parser_class(http_parser)
parser = parser_class()
# Japanese: /日本/ = /%E6%97%A5%E6%9C%AC/
parser.feed(b"GET /%E6%97%A5%E6%9C%AC/?q=test HTTP/1.1\r\nHost: test\r\n\r\n")
request = CallbackRequest.from_parser(parser)
assert request.path == "/\u65e5\u672c/" # /日本/
assert request.raw_path == b"/%E6%97%A5%E6%9C%AC/"
assert request.query == "q=test"
def test_invalid_utf8_path(self, http_parser):
"""Test that invalid UTF-8 sequences use replacement character."""
from gunicorn.asgi.parser import CallbackRequest
parser_class = get_parser_class(http_parser)
parser = parser_class()
# %FF is invalid UTF-8
parser.feed(b"GET /%FF HTTP/1.1\r\nHost: test\r\n\r\n")
request = CallbackRequest.from_parser(parser)
# Should use replacement character for invalid bytes
assert "\ufffd" in request.path
assert request.raw_path == b"/%FF"
def test_simple_ascii_path(self, http_parser):
"""Test that simple ASCII paths work unchanged."""
from gunicorn.asgi.parser import CallbackRequest
parser_class = get_parser_class(http_parser)
parser = parser_class()
parser.feed(b"GET /api/users HTTP/1.1\r\nHost: test\r\n\r\n")
request = CallbackRequest.from_parser(parser)
assert request.path == "/api/users"
assert request.raw_path == b"/api/users"
def test_percent_encoded_ascii(self, http_parser):
"""Test percent-encoded ASCII characters."""
from gunicorn.asgi.parser import CallbackRequest
parser_class = get_parser_class(http_parser)
parser = parser_class()
# Space encoded as %20
parser.feed(b"GET /hello%20world HTTP/1.1\r\nHost: test\r\n\r\n")
request = CallbackRequest.from_parser(parser)
assert request.path == "/hello world"
assert request.raw_path == b"/hello%20world"