diff --git a/gunicorn/asgi/parser.py b/gunicorn/asgi/parser.py index 000a41a6..e7432a77 100644 --- a/gunicorn/asgi/parser.py +++ b/gunicorn/asgi/parser.py @@ -443,7 +443,7 @@ class CallbackRequest: __slots__ = ( 'method', 'uri', 'path', 'query', 'fragment', 'version', - 'headers', 'headers_bytes', 'scheme', + 'headers', 'headers_bytes', 'scheme', 'raw_path', 'content_length', 'chunked', 'must_close', 'proxy_protocol_info', '_expect_100_continue', ) @@ -458,6 +458,7 @@ class CallbackRequest: self.headers = [] self.headers_bytes = [] self.scheme = "http" + self.raw_path = b'' self.content_length = 0 self.chunked = False self.must_close = False @@ -475,20 +476,27 @@ class CallbackRequest: Returns: CallbackRequest instance """ + from urllib.parse import unquote_to_bytes + req = cls() req.method = parser.method.decode('ascii') # Parse path and query from URL - raw_path = parser.path - if b'?' in raw_path: - path_part, query_part = raw_path.split(b'?', 1) - req.path = path_part.decode('latin-1') + # Per ASGI spec: + # - path: percent-decoded UTF-8 string + # - raw_path: original bytes as received + raw_url = parser.path + if b'?' in raw_url: + path_part, query_part = raw_url.split(b'?', 1) + req.raw_path = path_part # Store original bytes + req.path = unquote_to_bytes(path_part).decode('utf-8', errors='replace') req.query = query_part.decode('latin-1') else: - req.path = raw_path.decode('latin-1') + req.raw_path = raw_url # Store original bytes + req.path = unquote_to_bytes(raw_url).decode('utf-8', errors='replace') req.query = '' - req.uri = raw_path.decode('latin-1') + req.uri = raw_url.decode('latin-1') req.fragment = '' req.version = parser.http_version diff --git a/gunicorn/asgi/protocol.py b/gunicorn/asgi/protocol.py index 0e29038e..b0dc9acb 100644 --- a/gunicorn/asgi/protocol.py +++ b/gunicorn/asgi/protocol.py @@ -921,7 +921,7 @@ class ASGIProtocol(asyncio.Protocol): "method": request.method, "scheme": request.scheme, "path": request.path, - "raw_path": request.path.encode("latin-1") if request.path else b"", + "raw_path": request.raw_path if request.raw_path else b"", "query_string": request.query.encode("latin-1") if request.query else b"", "root_path": self.cfg.root_path or "", "headers": headers, @@ -985,7 +985,7 @@ class ASGIProtocol(asyncio.Protocol): "http_version": f"{request.version[0]}.{request.version[1]}", "scheme": "wss" if request.scheme == "https" else "ws", "path": request.path, - "raw_path": request.path.encode("latin-1") if request.path else b"", + "raw_path": request.raw_path if request.raw_path else b"", "query_string": request.query.encode("latin-1") if request.query else b"", "root_path": self.cfg.root_path or "", "headers": headers, @@ -1457,7 +1457,7 @@ class ASGIProtocol(asyncio.Protocol): "method": request.method, "scheme": request.scheme, "path": request.path, - "raw_path": request.path.encode("latin-1") if request.path else b"", + "raw_path": getattr(request, 'raw_path', None) or (request.path.encode("latin-1") if request.path else b""), "query_string": request.query.encode("latin-1") if request.query else b"", "root_path": self.cfg.root_path or "", "headers": headers, diff --git a/tests/test_asgi_callback_parser.py b/tests/test_asgi_callback_parser.py index 9e034759..7bba3ea2 100644 --- a/tests/test_asgi_callback_parser.py +++ b/tests/test_asgi_callback_parser.py @@ -429,3 +429,90 @@ class TestCallbackBehavior: assert parser.is_complete assert body_chunks == [] # Body was skipped + + +class TestCallbackRequest: + """Test CallbackRequest building from parser state.""" + + def test_non_ascii_path_decoding(self, http_parser): + """Test that percent-encoded UTF-8 paths are decoded correctly. + + Per ASGI spec: + - path: percent-decoded UTF-8 string + - raw_path: original bytes as received + """ + from gunicorn.asgi.parser import CallbackRequest + + parser_class = get_parser_class(http_parser) + parser = parser_class() + + # ö = %C3%B6 in UTF-8 percent-encoded + parser.feed(b"GET /%C3%B6/ HTTP/1.1\r\nHost: test\r\n\r\n") + + request = CallbackRequest.from_parser(parser) + + # path should be percent-decoded UTF-8 string + assert request.path == "/\u00f6/" # /ö/ + # raw_path should be original bytes + assert request.raw_path == b"/%C3%B6/" + + def test_non_ascii_path_with_query(self, http_parser): + """Test percent-encoded path with query string.""" + from gunicorn.asgi.parser import CallbackRequest + + parser_class = get_parser_class(http_parser) + parser = parser_class() + + # Japanese: /日本/ = /%E6%97%A5%E6%9C%AC/ + parser.feed(b"GET /%E6%97%A5%E6%9C%AC/?q=test HTTP/1.1\r\nHost: test\r\n\r\n") + + request = CallbackRequest.from_parser(parser) + + assert request.path == "/\u65e5\u672c/" # /日本/ + assert request.raw_path == b"/%E6%97%A5%E6%9C%AC/" + assert request.query == "q=test" + + def test_invalid_utf8_path(self, http_parser): + """Test that invalid UTF-8 sequences use replacement character.""" + from gunicorn.asgi.parser import CallbackRequest + + parser_class = get_parser_class(http_parser) + parser = parser_class() + + # %FF is invalid UTF-8 + parser.feed(b"GET /%FF HTTP/1.1\r\nHost: test\r\n\r\n") + + request = CallbackRequest.from_parser(parser) + + # Should use replacement character for invalid bytes + assert "\ufffd" in request.path + assert request.raw_path == b"/%FF" + + def test_simple_ascii_path(self, http_parser): + """Test that simple ASCII paths work unchanged.""" + from gunicorn.asgi.parser import CallbackRequest + + parser_class = get_parser_class(http_parser) + parser = parser_class() + + parser.feed(b"GET /api/users HTTP/1.1\r\nHost: test\r\n\r\n") + + request = CallbackRequest.from_parser(parser) + + assert request.path == "/api/users" + assert request.raw_path == b"/api/users" + + def test_percent_encoded_ascii(self, http_parser): + """Test percent-encoded ASCII characters.""" + from gunicorn.asgi.parser import CallbackRequest + + parser_class = get_parser_class(http_parser) + parser = parser_class() + + # Space encoded as %20 + parser.feed(b"GET /hello%20world HTTP/1.1\r\nHost: test\r\n\r\n") + + request = CallbackRequest.from_parser(parser) + + assert request.path == "/hello world" + assert request.raw_path == b"/hello%20world"