diff options
author | sotech117 <michael_foiani@brown.edu> | 2025-07-31 17:27:24 -0400 |
---|---|---|
committer | sotech117 <michael_foiani@brown.edu> | 2025-07-31 17:27:24 -0400 |
commit | 5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e (patch) | |
tree | 8dacb0f195df1c0788d36dd0064f6bbaa3143ede /venv/lib/python3.8/site-packages/werkzeug/urls.py | |
parent | b832d364da8c2efe09e3f75828caf73c50d01ce3 (diff) |
add code for analysis of data
Diffstat (limited to 'venv/lib/python3.8/site-packages/werkzeug/urls.py')
-rw-r--r-- | venv/lib/python3.8/site-packages/werkzeug/urls.py | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/venv/lib/python3.8/site-packages/werkzeug/urls.py b/venv/lib/python3.8/site-packages/werkzeug/urls.py new file mode 100644 index 0000000..5bffe39 --- /dev/null +++ b/venv/lib/python3.8/site-packages/werkzeug/urls.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import codecs +import re +import typing as t +import urllib.parse +from urllib.parse import quote +from urllib.parse import unquote +from urllib.parse import urlencode +from urllib.parse import urlsplit +from urllib.parse import urlunsplit + +from .datastructures import iter_multi_items + + +def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]: + """Used in :func:`uri_to_iri` after unquoting to re-quote any + invalid bytes. + """ + # the docs state that UnicodeError does have these attributes, + # but mypy isn't picking them up + out = quote(e.object[e.start : e.end], safe="") # type: ignore + return out, e.end # type: ignore + + +codecs.register_error("werkzeug.url_quote", _codec_error_url_quote) + + +def _make_unquote_part(name: str, chars: str) -> t.Callable[[str], str]: + """Create a function that unquotes all percent encoded characters except those + given. This allows working with unquoted characters if possible while not changing + the meaning of a given part of a URL. + """ + choices = "|".join(f"{ord(c):02X}" for c in sorted(chars)) + pattern = re.compile(f"((?:%(?:{choices}))+)", re.I) + + def _unquote_partial(value: str) -> str: + parts = iter(pattern.split(value)) + out = [] + + for part in parts: + out.append(unquote(part, "utf-8", "werkzeug.url_quote")) + out.append(next(parts, "")) + + return "".join(out) + + _unquote_partial.__name__ = f"_unquote_{name}" + return _unquote_partial + + +# characters that should remain quoted in URL parts +# based on https://url.spec.whatwg.org/#percent-encoded-bytes +# always keep all controls, space, and % quoted +_always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode() +_unquote_fragment = _make_unquote_part("fragment", _always_unsafe) +_unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#") +_unquote_path = _make_unquote_part("path", _always_unsafe + "/?#") +_unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#") + + +def uri_to_iri(uri: str) -> str: + """Convert a URI to an IRI. All valid UTF-8 characters are unquoted, + leaving all reserved and invalid characters quoted. If the URL has + a domain, it is decoded from Punycode. + + >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF") + 'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF' + + :param uri: The URI to convert. + + .. versionchanged:: 3.0 + Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters, + are removed. + + .. versionchanged:: 2.3 + Which characters remain quoted is specific to each part of the URL. + + .. versionchanged:: 0.15 + All reserved and invalid characters remain quoted. Previously, + only some reserved characters were preserved, and invalid bytes + were replaced instead of left quoted. + + .. versionadded:: 0.6 + """ + parts = urlsplit(uri) + path = _unquote_path(parts.path) + query = _unquote_query(parts.query) + fragment = _unquote_fragment(parts.fragment) + + if parts.hostname: + netloc = _decode_idna(parts.hostname) + else: + netloc = "" + + if ":" in netloc: + netloc = f"[{netloc}]" + + if parts.port: + netloc = f"{netloc}:{parts.port}" + + if parts.username: + auth = _unquote_user(parts.username) + + if parts.password: + password = _unquote_user(parts.password) + auth = f"{auth}:{password}" + + netloc = f"{auth}@{netloc}" + + return urlunsplit((parts.scheme, netloc, path, query, fragment)) + + +def iri_to_uri(iri: str) -> str: + """Convert an IRI to a URI. All non-ASCII and unsafe characters are + quoted. If the URL has a domain, it is encoded to Punycode. + + >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF') + 'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF' + + :param iri: The IRI to convert. + + .. versionchanged:: 3.0 + Passing a tuple or bytes, the ``charset`` and ``errors`` parameters, + and the ``safe_conversion`` parameter, are removed. + + .. versionchanged:: 2.3 + Which characters remain unquoted is specific to each part of the URL. + + .. versionchanged:: 0.15 + All reserved characters remain unquoted. Previously, only some reserved + characters were left unquoted. + + .. versionchanged:: 0.9.6 + The ``safe_conversion`` parameter was added. + + .. versionadded:: 0.6 + """ + parts = urlsplit(iri) + # safe = https://url.spec.whatwg.org/#url-path-segment-string + # as well as percent for things that are already quoted + path = quote(parts.path, safe="%!$&'()*+,/:;=@") + query = quote(parts.query, safe="%!$&'()*+,/:;=?@") + fragment = quote(parts.fragment, safe="%!#$&'()*+,/:;=?@") + + if parts.hostname: + netloc = parts.hostname.encode("idna").decode("ascii") + else: + netloc = "" + + if ":" in netloc: + netloc = f"[{netloc}]" + + if parts.port: + netloc = f"{netloc}:{parts.port}" + + if parts.username: + auth = quote(parts.username, safe="%!$&'()*+,;=") + + if parts.password: + password = quote(parts.password, safe="%!$&'()*+,;=") + auth = f"{auth}:{password}" + + netloc = f"{auth}@{netloc}" + + return urlunsplit((parts.scheme, netloc, path, query, fragment)) + + +# Python < 3.12 +# itms-services was worked around in previous iri_to_uri implementations, but +# we can tell Python directly that it needs to preserve the //. +if "itms-services" not in urllib.parse.uses_netloc: + urllib.parse.uses_netloc.append("itms-services") + + +def _decode_idna(domain: str) -> str: + try: + data = domain.encode("ascii") + except UnicodeEncodeError: + # If the domain is not ASCII, it's decoded already. + return domain + + try: + # Try decoding in one shot. + return data.decode("idna") + except UnicodeDecodeError: + pass + + # Decode each part separately, leaving invalid parts as punycode. + parts = [] + + for part in data.split(b"."): + try: + parts.append(part.decode("idna")) + except UnicodeDecodeError: + parts.append(part.decode("ascii")) + + return ".".join(parts) + + +def _urlencode(query: t.Mapping[str, str] | t.Iterable[tuple[str, str]]) -> str: + items = [x for x in iter_multi_items(query) if x[1] is not None] + # safe = https://url.spec.whatwg.org/#percent-encoded-bytes + return urlencode(items, safe="!$'()*,/:;?@") |