diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8f78943a..6670ad6f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.1 +current_version = 2.1.2 commit = True tag = True tag_name = v{new_version} diff --git a/.flake8 b/.flake8 index 268fd3a8..96c8f44d 100644 --- a/.flake8 +++ b/.flake8 @@ -10,4 +10,5 @@ ignore = W504, # black disagrees with flake8, and inserts whitespace - E203, # whitespace before ':' + # E203: whitespace before ':' + E203, diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ee3f8e52..a7d8f921 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,27 +15,27 @@ jobs: strategy: matrix: include: - - python-version: "3.7" + - python-version: "3.8" env: TOXENV: docs - - python-version: "3.10" + - python-version: "3.12" env: TOXENV: flake8 - - python-version: "3.10" + - python-version: "3.12" env: TOXENV: pylint - - python-version: "3.10" + - python-version: "3.12" env: TOXENV: security - - python-version: "3.10" + - python-version: "3.12" env: TOXENV: black - - python-version: "3.10" + - python-version: "3.12" env: TOXENV: typing steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f694f422..83fc5206 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,12 +10,12 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Set up Python 3.9 + - name: Set up Python 3.12 uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: 3.12 - name: Check Tag id: check-release-tag diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 031ee7b2..12acd0cc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,10 +14,10 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.7"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..5ba0d2a3 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py + fail_on_warning: true +build: + os: ubuntu-22.04 + tools: + # For available versions, see: + # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python + python: "3.12" # Keep in sync with .github/workflows/build.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..8044a257 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,132 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +[opensource@zyte.com](mailto:opensource@zyte.com) +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/NEWS b/NEWS index 6b2e426f..3846a427 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,78 @@ w3lib release notes =================== +2.1.2 (2023-08-03) +------------------ + +- Fix test failures on Python 3.11.4+ (#212, #213). +- Fix an incorrect type hint (#211). +- Add project URLs to setup.py (#215). + +2.1.1 (2022-12-09) +------------------ + +- :func:`~w3lib.url.safe_url_string`, :func:`~w3lib.url.safe_download_url` + and :func:`~w3lib.url.canonicalize_url` now strip whitespace and control + characters urls according to the URL living standard. + + +2.1.0 (2022-11-28) +------------------ + +- Dropped Python 3.6 support, and made Python 3.11 support official. (#195, + #200) + +- :func:`~w3lib.url.safe_url_string` now generates safer URLs. + + To make URLs safer for the `URL living standard`_: + + .. _URL living standard: https://url.spec.whatwg.org/ + + - ``;=`` are percent-encoded in the URL username. + + - ``;:=`` are percent-encoded in the URL password. + + - ``'`` is percent-encoded in the URL query if the URL scheme is `special + `__. + + To make URLs safer for `RFC 2396`_ and `RFC 3986`_, ``|[]`` are + percent-encoded in URL paths, queries, and fragments. + + .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt + .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt + + (#80, #203) + +- :func:`~w3lib.encoding.html_to_unicode` now checks for the `byte order + mark`_ before inspecting the ``Content-Type`` header when determining the + content encoding, in line with the `URL living standard`_. (#189, #191) + + .. _byte order mark: https://en.wikipedia.org/wiki/Byte_order_mark + +- :func:`~w3lib.url.canonicalize_url` now strips spaces from the input URL, + to be more in line with the `URL living standard`_. (#132, #136) + +- :func:`~w3lib.html.get_base_url` now ignores HTML comments. (#70, #77) + +- Fixed :func:`~w3lib.url.safe_url_string` re-encoding percent signs on + the URL username and password even when they were being used as part of an + escape sequence. (#187, #196) + +- Fixed :func:`~w3lib.http.basic_auth_header` using the wrong flavor of + base64 encoding, which could prevent authentication in rare cases. (#181, + #192) + +- Fixed :func:`~w3lib.html.replace_entities` raising :exc:`OverflowError` in + some cases due to `a bug in CPython + `__. (#199, #202) + +- Improved typing and fixed typing issues. (#190, #206) + +- Made CI and test improvements. (#197, #198) + +- Adopted a Code of Conduct. (#194) + + 2.0.1 (2022-08-11) ------------------ Minor documentation fix (release date is set in the changelog). @@ -130,8 +202,6 @@ Other improvements and bug fixes: - ``url_query_cleaner()``: support new ``keep_fragments`` argument (defaulting to ``False``) -.. _RFC 3986: https://tools.ietf.org/html/rfc3986#section-3.2 - 1.15.0 (2016-07-29) ------------------- diff --git a/README.rst b/README.rst index 56641830..00b447d4 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ This is a Python library of web-related functions, such as: Requirements ============ -Python 3.7+ +Python 3.8+ Install ======= diff --git a/docs/conf.py b/docs/conf.py index 27c1af70..cb57d425 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,7 +53,7 @@ # built documents. # # The full version, including alpha/beta/rc tags. -release = '2.0.1' +release = '2.1.2' # The short X.Y version. version = '.'.join(release.split('.')[:2]) @@ -96,7 +96,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the diff --git a/docs/index.rst b/docs/index.rst index aa1c851f..c30d6a59 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,7 +28,7 @@ Modules Requirements ============ -Python 3.7+ +Python 3.8+ Install ======= diff --git a/setup.py b/setup.py index 4e3be0f4..0cfae8a1 100644 --- a/setup.py +++ b/setup.py @@ -41,12 +41,17 @@ def no_cythonize(extensions, **_ignore): setup( name="w3lib", - version="2.0.1", + version="2.1.2", license="BSD", description="Library of web-related functions", author="Scrapy project", author_email="info@scrapy.org", url="https://github.com/scrapy/w3lib", + project_urls={ + "Documentation": "https://w3lib.readthedocs.io/en/latest/", + "Source Code": "https://github.com/scrapy/w3lib", + "Issue Tracker": "https://github.com/scrapy/w3lib/issues", + }, packages=find_packages(exclude=("tests", "tests.*")), package_data={ "w3lib": ["py.typed"], @@ -54,7 +59,7 @@ def no_cythonize(extensions, **_ignore): include_package_data=True, zip_safe=False, platforms=["Any"], - python_requires=">=3.7", + python_requires=">=3.8", install_requires=[ "idna", ], @@ -65,11 +70,11 @@ def no_cythonize(extensions, **_ignore): "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Internet :: WWW/HTTP", diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 6605cce0..58c98c3d 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -1,5 +1,6 @@ import codecs import unittest +from typing import Optional, Union, List, Any import pytest @@ -144,11 +145,11 @@ def test_invalid_utf8(self): self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3") -def ct(charset): +def ct(charset: Optional[str]) -> Optional[str]: return "Content-Type: text/html; charset=" + charset if charset else None -def norm_encoding(enc): +def norm_encoding(enc: str) -> str: return codecs.lookup(enc).name @@ -161,7 +162,13 @@ def test_unicode_body(self): self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(body_unicode, unicode_string) - def _assert_encoding(self, content_type, body, expected_encoding, expected_unicode): + def _assert_encoding( + self, + content_type: Optional[str], + body: bytes, + expected_encoding: str, + expected_unicode: Union[str, List[str]], + ) -> None: assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body) self.assertTrue(isinstance(body_unicode, str)) @@ -233,8 +240,12 @@ def test_replace_wrong_encoding(self): assert "value" in body_unicode, repr(body_unicode) def _assert_encoding_detected( - self, content_type, expected_encoding, body, **kwargs - ): + self, + content_type: Optional[str], + expected_encoding: str, + body: bytes, + **kwargs: Any, + ) -> None: assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) self.assertTrue(isinstance(body_unicode, str)) diff --git a/tests/test_html.py b/tests/test_html.py index 1e637b0f..68abb2ee 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,5 +1,6 @@ import unittest +from w3lib._infra import _C0_CONTROL_OR_SPACE from w3lib.html import ( get_base_url, get_meta_refresh, @@ -65,6 +66,10 @@ def test_illegal_entities(self): self.assertEqual(replace_entities("x≤y"), "x\u2264y") self.assertEqual(replace_entities("xy"), "xy") self.assertEqual(replace_entities("xy", remove_illegal=False), "xy") + self.assertEqual(replace_entities("�"), "") + self.assertEqual( + replace_entities("�", remove_illegal=False), "�" + ) def test_browser_hack(self): # check browser hack for numeric character references in the 80-9F range @@ -156,12 +161,12 @@ def test_returns_unicode(self): assert isinstance(remove_tags(b"no tags"), str) assert isinstance(remove_tags(b"no tags", which_ones=("p",)), str) assert isinstance(remove_tags(b"

one tag

"), str) - assert isinstance(remove_tags(b"

one tag

", which_ones=("p")), str) + assert isinstance(remove_tags(b"

one tag

", which_ones=("p",)), str) assert isinstance(remove_tags(b"link", which_ones=("b",)), str) assert isinstance(remove_tags("no tags"), str) assert isinstance(remove_tags("no tags", which_ones=("p",)), str) assert isinstance(remove_tags("

one tag

"), str) - assert isinstance(remove_tags("

one tag

", which_ones=("p")), str) + assert isinstance(remove_tags("

one tag

", which_ones=("p",)), str) assert isinstance(remove_tags("link", which_ones=("b",)), str) def test_remove_tags_without_tags(self): diff --git a/tests/test_url.py b/tests/test_url.py index 94554542..2c0769b1 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,8 +1,11 @@ import json import os +import sys import unittest +from inspect import isclass from pathlib import Path from timeit import timeit +from typing import Optional, Union, Type, Callable, Tuple, List from urllib.parse import urlparse import pytest @@ -13,6 +16,7 @@ _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE, ) +from w3lib._types import StrOrBytes from w3lib._url import ( _C0_CONTROL_PERCENT_ENCODE_SET, _domain_to_ascii, @@ -41,6 +45,411 @@ url_query_cleaner, ) +# Test cases for URL-to-safe-URL conversions with a URL and an encoding as +# input parameters. +# +# (encoding, input URL, output URL or exception) +SAFE_URL_ENCODING_CASES: List[ + Tuple[Optional[str], StrOrBytes, Union[str, Type[Exception]]] +] = [ + (None, "", ValueError), + (None, "https://example.com", "https://example.com"), + (None, "https://example.com/©", "https://example.com/%C2%A9"), + # Paths are always UTF-8-encoded. + ("iso-8859-1", "https://example.com/©", "https://example.com/%C2%A9"), + # Queries are UTF-8-encoded if the scheme is not special, ws or wss. + ("iso-8859-1", "a://example.com?©", "a://example.com?%C2%A9"), + *( + ("iso-8859-1", f"{scheme}://example.com?©", f"{scheme}://example.com?%C2%A9") + for scheme in ("ws", "wss") + ), + *( + ("iso-8859-1", f"{scheme}://example.com?©", f"{scheme}://example.com?%A9") + for scheme in _SPECIAL_SCHEMES + if scheme not in {"ws", "wss"} + ), + # Fragments are always UTF-8-encoded. + ("iso-8859-1", "https://example.com#©", "https://example.com#%C2%A9"), +] + +INVALID_SCHEME_FOLLOW_UPS = "".join( + chr(value) + for value in range(0x81) + if ( + chr(value) not in _ASCII_ALPHANUMERIC + and chr(value) not in "+-." + and chr(value) not in _C0_CONTROL_OR_SPACE # stripped + and chr(value) != ":" # separator + ) +) + +SAFE_URL_URL_INVALID_SCHEME_CASES = tuple( + (f"{scheme}://example.com", ValueError) + for scheme in ( + # A scheme is required. + "", + # The first scheme letter must be an ASCII alpha. + # Note: 0x80 is included below to also test non-ASCII example. + *( + chr(value) + for value in range(0x81) + if ( + chr(value) not in _ASCII_ALPHA + and chr(value) not in _C0_CONTROL_OR_SPACE # stripped + and chr(value) != ":" # separator + ) + ), + # The follow-up scheme letters can also be ASCII numbers, plus, hyphen, + # or period. + f"a{INVALID_SCHEME_FOLLOW_UPS}", + ) +) + +SCHEME_NON_FIRST = _ASCII_ALPHANUMERIC + "+-." + +# Username and password characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: % +# Removed for the URL living standard: :;= +USERINFO_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + "&+$," +USERNAME_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in USERINFO_SAFE + and chr(value) not in ":/?#\\[]" + ) +) +USERNAME_ENCODED = "".join(f"%{ord(char):02X}" for char in USERNAME_TO_ENCODE) +PASSWORD_TO_ENCODE = USERNAME_TO_ENCODE + ":" +PASSWORD_ENCODED = "".join(f"%{ord(char):02X}" for char in PASSWORD_TO_ENCODE) + +# Path characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: %[\]^| +PATH_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" +PATH_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in PATH_SAFE + and chr(value) not in "?#\\" + ) +) +PATH_ENCODED = "".join(f"%{ord(char):02X}" for char in PATH_TO_ENCODE) + +# Query characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: %[\]^`{|} +# Removed for the URL living standard: ' (special) +QUERY_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" + "?" +QUERY_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in QUERY_SAFE + and chr(value) not in "#" + ) +) +QUERY_ENCODED = "".join(f"%{ord(char):02X}" for char in QUERY_TO_ENCODE) +SPECIAL_QUERY_SAFE = QUERY_SAFE.replace("'", "") +SPECIAL_QUERY_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in SPECIAL_QUERY_SAFE + and chr(value) not in "#" + ) +) +SPECIAL_QUERY_ENCODED = "".join(f"%{ord(char):02X}" for char in SPECIAL_QUERY_TO_ENCODE) + +# Fragment characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: #%[\\]^{|} +FRAGMENT_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" + "?" +FRAGMENT_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if (chr(value) not in _C0_CONTROL_OR_SPACE and chr(value) not in FRAGMENT_SAFE) +) +FRAGMENT_ENCODED = "".join(f"%{ord(char):02X}" for char in FRAGMENT_TO_ENCODE) + + +# Test cases for URL-to-safe-URL conversions with only a URL as input parameter +# (i.e. no encoding or base URL). +# +# (input URL, output URL or exception) +SAFE_URL_URL_CASES = ( + # Invalid input type + (1, Exception), + (object(), Exception), + # Empty string + ("", ValueError), + # Remove any leading and trailing C0 control or space from input. + *( + (f"{char}https://example.com{char}", "https://example.com") + for char in _C0_CONTROL_OR_SPACE + if char not in _ASCII_TAB_OR_NEWLINE + ), + # Remove all ASCII tab or newline from input. + ( + ( + f"{_ASCII_TAB_OR_NEWLINE}h{_ASCII_TAB_OR_NEWLINE}ttps" + f"{_ASCII_TAB_OR_NEWLINE}:{_ASCII_TAB_OR_NEWLINE}/" + f"{_ASCII_TAB_OR_NEWLINE}/{_ASCII_TAB_OR_NEWLINE}a" + f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}:" + f"{_ASCII_TAB_OR_NEWLINE}a{_ASCII_TAB_OR_NEWLINE}b" + f"{_ASCII_TAB_OR_NEWLINE}@{_ASCII_TAB_OR_NEWLINE}exam" + f"{_ASCII_TAB_OR_NEWLINE}ple.com{_ASCII_TAB_OR_NEWLINE}:" + f"{_ASCII_TAB_OR_NEWLINE}1{_ASCII_TAB_OR_NEWLINE}2" + f"{_ASCII_TAB_OR_NEWLINE}/{_ASCII_TAB_OR_NEWLINE}a" + f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}?" + f"{_ASCII_TAB_OR_NEWLINE}a{_ASCII_TAB_OR_NEWLINE}b" + f"{_ASCII_TAB_OR_NEWLINE}#{_ASCII_TAB_OR_NEWLINE}a" + f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}" + ), + "https://ab:ab@example.com:12/ab?ab#ab", + ), + # Scheme + (f"{_ASCII_ALPHA}://example.com", f"{_ASCII_ALPHA.lower()}://example.com"), + ( + f"a{SCHEME_NON_FIRST}://example.com", + f"a{SCHEME_NON_FIRST.lower()}://example.com", + ), + *SAFE_URL_URL_INVALID_SCHEME_CASES, + # Authority + ("https://a@example.com", "https://a@example.com"), + ("https://a:@example.com", "https://a:@example.com"), + ("https://a:a@example.com", "https://a:a@example.com"), + ("https://a%3A@example.com", "https://a%3A@example.com"), + ( + f"https://{USERINFO_SAFE}:{USERINFO_SAFE}@example.com", + f"https://{USERINFO_SAFE}:{USERINFO_SAFE}@example.com", + ), + ( + f"https://{USERNAME_TO_ENCODE}:{PASSWORD_TO_ENCODE}@example.com", + f"https://{USERNAME_ENCODED}:{PASSWORD_ENCODED}@example.com", + ), + ("https://@\\example.com", ValueError), + ("https://\x80:\x80@example.com", "https://%C2%80:%C2%80@example.com"), + # Host + ("https://example.com", "https://example.com"), + ("https://.example", "https://.example"), + ("https://\x80.example", ValueError), + ("https://%80.example", ValueError), + # The 4 cases below test before and after crossing DNS length limits on + # domain name labels (63 characters) and the domain name as a whole (253 + # characters). However, all cases are expected to pass because the URL + # living standard does not require domain names to be within these limits. + (f"https://{'a' * 63}.example", f"https://{'a' * 63}.example"), + (f"https://{'a' * 64}.example", f"https://{'a' * 64}.example"), + ( + f"https://{'a' * 63}.{'a' * 63}.{'a' * 63}.{'a' * 53}.example", + f"https://{'a' * 63}.{'a' * 63}.{'a' * 63}.{'a' * 53}.example", + ), + ( + f"https://{'a' * 63}.{'a' * 63}.{'a' * 63}.{'a' * 54}.example", + f"https://{'a' * 63}.{'a' * 63}.{'a' * 63}.{'a' * 54}.example", + ), + ("https://ñ.example", "https://xn--ida.example"), + ("http://192.168.0.0", "http://192.168.0.0"), + ("http://192.168.0.256", ValueError), + ("http://192.168.0.0.0", ValueError), + ("http://[2a01:5cc0:1:2::4]", "http://[2a01:5cc0:1:2::4]"), + ("http://[2a01:5cc0:1:2:3:4]", ValueError), + # Port + ("https://example.com:", "https://example.com:"), + ("https://example.com:1", "https://example.com:1"), + ("https://example.com:443", "https://example.com:443"), + # Path + ("https://example.com/", "https://example.com/"), + ("https://example.com/a", "https://example.com/a"), + ("https://example.com\\a", "https://example.com/a"), + ("https://example.com/a\\b", "https://example.com/a/b"), + ( + f"https://example.com/{PATH_SAFE}", + f"https://example.com/{PATH_SAFE}", + ), + ( + f"https://example.com/{PATH_TO_ENCODE}", + f"https://example.com/{PATH_ENCODED}", + ), + ("https://example.com/ñ", "https://example.com/%C3%B1"), + ("https://example.com/ñ%C3%B1", "https://example.com/%C3%B1%C3%B1"), + # Query + ("https://example.com?", "https://example.com?"), + ("https://example.com/?", "https://example.com/?"), + ("https://example.com?a", "https://example.com?a"), + ("https://example.com?a=", "https://example.com?a="), + ("https://example.com?a=b", "https://example.com?a=b"), + ( + f"a://example.com?{QUERY_SAFE}", + f"a://example.com?{QUERY_SAFE}", + ), + ( + f"a://example.com?{QUERY_TO_ENCODE}", + f"a://example.com?{QUERY_ENCODED}", + ), + *( + ( + f"{scheme}://example.com?{SPECIAL_QUERY_SAFE}", + f"{scheme}://example.com?{SPECIAL_QUERY_SAFE}", + ) + for scheme in _SPECIAL_SCHEMES + ), + *( + ( + f"{scheme}://example.com?{SPECIAL_QUERY_TO_ENCODE}", + f"{scheme}://example.com?{SPECIAL_QUERY_ENCODED}", + ) + for scheme in _SPECIAL_SCHEMES + ), + ("https://example.com?ñ", "https://example.com?%C3%B1"), + ("https://example.com?ñ%C3%B1", "https://example.com?%C3%B1%C3%B1"), + # Fragment + ("https://example.com#", "https://example.com#"), + ("https://example.com/#", "https://example.com/#"), + ("https://example.com?#", "https://example.com?#"), + ("https://example.com/?#", "https://example.com/?#"), + ("https://example.com#a", "https://example.com#a"), + ( + f"a://example.com#{FRAGMENT_SAFE}", + f"a://example.com#{FRAGMENT_SAFE}", + ), + ( + f"a://example.com#{FRAGMENT_TO_ENCODE}", + f"a://example.com#{FRAGMENT_ENCODED}", + ), + ("https://example.com#ñ", "https://example.com#%C3%B1"), + ("https://example.com#ñ%C3%B1", "https://example.com#%C3%B1%C3%B1"), + # All fields, UTF-8 wherever possible. + ( + "https://ñ:ñ@ñ.example:1/ñ?ñ#ñ", + "https://%C3%B1:%C3%B1@xn--ida.example:1/%C3%B1?%C3%B1#%C3%B1", + ), +) + + +def _test_safe_url_func( + url: StrOrBytes, + *, + encoding: Optional[str] = None, + output: Union[str, Type[Exception]], + func: Callable[..., str], +) -> None: + kwargs = {} + if encoding is not None: + kwargs["encoding"] = encoding + if isclass(output) and issubclass(output, Exception): + with pytest.raises(output): + func(url, **kwargs) + return + actual = func(url, **kwargs) + assert actual == output + assert func(actual, **kwargs) == output # Idempotency + + +def _test_safe_url_string( + url: StrOrBytes, + *, + encoding: Optional[str] = None, + output: Union[str, Type[Exception]], +) -> None: + return _test_safe_url_func( + url, + encoding=encoding, + output=output, + func=safe_url_string, + ) + + +KNOWN_SAFE_URL_STRING_ENCODING_ISSUES = { + (None, ""), # Invalid URL + # UTF-8 encoding is not enforced in non-special URLs, or in URLs with the + # ws or wss schemas. + ("iso-8859-1", "a://example.com?\xa9"), + ("iso-8859-1", "ws://example.com?\xa9"), + ("iso-8859-1", "wss://example.com?\xa9"), + # UTF-8 encoding is not enforced on the fragment. + ("iso-8859-1", "https://example.com#\xa9"), +} + + +@pytest.mark.parametrize( + "encoding,url,output", + tuple( + case + if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + for case in SAFE_URL_ENCODING_CASES + ), +) +def test_safe_url_string_encoding( + encoding: Optional[str], url: StrOrBytes, output: Union[str, Type[Exception]] +) -> None: + _test_safe_url_string(url, encoding=encoding, output=output) + + +KNOWN_SAFE_URL_STRING_URL_ISSUES = { + "", # Invalid URL + *(case[0] for case in SAFE_URL_URL_INVALID_SCHEME_CASES), + # Userinfo characters that the URL living standard requires escaping (:;=) + # are not escaped. + "https://@\\example.com", # Invalid URL + "https://\x80.example", # Invalid domain name (non-visible character) + "https://%80.example", # Invalid domain name (non-visible character) + "http://192.168.0.256", # Invalid IP address + "http://192.168.0.0.0", # Invalid IP address / domain name + "http://[2a01:5cc0:1:2::4]", # https://github.com/scrapy/w3lib/issues/193 + "https://example.com:", # Removes the : + # Does not convert \ to / + "https://example.com\\a", + "https://example.com\\a\\b", + # Encodes \ and / after the first one in the path + "https://example.com/a/b", + "https://example.com/a\\b", + # Some path characters that RFC 2396 and RFC 3986 require escaping (%) + # are not escaped. + f"https://example.com/{PATH_TO_ENCODE}", + # ? is removed + "https://example.com?", + "https://example.com/?", + # Some query characters that RFC 2396 and RFC 3986 require escaping (%) + # are not escaped. + f"a://example.com?{QUERY_TO_ENCODE}", + # Some special query characters that RFC 2396 and RFC 3986 require escaping + # (%) are not escaped. + *( + f"{scheme}://example.com?{SPECIAL_QUERY_TO_ENCODE}" + for scheme in _SPECIAL_SCHEMES + ), + # ? and # are removed + "https://example.com#", + "https://example.com/#", + "https://example.com?#", + "https://example.com/?#", + # Some fragment characters that RFC 2396 and RFC 3986 require escaping + # (%) are not escaped. + f"a://example.com#{FRAGMENT_TO_ENCODE}", +} +if sys.version_info < (3, 11, 4): + KNOWN_SAFE_URL_STRING_URL_ISSUES.add("http://[2a01:5cc0:1:2:3:4]") # Invalid IPv6 + + +@pytest.mark.parametrize( + "url,output", + tuple( + case + if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + for case in SAFE_URL_URL_CASES + ), +) +def test_safe_url_string_url( + url: StrOrBytes, output: Union[str, Type[Exception]] +) -> None: + _test_safe_url_string(url, output=output) + TO_ASCII_TEST_DATA_FILE_PATH = Path(__file__).parent / "to-ascii-test-data.json" TO_ASCII_TEST_DATA_KNOWN_ISSUES = ( @@ -562,11 +971,7 @@ def test_safe_url_string_encoding(encoding, url, output): KNOWN_SAFE_URL_STRING_URL_ISSUES = { "", # Invalid URL - *(case[0] for case in SAFE_URL_URL_STRIP_CASES), *(case[0] for case in SAFE_URL_URL_INVALID_SCHEME_CASES), - # %3A gets decoded, going from a "a:" username to a "a" username with an - # empty password. - "https://a%3A@example.com", # Userinfo characters that the URL living standard requires escaping (:;=) # are not escaped. f"https://{USERNAME_TO_ENCODE}:{PASSWORD_TO_ENCODE}@example.com", @@ -657,6 +1062,56 @@ def test_safe_url_string_path_encoding(self): safeurl = safe_url_string("http://www.example.com/£", path_encoding="latin-1") self.assertTrue(isinstance(safeurl, str)) self.assertEqual(safeurl, "http://www.example.com/%A3") + self.assertTrue(isinstance(safe_url_string(b"http://example.com/"), str)) + + def test_safe_url_string_remove_ascii_tab_and_newlines(self): + self.assertEqual( + safe_url_string("http://example.com/test\n.html"), + "http://example.com/test.html", + ) + self.assertEqual( + safe_url_string("http://example.com/test\t.html"), + "http://example.com/test.html", + ) + self.assertEqual( + safe_url_string("http://example.com/test\r.html"), + "http://example.com/test.html", + ) + self.assertEqual( + safe_url_string("http://example.com/test\r.html\n"), + "http://example.com/test.html", + ) + self.assertEqual( + safe_url_string("http://example.com/test\r\n.html\t"), + "http://example.com/test.html", + ) + self.assertEqual( + safe_url_string("http://example.com/test\a\n.html"), + "http://example.com/test%07.html", + ) + + def test_safe_url_string_quote_path(self): + safeurl = safe_url_string('http://google.com/"hello"', quote_path=True) + self.assertEqual(safeurl, "http://google.com/%22hello%22") + + safeurl = safe_url_string('http://google.com/"hello"', quote_path=False) + self.assertEqual(safeurl, 'http://google.com/"hello"') + + safeurl = safe_url_string('http://google.com/"hello"') + self.assertEqual(safeurl, "http://google.com/%22hello%22") + + def test_safe_url_string_with_query(self): + safeurl = safe_url_string("http://www.example.com/£?unit=µ") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") + + safeurl = safe_url_string("http://www.example.com/£?unit=µ", encoding="utf-8") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5") + + safeurl = safe_url_string("http://www.example.com/£?unit=µ", encoding="latin-1") + self.assertTrue(isinstance(safeurl, str)) + self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5") safeurl = safe_url_string( "http://www.example.com/£?unit=µ", path_encoding="latin-1" diff --git a/tox.ini b/tox.ini index 60b12f38..56822019 100644 --- a/tox.ini +++ b/tox.ini @@ -4,14 +4,14 @@ # and then run "tox" from this directory. [tox] -envlist = py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black, typing +envlist = py38, py39, py310, py311, py312, pypy3, docs, security, flake8, pylint, black, typing [testenv] deps = pytest !=3.1.1, !=3.1.2 pytest-cov commands = - pytest \ + python -m pytest \ --doctest-modules \ --cov=w3lib --cov-report=term --cov-report=xml \ {posargs:w3lib tests} @@ -27,14 +27,14 @@ basepython = python3 deps = # mypy would error if pytest (or its sub) not found pytest - mypy==0.971 + mypy==1.0.0 commands = - mypy --show-error-codes {posargs: w3lib tests} + mypy --strict {posargs: w3lib tests} [testenv:flake8] basepython = python3 deps = - flake8 + flake8==6.1.0 commands = flake8 \ {posargs:w3lib tests setup.py} @@ -42,7 +42,7 @@ commands = [testenv:pylint] deps = {[testenv]deps} - pylint==2.14.2 + pylint==3.0.0 commands = pylint conftest.py docs setup.py tests w3lib @@ -52,12 +52,8 @@ deps = commands = black {posargs:--check conftest.py setup.py tests w3lib} -[docs] +[testenv:docs] changedir = docs deps = -rdocs/requirements.txt - -[testenv:docs] -changedir = {[docs]changedir} -deps = {[docs]deps} commands = sphinx-build -W -b html . {envtmpdir}/html diff --git a/w3lib/__init__.py b/w3lib/__init__.py index fb5f52cb..dd7a1fc7 100644 --- a/w3lib/__init__.py +++ b/w3lib/__init__.py @@ -1,2 +1,2 @@ -__version__ = "2.0.1" +__version__ = "2.1.2" version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split(".")) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 0879ead7..7d46d785 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -136,7 +136,7 @@ def _c18n_encoding(encoding: str) -> str: encoding aliases """ normed = encodings.normalize_encoding(encoding).lower() - return encodings.aliases.aliases.get(normed, normed) + return cast(str, encodings.aliases.aliases.get(normed, normed)) def resolve_encoding(encoding_alias: str) -> Optional[str]: diff --git a/w3lib/html.py b/w3lib/html.py index a31d42bd..f0f0184e 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -66,7 +66,7 @@ def replace_entities( """ - def convert_entity(m: Match) -> str: + def convert_entity(m: Match[str]) -> str: groups = m.groupdict() number = None if groups.get("dec"): @@ -91,7 +91,7 @@ def convert_entity(m: Match) -> str: return bytes((number,)).decode("cp1252") else: return chr(number) - except ValueError: + except (ValueError, OverflowError): pass return "" if remove_illegal and groups.get("semicolon") else m.group(0) @@ -205,7 +205,7 @@ def will_remove(tag: str) -> bool: else: return tag not in keep - def remove_tag(m: Match) -> str: + def remove_tag(m: Match[str]) -> str: tag = m.group(1) return "" if will_remove(tag) else m.group(0) @@ -278,7 +278,9 @@ def unquote_markup( """ - def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]: + def _get_fragments( + txt: str, pattern: Pattern[str] + ) -> Iterable[Union[str, Match[str]]]: offset = 0 for match in pattern.finditer(txt): match_s, match_e = match.span(1) @@ -326,8 +328,8 @@ def get_meta_refresh( baseurl: str = "", encoding: str = "utf-8", ignore_tags: Iterable[str] = ("script", "noscript"), -) -> Tuple[Optional[float], Optional[str]]: - """Return the http-equiv parameter of the HTML meta element from the given +) -> Union[Tuple[None, None], Tuple[float, str]]: + """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. diff --git a/w3lib/http.py b/w3lib/http.py index 10d16695..a3e4e174 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -2,7 +2,7 @@ from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping from w3lib.util import to_bytes, to_unicode -HeadersDictInput = Mapping[bytes, Union[Any, Sequence]] +HeadersDictInput = Mapping[bytes, Union[Any, Sequence[bytes]]] HeadersDictOutput = MutableMapping[bytes, List[bytes]] diff --git a/w3lib/url.py b/w3lib/url.py index 0c5faae7..c5c2fea3 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -36,6 +36,10 @@ from urllib.parse import _coerce_args # type: ignore from urllib.request import pathname2url, url2pathname +from ._infra import ( + _ASCII_TAB_OR_NEWLINE, + _C0_CONTROL_OR_SPACE, +) from ._rfc2396 import ( _RFC2396_ABS_PATH_PERCENT_ENCODE_SET, _RFC2396_FRAGMENT_PERCENT_ENCODE_SET, @@ -56,6 +60,7 @@ _QUERY_PERCENT_ENCODE_SET, _serialize_url, _SPECIAL_QUERY_PERCENT_ENCODE_SET, + _SPECIAL_SCHEMES, _USERINFO_PERCENT_ENCODE_SET, ) from .util import to_unicode @@ -76,13 +81,25 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]: RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii") EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25 +RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%" _path_safe_chars = _safe_chars.replace(b"#", b"") -RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" -_ascii_tab_newline_re = re.compile( - r"[\t\n\r]" -) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline +# Characters that are safe in all of: +# +# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class +# - RFC 3986 +# - The URL living standard +# +# NOTE: % is currently excluded from these lists of characters, due to +# limitations of the current safe_url_string implementation, but it should also +# be escaped as %25 when it is not already being used as part of an escape +# character. +_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") +_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|") +_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS +_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") +_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS _SAFE_USERINFO_PERCENT_ENCODE_SET = ( _USERINFO_PERCENT_ENCODE_SET @@ -158,15 +175,46 @@ def safe_url( return _serialize_url(url, canonicalize=False) -def safe_url_string( +_ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE = { + ord(char): None for char in _ASCII_TAB_OR_NEWLINE +} + + +def _strip(url: str) -> str: + return url.strip(_C0_CONTROL_OR_SPACE).translate( + _ASCII_TAB_OR_NEWLINE_TRANSLATION_TABLE + ) + + +def safe_url_string( # pylint: disable=too-many-locals url: StrOrBytes, encoding: str = "utf8", path_encoding: str = "utf8", quote_path: bool = True, ) -> str: - """Convert the given URL into a legal URL by escaping unsafe characters - according to RFC-3986. Also, ASCII tabs and newlines are removed - as per https://url.spec.whatwg.org/#url-parsing. + """Return a URL equivalent to *url* that a wide range of web browsers and + web servers consider valid. + + *url* is parsed according to the rules of the `URL living standard`_, + and during serialization additional characters are percent-encoded to make + the URL valid by additional URL standards. + + .. _URL living standard: https://url.spec.whatwg.org/ + + The returned URL should be valid by *all* of the following URL standards + known to be enforced by modern-day web browsers and web servers: + + - `URL living standard`_ + + - `RFC 3986`_ + + - `RFC 2396`_ and `RFC 2732`_, as interpreted by `Java 8’s java.net.URI + class`_. + + .. _Java 8’s java.net.URI class: https://docs.oracle.com/javase/8/docs/api/java/net/URI.html + .. _RFC 2396: https://www.ietf.org/rfc/rfc2396.txt + .. _RFC 2732: https://www.ietf.org/rfc/rfc2732.txt + .. _RFC 3986: https://www.ietf.org/rfc/rfc3986.txt If a bytes URL is given, it is first converted to `str` using the given encoding (which defaults to 'utf-8'). If quote_path is True (default), @@ -180,17 +228,15 @@ def safe_url_string( Calling this function on an already "safe" URL will return the URL unmodified. - - Always returns a native `str` (bytes in Python2, unicode in Python3). """ - # Python3's urlsplit() chokes on bytes input with non-ASCII chars, + # urlsplit() chokes on bytes input with non-ASCII chars, # so let's decode (to Unicode) using page encoding: # - it is assumed that a raw bytes input comes from a document # encoded with the supplied encoding (or UTF8 by default) # - if the supplied (or default) encoding chokes, # percent-encode offending bytes decoded = to_unicode(url, encoding=encoding, errors="percentencode") - parts = urlsplit(_ascii_tab_newline_re.sub("", decoded)) + parts = urlsplit(_strip(decoded)) username, password, hostname, port = ( parts.username, @@ -201,11 +247,11 @@ def safe_url_string( netloc_bytes = b"" if username is not None or password is not None: if username is not None: - safe_username = quote(unquote(username), RFC3986_USERINFO_SAFE_CHARS) + safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS) netloc_bytes += safe_username.encode(encoding) if password is not None: netloc_bytes += b":" - safe_password = quote(unquote(password), RFC3986_USERINFO_SAFE_CHARS) + safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS) netloc_bytes += safe_password.encode(encoding) netloc_bytes += b"@" if hostname is not None: @@ -223,17 +269,22 @@ def safe_url_string( # default encoding for path component SHOULD be UTF-8 if quote_path: - path = quote(parts.path.encode(path_encoding), _path_safe_chars) + path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS) else: path = parts.path + if parts.scheme in _SPECIAL_SCHEMES: + query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS) + else: + query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS) + return urlunsplit( ( parts.scheme, netloc, path, - quote(parts.query.encode(encoding), _safe_chars), - quote(parts.fragment.encode(encoding), _safe_chars), + query, + quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS), ) ) @@ -373,7 +424,7 @@ def url_query_cleaner( url = "?".join([base, sep.join(querylist)]) if querylist else base if keep_fragments and fragment: url += "#" + fragment - return cast(str, url) + return url def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str: @@ -602,11 +653,8 @@ def canonicalize_url( ) -> str: r"""Canonicalize the given url by applying the following procedures: + - make the URL safe - sort query arguments, first by key, then by value - - percent encode paths ; non-ASCII characters are percent-encoded - using UTF-8 (RFC-3986) - - percent encode query arguments ; non-ASCII characters are percent-encoded - using passed `encoding` (UTF-8 by default) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless `keep_blank_values` is True) @@ -634,7 +682,7 @@ def canonicalize_url( # so we should be covered regarding URL normalization, # if not for proper URL expected by remote website. if isinstance(url, str): - url = url.strip() + url = _strip(url) try: scheme, netloc, path, params, query, fragment = _safe_ParseResult( parse_url(url), encoding=encoding or "utf8" @@ -734,7 +782,7 @@ def parse_qsl_to_bytes( # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) # except for the unquote(s, encoding, errors) calls replaced # with unquote_to_bytes(s) - coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args) + coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args) qs, _coerce_result = coerce_args(qs) pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")] r = [] @@ -755,5 +803,5 @@ def parse_qsl_to_bytes( value: StrOrBytes = nv[1].replace("+", " ") value = unquote_to_bytes(value) value = _coerce_result(value) - r.append((cast(bytes, name), cast(bytes, value))) + r.append((name, value)) return r