Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a safe_url based on all standards #221

Draft
wants to merge 36 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4798aaf
test_safe_url_idna: adjust test data to the test scope
Gallaecio Nov 3, 2022
86711dd
Fix pytest-cov warning
Gallaecio Nov 4, 2022
030a996
Implement safe_url and provide partial test coverage
Gallaecio Nov 8, 2022
6f0dc16
Merge remote-tracking branch 'upstream/master' into safer-url
Gallaecio Nov 8, 2022
17ee8d6
Progress on typing
Gallaecio Nov 8, 2022
aef0532
Address typing issues
Gallaecio Nov 9, 2022
abe4032
Fix unicode encoding in userinfo and IPv6 parsing
Gallaecio Nov 9, 2022
86435df
Progress on tests
Gallaecio Nov 11, 2022
79193bf
Address issues reported by tests and static checkers
Gallaecio Nov 15, 2022
82f6e66
Complete test coverage for safe_url
Gallaecio Nov 15, 2022
81fc830
Progress on passing upstream tests
Gallaecio Nov 15, 2022
7e72474
Solve issues previously marked as expected failures
Gallaecio Nov 16, 2022
2c62859
Validate host parsing according to upstream URL living standard tests
Gallaecio Nov 16, 2022
3623c35
Cover port and path in upstream tests
Gallaecio Nov 17, 2022
2513ea1
Cover query and fragment in upstream tests
Gallaecio Nov 17, 2022
7d92740
Add upstream tests for domain_to_ascii
Gallaecio Nov 17, 2022
6a080da
Add upstream tests for percent encoding
Gallaecio Nov 17, 2022
79eafeb
Add a performance test
Gallaecio Nov 17, 2022
3028165
Add idna.txt to source distribution
Gallaecio Nov 17, 2022
091905e
2x speed-up at the cost of mishandling invalid code points
Gallaecio Nov 17, 2022
fea33a6
_percent_encode_after_encoding: 0.2 speed-up
Gallaecio Nov 17, 2022
184e5a7
0.1 speed-up
Gallaecio Nov 17, 2022
4c21a0e
x1.2 speed up
Gallaecio Nov 17, 2022
c317f9e
x1.67 speed up
Gallaecio Nov 17, 2022
7152d0b
Remove unused code
Gallaecio Nov 17, 2022
9daca87
x1.125 speed up
Gallaecio Nov 17, 2022
88f32ae
Use urllib.parse.unquote
Gallaecio Nov 18, 2022
63a2f2f
Initial use of Cython (x7 speedup)
Gallaecio Feb 12, 2024
eefbb9e
Use Cython for additional files (x1.5 speedup)
Gallaecio Feb 12, 2024
8185cdc
Merge _encoding into _url and use uchar for state control
Gallaecio Feb 13, 2024
bd15eb9
Update test expectations
Gallaecio Feb 13, 2024
5b18c9a
Merge remote-tracking branch 'scrapy/master' into safer-url
Gallaecio Feb 13, 2024
4960889
WIP
Gallaecio Feb 13, 2024
25ecb97
Refactor _URL
Gallaecio Feb 13, 2024
66b9154
Minor improvements
Gallaecio Feb 13, 2024
8f1f809
Minor change
Gallaecio Feb 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .bandit.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
skips:
- B101 # Use of assert; we use them for mypy
- B107
1 change: 0 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
[run]
branch = true
include = w3lib/*
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ coverage.xml
/index.txt
.dmypy.json
.hypothesis/
*.so
*.html
4 changes: 4 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Include C and Cython files
include **/*.c **/*.pyx

# Include tests into distribution
recursive-include tests *.py *.txt

Expand All @@ -10,3 +13,4 @@ include NEWS
include README.rst
include pytest.ini
include tox.ini
include w3lib/idna.txt
47 changes: 46 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,47 @@
from setuptools import setup, find_packages
import os
from setuptools import setup, find_packages, Extension


# https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#distributing-cython-modules
def no_cythonize(extensions, **_ignore):
for extension in extensions:
sources = []
for sfile in extension.sources:
path, ext = os.path.splitext(sfile)
if ext in (".pyx", ".py"):
if extension.language == "c++":
ext = ".cpp"
else:
ext = ".c"
sfile = path + ext
sources.append(sfile)
extension.sources[:] = sources
return extensions

extensions = [
Extension(f"w3lib._{name}", [f"w3lib/_{name}.pyx"])
for name in (
"infra",
"rfc2396",
"rfc3986",
"rfc5892",
"types",
"url",
"util",
"utr46",
)
]

if bool(int(os.getenv("CYTHONIZE", 0))):
from Cython.Build import cythonize
compiler_directives = {
"language_level": 3,
"profile": bool(int(os.getenv("CYTHON_PROFILE", 0))),
}
extensions = cythonize(extensions, compiler_directives=compiler_directives, force=bool(int(os.getenv("CYTHON_FORCE", 0))))
else:
extensions = no_cythonize(extensions)

setup(
name="w3lib",
version="2.1.2",
Expand All @@ -22,6 +63,10 @@
zip_safe=False,
platforms=["Any"],
python_requires=">=3.8",
install_requires=[
"idna",
],
ext_modules=extensions,
classifiers=[
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: BSD License",
Expand Down
50 changes: 50 additions & 0 deletions tests/percent-encoding-test-data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
[
"Source: https://github.com/web-platform-tests/wpt/blob/77d95d52351ebf51bc80ee21eea455de31a5e356/url/resources/percent-encoding.json",

"Tests for percent-encoding.",
{
"input": "\u2020",
"output": {
"big5": "%26%238224%3B",
"euc-kr": "%A2%D3",
"utf-8": "%E2%80%A0",
"windows-1252": "%86"
}
},
"This uses a trailing A to prevent the URL parser from trimming the C0 control.",
{
"input": "\u000EA",
"output": {
"big5": "%0EA",
"iso-2022-jp": "%26%2365533%3BA",
"utf-8": "%0EA"
}
},
{
"input": "\u203E\u005C",
"output": {
"iso-2022-jp": "%1B(J~%1B(B\\",
"utf-8": "%E2%80%BE\\"
}
},
{
"input": "\uE5E5",
"output": {
"gb18030": "%26%2358853%3B",
"utf-8": "%EE%97%A5"
}
},
{
"input": "\u2212",
"output": {
"shift_jis": "%81|",
"utf-8": "%E2%88%92"
}
},
{
"input": "á|",
"output": {
"utf-8": "%C3%A1|"
}
}
]
23 changes: 23 additions & 0 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import unittest
from typing import Optional, Union, List, Any

import pytest

from w3lib._encoding import _get_encoding, _LABEL_ENCODINGS
from w3lib.encoding import (
html_body_declared_encoding,
http_content_type_encoding,
Expand All @@ -11,6 +14,26 @@
to_unicode,
)

# Encodings from the spec that Python does not support.
_UNSUPPORTED_ENCODINGS = {
"iso-8859-8-i", # https://bugs.python.org/msg213772
"replacement", # Not an actual encoding
# Not supported.
# We could bring support to it with the webencodings package.
"x-user-defined",
}


@pytest.mark.parametrize(
"label,name", tuple((label, name) for label, name in _LABEL_ENCODINGS.items())
)
def test_get_encoding_python(label, name):
"""The encodings that _get_encoding can return must work as encoding
aliases in Python."""
assert _get_encoding(label) == name
if name not in _UNSUPPORTED_ENCODINGS:
codecs.lookup(name) # Raises LookupError if not found.


class RequestEncodingTests(unittest.TestCase):
utf8_fragments = [
Expand Down
1 change: 1 addition & 0 deletions tests/test_html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest

from w3lib._infra import _C0_CONTROL_OR_SPACE
from w3lib.html import (
get_base_url,
get_meta_refresh,
Expand Down
Loading
Loading