diff --git a/docs/conf.py b/docs/conf.py index cb57d42..0d6ec28 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,87 +9,88 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os +import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'hoverxref.extension', - 'notfound.extension', - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', + "hoverxref.extension", + "notfound.extension", + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'w3lib' -copyright = '2014, w3lib developers' +project = "w3lib" +copyright = "2014, w3lib developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. -release = '2.1.2' +release = "2.1.2" # The short X.Y version. -version = '.'.join(release.split('.')[:2]) +version = ".".join(release.split(".")[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- @@ -101,26 +102,26 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -129,101 +130,95 @@ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'w3libdoc' +htmlhelp_basename = "w3libdoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'w3lib.tex', 'w3lib Documentation', - 'w3lib developers', 'manual'), + ("index", "w3lib.tex", "w3lib Documentation", "w3lib developers", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'w3lib', 'w3lib Documentation', - ['w3lib developers'], 1) -] +man_pages = [("index", "w3lib", "w3lib Documentation", ["w3lib developers"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -232,27 +227,33 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'w3lib', 'w3lib Documentation', - 'w3lib developers', 'w3lib', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "w3lib", + "w3lib Documentation", + "w3lib developers", + "w3lib", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'pytest': ('https://docs.pytest.org/en/latest', None), - 'python': ('https://docs.python.org/3', None), - 'scrapy': ('https://scrapy.readthedocs.io/en/latest', None), - 'tox': ('https://tox.readthedocs.io/en/latest', None), + "pytest": ("https://docs.pytest.org/en/latest", None), + "python": ("https://docs.python.org/3", None), + "scrapy": ("https://scrapy.readthedocs.io/en/latest", None), + "tox": ("https://tox.readthedocs.io/en/latest", None), } diff --git a/setup.py b/setup.py index c8728de..9825882 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ -from setuptools import setup, find_packages - +from setuptools import find_packages, setup setup( name="w3lib", diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 0389e78..2f59e28 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -1,11 +1,11 @@ import codecs import unittest -from typing import Optional, Union, List, Any +from typing import Any, List, Optional, Union from w3lib.encoding import ( html_body_declared_encoding, - http_content_type_encoding, html_to_unicode, + http_content_type_encoding, read_bom, resolve_encoding, to_unicode, @@ -39,17 +39,17 @@ def test_bom(self): utf32le = b"\xff\xfe\x00\x00\x34\x6c\x00\x00" for string in (utf16be, utf16le, utf32be, utf32le): bom_encoding, bom = read_bom(string) - assert bom_encoding is not None - assert bom is not None + self.assertIsNotNone(bom_encoding) + self.assertIsNotNone(bom) decoded = string[len(bom) :].decode(bom_encoding) self.assertEqual(water_unicode, decoded) # Body without BOM enc, bom = read_bom(b"foo") - self.assertEqual(enc, None) - self.assertEqual(bom, None) + self.assertIsNone(enc) + self.assertIsNone(bom) # Empty body enc, bom = read_bom(b"") - self.assertEqual(enc, None) + self.assertIsNone(enc) self.assertEqual(bom, None) def test_http_encoding_header(self): @@ -146,7 +146,8 @@ def _assert_encoding( expected_encoding: str, expected_unicode: Union[str, List[str]], ) -> None: - assert not isinstance(body, str) + if isinstance(body, str): + raise AssertionError("body must not be an instance of str") encoding, body_unicode = html_to_unicode(ct(content_type), body) self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) @@ -208,13 +209,13 @@ def test_replace_wrong_encoding(self): encoding, body_unicode = html_to_unicode(ct("utf-8"), b"PREFIX\xe3\xabSUFFIX") # XXX: Policy for replacing invalid chars may suffer minor variations # but it should always contain the unicode replacement char ('\ufffd') - assert "\ufffd" in body_unicode, repr(body_unicode) - assert "PREFIX" in body_unicode, repr(body_unicode) - assert "SUFFIX" in body_unicode, repr(body_unicode) + self.assertIn("\ufffd", body_unicode) + self.assertIn("PREFIX", body_unicode) + self.assertIn("SUFFIX", body_unicode) # Do not destroy html tags due to encoding bugs encoding, body_unicode = html_to_unicode(ct("utf-8"), b"\xf0value") - assert "value" in body_unicode, repr(body_unicode) + self.assertIn("value", body_unicode) def _assert_encoding_detected( self, @@ -223,7 +224,8 @@ def _assert_encoding_detected( body: bytes, **kwargs: Any, ) -> None: - assert not isinstance(body, str) + if isinstance(body, str): + raise AssertionError("body must not be an instance of str") encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) @@ -272,7 +274,7 @@ def test_python_crash(self): random.seed(42) buf = BytesIO() for i in range(150000): - buf.write(bytes([random.randint(0, 255)])) + buf.write(bytes([random.randint(0, 255)])) # nosec to_unicode(buf.getvalue(), "utf-16-le") to_unicode(buf.getvalue(), "utf-16-be") to_unicode(buf.getvalue(), "utf-32-le") diff --git a/tests/test_html.py b/tests/test_html.py index cf4ec03..3fe42d6 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -16,10 +16,14 @@ class RemoveEntitiesTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode - assert isinstance(replace_entities(b"no entities"), str) - assert isinstance(replace_entities(b"Price: £100!"), str) - assert isinstance(replace_entities("no entities"), str) - assert isinstance(replace_entities("Price: £100!"), str) + if not isinstance(replace_entities(b"no entities"), str): + raise AssertionError() + if not isinstance(replace_entities(b"Price: £100!"), str): + raise AssertionError() + if not isinstance(replace_entities("no entities"), str): + raise AssertionError() + if not isinstance(replace_entities("Price: £100!"), str): + raise AssertionError() def test_regular(self): # regular conversions @@ -105,8 +109,8 @@ def test_encoding(self): class ReplaceTagsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode - assert isinstance(replace_tags(b"no entities"), str) - assert isinstance(replace_tags("no entities"), str) + self.assertEqual(isinstance(replace_tags(b"no entities"), str), True) + self.assertEqual(isinstance(replace_tags("no entities"), str), True) def test_replace_tags(self): self.assertEqual( @@ -127,10 +131,14 @@ def test_replace_tags_multiline(self): class RemoveCommentsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_comments(b"without comments"), str) - assert isinstance(remove_comments(b""), str) - assert isinstance(remove_comments("without comments"), str) - assert isinstance(remove_comments(""), str) + self.assertEqual(isinstance(remove_comments(b"without comments"), str), True) + self.assertEqual( + isinstance(remove_comments(b""), str), True + ) + self.assertEqual(isinstance(remove_comments("without comments"), str), True) + self.assertEqual( + isinstance(remove_comments(""), str), True + ) def test_no_comments(self): # text without comments @@ -157,16 +165,28 @@ def test_remove_comments(self): class RemoveTagsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_tags(b"no tags"), str) - assert isinstance(remove_tags(b"no tags", which_ones=("p",)), str) - assert isinstance(remove_tags(b"

one tag

"), str) - assert isinstance(remove_tags(b"

one tag

", which_ones=("p",)), str) - assert isinstance(remove_tags(b"link", which_ones=("b",)), str) - assert isinstance(remove_tags("no tags"), str) - assert isinstance(remove_tags("no tags", which_ones=("p",)), str) - assert isinstance(remove_tags("

one tag

"), str) - assert isinstance(remove_tags("

one tag

", which_ones=("p",)), str) - assert isinstance(remove_tags("link", which_ones=("b",)), str) + self.assertEqual(isinstance(remove_tags(b"no tags"), str), True) + self.assertEqual( + isinstance(remove_tags(b"no tags", which_ones=("p",)), str), True + ) + self.assertEqual(isinstance(remove_tags(b"

one tag

"), str), True) + self.assertEqual( + isinstance(remove_tags(b"

one tag

", which_ones=("p",)), str), True + ) + self.assertEqual( + isinstance(remove_tags(b"link", which_ones=("b",)), str), True + ) + self.assertEqual(isinstance(remove_tags("no tags"), str), True) + self.assertEqual( + isinstance(remove_tags("no tags", which_ones=("p",)), str), True + ) + self.assertEqual(isinstance(remove_tags("

one tag

"), str), True) + self.assertEqual( + isinstance(remove_tags("

one tag

", which_ones=("p",)), str), True + ) + self.assertEqual( + isinstance(remove_tags("link", which_ones=("b",)), str), True + ) def test_remove_tags_without_tags(self): # text without tags @@ -232,21 +252,37 @@ def test_uppercase_tags(self): class RemoveTagsWithContentTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_tags_with_content(b"no tags"), str) - assert isinstance(remove_tags_with_content(b"no tags", which_ones=("p",)), str) - assert isinstance( - remove_tags_with_content(b"

one tag

", which_ones=("p",)), str + self.assertEqual(isinstance(remove_tags_with_content(b"no tags"), str), True) + self.assertEqual( + isinstance(remove_tags_with_content(b"no tags", which_ones=("p",)), str), + True, + ) + self.assertEqual( + isinstance( + remove_tags_with_content(b"

one tag

", which_ones=("p",)), str + ), + True, ) - assert isinstance( - remove_tags_with_content(b"link", which_ones=("b",)), str + self.assertEqual( + isinstance( + remove_tags_with_content(b"link", which_ones=("b",)), str + ), + True, ) - assert isinstance(remove_tags_with_content("no tags"), str) - assert isinstance(remove_tags_with_content("no tags", which_ones=("p",)), str) - assert isinstance( - remove_tags_with_content("

one tag

", which_ones=("p",)), str + self.assertEqual(isinstance(remove_tags_with_content("no tags"), str), True) + self.assertEqual( + isinstance(remove_tags_with_content("no tags", which_ones=("p",)), str), + True, + ) + self.assertEqual( + isinstance( + remove_tags_with_content("

one tag

", which_ones=("p",)), str + ), + True, ) - assert isinstance( - remove_tags_with_content("link", which_ones=("b",)), str + self.assertEqual( + isinstance(remove_tags_with_content("link", which_ones=("b",)), str), + True, ) def test_without_tags(self): @@ -289,13 +325,25 @@ def test_tags_with_shared_prefix(self): class ReplaceEscapeCharsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(replace_escape_chars(b"no ec"), str) - assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str) - assert isinstance(replace_escape_chars(b"no ec", replace_by="str"), str) - assert isinstance(replace_escape_chars(b"no ec", which_ones=("\n", "\t")), str) - assert isinstance(replace_escape_chars("no ec"), str) - assert isinstance(replace_escape_chars("no ec", replace_by="str"), str) - assert isinstance(replace_escape_chars("no ec", which_ones=("\n", "\t")), str) + self.assertEqual(isinstance(replace_escape_chars(b"no ec"), str), True) + self.assertEqual( + isinstance(replace_escape_chars(b"no ec", replace_by="str"), str), True + ) + self.assertEqual( + isinstance(replace_escape_chars(b"no ec", replace_by="str"), str), True + ) + self.assertEqual( + isinstance(replace_escape_chars(b"no ec", which_ones=("\n", "\t")), str), + True, + ) + self.assertEqual(isinstance(replace_escape_chars("no ec"), str), True) + self.assertEqual( + isinstance(replace_escape_chars("no ec", replace_by="str"), str), True + ) + self.assertEqual( + isinstance(replace_escape_chars("no ec", which_ones=("\n", "\t")), str), + True, + ) def test_without_escape_chars(self): # text without escape chars @@ -325,7 +373,6 @@ def test_with_escape_chars(self): class UnquoteMarkupTest(unittest.TestCase): - sample_txt1 = """hi, this is sample text with entities: & © """ sample_txt2 = ( @@ -335,8 +382,10 @@ class UnquoteMarkupTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(unquote_markup(self.sample_txt1.encode("latin-1")), str) - assert isinstance(unquote_markup(self.sample_txt2), str) + self.assertEqual( + isinstance(unquote_markup(self.sample_txt1.encode("latin-1")), str), True + ) + self.assertEqual(isinstance(unquote_markup(self.sample_txt2), str), True) def test_unquote_markup(self): self.assertEqual( diff --git a/tests/test_http.py b/tests/test_http.py index 76a1ff1..2125e9e 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -1,5 +1,6 @@ import unittest from collections import OrderedDict + from w3lib.http import ( HeadersDictInput, basic_auth_header, diff --git a/tests/test_url.py b/tests/test_url.py index 2960d5e..a6c01e6 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,8 +1,8 @@ -import sys import os +import sys import unittest from inspect import isclass -from typing import Optional, Union, Type, Callable, Tuple, List +from typing import Callable, List, Optional, Tuple, Type, Union from urllib.parse import urlparse import pytest @@ -27,8 +27,8 @@ path_to_file_uri, safe_download_url, safe_url_string, - url_query_parameter, url_query_cleaner, + url_query_parameter, ) # Test cases for URL-to-safe-URL conversions with a URL and an encoding as @@ -327,12 +327,18 @@ def _test_safe_url_func( if encoding is not None: kwargs["encoding"] = encoding if isclass(output) and issubclass(output, Exception): - with pytest.raises(output): + try: func(url, **kwargs) + except output: + pass + else: + raise AssertionError(f"Expected exception {output} was not raised.") return actual = func(url, **kwargs) - assert actual == output - assert func(actual, **kwargs) == output # Idempotency + if actual != output: + raise AssertionError(f"Expected {output}, but got {actual}.") + if func(actual, **kwargs) != output: + raise AssertionError("Idempotency check failed.") def _test_safe_url_string( @@ -364,9 +370,11 @@ def _test_safe_url_string( @pytest.mark.parametrize( "encoding,url,output", tuple( - case - if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES - else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ( + case + if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ) for case in SAFE_URL_ENCODING_CASES ), ) @@ -425,9 +433,11 @@ def test_safe_url_string_encoding( @pytest.mark.parametrize( "url,output", tuple( - case - if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES - else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ( + case + if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ) for case in SAFE_URL_URL_CASES ), ) @@ -713,7 +723,8 @@ def test_safe_url_string_preserve_nonfragment_hash(self): def test_safe_url_string_encode_idna_domain_with_port(self): self.assertEqual( - safe_url_string("http://新华网.中国:80"), "http://xn--xkrr14bows.xn--fiqs8s:80" + safe_url_string("http://新华网.中国:80"), + "http://xn--xkrr14bows.xn--fiqs8s:80", ) def test_safe_url_string_encode_idna_domain_with_username_password_and_port_number( @@ -1134,8 +1145,12 @@ def test_canonicalize_url(self): ) def test_return_str(self): - assert isinstance(canonicalize_url("http://www.example.com"), str) - assert isinstance(canonicalize_url(b"http://www.example.com"), str) + self.assertEqual( + isinstance(canonicalize_url("http://www.example.com"), str), True + ) + self.assertEqual( + isinstance(canonicalize_url(b"http://www.example.com"), str), True + ) def test_append_missing_path(self): self.assertEqual( diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 7d46d78..c5c7526 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -1,13 +1,14 @@ """ Functions for handling encoding of web pages """ -import re + import codecs import encodings +import re from typing import Callable, Match, Optional, Tuple, Union, cast -from w3lib._types import AnyUnicodeError, StrOrBytes import w3lib.util +from w3lib._types import AnyUnicodeError, StrOrBytes _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I) @@ -51,7 +52,13 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: # pylint: disable=consider-using-f-string _BODY_ENCODING_PATTERN = ( r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)" - % (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) + % ( + _SKIP_ATTRS, + _HTTPEQUIV_RE, + _CONTENT_RE, + _CONTENT2_RE, + _XML_ENCODING_RE, + ) ) _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE) _BODY_ENCODING_BYTES_RE = re.compile( diff --git a/w3lib/html.py b/w3lib/html.py index f0f0184..760c0da 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -4,12 +4,12 @@ import re from html.entities import name2codepoint -from typing import Iterable, Match, AnyStr, Optional, Pattern, Tuple, Union +from typing import AnyStr, Iterable, Match, Optional, Pattern, Tuple, Union from urllib.parse import urljoin -from w3lib.util import to_unicode -from w3lib.url import safe_url_string from w3lib._types import StrOrBytes +from w3lib.url import safe_url_string +from w3lib.util import to_unicode _ent_re = re.compile( r"&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)", diff --git a/w3lib/http.py b/w3lib/http.py index a3e4e17..bdb3f66 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,5 +1,6 @@ from base64 import b64encode -from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping +from typing import Any, AnyStr, List, Mapping, MutableMapping, Optional, Sequence, Union + from w3lib.util import to_bytes, to_unicode HeadersDictInput = Mapping[bytes, Union[Any, Sequence[bytes]]] diff --git a/w3lib/url.py b/w3lib/url.py index 485e694..52cf6ad 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -2,6 +2,7 @@ This module contains general purpose URL functions not found in the standard library. """ + import base64 import codecs import os @@ -9,7 +10,6 @@ import re import string from typing import ( - cast, Callable, Dict, List, @@ -18,12 +18,15 @@ Sequence, Tuple, Union, + cast, ) +from urllib.parse import _coerce_args # type: ignore from urllib.parse import ( + ParseResult, parse_qs, parse_qsl, - ParseResult, quote, + unquote, unquote_to_bytes, urldefrag, urlencode, @@ -31,15 +34,13 @@ urlsplit, urlunparse, urlunsplit, - unquote, ) -from urllib.parse import _coerce_args # type: ignore from urllib.request import pathname2url, url2pathname -from .util import to_unicode from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE from ._types import AnyUnicodeError, StrOrBytes from ._url import _SPECIAL_SCHEMES +from .util import to_unicode # error handling function for bytes-to-Unicode decoding errors with URLs diff --git a/w3lib/util.py b/w3lib/util.py index 70f4ef5..61426e8 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,5 +1,5 @@ -from warnings import warn from typing import Optional +from warnings import warn from w3lib._types import StrOrBytes