diff --git a/CHANGELOG.md b/CHANGELOG.md index 45248484..8cf5c6ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ ## Changelog +### 0.6.2 +- performance and documentation improved + ### 0.6.1 - code base restructured - bugs fixed and further tests diff --git a/README.rst b/README.rst index 67a1e606..69c8bb13 100644 --- a/README.rst +++ b/README.rst @@ -68,7 +68,7 @@ On the command-line: Features -------- -*htmldate* finds original and updated publication dates of web pages. URLs, HTML files or HTML trees are given as input, the library outputs a date string in the desired format. It provides following ways to date a HTML document: +*htmldate* finds original and updated publication dates of web pages using heuristics on HTML code and linguistic patterns. URLs, HTML files or HTML trees are given as input, the library outputs a date string in the desired format. It provides following ways to date a HTML document: 1. **Markup in header**: common patterns are used to identify relevant elements (e.g. ``link`` and ``meta`` elements) including `Open Graph protocol `_ attributes and a large number of CMS idiosyncracies 2. **HTML code**: The whole document is then searched for structural markers: ``abbr``/``time`` elements and a series of attributes (e.g. ``postmetadata``) @@ -252,5 +252,5 @@ Feel free to file bug reports on the `issues page `_, `ciso8601 `_, `lxml `_, `dateparser `_ +- `ciso8601 `_, `lxml `_, `dateparser `_ - A few patterns are derived from `python-goose `_, `metascraper `_, `newspaper `_ and `articleDateExtractor `_. This module extends their coverage and robustness significantly. diff --git a/htmldate/__init__.py b/htmldate/__init__.py index fe7c66a7..4bee8ed8 100644 --- a/htmldate/__init__.py +++ b/htmldate/__init__.py @@ -7,7 +7,7 @@ __author__ = 'Adrien Barbaresi' __license__ = 'GNU GPL v3' __copyright__ = 'Copyright 2017-2020, Adrien Barbaresi' -__version__ = '0.6.1' +__version__ = '0.6.2' import logging diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 55659675..7bb324f3 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -1,8 +1,8 @@ # pylint:disable-msg=E0611,I1101 """ -Custom parsers and X-Path expressions for date extraction +Custom parsers and XPath expressions for date extraction """ -## This file is available from https://github.com/adbar/trafilatura +## This file is available from https://github.com/adbar/htmldate ## under GNU GPL v3 license # standard diff --git a/setup.py b/setup.py index 24c5d964..9242215d 100644 --- a/setup.py +++ b/setup.py @@ -30,8 +30,8 @@ def readme(): setup( name='htmldate', - version='0.6.1', - description='Find the creation date of web pages using a combination of tree traversal, common structural patterns, text-based heuristics and robust date extraction.', + version='0.6.2', + description='Fast and robust extraction of original and updated publication dates from web pages.', long_description=readme(), classifiers=[ # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers