Initial commit with translated description

This commit is contained in:
2026-03-29 13:18:46 +08:00
commit 37406a2ec4
51 changed files with 9545 additions and 0 deletions

View File

@@ -0,0 +1 @@
feedparser

View File

@@ -0,0 +1,48 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE."""
from .api import parse
from .datetimes import registerDateHandler
from .exceptions import *
from .util import FeedParserDict
__author__ = 'Kurt McKee <contactme@kurtmckee.org>'
__license__ = 'BSD 2-clause'
__version__ = '6.0.12'
# HTTP "User-Agent" header to send to servers when downloading feeds.
# If you are embedding feedparser in a larger application, you should
# change this to your application name and URL.
USER_AGENT = "feedparser/%s +https://github.com/kurtmckee/feedparser/" % __version__
# If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1

View File

@@ -0,0 +1,277 @@
# The public API for feedparser
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import io
import urllib.error
import urllib.parse
import xml.sax
from .datetimes import registerDateHandler, _parse_date
from .encodings import convert_to_utf8
from .exceptions import *
from .html import _BaseHTMLProcessor
from . import http
from . import mixin
from .mixin import _FeedParserMixin
from .parsers.loose import _LooseFeedParser
from .parsers.strict import _StrictFeedParser
from .sanitizer import replace_doctype
from .sgml import *
from .urls import convert_to_idn, make_safe_absolute_uri
from .util import FeedParserDict
# List of preferred XML parsers, by SAX driver name. These will be tried first,
# but if they're not installed, Python will keep searching through its own list
# of pre-installed parsers until it finds one that supports everything we need.
PREFERRED_XML_PARSERS = ["drv_libxml2"]
_XML_AVAILABLE = True
SUPPORTED_VERSIONS = {
'': 'unknown',
'rss090': 'RSS 0.90',
'rss091n': 'RSS 0.91 (Netscape)',
'rss091u': 'RSS 0.91 (Userland)',
'rss092': 'RSS 0.92',
'rss093': 'RSS 0.93',
'rss094': 'RSS 0.94',
'rss20': 'RSS 2.0',
'rss10': 'RSS 1.0',
'rss': 'RSS (unknown version)',
'atom01': 'Atom 0.1',
'atom02': 'Atom 0.2',
'atom03': 'Atom 0.3',
'atom10': 'Atom 1.0',
'atom': 'Atom (unknown version)',
'cdf': 'CDF',
}
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result):
"""URL, filename, or string --> stream
This function lets you define parsers that take any input source
(URL, pathname to local or network file, or actual data as a string)
and deal with it in a uniform manner. Returned object is guaranteed
to have all the basic stdio read methods (read, readline, readlines).
Just .close() the object when you're done with it.
If the etag argument is supplied, it will be used as the value of an
If-None-Match request header.
If the modified argument is supplied, it can be a tuple of 9 integers
(as returned by gmtime() in the standard Python time module) or a date
string in any format supported by feedparser. Regardless, it MUST
be in GMT (Greenwich Mean Time). It will be reformatted into an
RFC 1123-compliant date and used as the value of an If-Modified-Since
request header.
If the agent argument is supplied, it will be used as the value of a
User-Agent request header.
If the referrer argument is supplied, it will be used as the value of a
Referer[sic] request header.
If handlers is supplied, it is a list of handlers used to build a
urllib2 opener.
if request_headers is supplied it is a dictionary of HTTP request headers
that will override the values generated by FeedParser.
:return: A bytes object.
"""
if hasattr(url_file_stream_or_string, 'read'):
return url_file_stream_or_string.read()
if isinstance(url_file_stream_or_string, str) \
and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
# try to open with native open function (if url_file_stream_or_string is a filename)
try:
with open(url_file_stream_or_string, 'rb') as f:
data = f.read()
except (IOError, UnicodeEncodeError, TypeError, ValueError):
# if url_file_stream_or_string is a str object that
# cannot be converted to the encoding returned by
# sys.getfilesystemencoding(), a UnicodeEncodeError
# will be thrown
# If url_file_stream_or_string is a string that contains NULL
# (such as an XML document encoded in UTF-32), TypeError will
# be thrown.
pass
else:
return data
# treat url_file_stream_or_string as string
if not isinstance(url_file_stream_or_string, bytes):
return url_file_stream_or_string.encode('utf-8')
return url_file_stream_or_string
LooseFeedParser = type(
'LooseFeedParser',
(_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
{},
)
StrictFeedParser = type(
'StrictFeedParser',
(_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
{},
)
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
"""Parse a feed from a URL, file, stream, or string.
:param url_file_stream_or_string:
File-like object, URL, file path, or string. Both byte and text strings
are accepted. If necessary, encoding will be derived from the response
headers or automatically detected.
Note that strings may trigger network I/O or filesystem access
depending on the value. Wrap an untrusted string in
a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
pass untrusted strings to this function.
When a URL is not passed the feed location to use in relative URL
resolution should be passed in the ``Content-Location`` response header
(see ``response_headers`` below).
:param str etag: HTTP ``ETag`` request header.
:param modified: HTTP ``Last-Modified`` request header.
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
:class:`datetime.datetime`
:param str agent: HTTP ``User-Agent`` request header, which defaults to
the value of :data:`feedparser.USER_AGENT`.
:param referrer: HTTP ``Referer`` [sic] request header.
:param request_headers:
A mapping of HTTP header name to HTTP header value to add to the
request, overriding internally generated values.
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
:param response_headers:
A mapping of HTTP header name to HTTP header value. Multiple values may
be joined with a comma. If a HTTP request was made, these headers
override any matching headers in the response. Otherwise this specifies
the entirety of the response headers.
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
:param bool resolve_relative_uris:
Should feedparser attempt to resolve relative URIs absolute ones within
HTML content? Defaults to the value of
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
:param bool sanitize_html:
Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
:return: A :class:`FeedParserDict`.
"""
if not agent or sanitize_html is None or resolve_relative_uris is None:
import feedparser
if not agent:
agent = feedparser.USER_AGENT
if sanitize_html is None:
sanitize_html = feedparser.SANITIZE_HTML
if resolve_relative_uris is None:
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
result = FeedParserDict(
bozo=False,
entries=[],
feed=FeedParserDict(),
headers={},
)
try:
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
except urllib.error.URLError as error:
result.update({
'bozo': True,
'bozo_exception': error,
})
return result
if not data:
return result
# overwrite existing headers using response_headers
result['headers'].update(response_headers or {})
data = convert_to_utf8(result['headers'], data, result)
use_strict_parser = result['encoding'] and True or False
result['version'], data, entities = replace_doctype(data)
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
contentloc = result['headers'].get('content-location', '')
href = result.get('href', '')
baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href
baselang = result['headers'].get('content-language', None)
if isinstance(baselang, bytes) and baselang is not None:
baselang = baselang.decode('utf-8', 'ignore')
if not _XML_AVAILABLE:
use_strict_parser = 0
if use_strict_parser:
# initialize the SAX parser
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
feedparser.resolve_relative_uris = resolve_relative_uris
feedparser.sanitize_html = sanitize_html
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
try:
# disable downloading external doctype references, if possible
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
except xml.sax.SAXNotSupportedException:
pass
saxparser.setContentHandler(feedparser)
saxparser.setErrorHandler(feedparser)
source = xml.sax.xmlreader.InputSource()
source.setByteStream(io.BytesIO(data))
try:
saxparser.parse(source)
except xml.sax.SAXException as e:
result['bozo'] = 1
result['bozo_exception'] = feedparser.exc or e
use_strict_parser = 0
if not use_strict_parser:
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
feedparser.resolve_relative_uris = resolve_relative_uris
feedparser.sanitize_html = sanitize_html
feedparser.feed(data.decode('utf-8', 'replace'))
result['feed'] = feedparser.feeddata
result['entries'] = feedparser.entries
result['version'] = result['version'] or feedparser.version
result['namespaces'] = feedparser.namespaces_in_use
return result

View File

@@ -0,0 +1,70 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from .asctime import _parse_date_asctime
from .greek import _parse_date_greek
from .hungarian import _parse_date_hungarian
from .iso8601 import _parse_date_iso8601
from .korean import _parse_date_onblog, _parse_date_nate
from .perforce import _parse_date_perforce
from .rfc822 import _parse_date_rfc822
from .w3dtf import _parse_date_w3dtf
_date_handlers = []
def registerDateHandler(func):
"""Register a date handler function (takes string, returns 9-tuple date in GMT)"""
_date_handlers.insert(0, func)
def _parse_date(date_string):
"""Parses a variety of date formats into a 9-tuple in GMT"""
if not date_string:
return None
for handler in _date_handlers:
try:
date9tuple = handler(date_string)
except (KeyError, OverflowError, ValueError, AttributeError):
continue
if not date9tuple:
continue
if len(date9tuple) != 9:
continue
return date9tuple
return None
registerDateHandler(_parse_date_onblog)
registerDateHandler(_parse_date_nate)
registerDateHandler(_parse_date_greek)
registerDateHandler(_parse_date_hungarian)
registerDateHandler(_parse_date_perforce)
registerDateHandler(_parse_date_asctime)
registerDateHandler(_parse_date_iso8601)
registerDateHandler(_parse_date_rfc822)
registerDateHandler(_parse_date_w3dtf)

View File

@@ -0,0 +1,71 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from .rfc822 import _parse_date_rfc822
_months = [
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
'aug',
'sep',
'oct',
'nov',
'dec',
]
def _parse_date_asctime(dt):
"""Parse asctime-style dates.
Converts asctime to RFC822-compatible dates and uses the RFC822 parser
to do the actual parsing.
Supported formats (format is standardized to the first one listed):
* {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy
* {weekday name} {month name} dd hh:mm:ss yyyy
"""
parts = dt.split()
# Insert a GMT timezone, if needed.
if len(parts) == 5:
parts.insert(4, '+0000')
# Exit if there are not six parts.
if len(parts) != 6:
return None
# Reassemble the parts in an RFC822-compatible order and parse them.
return _parse_date_rfc822(' '.join([
parts[0], parts[2], parts[1], parts[5], parts[3], parts[4],
]))

View File

@@ -0,0 +1,86 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
from .rfc822 import _parse_date_rfc822
# Unicode strings for Greek date strings
_greek_months = {
'\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7
'\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7
'\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7
'\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7
'\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7
'\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7
'\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7
'\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7
'\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7
'\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7
'\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7
'\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7
'\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7
'\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7
'\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7
'\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7
'\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7
'\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7
'\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7
}
_greek_wdays = {
'\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7
'\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7
'\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7
'\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7
'\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7
'\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7
'\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7
}
_greek_date_format_re = re.compile(r'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
def _parse_date_greek(date_string):
"""Parse a string according to a Greek 8-bit date format."""
m = _greek_date_format_re.match(date_string)
if not m:
return
wday = _greek_wdays[m.group(1)]
month = _greek_months[m.group(3)]
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
{
'wday': wday,
'day': m.group(2),
'month': month,
'year': m.group(4),
'hour': m.group(5),
'minute': m.group(6),
'second': m.group(7),
'zonediff': m.group(8),
}
return _parse_date_rfc822(rfc822date)

View File

@@ -0,0 +1,72 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
from .w3dtf import _parse_date_w3dtf
# Unicode strings for Hungarian date strings
_hungarian_months = {
'janu\u00e1r': '01', # e1 in iso-8859-2
'febru\u00e1ri': '02', # e1 in iso-8859-2
'm\u00e1rcius': '03', # e1 in iso-8859-2
'\u00e1prilis': '04', # e1 in iso-8859-2
'm\u00e1ujus': '05', # e1 in iso-8859-2
'j\u00fanius': '06', # fa in iso-8859-2
'j\u00falius': '07', # fa in iso-8859-2
'augusztus': '08',
'szeptember': '09',
'okt\u00f3ber': '10', # f3 in iso-8859-2
'november': '11',
'december': '12',
}
_hungarian_date_format_re = re.compile(r'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})([+-](\d{,2}:\d{2}))')
def _parse_date_hungarian(date_string):
"""Parse a string according to a Hungarian 8-bit date format."""
m = _hungarian_date_format_re.match(date_string)
if not m or m.group(2) not in _hungarian_months:
return None
month = _hungarian_months[m.group(2)]
day = m.group(3)
if len(day) == 1:
day = '0' + day
hour = m.group(4)
if len(hour) == 1:
hour = '0' + hour
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
{
'year': m.group(1),
'month': month,
'day': day,
'hour': hour,
'minute': m.group(5),
'zonediff': m.group(6),
}
return _parse_date_w3dtf(w3dtfdate)

View File

@@ -0,0 +1,158 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
import time
# ISO-8601 date parsing routines written by Fazal Majid.
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
# parser is beyond the scope of feedparser and would be a worthwhile addition
# to the Python library.
# A single regular expression cannot parse ISO 8601 date formats into groups
# as the standard is highly irregular (for instance is 030104 2003-01-04 or
# 0301-04-01), so we use templates instead.
# Please note the order in templates is significant because we need a
# greedy match.
_iso8601_tmpl = [
'YYYY-?MM-?DD',
'YYYY-0MM?-?DD',
'YYYY-MM',
'YYYY-?OOO',
'YY-?MM-?DD',
'YY-?OOO',
'YYYY',
'-YY-?MM',
'-OOO',
'-YY',
'--MM-?DD',
'--MM',
'---DD',
'CC',
'',
]
_iso8601_re = [
tmpl.replace(
'YYYY', r'(?P<year>\d{4})').replace(
'YY', r'(?P<year>\d\d)').replace(
'MM', r'(?P<month>[01]\d)').replace(
'DD', r'(?P<day>[0123]\d)').replace(
'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
'CC', r'(?P<century>\d\d$)')
+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
+ r'(:(?P<second>\d{2}))?'
+ r'(\.(?P<fracsecond>\d+))?'
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
for tmpl in _iso8601_tmpl]
try:
del tmpl
except NameError:
pass
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
try:
del regex
except NameError:
pass
def _parse_date_iso8601(date_string):
"""Parse a variety of ISO-8601-compatible formats like 20040105"""
m = None
for _iso8601_match in _iso8601_matches:
m = _iso8601_match(date_string)
if m:
break
if not m:
return
if m.span() == (0, 0):
return
params = m.groupdict()
ordinal = params.get('ordinal', 0)
if ordinal:
ordinal = int(ordinal)
else:
ordinal = 0
year = params.get('year', '--')
if not year or year == '--':
year = time.gmtime()[0]
elif len(year) == 2:
# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
year = 100 * int(time.gmtime()[0] / 100) + int(year)
else:
year = int(year)
month = params.get('month', '-')
if not month or month == '-':
# ordinals are NOT normalized by mktime, we simulate them
# by setting month=1, day=ordinal
if ordinal:
month = 1
else:
month = time.gmtime()[1]
month = int(month)
day = params.get('day', 0)
if not day:
# see above
if ordinal:
day = ordinal
elif params.get('century', 0) or \
params.get('year', 0) or params.get('month', 0):
day = 1
else:
day = time.gmtime()[2]
else:
day = int(day)
# special case of the century - is the first year of the 21st century
# 2000 or 2001 ? The debate goes on...
if 'century' in params:
year = (int(params['century']) - 1) * 100 + 1
# in ISO 8601 most fields are optional
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
if not params.get(field, None):
params[field] = 0
hour = int(params.get('hour', 0))
minute = int(params.get('minute', 0))
second = int(float(params.get('second', 0)))
# weekday is normalized by mktime(), we can ignore it
weekday = 0
daylight_savings_flag = -1
tm = [year, month, day, hour, minute, second, weekday,
ordinal, daylight_savings_flag]
# ISO 8601 time zone adjustments
tz = params.get('tz')
if tz and tz != 'Z':
if tz[0] == '-':
tm[3] += int(params.get('tzhour', 0))
tm[4] += int(params.get('tzmin', 0))
elif tz[0] == '+':
tm[3] -= int(params.get('tzhour', 0))
tm[4] -= int(params.get('tzmin', 0))
else:
return None
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
# which is guaranteed to normalize d/m/y/h/m/s.
# Many implementations have bugs, but we'll pretend they don't.
return time.localtime(time.mktime(tuple(tm)))

View File

@@ -0,0 +1,83 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
from .w3dtf import _parse_date_w3dtf
# 8-bit date handling routines written by ytrewq1.
_korean_year = '\ub144' # b3e2 in euc-kr
_korean_month = '\uc6d4' # bff9 in euc-kr
_korean_day = '\uc77c' # c0cf in euc-kr
_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr
_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr
_korean_onblog_date_re = re.compile(
r'(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})'
% (_korean_year, _korean_month, _korean_day)
)
_korean_nate_date_re = re.compile(
r'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})'
% (_korean_am, _korean_pm))
def _parse_date_onblog(dateString):
"""Parse a string according to the OnBlog 8-bit date format"""
m = _korean_onblog_date_re.match(dateString)
if not m:
return
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),
'zonediff': '+09:00'}
return _parse_date_w3dtf(w3dtfdate)
def _parse_date_nate(dateString):
"""Parse a string according to the Nate 8-bit date format"""
m = _korean_nate_date_re.match(dateString)
if not m:
return
hour = int(m.group(5))
ampm = m.group(4)
if ampm == _korean_pm:
hour += 12
hour = str(hour)
if len(hour) == 1:
hour = '0' + hour
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
{
'year': m.group(1),
'month': m.group(2),
'day': m.group(3),
'hour': hour,
'minute': m.group(6),
'second': m.group(7),
'zonediff': '+09:00',
}
return _parse_date_w3dtf(w3dtfdate)

View File

@@ -0,0 +1,46 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import email._parseaddr
import re
import time
def _parse_date_perforce(date_string):
"""parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
# Fri, 2006/09/15 08:19:53 EDT
_my_date_pattern = re.compile(r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
m = _my_date_pattern.search(date_string)
if m is None:
return None
dow, year, month, day, hour, minute, second, tz = m.groups()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
tm = email._parseaddr.parsedate_tz(new_date_string)
if tm:
return time.gmtime(email._parseaddr.mktime_tz(tm))

View File

@@ -0,0 +1,150 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import datetime
timezone_names = {
'ut': 0, 'gmt': 0, 'z': 0,
'adt': -3, 'ast': -4, 'at': -4,
'edt': -4, 'est': -5, 'et': -5,
'cdt': -5, 'cst': -6, 'ct': -6,
'mdt': -6, 'mst': -7, 'mt': -7,
'pdt': -7, 'pst': -8, 'pt': -8,
'a': -1, 'n': 1,
'm': -12, 'y': 12,
'met': 1, 'mest': 2,
}
day_names = {'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'}
months = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
}
def _parse_date_rfc822(date):
"""Parse RFC 822 dates and times
http://tools.ietf.org/html/rfc822#section-5
There are some formatting differences that are accounted for:
1. Years may be two or four digits.
2. The month and day can be swapped.
3. Additional timezone names are supported.
4. A default time and timezone are assumed if only a date is present.
:param str date: a date/time string that will be converted to a time tuple
:returns: a UTC time tuple, or None
:rtype: time.struct_time | None
"""
parts = date.lower().split()
if len(parts) < 5:
# Assume that the time and timezone are missing
parts.extend(('00:00:00', '0000'))
# Remove the day name
if parts[0][:3] in day_names:
parts = parts[1:]
if len(parts) < 5:
# If there are still fewer than five parts, there's not enough
# information to interpret this.
return None
# Handle the day and month name.
month = months.get(parts[1][:3])
try:
day = int(parts[0])
except ValueError:
# Check if the day and month are swapped.
if months.get(parts[0][:3]):
try:
day = int(parts[1])
except ValueError:
return None
month = months.get(parts[0][:3])
else:
return None
if not month:
return None
# Handle the year.
try:
year = int(parts[2])
except ValueError:
return None
# Normalize two-digit years:
# Anything in the 90's is interpreted as 1990 and on.
# Anything 89 or less is interpreted as 2089 or before.
if len(parts[2]) <= 2:
year += (1900, 2000)[year < 90]
# Handle the time (default to 00:00:00).
time_parts = parts[3].split(':')
time_parts.extend(('0',) * (3 - len(time_parts)))
try:
(hour, minute, second) = [int(i) for i in time_parts]
except ValueError:
return None
# Handle the timezone information, if any (default to +0000).
# Strip 'Etc/' from the timezone.
if parts[4].startswith('etc/'):
parts[4] = parts[4][4:]
# Normalize timezones that start with 'gmt':
# GMT-05:00 => -0500
# GMT => GMT
if parts[4].startswith('gmt'):
parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
# Handle timezones like '-0500', '+0500', and 'EST'
if parts[4] and parts[4][0] in ('-', '+'):
try:
if ':' in parts[4]:
timezone_hours = int(parts[4][1:3])
timezone_minutes = int(parts[4][4:])
else:
timezone_hours = int(parts[4][1:3])
timezone_minutes = int(parts[4][3:])
except ValueError:
return None
if parts[4].startswith('-'):
timezone_hours *= -1
timezone_minutes *= -1
else:
timezone_hours = timezone_names.get(parts[4], 0)
timezone_minutes = 0
# Create the datetime object and timezone delta objects
try:
stamp = datetime.datetime(year, month, day, hour, minute, second)
except ValueError:
return None
delta = datetime.timedelta(0, 0, 0, 0, timezone_minutes, timezone_hours)
# Return the date and timestamp in a UTC 9-tuple
try:
return (stamp - delta).utctimetuple()
except (OverflowError, ValueError):
# IronPython throws ValueErrors instead of OverflowErrors
return None

View File

@@ -0,0 +1,114 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import datetime
timezonenames = {
'ut': 0, 'gmt': 0, 'z': 0,
'adt': -3, 'ast': -4, 'at': -4,
'edt': -4, 'est': -5, 'et': -5,
'cdt': -5, 'cst': -6, 'ct': -6,
'mdt': -6, 'mst': -7, 'mt': -7,
'pdt': -7, 'pst': -8, 'pt': -8,
'a': -1, 'n': 1,
'm': -12, 'y': 12,
}
# W3 date and time format parser
# http://www.w3.org/TR/NOTE-datetime
# Also supports MSSQL-style datetimes as defined at:
# http://msdn.microsoft.com/en-us/library/ms186724.aspx
# (basically, allow a space as a date/time/timezone separator)
def _parse_date_w3dtf(datestr):
if not datestr.strip():
return None
parts = datestr.lower().split('t')
if len(parts) == 1:
# This may be a date only, or may be an MSSQL-style date
parts = parts[0].split()
if len(parts) == 1:
# Treat this as a date only
parts.append('00:00:00z')
elif len(parts) > 2:
return None
date = parts[0].split('-', 2)
if not date or len(date[0]) != 4:
return None
# Ensure that `date` has 3 elements. Using '1' sets the default
# month to January and the default day to the 1st of the month.
date.extend(['1'] * (3 - len(date)))
try:
year, month, day = [int(i) for i in date]
except ValueError:
# `date` may have more than 3 elements or may contain
# non-integer strings.
return None
if parts[1].endswith('z'):
parts[1] = parts[1][:-1]
parts.append('z')
# Append the numeric timezone offset, if any, to parts.
# If this is an MSSQL-style date then parts[2] already contains
# the timezone information, so `append()` will not affect it.
# Add 1 to each value so that if `find()` returns -1 it will be
# treated as False.
loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
loc = loc - 1
parts.append(parts[1][loc:])
parts[1] = parts[1][:loc]
time = parts[1].split(':', 2)
# Ensure that time has 3 elements. Using '0' means that the
# minutes and seconds, if missing, will default to 0.
time.extend(['0'] * (3 - len(time)))
if parts[2][:1] in ('-', '+'):
try:
tzhour = int(parts[2][1:3])
tzmin = int(parts[2][4:])
except ValueError:
return None
if parts[2].startswith('-'):
tzhour = tzhour * -1
tzmin = tzmin * -1
else:
tzhour = timezonenames.get(parts[2], 0)
tzmin = 0
try:
hour, minute, second = [int(float(i)) for i in time]
except ValueError:
return None
# Create the datetime object and timezone delta objects
try:
stamp = datetime.datetime(year, month, day, hour, minute, second)
except ValueError:
return None
delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
# Return the date and timestamp in a UTC 9-tuple
try:
return (stamp - delta).utctimetuple()
except (OverflowError, ValueError):
# IronPython throws ValueErrors instead of OverflowErrors
return None

View File

@@ -0,0 +1,303 @@
# Character encoding routines
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import codecs
import re
import typing as t
try:
try:
import cchardet as chardet
except ImportError:
import chardet
except ImportError:
chardet = None
lazy_chardet_encoding = None
else:
def lazy_chardet_encoding(data):
return chardet.detect(data)['encoding'] or ''
from .exceptions import (
CharacterEncodingOverride,
CharacterEncodingUnknown,
NonXMLContentType,
)
# Each marker represents some of the characters of the opening XML
# processing instruction ('<?xm') in the specified encoding.
EBCDIC_MARKER = b'\x4C\x6F\xA7\x94'
UTF16BE_MARKER = b'\x00\x3C\x00\x3F'
UTF16LE_MARKER = b'\x3C\x00\x3F\x00'
UTF32BE_MARKER = b'\x00\x00\x00\x3C'
UTF32LE_MARKER = b'\x3C\x00\x00\x00'
ZERO_BYTES = '\x00\x00'
# Match the opening XML declaration.
# Example: <?xml version="1.0" encoding="utf-8"?>
RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>')
# Capture the value of the XML processing instruction's encoding attribute.
# Example: <?xml version="1.0" encoding="utf-8"?>
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
def parse_content_type(line: str) -> t.Tuple[str, str]:
"""Parse an HTTP Content-Type header.
The return value will be a tuple of strings:
the MIME type, and the value of the "charset" (if any).
This is a custom replacement for Python's cgi.parse_header().
The cgi module will be removed in Python 3.13.
"""
chunks = line.split(";")
if not chunks:
return "", ""
mime_type = chunks[0].strip()
charset_value = ""
for chunk in chunks[1:]:
key, _, value = chunk.partition("=")
if key.strip().lower() == "charset":
charset_value = value.strip().strip("\"'")
return mime_type, charset_value
def convert_to_utf8(http_headers, data, result):
"""Detect and convert the character encoding to UTF-8.
http_headers is a dictionary
data is a raw string (not Unicode)"""
# This is so much trickier than it sounds, it's not even funny.
# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
# is application/xml, application/*+xml,
# application/xml-external-parsed-entity, or application/xml-dtd,
# the encoding given in the charset parameter of the HTTP Content-Type
# takes precedence over the encoding given in the XML prefix within the
# document, and defaults to 'utf-8' if neither are specified. But, if
# the HTTP Content-Type is text/xml, text/*+xml, or
# text/xml-external-parsed-entity, the encoding given in the XML prefix
# within the document is ALWAYS IGNORED and only the encoding given in
# the charset parameter of the HTTP Content-Type header should be
# respected, and it defaults to 'us-ascii' if not specified.
# Furthermore, discussion on the atom-syntax mailing list with the
# author of RFC 3023 leads me to the conclusion that any document
# served with a Content-Type of text/* and no charset parameter
# must be treated as us-ascii. (We now do this.) And also that it
# must always be flagged as non-well-formed. (We now do this too.)
# If Content-Type is unspecified (input was local file or non-HTTP source)
# or unrecognized (server just got it totally wrong), then go by the
# encoding given in the XML prefix of the document and default to
# 'iso-8859-1' as per the HTTP specification (RFC 2616).
# Then, assuming we didn't find a character encoding in the HTTP headers
# (and the HTTP Content-type allowed us to look in the body), we need
# to sniff the first few bytes of the XML data and try to determine
# whether the encoding is ASCII-compatible. Section F of the XML
# specification shows the way here:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
# If the sniffed encoding is not ASCII-compatible, we need to make it
# ASCII compatible so that we can sniff further into the XML declaration
# to find the encoding attribute, which will tell us the true encoding.
# Of course, none of this guarantees that we will be able to parse the
# feed in the declared character encoding (assuming it was declared
# correctly, which many are not). iconv_codec can help a lot;
# you should definitely install it if you can.
# http://cjkpython.i18n.org/
bom_encoding = ''
xml_encoding = ''
# Look at the first few bytes of the document to guess what
# its encoding may be. We only need to decode enough of the
# document that we can use an ASCII-compatible regular
# expression to search for an XML encoding declaration.
# The heuristic follows the XML specification, section F:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
# Check for BOMs first.
if data[:4] == codecs.BOM_UTF32_BE:
bom_encoding = 'utf-32be'
data = data[4:]
elif data[:4] == codecs.BOM_UTF32_LE:
bom_encoding = 'utf-32le'
data = data[4:]
elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
bom_encoding = 'utf-16be'
data = data[2:]
elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
bom_encoding = 'utf-16le'
data = data[2:]
elif data[:3] == codecs.BOM_UTF8:
bom_encoding = 'utf-8'
data = data[3:]
# Check for the characters '<?xm' in several encodings.
elif data[:4] == EBCDIC_MARKER:
bom_encoding = 'cp037'
elif data[:4] == UTF16BE_MARKER:
bom_encoding = 'utf-16be'
elif data[:4] == UTF16LE_MARKER:
bom_encoding = 'utf-16le'
elif data[:4] == UTF32BE_MARKER:
bom_encoding = 'utf-32be'
elif data[:4] == UTF32LE_MARKER:
bom_encoding = 'utf-32le'
tempdata = data
try:
if bom_encoding:
tempdata = data.decode(bom_encoding).encode('utf-8')
except (UnicodeDecodeError, LookupError):
# feedparser recognizes UTF-32 encodings that aren't
# available in Python 2.4 and 2.5, so it's possible to
# encounter a LookupError during decoding.
xml_encoding_match = None
else:
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
if xml_encoding_match:
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
# Normalize the xml_encoding if necessary.
if bom_encoding and (xml_encoding in (
'u16', 'utf-16', 'utf16', 'utf_16',
'u32', 'utf-32', 'utf32', 'utf_32',
'iso-10646-ucs-2', 'iso-10646-ucs-4',
'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
)):
xml_encoding = bom_encoding
# Find the HTTP Content-Type and, hopefully, a character
# encoding provided by the server. The Content-Type is used
# to choose the "correct" encoding among the BOM encoding,
# XML declaration encoding, and HTTP encoding, following the
# heuristic defined in RFC 3023.
http_content_type = http_headers.get('content-type') or ''
http_content_type, http_encoding = parse_content_type(http_content_type)
acceptable_content_type = 0
application_content_types = ('application/xml', 'application/xml-dtd',
'application/xml-external-parsed-entity')
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
if (
http_content_type in application_content_types
or (
http_content_type.startswith('application/')
and http_content_type.endswith('+xml')
)
):
acceptable_content_type = 1
rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
elif (
http_content_type in text_content_types
or (
http_content_type.startswith('text/')
and http_content_type.endswith('+xml')
)
):
acceptable_content_type = 1
rfc3023_encoding = http_encoding or 'us-ascii'
elif http_content_type.startswith('text/'):
rfc3023_encoding = http_encoding or 'us-ascii'
elif http_headers and 'content-type' not in http_headers:
rfc3023_encoding = xml_encoding or 'iso-8859-1'
else:
rfc3023_encoding = xml_encoding or 'utf-8'
# gb18030 is a superset of gb2312, so always replace gb2312
# with gb18030 for greater compatibility.
if rfc3023_encoding.lower() == 'gb2312':
rfc3023_encoding = 'gb18030'
if xml_encoding.lower() == 'gb2312':
xml_encoding = 'gb18030'
# there are four encodings to keep track of:
# - http_encoding is the encoding declared in the Content-Type HTTP header
# - xml_encoding is the encoding declared in the <?xml declaration
# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
# - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
error = None
if http_headers and (not acceptable_content_type):
if 'content-type' in http_headers:
msg = '%s is not an XML media type' % http_headers['content-type']
else:
msg = 'no Content-type specified'
error = NonXMLContentType(msg)
# determine character encoding
known_encoding = 0
tried_encodings = []
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'):
if callable(proposed_encoding):
proposed_encoding = proposed_encoding(data)
if not proposed_encoding:
continue
if proposed_encoding in tried_encodings:
continue
tried_encodings.append(proposed_encoding)
try:
data = data.decode(proposed_encoding)
except (UnicodeDecodeError, LookupError):
pass
else:
known_encoding = 1
# Update the encoding in the opening XML processing instruction.
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
if RE_XML_DECLARATION.search(data):
data = RE_XML_DECLARATION.sub(new_declaration, data)
else:
data = new_declaration + '\n' + data
data = data.encode('utf-8')
break
# if still no luck, give up
if not known_encoding:
error = CharacterEncodingUnknown(
'document encoding unknown, I tried ' +
'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
(rfc3023_encoding, xml_encoding))
rfc3023_encoding = ''
elif proposed_encoding != rfc3023_encoding:
error = CharacterEncodingOverride(
'document declared as %s, but parsed as %s' %
(rfc3023_encoding, proposed_encoding))
rfc3023_encoding = proposed_encoding
result['encoding'] = rfc3023_encoding
if error:
result['bozo'] = True
result['bozo_exception'] = error
return data

View File

@@ -0,0 +1,55 @@
# Exceptions used throughout feedparser
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
__all__ = [
'ThingsNobodyCaresAboutButMe',
'CharacterEncodingOverride',
'CharacterEncodingUnknown',
'NonXMLContentType',
'UndeclaredNamespace',
]
class ThingsNobodyCaresAboutButMe(Exception):
pass
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
pass
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
pass
class NonXMLContentType(ThingsNobodyCaresAboutButMe):
pass
class UndeclaredNamespace(Exception):
pass

View File

@@ -0,0 +1,355 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import html.entities
import re
from .sgml import *
_cp1252 = {
128: '\u20ac', # euro sign
130: '\u201a', # single low-9 quotation mark
131: '\u0192', # latin small letter f with hook
132: '\u201e', # double low-9 quotation mark
133: '\u2026', # horizontal ellipsis
134: '\u2020', # dagger
135: '\u2021', # double dagger
136: '\u02c6', # modifier letter circumflex accent
137: '\u2030', # per mille sign
138: '\u0160', # latin capital letter s with caron
139: '\u2039', # single left-pointing angle quotation mark
140: '\u0152', # latin capital ligature oe
142: '\u017d', # latin capital letter z with caron
145: '\u2018', # left single quotation mark
146: '\u2019', # right single quotation mark
147: '\u201c', # left double quotation mark
148: '\u201d', # right double quotation mark
149: '\u2022', # bullet
150: '\u2013', # en dash
151: '\u2014', # em dash
152: '\u02dc', # small tilde
153: '\u2122', # trade mark sign
154: '\u0161', # latin small letter s with caron
155: '\u203a', # single right-pointing angle quotation mark
156: '\u0153', # latin small ligature oe
158: '\u017e', # latin small letter z with caron
159: '\u0178', # latin capital letter y with diaeresis
}
class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
special = re.compile("""[<>'"]""")
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
elements_no_end_tag = {
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
}
def __init__(self, encoding=None, _type='application/xhtml+xml'):
if encoding:
self.encoding = encoding
self._type = _type
self.pieces = []
super(_BaseHTMLProcessor, self).__init__()
def reset(self):
self.pieces = []
super(_BaseHTMLProcessor, self).reset()
def _shorttag_replace(self, match):
"""
:type match: Match[str]
:rtype: str
"""
tag = match.group(1)
if tag in self.elements_no_end_tag:
return '<' + tag + ' />'
else:
return '<' + tag + '></' + tag + '>'
# By declaring these methods and overriding their compiled code
# with the code from sgmllib, the original code will execute in
# feedparser's scope instead of sgmllib's. This means that the
# `tagfind` and `charref` regular expressions will be found as
# they're declared above, not as they're declared in sgmllib.
def goahead(self, i):
raise NotImplementedError
# Replace goahead with SGMLParser's goahead() code object.
try:
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
except AttributeError:
# Python 2
# noinspection PyUnresolvedReferences
goahead.func_code = sgmllib.SGMLParser.goahead.func_code
def __parse_starttag(self, i):
raise NotImplementedError
# Replace __parse_starttag with SGMLParser's parse_starttag() code object.
try:
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
except AttributeError:
# Python 2
# noinspection PyUnresolvedReferences
__parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
def parse_starttag(self, i):
j = self.__parse_starttag(i)
if self._type == 'application/xhtml+xml':
if j > 2 and self.rawdata[j-2:j] == '/>':
self.unknown_endtag(self.lasttag)
return j
def feed(self, data):
"""
:type data: str
:rtype: None
"""
data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, flags=re.IGNORECASE)
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace('&#39;', "'")
data = data.replace('&#34;', '"')
super(_BaseHTMLProcessor, self).feed(data)
super(_BaseHTMLProcessor, self).close()
@staticmethod
def normalize_attrs(attrs):
"""
:type attrs: List[Tuple[str, str]]
:rtype: List[Tuple[str, str]]
"""
if not attrs:
return attrs
# utility method to be called by descendants
# Collapse any duplicate attribute names and values by converting
# *attrs* into a dictionary, then convert it back to a list.
attrs_d = {k.lower(): v for k, v in attrs}
attrs = [
(k, k in ('rel', 'type') and v.lower() or v)
for k, v in attrs_d.items()
]
attrs.sort()
return attrs
def unknown_starttag(self, tag, attrs):
"""
:type tag: str
:type attrs: List[Tuple[str, str]]
:rtype: None
"""
# Called for each start tag
# attrs is a list of (attr, value) tuples
# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
uattrs = []
strattrs = ''
if attrs:
for key, value in attrs:
value = value.replace('>', '&gt;')
value = value.replace('<', '&lt;')
value = value.replace('"', '&quot;')
value = self.bare_ampersand.sub("&amp;", value)
uattrs.append((key, value))
strattrs = ''.join(
' %s="%s"' % (key, value)
for key, value in uattrs
)
if tag in self.elements_no_end_tag:
self.pieces.append('<%s%s />' % (tag, strattrs))
else:
self.pieces.append('<%s%s>' % (tag, strattrs))
def unknown_endtag(self, tag):
"""
:type tag: str
:rtype: None
"""
# Called for each end tag, e.g. for </pre>, tag will be 'pre'
# Reconstruct the original end tag.
if tag not in self.elements_no_end_tag:
self.pieces.append("</%s>" % tag)
def handle_charref(self, ref):
"""
:type ref: str
:rtype: None
"""
# Called for each character reference, e.g. '&#160;' will extract '160'
# Reconstruct the original character reference.
ref = ref.lower()
if ref.startswith('x'):
value = int(ref[1:], 16)
else:
value = int(ref)
if value in _cp1252:
self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
else:
self.pieces.append('&#%s;' % ref)
def handle_entityref(self, ref):
"""
:type ref: str
:rtype: None
"""
# Called for each entity reference, e.g. '&copy;' will extract 'copy'
# Reconstruct the original entity reference.
if ref in html.entities.name2codepoint or ref == 'apos':
self.pieces.append('&%s;' % ref)
else:
self.pieces.append('&amp;%s' % ref)
def handle_data(self, text):
"""
:type text: str
:rtype: None
"""
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
self.pieces.append(text)
def handle_comment(self, text):
"""
:type text: str
:rtype: None
"""
# Called for HTML comments, e.g. <!-- insert Javascript code here -->
# Reconstruct the original comment.
self.pieces.append('<!--%s-->' % text)
def handle_pi(self, text):
"""
:type text: str
:rtype: None
"""
# Called for each processing instruction, e.g. <?instruction>
# Reconstruct original processing instruction.
self.pieces.append('<?%s>' % text)
def handle_decl(self, text):
"""
:type text: str
:rtype: None
"""
# called for the DOCTYPE, if present, e.g.
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
# "http://www.w3.org/TR/html4/loose.dtd">
# Reconstruct original DOCTYPE
self.pieces.append('<!%s>' % text)
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
def _scan_name(self, i, declstartpos):
"""
:type i: int
:type declstartpos: int
:rtype: Tuple[Optional[str], int]
"""
rawdata = self.rawdata
n = len(rawdata)
if i == n:
return None, -1
m = self._new_declname_match(rawdata, i)
if m:
s = m.group()
name = s.strip()
if (i + len(s)) == n:
return None, -1 # end of buffer
return name.lower(), m.end()
else:
self.handle_data(rawdata)
# self.updatepos(declstartpos, i)
return None, -1
@staticmethod
def convert_charref(name):
"""
:type name: str
:rtype: str
"""
return '&#%s;' % name
@staticmethod
def convert_entityref(name):
"""
:type name: str
:rtype: str
"""
return '&%s;' % name
def output(self):
"""Return processed HTML as a single string.
:rtype: str
"""
return ''.join(self.pieces)
def parse_declaration(self, i):
"""
:type i: int
:rtype: int
"""
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except (AssertionError, sgmllib.SGMLParseError):
# Escape the doctype declaration and continue parsing.
self.handle_data('&lt;')
return i+1

View File

@@ -0,0 +1,227 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import base64
import datetime
import gzip
import io
import re
import struct
import urllib.parse
import urllib.request
import zlib
from .datetimes import _parse_date
from .urls import convert_to_idn
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
# want to send an Accept header, set this to None.
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, headers):
# The default implementation just raises HTTPError.
# Forget that.
fp.status = code
return fp
def http_error_301(self, req, fp, code, msg, hdrs):
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
if not result:
return fp
result.status = code
result.newurl = result.geturl()
return result
# The default implementations in urllib.request.HTTPRedirectHandler
# are identical, so hardcoding a http_error_301 call above
# won't affect anything
http_error_300 = http_error_301
http_error_302 = http_error_301
http_error_303 = http_error_301
http_error_307 = http_error_301
def http_error_401(self, req, fp, code, msg, headers):
# Check if
# - server requires digest auth, AND
# - we tried (unsuccessfully) with basic auth, AND
# If all conditions hold, parse authentication information
# out of the Authorization header we sent the first time
# (for the username and password) and the WWW-Authenticate
# header the server sent back (for the realm) and retry
# the request with the appropriate digest auth headers instead.
# This evil genius hack has been brought to you by Aaron Swartz.
host = urllib.parse.urlparse(req.get_full_url())[1]
if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
return self.http_error_default(req, fp, code, msg, headers)
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
user, passw = auth.split(':')
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
self.add_password(realm, host, user, passw)
retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
self.reset_retry_count()
return retry
def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers):
request = urllib.request.Request(url)
request.add_header('User-Agent', agent)
if etag:
request.add_header('If-None-Match', etag)
if isinstance(modified, str):
modified = _parse_date(modified)
elif isinstance(modified, datetime.datetime):
modified = modified.utctimetuple()
if modified:
# format into an RFC 1123-compliant timestamp. We can't use
# time.strftime() since the %a and %b directives can be affected
# by the current locale, but RFC 2616 states that dates must be
# in English.
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
if referrer:
request.add_header('Referer', referrer)
request.add_header('Accept-encoding', 'gzip, deflate')
if auth:
request.add_header('Authorization', 'Basic %s' % auth)
if accept_header:
request.add_header('Accept', accept_header)
# use this for whatever -- cookies, special headers, etc
# [('Cookie','Something'),('x-special-header','Another Value')]
for header_name, header_value in request_headers.items():
request.add_header(header_name, header_value)
request.add_header('A-IM', 'feed') # RFC 3229 support
return request
def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None):
if handlers is None:
handlers = []
elif not isinstance(handlers, list):
handlers = [handlers]
if request_headers is None:
request_headers = {}
# Deal with the feed URI scheme
if url.startswith('feed:http'):
url = url[5:]
elif url.startswith('feed:'):
url = 'http:' + url[5:]
if not agent:
from . import USER_AGENT
agent = USER_AGENT
# Test for inline user:password credentials for HTTP basic auth
auth = None
if not url.startswith('ftp:'):
url_pieces = urllib.parse.urlparse(url)
if url_pieces.username:
new_pieces = list(url_pieces)
new_pieces[1] = url_pieces.hostname
if url_pieces.port:
new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
url = urllib.parse.urlunparse(new_pieces)
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
# iri support
if not isinstance(url, bytes):
url = convert_to_idn(url)
# Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
bits = []
for c in url:
try:
c.encode('ascii')
except UnicodeEncodeError:
bits.append(urllib.parse.quote(c))
else:
bits.append(c)
url = ''.join(bits)
# try to open with urllib2 (to use optional headers)
request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
f = opener.open(request)
data = f.read()
f.close()
# lowercase all of the HTTP headers for comparisons per RFC 2616
result['headers'] = {k.lower(): v for k, v in f.headers.items()}
# if feed is gzip-compressed, decompress it
if data and 'gzip' in result['headers'].get('content-encoding', ''):
try:
data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
except (EOFError, IOError, struct.error) as e:
# IOError can occur if the gzip header is bad.
# struct.error can occur if the data is damaged.
result['bozo'] = True
result['bozo_exception'] = e
if isinstance(e, struct.error):
# A gzip header was found but the data is corrupt.
# Ideally, we should re-request the feed without the
# 'Accept-encoding: gzip' header, but we don't.
data = None
elif data and 'deflate' in result['headers'].get('content-encoding', ''):
try:
data = zlib.decompress(data)
except zlib.error:
try:
# The data may have no headers and no checksum.
data = zlib.decompress(data, -15)
except zlib.error as e:
result['bozo'] = True
result['bozo_exception'] = e
# save HTTP headers
if 'etag' in result['headers']:
etag = result['headers'].get('etag', '')
if isinstance(etag, bytes):
etag = etag.decode('utf-8', 'ignore')
if etag:
result['etag'] = etag
if 'last-modified' in result['headers']:
modified = result['headers'].get('last-modified', '')
if modified:
result['modified'] = modified
result['modified_parsed'] = _parse_date(modified)
if isinstance(f.url, bytes):
result['href'] = f.url.decode('utf-8', 'ignore')
else:
result['href'] = f.url
result['status'] = getattr(f, 'status', None) or 200
# Stop processing if the server sent HTTP 304 Not Modified.
if getattr(f, 'code', 0) == 304:
result['version'] = ''
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
'so the server sent no data. This is a feature, not a bug!'
return data

View File

@@ -0,0 +1,785 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import base64
import binascii
import copy
import html.entities
import re
import xml.sax.saxutils
from .html import _cp1252
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
from .sanitizer import _sanitize_html, _HTMLSanitizer
from .util import FeedParserDict
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
class _FeedParserMixin(
_base.Namespace,
cc.Namespace,
dc.Namespace,
georss.Namespace,
itunes.Namespace,
mediarss.Namespace,
psc.Namespace,
):
namespaces = {
'': '',
'http://backend.userland.com/rss': '',
'http://blogs.law.harvard.edu/tech/rss': '',
'http://purl.org/rss/1.0/': '',
'http://my.netscape.com/rdf/simple/0.9/': '',
'http://example.com/newformat#': '',
'http://example.com/necho': '',
'http://purl.org/echo/': '',
'uri/of/echo/namespace#': '',
'http://purl.org/pie/': '',
'http://purl.org/atom/ns#': '',
'http://www.w3.org/2005/Atom': '',
'http://purl.org/rss/1.0/modules/rss091#': '',
'http://webns.net/mvcb/': 'admin',
'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
'http://media.tangent.org/rss/1.0/': 'audio',
'http://backend.userland.com/blogChannelModule': 'blogChannel',
'http://creativecommons.org/ns#license': 'cc',
'http://web.resource.org/cc/': 'cc',
'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativeCommons',
'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
'http://purl.org/rss/1.0/modules/company': 'co',
'http://purl.org/rss/1.0/modules/content/': 'content',
'http://my.theinfo.org/changed/1.0/rss/': 'cp',
'http://purl.org/dc/elements/1.1/': 'dc',
'http://purl.org/dc/terms/': 'dcterms',
'http://purl.org/rss/1.0/modules/email/': 'email',
'http://purl.org/rss/1.0/modules/event/': 'ev',
'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
'http://freshmeat.net/rss/fm/': 'fm',
'http://xmlns.com/foaf/0.1/': 'foaf',
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
'http://www.georss.org/georss': 'georss',
'http://www.opengis.net/gml': 'gml',
'http://postneo.com/icbm/': 'icbm',
'http://purl.org/rss/1.0/modules/image/': 'image',
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
'http://purl.org/rss/1.0/modules/link/': 'l',
'http://search.yahoo.com/mrss': 'media',
# Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
'http://search.yahoo.com/mrss/': 'media',
'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
'http://purl.org/rss/1.0/modules/reference/': 'ref',
'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
'http://purl.org/rss/1.0/modules/search/': 'search',
'http://purl.org/rss/1.0/modules/slash/': 'slash',
'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
'http://hacks.benhammersley.com/rss/streaming/': 'str',
'http://purl.org/rss/1.0/modules/subscription/': 'sub',
'http://purl.org/rss/1.0/modules/syndication/': 'sy',
'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
'http://purl.org/rss/1.0/modules/threading/': 'thr',
'http://purl.org/rss/1.0/modules/textinput/': 'ti',
'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
'http://wellformedweb.org/commentAPI/': 'wfw',
'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
'http://www.w3.org/1999/xhtml': 'xhtml',
'http://www.w3.org/1999/xlink': 'xlink',
'http://www.w3.org/XML/1998/namespace': 'xml',
'http://podlove.org/simple-chapters': 'psc',
}
_matchnamespaces = {}
can_be_relative_uri = {
'comments',
'docs',
'href',
'icon',
'id',
'link',
'logo',
'url',
'wfw_comment',
'wfw_commentrss',
}
can_contain_relative_uris = {
'content',
'copyright',
'description',
'info',
'rights',
'subtitle',
'summary',
'tagline',
'title',
}
can_contain_dangerous_markup = {
'content',
'copyright',
'description',
'info',
'rights',
'subtitle',
'summary',
'tagline',
'title',
}
html_types = {
'application/xhtml+xml',
'text/html',
}
def __init__(self):
if not self._matchnamespaces:
for k, v in self.namespaces.items():
self._matchnamespaces[k.lower()] = v
self.feeddata = FeedParserDict() # feed-level data
self.entries = [] # list of entry-level data
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
# the following are used internally to track state;
# this is really out of control and should be refactored
self.infeed = 0
self.inentry = 0
self.incontent = 0
self.intextinput = 0
self.inimage = 0
self.inauthor = 0
self.incontributor = 0
self.inpublisher = 0
self.insource = 0
self.sourcedata = FeedParserDict()
self.contentparams = FeedParserDict()
self._summaryKey = None
self.namespacemap = {}
self.elementstack = []
self.basestack = []
self.langstack = []
self.svgOK = 0
self.title_depth = -1
self.depth = 0
self.hasContent = 0
if self.lang:
self.feeddata['language'] = self.lang.replace('_', '-')
# A map of the following form:
# {
# object_that_value_is_set_on: {
# property_name: depth_of_node_property_was_extracted_from,
# other_property: depth_of_node_property_was_extracted_from,
# },
# }
self.property_depth_map = {}
super(_FeedParserMixin, self).__init__()
def _normalize_attributes(self, kv):
raise NotImplementedError
def unknown_starttag(self, tag, attrs):
# increment depth counter
self.depth += 1
# normalize attrs
attrs = [self._normalize_attributes(attr) for attr in attrs]
# track xml:base and xml:lang
attrs_d = dict(attrs)
baseuri = attrs_d.get('xml:base', attrs_d.get('base')) or self.baseuri
if isinstance(baseuri, bytes):
baseuri = baseuri.decode(self.encoding, 'ignore')
# ensure that self.baseuri is always an absolute URI that
# uses a whitelisted URI scheme (e.g. not `javscript:`)
if self.baseuri:
self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
else:
self.baseuri = _urljoin(self.baseuri, baseuri)
lang = attrs_d.get('xml:lang', attrs_d.get('lang'))
if lang == '':
# xml:lang could be explicitly set to '', we need to capture that
lang = None
elif lang is None:
# if no xml:lang is specified, use parent lang
lang = self.lang
if lang:
if tag in ('feed', 'rss', 'rdf:RDF'):
self.feeddata['language'] = lang.replace('_', '-')
self.lang = lang
self.basestack.append(self.baseuri)
self.langstack.append(lang)
# track namespaces
for prefix, uri in attrs:
if prefix.startswith('xmlns:'):
self.track_namespace(prefix[6:], uri)
elif prefix == 'xmlns':
self.track_namespace(None, uri)
# track inline content
if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'):
if tag in ('xhtml:div', 'div'):
return # typepad does this 10/2007
# element declared itself as escaped markup, but it isn't really
self.contentparams['type'] = 'application/xhtml+xml'
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
if tag.find(':') != -1:
prefix, tag = tag.split(':', 1)
namespace = self.namespaces_in_use.get(prefix, '')
if tag == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML':
attrs.append(('xmlns', namespace))
if tag == 'svg' and namespace == 'http://www.w3.org/2000/svg':
attrs.append(('xmlns', namespace))
if tag == 'svg':
self.svgOK += 1
return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
# match namespaces
if tag.find(':') != -1:
prefix, suffix = tag.split(':', 1)
else:
prefix, suffix = '', tag
prefix = self.namespacemap.get(prefix, prefix)
if prefix:
prefix = prefix + '_'
# Special hack for better tracking of empty textinput/image elements in
# illformed feeds.
if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
self.intextinput = 0
if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
self.inimage = 0
# call special handler (if defined) or default handler
methodname = '_start_' + prefix + suffix
try:
method = getattr(self, methodname)
return method(attrs_d)
except AttributeError:
# Since there's no handler or something has gone wrong we
# explicitly add the element and its attributes.
unknown_tag = prefix + suffix
if len(attrs_d) == 0:
# No attributes so merge it into the enclosing dictionary
return self.push(unknown_tag, 1)
else:
# Has attributes so create it in its own dictionary
context = self._get_context()
context[unknown_tag] = attrs_d
def unknown_endtag(self, tag):
# match namespaces
if tag.find(':') != -1:
prefix, suffix = tag.split(':', 1)
else:
prefix, suffix = '', tag
prefix = self.namespacemap.get(prefix, prefix)
if prefix:
prefix = prefix + '_'
if suffix == 'svg' and self.svgOK:
self.svgOK -= 1
# call special handler (if defined) or default handler
methodname = '_end_' + prefix + suffix
try:
if self.svgOK:
raise AttributeError()
method = getattr(self, methodname)
method()
except AttributeError:
self.pop(prefix + suffix)
# track inline content
if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'):
# element declared itself as escaped markup, but it isn't really
if tag in ('xhtml:div', 'div'):
return # typepad does this 10/2007
self.contentparams['type'] = 'application/xhtml+xml'
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
tag = tag.split(':')[-1]
self.handle_data('</%s>' % tag, escape=0)
# track xml:base and xml:lang going out of scope
if self.basestack:
self.basestack.pop()
if self.basestack and self.basestack[-1]:
self.baseuri = self.basestack[-1]
if self.langstack:
self.langstack.pop()
if self.langstack: # and (self.langstack[-1] is not None):
self.lang = self.langstack[-1]
self.depth -= 1
def handle_charref(self, ref):
# Called for each character reference, e.g. for '&#160;', ref is '160'
if not self.elementstack:
return
ref = ref.lower()
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
text = '&#%s;' % ref
else:
if ref[0] == 'x':
c = int(ref[1:], 16)
else:
c = int(ref)
text = chr(c).encode('utf-8')
self.elementstack[-1][2].append(text)
def handle_entityref(self, ref):
# Called for each entity reference, e.g. for '&copy;', ref is 'copy'
if not self.elementstack:
return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
elif ref in self.entities:
text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'):
return self.handle_entityref(text)
else:
try:
html.entities.name2codepoint[ref]
except KeyError:
text = '&%s;' % ref
else:
text = chr(html.entities.name2codepoint[ref]).encode('utf-8')
self.elementstack[-1][2].append(text)
def handle_data(self, text, escape=1):
# Called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
if not self.elementstack:
return
if escape and self.contentparams.get('type') == 'application/xhtml+xml':
text = xml.sax.saxutils.escape(text)
self.elementstack[-1][2].append(text)
def handle_comment(self, text):
# Called for each comment, e.g. <!-- insert message here -->
pass
def handle_pi(self, text):
# Called for each processing instruction, e.g. <?instruction>
pass
def handle_decl(self, text):
pass
def parse_declaration(self, i):
# Override internal declaration handler to handle CDATA blocks.
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1:
# CDATA block began but didn't finish
k = len(self.rawdata)
return k
self.handle_data(xml.sax.saxutils.escape(self.rawdata[i+9:k]), 0)
return k+3
else:
k = self.rawdata.find('>', i)
if k >= 0:
return k+1
else:
# We have an incomplete CDATA block.
return k
@staticmethod
def map_content_type(content_type):
content_type = content_type.lower()
if content_type == 'text' or content_type == 'plain':
content_type = 'text/plain'
elif content_type == 'html':
content_type = 'text/html'
elif content_type == 'xhtml':
content_type = 'application/xhtml+xml'
return content_type
def track_namespace(self, prefix, uri):
loweruri = uri.lower()
if not self.version:
if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
self.version = 'rss090'
elif loweruri == 'http://purl.org/rss/1.0/':
self.version = 'rss10'
elif loweruri == 'http://www.w3.org/2005/atom':
self.version = 'atom10'
if loweruri.find('backend.userland.com/rss') != -1:
# match any backend.userland.com namespace
uri = 'http://backend.userland.com/rss'
loweruri = uri
if loweruri in self._matchnamespaces:
self.namespacemap[prefix] = self._matchnamespaces[loweruri]
self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri
else:
self.namespaces_in_use[prefix or ''] = uri
def resolve_uri(self, uri):
return _urljoin(self.baseuri or '', uri)
@staticmethod
def decode_entities(element, data):
return data
@staticmethod
def strattrs(attrs):
return ''.join(
' %s="%s"' % (t[0], xml.sax.saxutils.escape(t[1], {'"': '&quot;'}))
for t in attrs
)
def push(self, element, expecting_text):
self.elementstack.append([element, expecting_text, []])
def pop(self, element, strip_whitespace=1):
if not self.elementstack:
return
if self.elementstack[-1][0] != element:
return
element, expecting_text, pieces = self.elementstack.pop()
# Ensure each piece is a str for Python 3
for (i, v) in enumerate(pieces):
if isinstance(v, bytes):
pieces[i] = v.decode('utf-8')
if self.version == 'atom10' and self.contentparams.get('type', 'text') == 'application/xhtml+xml':
# remove enclosing child element, but only if it is a <div> and
# only if all the remaining content is nested underneath it.
# This means that the divs would be retained in the following:
# <div>foo</div><div>bar</div>
while pieces and len(pieces) > 1 and not pieces[-1].strip():
del pieces[-1]
while pieces and len(pieces) > 1 and not pieces[0].strip():
del pieces[0]
if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1] == '</div>':
depth = 0
for piece in pieces[:-1]:
if piece.startswith('</'):
depth -= 1
if depth == 0:
break
elif piece.startswith('<') and not piece.endswith('/>'):
depth += 1
else:
pieces = pieces[1:-1]
output = ''.join(pieces)
if strip_whitespace:
output = output.strip()
if not expecting_text:
return output
# decode base64 content
if base64 and self.contentparams.get('base64', 0):
try:
output = base64.decodebytes(output.encode('utf8')).decode('utf8')
except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
pass
# resolve relative URIs
if (element in self.can_be_relative_uri) and output:
# do not resolve guid elements with isPermalink="false"
if not element == 'id' or self.guidislink:
output = self.resolve_uri(output)
# decode entities within embedded markup
if not self.contentparams.get('base64', 0):
output = self.decode_entities(element, output)
# some feed formats require consumers to guess
# whether the content is html or plain text
if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain':
if self.looks_like_html(output):
self.contentparams['type'] = 'text/html'
# remove temporary cruft from contentparams
try:
del self.contentparams['mode']
except KeyError:
pass
try:
del self.contentparams['base64']
except KeyError:
pass
is_htmlish = self.map_content_type(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup
if is_htmlish and self.resolve_relative_uris:
if element in self.can_contain_relative_uris:
output = resolve_relative_uris(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
# sanitize embedded markup
if is_htmlish and self.sanitize_html:
if element in self.can_contain_dangerous_markup:
output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
if self.encoding and isinstance(output, bytes):
output = output.decode(self.encoding, 'ignore')
# address common error where people take data that is already
# utf-8, presume that it is iso-8859-1, and re-encode it.
if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and not isinstance(output, bytes):
try:
output = output.encode('iso-8859-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
pass
# map win-1252 extensions to the proper code points
if not isinstance(output, bytes):
output = output.translate(_cp1252)
# categories/tags/keywords/whatever are handled in _end_category or
# _end_tags or _end_itunes_keywords
if element in ('category', 'tags', 'itunes_keywords'):
return output
if element == 'title' and -1 < self.title_depth <= self.depth:
return output
# store output in appropriate place(s)
if self.inentry and not self.insource:
if element == 'content':
self.entries[-1].setdefault(element, [])
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
self.entries[-1][element].append(contentparams)
elif element == 'link':
if not self.inimage:
# query variables in urls in link elements are improperly
# converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
# unhandled character references. fix this special case.
output = output.replace('&amp;', '&')
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
self.entries[-1][element] = output
if output:
self.entries[-1]['links'][-1]['href'] = output
else:
if element == 'description':
element = 'summary'
old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
if old_value_depth is None or self.depth <= old_value_depth:
self.property_depth_map[self.entries[-1]][element] = self.depth
self.entries[-1][element] = output
if self.incontent:
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
self.entries[-1][element + '_detail'] = contentparams
elif self.infeed or self.insource: # and (not self.intextinput) and (not self.inimage):
context = self._get_context()
if element == 'description':
element = 'subtitle'
context[element] = output
if element == 'link':
# fix query variables; see above for the explanation
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
context[element] = output
context['links'][-1]['href'] = output
elif self.incontent:
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
context[element + '_detail'] = contentparams
return output
def push_content(self, tag, attrs_d, default_content_type, expecting_text):
self.incontent += 1
if self.lang:
self.lang = self.lang.replace('_', '-')
self.contentparams = FeedParserDict({
'type': self.map_content_type(attrs_d.get('type', default_content_type)),
'language': self.lang,
'base': self.baseuri})
self.contentparams['base64'] = self._is_base64(attrs_d, self.contentparams)
self.push(tag, expecting_text)
def pop_content(self, tag):
value = self.pop(tag)
self.incontent -= 1
self.contentparams.clear()
return value
# a number of elements in a number of RSS variants are nominally plain
# text, but this is routinely ignored. This is an attempt to detect
# the most common cases. As false positives often result in silent
# data loss, this function errs on the conservative side.
@staticmethod
def looks_like_html(s):
"""
:type s: str
:rtype: bool
"""
# must have a close tag or an entity reference to qualify
if not (re.search(r'</(\w+)>', s) or re.search(r'&#?\w+;', s)):
return False
# all tags must be in a restricted subset of valid HTML tags
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in _HTMLSanitizer.acceptable_elements)):
return False
# all entities must have been defined as valid HTML entities
if any((e for e in re.findall(r'&(\w+);', s) if e not in html.entities.entitydefs)):
return False
return True
def _map_to_standard_prefix(self, name):
colonpos = name.find(':')
if colonpos != -1:
prefix = name[:colonpos]
suffix = name[colonpos+1:]
prefix = self.namespacemap.get(prefix, prefix)
name = prefix + ':' + suffix
return name
def _get_attribute(self, attrs_d, name):
return attrs_d.get(self._map_to_standard_prefix(name))
def _is_base64(self, attrs_d, contentparams):
if attrs_d.get('mode', '') == 'base64':
return 1
if self.contentparams['type'].startswith('text/'):
return 0
if self.contentparams['type'].endswith('+xml'):
return 0
if self.contentparams['type'].endswith('/xml'):
return 0
return 1
@staticmethod
def _enforce_href(attrs_d):
href = attrs_d.get('url', attrs_d.get('uri', attrs_d.get('href', None)))
if href:
try:
del attrs_d['url']
except KeyError:
pass
try:
del attrs_d['uri']
except KeyError:
pass
attrs_d['href'] = href
return attrs_d
def _save(self, key, value, overwrite=False):
context = self._get_context()
if overwrite:
context[key] = value
else:
context.setdefault(key, value)
def _get_context(self):
if self.insource:
context = self.sourcedata
elif self.inimage and 'image' in self.feeddata:
context = self.feeddata['image']
elif self.intextinput:
context = self.feeddata['textinput']
elif self.inentry:
context = self.entries[-1]
else:
context = self.feeddata
return context
def _save_author(self, key, value, prefix='author'):
context = self._get_context()
context.setdefault(prefix + '_detail', FeedParserDict())
context[prefix + '_detail'][key] = value
self._sync_author_detail()
context.setdefault('authors', [FeedParserDict()])
context['authors'][-1][key] = value
def _save_contributor(self, key, value):
context = self._get_context()
context.setdefault('contributors', [FeedParserDict()])
context['contributors'][-1][key] = value
def _sync_author_detail(self, key='author'):
context = self._get_context()
detail = context.get('%ss' % key, [FeedParserDict()])[-1]
if detail:
name = detail.get('name')
email = detail.get('email')
if name and email:
context[key] = '%s (%s)' % (name, email)
elif name:
context[key] = name
elif email:
context[key] = email
else:
author, email = context.get(key), None
if not author:
return
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
if emailmatch:
email = emailmatch.group(0)
# probably a better way to do the following, but it passes
# all the tests
author = author.replace(email, '')
author = author.replace('()', '')
author = author.replace('<>', '')
author = author.replace('&lt;&gt;', '')
author = author.strip()
if author and (author[0] == '('):
author = author[1:]
if author and (author[-1] == ')'):
author = author[:-1]
author = author.strip()
if author or email:
context.setdefault('%s_detail' % key, detail)
if author:
detail['name'] = author
if email:
detail['email'] = email
def _add_tag(self, term, scheme, label):
context = self._get_context()
tags = context.setdefault('tags', [])
if (not term) and (not scheme) and (not label):
return
value = FeedParserDict(term=term, scheme=scheme, label=label)
if value not in tags:
tags.append(value)
def _start_tags(self, attrs_d):
# This is a completely-made up element. Its semantics are determined
# only by a single feed that precipitated bug report 392 on Google Code.
# In short, this is junk code.
self.push('tags', 1)
def _end_tags(self):
for term in self.pop('tags').split(','):
self._add_tag(term.strip(), None, None)

View File

@@ -0,0 +1,506 @@
# Support for the Atom, RSS, RDF, and CDF feed formats
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import copy
from ..datetimes import _parse_date
from ..urls import make_safe_absolute_uri
from ..util import FeedParserDict
class Namespace(object):
"""Support for the Atom, RSS, RDF, and CDF feed formats.
The feed formats all share common elements, some of which have conflicting
interpretations. For simplicity, all of the base feed format support is
collected here.
"""
supported_namespaces = {
'': '',
'http://backend.userland.com/rss': '',
'http://blogs.law.harvard.edu/tech/rss': '',
'http://purl.org/rss/1.0/': '',
'http://my.netscape.com/rdf/simple/0.9/': '',
'http://example.com/newformat#': '',
'http://example.com/necho': '',
'http://purl.org/echo/': '',
'uri/of/echo/namespace#': '',
'http://purl.org/pie/': '',
'http://purl.org/atom/ns#': '',
'http://www.w3.org/2005/Atom': '',
'http://purl.org/rss/1.0/modules/rss091#': '',
}
def _start_rss(self, attrs_d):
versionmap = {
'0.91': 'rss091u',
'0.92': 'rss092',
'0.93': 'rss093',
'0.94': 'rss094',
}
# If we're here then this is an RSS feed.
# If we don't have a version or have a version that starts with something
# other than RSS then there's been a mistake. Correct it.
if not self.version or not self.version.startswith('rss'):
attr_version = attrs_d.get('version', '')
version = versionmap.get(attr_version)
if version:
self.version = version
elif attr_version.startswith('2.'):
self.version = 'rss20'
else:
self.version = 'rss'
def _start_channel(self, attrs_d):
self.infeed = 1
self._cdf_common(attrs_d)
def _cdf_common(self, attrs_d):
if 'lastmod' in attrs_d:
self._start_modified({})
self.elementstack[-1][-1] = attrs_d['lastmod']
self._end_modified()
if 'href' in attrs_d:
self._start_link({})
self.elementstack[-1][-1] = attrs_d['href']
self._end_link()
def _start_feed(self, attrs_d):
self.infeed = 1
versionmap = {'0.1': 'atom01',
'0.2': 'atom02',
'0.3': 'atom03'}
if not self.version:
attr_version = attrs_d.get('version')
version = versionmap.get(attr_version)
if version:
self.version = version
else:
self.version = 'atom'
def _end_channel(self):
self.infeed = 0
_end_feed = _end_channel
def _start_image(self, attrs_d):
context = self._get_context()
if not self.inentry:
context.setdefault('image', FeedParserDict())
self.inimage = 1
self.title_depth = -1
self.push('image', 0)
def _end_image(self):
self.pop('image')
self.inimage = 0
def _start_textinput(self, attrs_d):
context = self._get_context()
context.setdefault('textinput', FeedParserDict())
self.intextinput = 1
self.title_depth = -1
self.push('textinput', 0)
_start_textInput = _start_textinput
def _end_textinput(self):
self.pop('textinput')
self.intextinput = 0
_end_textInput = _end_textinput
def _start_author(self, attrs_d):
self.inauthor = 1
self.push('author', 1)
# Append a new FeedParserDict when expecting an author
context = self._get_context()
context.setdefault('authors', [])
context['authors'].append(FeedParserDict())
_start_managingeditor = _start_author
def _end_author(self):
self.pop('author')
self.inauthor = 0
self._sync_author_detail()
_end_managingeditor = _end_author
def _start_contributor(self, attrs_d):
self.incontributor = 1
context = self._get_context()
context.setdefault('contributors', [])
context['contributors'].append(FeedParserDict())
self.push('contributor', 0)
def _end_contributor(self):
self.pop('contributor')
self.incontributor = 0
def _start_name(self, attrs_d):
self.push('name', 0)
def _end_name(self):
value = self.pop('name')
if self.inpublisher:
self._save_author('name', value, 'publisher')
elif self.inauthor:
self._save_author('name', value)
elif self.incontributor:
self._save_contributor('name', value)
elif self.intextinput:
context = self._get_context()
context['name'] = value
def _start_width(self, attrs_d):
self.push('width', 0)
def _end_width(self):
value = self.pop('width')
try:
value = int(value)
except ValueError:
value = 0
if self.inimage:
context = self._get_context()
context['width'] = value
def _start_height(self, attrs_d):
self.push('height', 0)
def _end_height(self):
value = self.pop('height')
try:
value = int(value)
except ValueError:
value = 0
if self.inimage:
context = self._get_context()
context['height'] = value
def _start_url(self, attrs_d):
self.push('href', 1)
_start_homepage = _start_url
_start_uri = _start_url
def _end_url(self):
value = self.pop('href')
if self.inauthor:
self._save_author('href', value)
elif self.incontributor:
self._save_contributor('href', value)
_end_homepage = _end_url
_end_uri = _end_url
def _start_email(self, attrs_d):
self.push('email', 0)
def _end_email(self):
value = self.pop('email')
if self.inpublisher:
self._save_author('email', value, 'publisher')
elif self.inauthor:
self._save_author('email', value)
elif self.incontributor:
self._save_contributor('email', value)
def _start_subtitle(self, attrs_d):
self.push_content('subtitle', attrs_d, 'text/plain', 1)
_start_tagline = _start_subtitle
def _end_subtitle(self):
self.pop_content('subtitle')
_end_tagline = _end_subtitle
def _start_rights(self, attrs_d):
self.push_content('rights', attrs_d, 'text/plain', 1)
_start_copyright = _start_rights
def _end_rights(self):
self.pop_content('rights')
_end_copyright = _end_rights
def _start_item(self, attrs_d):
self.entries.append(FeedParserDict())
self.push('item', 0)
self.inentry = 1
self.guidislink = 0
self.title_depth = -1
id = self._get_attribute(attrs_d, 'rdf:about')
if id:
context = self._get_context()
context['id'] = id
self._cdf_common(attrs_d)
_start_entry = _start_item
def _end_item(self):
self.pop('item')
self.inentry = 0
self.hasContent = 0
_end_entry = _end_item
def _start_language(self, attrs_d):
self.push('language', 1)
def _end_language(self):
self.lang = self.pop('language')
def _start_webmaster(self, attrs_d):
self.push('publisher', 1)
def _end_webmaster(self):
self.pop('publisher')
self._sync_author_detail('publisher')
def _start_published(self, attrs_d):
self.push('published', 1)
_start_issued = _start_published
_start_pubdate = _start_published
def _end_published(self):
value = self.pop('published')
self._save('published_parsed', _parse_date(value), overwrite=True)
_end_issued = _end_published
_end_pubdate = _end_published
def _start_updated(self, attrs_d):
self.push('updated', 1)
_start_modified = _start_updated
_start_lastbuilddate = _start_updated
def _end_updated(self):
value = self.pop('updated')
parsed_value = _parse_date(value)
self._save('updated_parsed', parsed_value, overwrite=True)
_end_modified = _end_updated
_end_lastbuilddate = _end_updated
def _start_created(self, attrs_d):
self.push('created', 1)
def _end_created(self):
value = self.pop('created')
self._save('created_parsed', _parse_date(value), overwrite=True)
def _start_expirationdate(self, attrs_d):
self.push('expired', 1)
def _end_expirationdate(self):
self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
def _start_category(self, attrs_d):
term = attrs_d.get('term')
scheme = attrs_d.get('scheme', attrs_d.get('domain'))
label = attrs_d.get('label')
self._add_tag(term, scheme, label)
self.push('category', 1)
_start_keywords = _start_category
def _end_category(self):
value = self.pop('category')
if not value:
return
context = self._get_context()
tags = context['tags']
if value and len(tags) and not tags[-1]['term']:
tags[-1]['term'] = value
else:
self._add_tag(value, None, None)
_end_keywords = _end_category
def _start_cloud(self, attrs_d):
self._get_context()['cloud'] = FeedParserDict(attrs_d)
def _start_link(self, attrs_d):
attrs_d.setdefault('rel', 'alternate')
if attrs_d['rel'] == 'self':
attrs_d.setdefault('type', 'application/atom+xml')
else:
attrs_d.setdefault('type', 'text/html')
context = self._get_context()
attrs_d = self._enforce_href(attrs_d)
if 'href' in attrs_d:
attrs_d['href'] = self.resolve_uri(attrs_d['href'])
expecting_text = self.infeed or self.inentry or self.insource
context.setdefault('links', [])
if not (self.inentry and self.inimage):
context['links'].append(FeedParserDict(attrs_d))
if 'href' in attrs_d:
if (
attrs_d.get('rel') == 'alternate'
and self.map_content_type(attrs_d.get('type')) in self.html_types
):
context['link'] = attrs_d['href']
else:
self.push('link', expecting_text)
def _end_link(self):
self.pop('link')
def _start_guid(self, attrs_d):
self.guidislink = (attrs_d.get('ispermalink', 'true') == 'true')
self.push('id', 1)
_start_id = _start_guid
def _end_guid(self):
value = self.pop('id')
self._save('guidislink', self.guidislink and 'link' not in self._get_context())
if self.guidislink:
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
# and only if the item doesn't already have a link element
self._save('link', value)
_end_id = _end_guid
def _start_title(self, attrs_d):
if self.svgOK:
return self.unknown_starttag('title', list(attrs_d.items()))
self.push_content('title', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource)
def _end_title(self):
if self.svgOK:
return
value = self.pop_content('title')
if not value:
return
self.title_depth = self.depth
def _start_description(self, attrs_d):
context = self._get_context()
if 'summary' in context and not self.hasContent:
self._summaryKey = 'content'
self._start_content(attrs_d)
else:
self.push_content('description', attrs_d, 'text/html', self.infeed or self.inentry or self.insource)
def _start_abstract(self, attrs_d):
self.push_content('description', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource)
def _end_description(self):
if self._summaryKey == 'content':
self._end_content()
else:
self.pop_content('description')
self._summaryKey = None
_end_abstract = _end_description
def _start_info(self, attrs_d):
self.push_content('info', attrs_d, 'text/plain', 1)
_start_feedburner_browserfriendly = _start_info
def _end_info(self):
self.pop_content('info')
_end_feedburner_browserfriendly = _end_info
def _start_generator(self, attrs_d):
if attrs_d:
attrs_d = self._enforce_href(attrs_d)
if 'href' in attrs_d:
attrs_d['href'] = self.resolve_uri(attrs_d['href'])
self._get_context()['generator_detail'] = FeedParserDict(attrs_d)
self.push('generator', 1)
def _end_generator(self):
value = self.pop('generator')
context = self._get_context()
if 'generator_detail' in context:
context['generator_detail']['name'] = value
def _start_summary(self, attrs_d):
context = self._get_context()
if 'summary' in context and not self.hasContent:
self._summaryKey = 'content'
self._start_content(attrs_d)
else:
self._summaryKey = 'summary'
self.push_content(self._summaryKey, attrs_d, 'text/plain', 1)
def _end_summary(self):
if self._summaryKey == 'content':
self._end_content()
else:
self.pop_content(self._summaryKey or 'summary')
self._summaryKey = None
def _start_enclosure(self, attrs_d):
attrs_d = self._enforce_href(attrs_d)
context = self._get_context()
attrs_d['rel'] = 'enclosure'
context.setdefault('links', []).append(FeedParserDict(attrs_d))
def _start_source(self, attrs_d):
if 'url' in attrs_d:
# This means that we're processing a source element from an RSS 2.0 feed
self.sourcedata['href'] = attrs_d['url']
self.push('source', 1)
self.insource = 1
self.title_depth = -1
def _end_source(self):
self.insource = 0
value = self.pop('source')
if value:
self.sourcedata['title'] = value
self._get_context()['source'] = copy.deepcopy(self.sourcedata)
self.sourcedata.clear()
def _start_content(self, attrs_d):
self.hasContent = 1
self.push_content('content', attrs_d, 'text/plain', 1)
src = attrs_d.get('src')
if src:
self.contentparams['src'] = src
self.push('content', 1)
def _start_body(self, attrs_d):
self.push_content('content', attrs_d, 'application/xhtml+xml', 1)
_start_xhtml_body = _start_body
def _start_content_encoded(self, attrs_d):
self.hasContent = 1
self.push_content('content', attrs_d, 'text/html', 1)
_start_fullitem = _start_content_encoded
def _end_content(self):
copyToSummary = self.map_content_type(self.contentparams.get('type')) in ({'text/plain'} | self.html_types)
value = self.pop_content('content')
if copyToSummary:
self._save('summary', value)
_end_body = _end_content
_end_xhtml_body = _end_content
_end_content_encoded = _end_content
_end_fullitem = _end_content
def _start_newlocation(self, attrs_d):
self.push('newlocation', 1)
def _end_newlocation(self):
url = self.pop('newlocation')
context = self._get_context()
# don't set newlocation if the context isn't right
if context is not self.feeddata:
return
context['newlocation'] = make_safe_absolute_uri(self.baseuri, url.strip())

View File

@@ -0,0 +1,53 @@
# Support for the administrative elements extension
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ..util import FeedParserDict
class Namespace(object):
# RDF Site Summary 1.0 Modules: Administrative
# http://web.resource.org/rss/1.0/modules/admin/
supported_namespaces = {
'http://webns.net/mvcb/': 'admin',
}
def _start_admin_generatoragent(self, attrs_d):
self.push('generator', 1)
value = self._get_attribute(attrs_d, 'rdf:resource')
if value:
self.elementstack[-1][2].append(value)
self.pop('generator')
self._get_context()['generator_detail'] = FeedParserDict({'href': value})
def _start_admin_errorreportsto(self, attrs_d):
self.push('errorreportsto', 1)
value = self._get_attribute(attrs_d, 'rdf:resource')
if value:
self.elementstack[-1][2].append(value)
self.pop('errorreportsto')

View File

@@ -0,0 +1,69 @@
# Support for the Creative Commons licensing extensions
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ..util import FeedParserDict
class Namespace(object):
supported_namespaces = {
# RDF-based namespace
'http://creativecommons.org/ns#license': 'cc',
# Old RDF-based namespace
'http://web.resource.org/cc/': 'cc',
# RSS-based namespace
'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativecommons',
# Old RSS-based namespace
'http://backend.userland.com/creativeCommonsRssModule': 'creativecommons',
}
def _start_cc_license(self, attrs_d):
context = self._get_context()
value = self._get_attribute(attrs_d, 'rdf:resource')
attrs_d = FeedParserDict()
attrs_d['rel'] = 'license'
if value:
attrs_d['href'] = value
context.setdefault('links', []).append(attrs_d)
def _start_creativecommons_license(self, attrs_d):
self.push('license', 1)
_start_creativeCommons_license = _start_creativecommons_license
def _end_creativecommons_license(self):
value = self.pop('license')
context = self._get_context()
attrs_d = FeedParserDict()
attrs_d['rel'] = 'license'
if value:
attrs_d['href'] = value
context.setdefault('links', []).append(attrs_d)
del context['license']
_end_creativeCommons_license = _end_creativecommons_license

View File

@@ -0,0 +1,134 @@
# Support for the Dublin Core metadata extensions
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ..datetimes import _parse_date
from ..util import FeedParserDict
class Namespace(object):
supported_namespaces = {
'http://purl.org/dc/elements/1.1/': 'dc',
'http://purl.org/dc/terms/': 'dcterms',
}
def _end_dc_author(self):
self._end_author()
def _end_dc_creator(self):
self._end_author()
def _end_dc_date(self):
self._end_updated()
def _end_dc_description(self):
self._end_description()
def _end_dc_language(self):
self._end_language()
def _end_dc_publisher(self):
self._end_webmaster()
def _end_dc_rights(self):
self._end_rights()
def _end_dc_subject(self):
self._end_category()
def _end_dc_title(self):
self._end_title()
def _end_dcterms_created(self):
self._end_created()
def _end_dcterms_issued(self):
self._end_published()
def _end_dcterms_modified(self):
self._end_updated()
def _start_dc_author(self, attrs_d):
self._start_author(attrs_d)
def _start_dc_creator(self, attrs_d):
self._start_author(attrs_d)
def _start_dc_date(self, attrs_d):
self._start_updated(attrs_d)
def _start_dc_description(self, attrs_d):
self._start_description(attrs_d)
def _start_dc_language(self, attrs_d):
self._start_language(attrs_d)
def _start_dc_publisher(self, attrs_d):
self._start_webmaster(attrs_d)
def _start_dc_rights(self, attrs_d):
self._start_rights(attrs_d)
def _start_dc_subject(self, attrs_d):
self._start_category(attrs_d)
def _start_dc_title(self, attrs_d):
self._start_title(attrs_d)
def _start_dcterms_created(self, attrs_d):
self._start_created(attrs_d)
def _start_dcterms_issued(self, attrs_d):
self._start_published(attrs_d)
def _start_dcterms_modified(self, attrs_d):
self._start_updated(attrs_d)
def _start_dcterms_valid(self, attrs_d):
self.push('validity', 1)
def _end_dcterms_valid(self):
for validity_detail in self.pop('validity').split(';'):
if '=' in validity_detail:
key, value = validity_detail.split('=', 1)
if key == 'start':
self._save('validity_start', value, overwrite=True)
self._save('validity_start_parsed', _parse_date(value), overwrite=True)
elif key == 'end':
self._save('validity_end', value, overwrite=True)
self._save('validity_end_parsed', _parse_date(value), overwrite=True)
def _start_dc_contributor(self, attrs_d):
self.incontributor = 1
context = self._get_context()
context.setdefault('contributors', [])
context['contributors'].append(FeedParserDict())
self.push('name', 0)
def _end_dc_contributor(self):
self._end_name()
self.incontributor = 0

View File

@@ -0,0 +1,278 @@
# Support for the GeoRSS format
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# Required for Python 3.6 compatibility.
from __future__ import generator_stop
from ..util import FeedParserDict
class Namespace(object):
supported_namespaces = {
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
'http://www.georss.org/georss': 'georss',
'http://www.opengis.net/gml': 'gml',
}
def __init__(self):
self.ingeometry = 0
super(Namespace, self).__init__()
def _start_georssgeom(self, attrs_d):
self.push('geometry', 0)
context = self._get_context()
context['where'] = FeedParserDict()
_start_georss_point = _start_georssgeom
_start_georss_line = _start_georssgeom
_start_georss_polygon = _start_georssgeom
_start_georss_box = _start_georssgeom
def _save_where(self, geometry):
context = self._get_context()
context['where'].update(geometry)
def _end_georss_point(self):
geometry = _parse_georss_point(self.pop('geometry'))
if geometry:
self._save_where(geometry)
def _end_georss_line(self):
geometry = _parse_georss_line(self.pop('geometry'))
if geometry:
self._save_where(geometry)
def _end_georss_polygon(self):
this = self.pop('geometry')
geometry = _parse_georss_polygon(this)
if geometry:
self._save_where(geometry)
def _end_georss_box(self):
geometry = _parse_georss_box(self.pop('geometry'))
if geometry:
self._save_where(geometry)
def _start_where(self, attrs_d):
self.push('where', 0)
context = self._get_context()
context['where'] = FeedParserDict()
_start_georss_where = _start_where
def _parse_srs_attrs(self, attrs_d):
srs_name = attrs_d.get('srsname')
try:
srs_dimension = int(attrs_d.get('srsdimension', '2'))
except ValueError:
srs_dimension = 2
context = self._get_context()
if 'where' not in context:
context['where'] = {}
context['where']['srsName'] = srs_name
context['where']['srsDimension'] = srs_dimension
def _start_gml_point(self, attrs_d):
self._parse_srs_attrs(attrs_d)
self.ingeometry = 1
self.push('geometry', 0)
def _start_gml_linestring(self, attrs_d):
self._parse_srs_attrs(attrs_d)
self.ingeometry = 'linestring'
self.push('geometry', 0)
def _start_gml_polygon(self, attrs_d):
self._parse_srs_attrs(attrs_d)
self.push('geometry', 0)
def _start_gml_exterior(self, attrs_d):
self.push('geometry', 0)
def _start_gml_linearring(self, attrs_d):
self.ingeometry = 'polygon'
self.push('geometry', 0)
def _start_gml_pos(self, attrs_d):
self.push('pos', 0)
def _end_gml_pos(self):
this = self.pop('pos')
context = self._get_context()
srs_name = context['where'].get('srsName')
srs_dimension = context['where'].get('srsDimension', 2)
swap = True
if srs_name and "EPSG" in srs_name:
epsg = int(srs_name.split(":")[-1])
swap = bool(epsg in _geogCS)
geometry = _parse_georss_point(this, swap=swap, dims=srs_dimension)
if geometry:
self._save_where(geometry)
def _start_gml_poslist(self, attrs_d):
self.push('pos', 0)
def _end_gml_poslist(self):
this = self.pop('pos')
context = self._get_context()
srs_name = context['where'].get('srsName')
srs_dimension = context['where'].get('srsDimension', 2)
swap = True
if srs_name and "EPSG" in srs_name:
epsg = int(srs_name.split(":")[-1])
swap = bool(epsg in _geogCS)
geometry = _parse_poslist(
this, self.ingeometry, swap=swap, dims=srs_dimension)
if geometry:
self._save_where(geometry)
def _end_geom(self):
self.ingeometry = 0
self.pop('geometry')
_end_gml_point = _end_geom
_end_gml_linestring = _end_geom
_end_gml_linearring = _end_geom
_end_gml_exterior = _end_geom
_end_gml_polygon = _end_geom
def _end_where(self):
self.pop('where')
_end_georss_where = _end_where
# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
# items, or None in the case of a parsing error.
def _parse_poslist(value, geom_type, swap=True, dims=2):
if geom_type == 'linestring':
return _parse_georss_line(value, swap, dims)
elif geom_type == 'polygon':
ring = _parse_georss_line(value, swap, dims)
return {'type': 'Polygon', 'coordinates': (ring['coordinates'],)}
else:
return None
def _gen_georss_coords(value, swap=True, dims=2):
# A generator of (lon, lat) pairs from a string of encoded GeoRSS
# coordinates. Converts to floats and swaps order.
latlons = (float(ll) for ll in value.replace(',', ' ').split())
while True:
try:
t = [next(latlons), next(latlons)][::swap and -1 or 1]
if dims == 3:
t.append(next(latlons))
yield tuple(t)
except StopIteration:
return
def _parse_georss_point(value, swap=True, dims=2):
# A point contains a single latitude-longitude pair, separated by
# whitespace. We'll also handle comma separators.
try:
coords = list(_gen_georss_coords(value, swap, dims))
return {'type': 'Point', 'coordinates': coords[0]}
except (IndexError, ValueError):
return None
def _parse_georss_line(value, swap=True, dims=2):
# A line contains a space separated list of latitude-longitude pairs in
# WGS84 coordinate reference system, with each pair separated by
# whitespace. There must be at least two pairs.
try:
coords = list(_gen_georss_coords(value, swap, dims))
return {'type': 'LineString', 'coordinates': coords}
except (IndexError, ValueError):
return None
def _parse_georss_polygon(value, swap=True, dims=2):
# A polygon contains a space separated list of latitude-longitude pairs,
# with each pair separated by whitespace. There must be at least four
# pairs, with the last being identical to the first (so a polygon has a
# minimum of three actual points).
try:
ring = list(_gen_georss_coords(value, swap, dims))
except (IndexError, ValueError):
return None
if len(ring) < 4:
return None
return {'type': 'Polygon', 'coordinates': (ring,)}
def _parse_georss_box(value, swap=True, dims=2):
# A bounding box is a rectangular region, often used to define the extents
# of a map or a rough area of interest. A box contains two space separate
# latitude-longitude pairs, with each pair separated by whitespace. The
# first pair is the lower corner, the second is the upper corner.
try:
coords = list(_gen_georss_coords(value, swap, dims))
return {'type': 'Box', 'coordinates': tuple(coords)}
except (IndexError, ValueError):
return None
# The list of EPSG codes for geographic (latitude/longitude) coordinate
# systems to support decoding of GeoRSS GML profiles.
_geogCS = [
3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979,
]

View File

@@ -0,0 +1,109 @@
# Support for the iTunes format
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ..util import FeedParserDict
class Namespace(object):
supported_namespaces = {
# Canonical namespace
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
# Extra namespace
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
}
def _start_itunes_author(self, attrs_d):
self._start_author(attrs_d)
def _end_itunes_author(self):
self._end_author()
def _end_itunes_category(self):
self._end_category()
def _start_itunes_name(self, attrs_d):
self._start_name(attrs_d)
def _end_itunes_name(self):
self._end_name()
def _start_itunes_email(self, attrs_d):
self._start_email(attrs_d)
def _end_itunes_email(self):
self._end_email()
def _start_itunes_subtitle(self, attrs_d):
self._start_subtitle(attrs_d)
def _end_itunes_subtitle(self):
self._end_subtitle()
def _start_itunes_summary(self, attrs_d):
self._start_summary(attrs_d)
def _end_itunes_summary(self):
self._end_summary()
def _start_itunes_owner(self, attrs_d):
self.inpublisher = 1
self.push('publisher', 0)
def _end_itunes_owner(self):
self.pop('publisher')
self.inpublisher = 0
self._sync_author_detail('publisher')
def _end_itunes_keywords(self):
for term in self.pop('itunes_keywords').split(','):
if term.strip():
self._add_tag(term.strip(), 'http://www.itunes.com/', None)
def _start_itunes_category(self, attrs_d):
self._add_tag(attrs_d.get('text'), 'http://www.itunes.com/', None)
self.push('category', 1)
def _start_itunes_image(self, attrs_d):
self.push('itunes_image', 0)
if attrs_d.get('href'):
self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('href')})
elif attrs_d.get('url'):
self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('url')})
_start_itunes_link = _start_itunes_image
def _end_itunes_block(self):
value = self.pop('itunes_block', 0)
self._get_context()['itunes_block'] = (value == 'yes' or value == 'Yes') and 1 or 0
def _end_itunes_explicit(self):
value = self.pop('itunes_explicit', 0)
# Convert 'yes' -> True, 'clean' to False, and any other value to None
# False and None both evaluate as False, so the difference can be ignored
# by applications that only need to know if the content is explicit.
self._get_context()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]

View File

@@ -0,0 +1,141 @@
# Support for the Media RSS format
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ..util import FeedParserDict
class Namespace(object):
supported_namespaces = {
# Canonical namespace
'http://search.yahoo.com/mrss/': 'media',
# Old namespace (no trailing slash)
'http://search.yahoo.com/mrss': 'media',
}
def _start_media_category(self, attrs_d):
attrs_d.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
self._start_category(attrs_d)
def _end_media_category(self):
self._end_category()
def _end_media_keywords(self):
for term in self.pop('media_keywords').split(','):
if term.strip():
self._add_tag(term.strip(), None, None)
def _start_media_title(self, attrs_d):
self._start_title(attrs_d)
def _end_media_title(self):
title_depth = self.title_depth
self._end_title()
self.title_depth = title_depth
def _start_media_group(self, attrs_d):
# don't do anything, but don't break the enclosed tags either
pass
def _start_media_rating(self, attrs_d):
context = self._get_context()
context.setdefault('media_rating', attrs_d)
self.push('rating', 1)
def _end_media_rating(self):
rating = self.pop('rating')
if rating is not None and rating.strip():
context = self._get_context()
context['media_rating']['content'] = rating
def _start_media_credit(self, attrs_d):
context = self._get_context()
context.setdefault('media_credit', [])
context['media_credit'].append(attrs_d)
self.push('credit', 1)
def _end_media_credit(self):
credit = self.pop('credit')
if credit is not None and credit.strip():
context = self._get_context()
context['media_credit'][-1]['content'] = credit
def _start_media_description(self, attrs_d):
self._start_description(attrs_d)
def _end_media_description(self):
self._end_description()
def _start_media_restriction(self, attrs_d):
context = self._get_context()
context.setdefault('media_restriction', attrs_d)
self.push('restriction', 1)
def _end_media_restriction(self):
restriction = self.pop('restriction')
if restriction is not None and restriction.strip():
context = self._get_context()
context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')]
def _start_media_license(self, attrs_d):
context = self._get_context()
context.setdefault('media_license', attrs_d)
self.push('license', 1)
def _end_media_license(self):
license_ = self.pop('license')
if license_ is not None and license_.strip():
context = self._get_context()
context['media_license']['content'] = license_
def _start_media_content(self, attrs_d):
context = self._get_context()
context.setdefault('media_content', [])
context['media_content'].append(attrs_d)
def _start_media_thumbnail(self, attrs_d):
context = self._get_context()
context.setdefault('media_thumbnail', [])
self.push('url', 1) # new
context['media_thumbnail'].append(attrs_d)
def _end_media_thumbnail(self):
url = self.pop('url')
context = self._get_context()
if url is not None and url.strip():
if 'url' not in context['media_thumbnail'][-1]:
context['media_thumbnail'][-1]['url'] = url
def _start_media_player(self, attrs_d):
self.push('media_player', 0)
self._get_context()['media_player'] = FeedParserDict(attrs_d)
def _end_media_player(self):
value = self.pop('media_player')
context = self._get_context()
context['media_player']['content'] = value

View File

@@ -0,0 +1,74 @@
# Support for the Podlove Simple Chapters format
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import datetime
import re
from .. import util
class Namespace(object):
supported_namespaces = {
'http://podlove.org/simple-chapters': 'psc',
}
def __init__(self):
# chapters will only be captured while psc_chapters_flag is True.
self.psc_chapters_flag = False
super(Namespace, self).__init__()
def _start_psc_chapters(self, attrs_d):
context = self._get_context()
if 'psc_chapters' not in context:
self.psc_chapters_flag = True
attrs_d['chapters'] = []
context['psc_chapters'] = util.FeedParserDict(attrs_d)
def _end_psc_chapters(self):
self.psc_chapters_flag = False
def _start_psc_chapter(self, attrs_d):
if self.psc_chapters_flag:
start = self._get_attribute(attrs_d, 'start')
attrs_d['start_parsed'] = _parse_psc_chapter_start(start)
context = self._get_context()['psc_chapters']
context['chapters'].append(util.FeedParserDict(attrs_d))
format_ = re.compile(r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$')
def _parse_psc_chapter_start(start):
m = format_.match(start)
if m is None:
return None
_, h, m, s, _, ms = m.groups()
h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000)

View File

@@ -0,0 +1,77 @@
# The loose feed parser that interfaces with an SGML parsing library
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
class _LooseFeedParser(object):
contentparams = None
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
self.baseuri = baseuri or ''
self.lang = baselang or None
self.encoding = encoding or 'utf-8' # character encoding
self.entities = entities or {}
super(_LooseFeedParser, self).__init__()
@staticmethod
def _normalize_attributes(kv):
k = kv[0].lower()
v = k in ('rel', 'type') and kv[1].lower() or kv[1]
# the sgml parser doesn't handle entities in attributes, nor
# does it pass the attribute values through as unicode, while
# strict xml parsers do -- account for this difference
v = v.replace('&amp;', '&')
return k, v
def decode_entities(self, element, data):
data = data.replace('&#60;', '&lt;')
data = data.replace('&#x3c;', '&lt;')
data = data.replace('&#x3C;', '&lt;')
data = data.replace('&#62;', '&gt;')
data = data.replace('&#x3e;', '&gt;')
data = data.replace('&#x3E;', '&gt;')
data = data.replace('&#38;', '&amp;')
data = data.replace('&#x26;', '&amp;')
data = data.replace('&#34;', '&quot;')
data = data.replace('&#x22;', '&quot;')
data = data.replace('&#39;', '&apos;')
data = data.replace('&#x27;', '&apos;')
if not self.contentparams.get('type', 'xml').endswith('xml'):
data = data.replace('&lt;', '<')
data = data.replace('&gt;', '>')
data = data.replace('&amp;', '&')
data = data.replace('&quot;', '"')
data = data.replace('&apos;', "'")
data = data.replace('&#x2f;', '/')
data = data.replace('&#x2F;', '/')
return data
@staticmethod
def strattrs(attrs):
return ''.join(
' %s="%s"' % (n, v.replace('"', '&quot;'))
for n, v in attrs
)

View File

@@ -0,0 +1,135 @@
# The strict feed parser that interfaces with an XML parsing library
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from ..exceptions import UndeclaredNamespace
class _StrictFeedParser(object):
def __init__(self, baseuri, baselang, encoding):
self.bozo = 0
self.exc = None
self.decls = {}
self.baseuri = baseuri or ''
self.lang = baselang
self.encoding = encoding
super(_StrictFeedParser, self).__init__()
@staticmethod
def _normalize_attributes(kv):
k = kv[0].lower()
v = k in ('rel', 'type') and kv[1].lower() or kv[1]
return k, v
def startPrefixMapping(self, prefix, uri):
if not uri:
return
# Jython uses '' instead of None; standardize on None
prefix = prefix or None
self.track_namespace(prefix, uri)
if prefix and uri == 'http://www.w3.org/1999/xlink':
self.decls['xmlns:' + prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
lowernamespace = str(namespace or '').lower()
if lowernamespace.find('backend.userland.com/rss') != -1:
# match any backend.userland.com namespace
namespace = 'http://backend.userland.com/rss'
lowernamespace = namespace
if qname and qname.find(':') > 0:
givenprefix = qname.split(':')[0]
else:
givenprefix = None
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if givenprefix and (prefix is None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespaces_in_use:
raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix)
localname = str(localname).lower()
# qname implementation is horribly broken in Python 2.1 (it
# doesn't report any), and slightly broken in Python 2.2 (it
# doesn't report the xml: namespace). So we match up namespaces
# with a known list first, and then possibly override them with
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
attrsD, self.decls = self.decls, {}
if localname == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML':
attrsD['xmlns'] = namespace
if localname == 'svg' and namespace == 'http://www.w3.org/2000/svg':
attrsD['xmlns'] = namespace
if prefix:
localname = prefix.lower() + ':' + localname
elif namespace and not qname: # Expat
for name, value in self.namespaces_in_use.items():
if name and value == namespace:
localname = name + ':' + localname
break
for (namespace, attrlocalname), attrvalue in attrs.items():
lowernamespace = (namespace or '').lower()
prefix = self._matchnamespaces.get(lowernamespace, '')
if prefix:
attrlocalname = prefix + ':' + attrlocalname
attrsD[str(attrlocalname).lower()] = attrvalue
for qname in attrs.getQNames():
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
localname = str(localname).lower()
self.unknown_starttag(localname, list(attrsD.items()))
def characters(self, text):
self.handle_data(text)
def endElementNS(self, name, qname):
namespace, localname = name
lowernamespace = str(namespace or '').lower()
if qname and qname.find(':') > 0:
givenprefix = qname.split(':')[0]
else:
givenprefix = ''
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if prefix:
localname = prefix + ':' + localname
elif namespace and not qname: # Expat
for name, value in self.namespaces_in_use.items():
if name and value == namespace:
localname = name + ':' + localname
break
localname = str(localname).lower()
self.unknown_endtag(localname)
def error(self, exc):
self.bozo = 1
self.exc = exc
# drv_libxml2 calls warning() in some cases
warning = error
def fatalError(self, exc):
self.error(exc)
raise exc

View File

@@ -0,0 +1,950 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
from .html import _BaseHTMLProcessor
from .urls import make_safe_absolute_uri
class _HTMLSanitizer(_BaseHTMLProcessor):
acceptable_elements = {
'a',
'abbr',
'acronym',
'address',
'area',
'article',
'aside',
'audio',
'b',
'big',
'blockquote',
'br',
'button',
'canvas',
'caption',
'center',
'cite',
'code',
'col',
'colgroup',
'command',
'datagrid',
'datalist',
'dd',
'del',
'details',
'dfn',
'dialog',
'dir',
'div',
'dl',
'dt',
'em',
'event-source',
'fieldset',
'figcaption',
'figure',
'font',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hr',
'i',
'img',
'input',
'ins',
'kbd',
'keygen',
'label',
'legend',
'li',
'm',
'map',
'menu',
'meter',
'multicol',
'nav',
'nextid',
'noscript',
'ol',
'optgroup',
'option',
'output',
'p',
'pre',
'progress',
'q',
's',
'samp',
'section',
'select',
'small',
'sound',
'source',
'spacer',
'span',
'strike',
'strong',
'sub',
'sup',
'table',
'tbody',
'td',
'textarea',
'tfoot',
'th',
'thead',
'time',
'tr',
'tt',
'u',
'ul',
'var',
'video',
}
acceptable_attributes = {
'abbr',
'accept',
'accept-charset',
'accesskey',
'action',
'align',
'alt',
'autocomplete',
'autofocus',
'axis',
'background',
'balance',
'bgcolor',
'bgproperties',
'border',
'bordercolor',
'bordercolordark',
'bordercolorlight',
'bottompadding',
'cellpadding',
'cellspacing',
'ch',
'challenge',
'char',
'charoff',
'charset',
'checked',
'choff',
'cite',
'class',
'clear',
'color',
'cols',
'colspan',
'compact',
'contenteditable',
'controls',
'coords',
'data',
'datafld',
'datapagesize',
'datasrc',
'datetime',
'default',
'delay',
'dir',
'disabled',
'draggable',
'dynsrc',
'enctype',
'end',
'face',
'for',
'form',
'frame',
'galleryimg',
'gutter',
'headers',
'height',
'hidden',
'hidefocus',
'high',
'href',
'hreflang',
'hspace',
'icon',
'id',
'inputmode',
'ismap',
'keytype',
'label',
'lang',
'leftspacing',
'list',
'longdesc',
'loop',
'loopcount',
'loopend',
'loopstart',
'low',
'lowsrc',
'max',
'maxlength',
'media',
'method',
'min',
'multiple',
'name',
'nohref',
'noshade',
'nowrap',
'open',
'optimum',
'pattern',
'ping',
'point-size',
'poster',
'pqg',
'preload',
'prompt',
'radiogroup',
'readonly',
'rel',
'repeat-max',
'repeat-min',
'replace',
'required',
'rev',
'rightspacing',
'rows',
'rowspan',
'rules',
'scope',
'selected',
'shape',
'size',
'span',
'src',
'start',
'step',
'style',
'summary',
'suppress',
'tabindex',
'target',
'template',
'title',
'toppadding',
'type',
'unselectable',
'urn',
'usemap',
'valign',
'value',
'variable',
'volume',
'vrml',
'vspace',
'width',
'wrap',
'xml:lang',
}
unacceptable_elements_with_end_tag = {
'applet',
'script',
'style',
}
acceptable_css_properties = {
'azimuth',
'background-color',
'border-bottom-color',
'border-collapse',
'border-color',
'border-left-color',
'border-right-color',
'border-top-color',
'clear',
'color',
'cursor',
'direction',
'display',
'elevation',
'float',
'font',
'font-family',
'font-size',
'font-style',
'font-variant',
'font-weight',
'height',
'letter-spacing',
'line-height',
'overflow',
'pause',
'pause-after',
'pause-before',
'pitch',
'pitch-range',
'richness',
'speak',
'speak-header',
'speak-numeral',
'speak-punctuation',
'speech-rate',
'stress',
'text-align',
'text-decoration',
'text-indent',
'unicode-bidi',
'vertical-align',
'voice-family',
'volume',
'white-space',
'width',
}
# survey of common keywords found in feeds
acceptable_css_keywords = {
'!important',
'aqua',
'auto',
'black',
'block',
'blue',
'bold',
'both',
'bottom',
'brown',
'center',
'collapse',
'dashed',
'dotted',
'fuchsia',
'gray',
'green',
'italic',
'left',
'lime',
'maroon',
'medium',
'navy',
'none',
'normal',
'nowrap',
'olive',
'pointer',
'purple',
'red',
'right',
'silver',
'solid',
'teal',
'top',
'transparent',
'underline',
'white',
'yellow',
}
valid_css_values = re.compile(
r'^('
r'#[0-9a-f]+' # Hex values
r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values
r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths
r')$'
)
mathml_elements = {
'annotation',
'annotation-xml',
'maction',
'maligngroup',
'malignmark',
'math',
'menclose',
'merror',
'mfenced',
'mfrac',
'mglyph',
'mi',
'mlabeledtr',
'mlongdiv',
'mmultiscripts',
'mn',
'mo',
'mover',
'mpadded',
'mphantom',
'mprescripts',
'mroot',
'mrow',
'ms',
'mscarries',
'mscarry',
'msgroup',
'msline',
'mspace',
'msqrt',
'msrow',
'mstack',
'mstyle',
'msub',
'msubsup',
'msup',
'mtable',
'mtd',
'mtext',
'mtr',
'munder',
'munderover',
'none',
'semantics',
}
mathml_attributes = {
'accent',
'accentunder',
'actiontype',
'align',
'alignmentscope',
'altimg',
'altimg-height',
'altimg-valign',
'altimg-width',
'alttext',
'bevelled',
'charalign',
'close',
'columnalign',
'columnlines',
'columnspacing',
'columnspan',
'columnwidth',
'crossout',
'decimalpoint',
'denomalign',
'depth',
'dir',
'display',
'displaystyle',
'edge',
'encoding',
'equalcolumns',
'equalrows',
'fence',
'fontstyle',
'fontweight',
'form',
'frame',
'framespacing',
'groupalign',
'height',
'href',
'id',
'indentalign',
'indentalignfirst',
'indentalignlast',
'indentshift',
'indentshiftfirst',
'indentshiftlast',
'indenttarget',
'infixlinebreakstyle',
'largeop',
'length',
'linebreak',
'linebreakmultchar',
'linebreakstyle',
'lineleading',
'linethickness',
'location',
'longdivstyle',
'lquote',
'lspace',
'mathbackground',
'mathcolor',
'mathsize',
'mathvariant',
'maxsize',
'minlabelspacing',
'minsize',
'movablelimits',
'notation',
'numalign',
'open',
'other',
'overflow',
'position',
'rowalign',
'rowlines',
'rowspacing',
'rowspan',
'rquote',
'rspace',
'scriptlevel',
'scriptminsize',
'scriptsizemultiplier',
'selection',
'separator',
'separators',
'shift',
'side',
'src',
'stackalign',
'stretchy',
'subscriptshift',
'superscriptshift',
'symmetric',
'voffset',
'width',
'xlink:href',
'xlink:show',
'xlink:type',
'xmlns',
'xmlns:xlink',
}
# svgtiny - foreignObject + linearGradient + radialGradient + stop
svg_elements = {
'a',
'animate',
'animateColor',
'animateMotion',
'animateTransform',
'circle',
'defs',
'desc',
'ellipse',
'font-face',
'font-face-name',
'font-face-src',
'foreignObject',
'g',
'glyph',
'hkern',
'line',
'linearGradient',
'marker',
'metadata',
'missing-glyph',
'mpath',
'path',
'polygon',
'polyline',
'radialGradient',
'rect',
'set',
'stop',
'svg',
'switch',
'text',
'title',
'tspan',
'use',
}
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
svg_attributes = {
'accent-height',
'accumulate',
'additive',
'alphabetic',
'arabic-form',
'ascent',
'attributeName',
'attributeType',
'baseProfile',
'bbox',
'begin',
'by',
'calcMode',
'cap-height',
'class',
'color',
'color-rendering',
'content',
'cx',
'cy',
'd',
'descent',
'display',
'dur',
'dx',
'dy',
'end',
'fill',
'fill-opacity',
'fill-rule',
'font-family',
'font-size',
'font-stretch',
'font-style',
'font-variant',
'font-weight',
'from',
'fx',
'fy',
'g1',
'g2',
'glyph-name',
'gradientUnits',
'hanging',
'height',
'horiz-adv-x',
'horiz-origin-x',
'id',
'ideographic',
'k',
'keyPoints',
'keySplines',
'keyTimes',
'lang',
'marker-end',
'marker-mid',
'marker-start',
'markerHeight',
'markerUnits',
'markerWidth',
'mathematical',
'max',
'min',
'name',
'offset',
'opacity',
'orient',
'origin',
'overline-position',
'overline-thickness',
'panose-1',
'path',
'pathLength',
'points',
'preserveAspectRatio',
'r',
'refX',
'refY',
'repeatCount',
'repeatDur',
'requiredExtensions',
'requiredFeatures',
'restart',
'rotate',
'rx',
'ry',
'slope',
'stemh',
'stemv',
'stop-color',
'stop-opacity',
'strikethrough-position',
'strikethrough-thickness',
'stroke',
'stroke-dasharray',
'stroke-dashoffset',
'stroke-linecap',
'stroke-linejoin',
'stroke-miterlimit',
'stroke-opacity',
'stroke-width',
'systemLanguage',
'target',
'text-anchor',
'to',
'transform',
'type',
'u1',
'u2',
'underline-position',
'underline-thickness',
'unicode',
'unicode-range',
'units-per-em',
'values',
'version',
'viewBox',
'visibility',
'width',
'widths',
'x',
'x-height',
'x1',
'x2',
'xlink:actuate',
'xlink:arcrole',
'xlink:href',
'xlink:role',
'xlink:show',
'xlink:title',
'xlink:type',
'xml:base',
'xml:lang',
'xml:space',
'xmlns',
'xmlns:xlink',
'y',
'y1',
'y2',
'zoomAndPan',
}
svg_attr_map = None
svg_elem_map = None
acceptable_svg_properties = {
'fill',
'fill-opacity',
'fill-rule',
'stroke',
'stroke-linecap',
'stroke-linejoin',
'stroke-opacity',
'stroke-width',
}
def __init__(self, encoding=None, _type='application/xhtml+xml'):
super(_HTMLSanitizer, self).__init__(encoding, _type)
self.unacceptablestack = 0
self.mathmlOK = 0
self.svgOK = 0
def reset(self):
super(_HTMLSanitizer, self).reset()
self.unacceptablestack = 0
self.mathmlOK = 0
self.svgOK = 0
def unknown_starttag(self, tag, attrs):
acceptable_attributes = self.acceptable_attributes
keymap = {}
if tag not in self.acceptable_elements or self.svgOK:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
# add implicit namespaces to html5 inline svg/mathml
if self._type.endswith('html'):
if not dict(attrs).get('xmlns'):
if tag == 'svg':
attrs.append(('xmlns', 'http://www.w3.org/2000/svg'))
if tag == 'math':
attrs.append(('xmlns', 'http://www.w3.org/1998/Math/MathML'))
# not otherwise acceptable, perhaps it is MathML or SVG?
if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs:
self.mathmlOK += 1
if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs:
self.svgOK += 1
# chose acceptable attributes based on tag class, else bail
if self.mathmlOK and tag in self.mathml_elements:
acceptable_attributes = self.mathml_attributes
elif self.svgOK and tag in self.svg_elements:
# For most vocabularies, lowercasing is a good idea. Many
# svg elements, however, are camel case.
if not self.svg_attr_map:
lower = [attr.lower() for attr in self.svg_attributes]
mix = [a for a in self.svg_attributes if a not in lower]
self.svg_attributes = lower
self.svg_attr_map = {a.lower(): a for a in mix}
lower = [attr.lower() for attr in self.svg_elements]
mix = [a for a in self.svg_elements if a not in lower]
self.svg_elements = lower
self.svg_elem_map = {a.lower(): a for a in mix}
acceptable_attributes = self.svg_attributes
tag = self.svg_elem_map.get(tag, tag)
keymap = self.svg_attr_map
elif tag not in self.acceptable_elements:
return
# declare xlink namespace, if needed
if self.mathmlOK or self.svgOK:
if any((a for a in attrs if a[0].startswith('xlink:'))):
if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs:
attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink'))
clean_attrs = []
for key, value in self.normalize_attrs(attrs):
if key == 'style' and 'style' in acceptable_attributes:
clean_value = self.sanitize_style(value)
if clean_value:
clean_attrs.append((key, clean_value))
elif key in acceptable_attributes:
key = keymap.get(key, key)
# make sure the uri uses an acceptable uri scheme
if key == 'href':
value = make_safe_absolute_uri(value)
clean_attrs.append((key, value))
super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
def unknown_endtag(self, tag):
if tag not in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack -= 1
if self.mathmlOK and tag in self.mathml_elements:
if tag == 'math' and self.mathmlOK:
self.mathmlOK -= 1
elif self.svgOK and tag in self.svg_elements:
tag = self.svg_elem_map.get(tag, tag)
if tag == 'svg' and self.svgOK:
self.svgOK -= 1
else:
return
super(_HTMLSanitizer, self).unknown_endtag(tag)
def handle_pi(self, text):
pass
def handle_decl(self, text):
pass
def handle_data(self, text):
if not self.unacceptablestack:
super(_HTMLSanitizer, self).handle_data(text)
def sanitize_style(self, style):
# disallow urls
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
# This replaced a regexp that used re.match and was prone to
# pathological back-tracking.
if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
return ''
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.acceptable_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']:
for keyword in value.split():
if (
keyword not in self.acceptable_css_keywords
and not self.valid_css_values.match(keyword)
):
break
else:
clean.append(prop + ': ' + value + ';')
elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
def parse_comment(self, i, report=1):
ret = super(_HTMLSanitizer, self).parse_comment(i, report)
if ret >= 0:
return ret
# if ret == -1, this may be a malicious attempt to circumvent
# sanitization, or a page-destroying unclosed comment
match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
if match:
return match.end()
# unclosed comment; deliberately fail to handle_data()
return len(self.rawdata)
def _sanitize_html(html_source, encoding, _type):
p = _HTMLSanitizer(encoding, _type)
html_source = html_source.replace('<![CDATA[', '&lt;![CDATA[')
p.feed(html_source)
data = p.output()
data = data.strip().replace('\r\n', '\n')
return data
# Match XML entity declarations.
# Example: <!ENTITY copyright "(C)">
RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
# Match XML DOCTYPE declarations.
# Example: <!DOCTYPE feed [ ]>
RE_DOCTYPE_PATTERN = re.compile(br'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
# Match safe entity declarations.
# This will allow hexadecimal character references through,
# as well as text, but not arbitrary nested entities.
# Example: cubed "&#179;"
# Example: copyright "(C)"
# Forbidden: explode1 "&explode2;&explode2;"
RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
def replace_doctype(data):
"""Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
rss_version may be 'rss091n' or None
stripped_data is the same XML document with a replaced DOCTYPE
"""
# Divide the document into two groups by finding the location
# of the first element that doesn't begin with '<?' or '<!'.
start = re.search(br'<\w', data)
start = start and start.start() or -1
head, data = data[:start+1], data[start+1:]
# Save and then remove all of the ENTITY declarations.
entity_results = RE_ENTITY_PATTERN.findall(head)
head = RE_ENTITY_PATTERN.sub(b'', head)
# Find the DOCTYPE declaration and check the feed type.
doctype_results = RE_DOCTYPE_PATTERN.findall(head)
doctype = doctype_results and doctype_results[0] or b''
if b'netscape' in doctype.lower():
version = 'rss091n'
else:
version = None
# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
replacement = b''
if len(doctype_results) == 1 and entity_results:
safe_entities = [
e
for e in entity_results
if RE_SAFE_ENTITY_PATTERN.match(e)
]
if safe_entities:
replacement = b'<!DOCTYPE feed [\n<!ENTITY' \
+ b'>\n<!ENTITY '.join(safe_entities) \
+ b'>\n]>'
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
# Precompute the safe entities for the loose parser.
safe_entities = {
k.decode('utf-8'): v.decode('utf-8')
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
}
return version, data, safe_entities

View File

@@ -0,0 +1,98 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
import sgmllib
__all__ = [
'sgmllib',
'charref',
'tagfind',
'attrfind',
'entityref',
'incomplete',
'interesting',
'shorttag',
'shorttagopen',
'starttagopen',
'endbracket',
]
# sgmllib defines a number of module-level regular expressions that are
# insufficient for the XML parsing feedparser needs. Rather than modify
# the variables directly in sgmllib, they're defined here using the same
# names, and the compiled code objects of several sgmllib.SGMLParser
# methods are copied into _BaseHTMLProcessor so that they execute in
# feedparser's scope instead of sgmllib's scope.
charref = re.compile(r'&#(\d+|[xX][0-9a-fA-F]+);')
tagfind = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*')
attrfind = re.compile(
r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*"""
r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?"""
)
# Unfortunately, these must be copied over to prevent NameError exceptions
entityref = sgmllib.entityref
incomplete = sgmllib.incomplete
interesting = sgmllib.interesting
shorttag = sgmllib.shorttag
shorttagopen = sgmllib.shorttagopen
starttagopen = sgmllib.starttagopen
class _EndBracketRegEx:
def __init__(self):
# Overriding the built-in sgmllib.endbracket regex allows the
# parser to find angle brackets embedded in element attributes.
self.endbracket = re.compile(
r'('
r"""[^'"<>]"""
r"""|"[^"]*"(?=>|/|\s|\w+=)"""
r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])"""
r"""|.*?(?=[<>]"""
r')'
)
def search(self, target, index=0):
match = self.endbracket.match(target, index)
if match is not None:
# Returning a new object in the calling thread's context
# resolves a thread-safety.
return EndBracketMatch(match)
return None
class EndBracketMatch:
def __init__(self, match):
self.match = match
def start(self, n):
return self.match.end(n)
endbracket = _EndBracketRegEx()

View File

@@ -0,0 +1,155 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
import urllib.parse
from .html import _BaseHTMLProcessor
# If you want feedparser to allow all URL schemes, set this to ()
# List culled from Python's urlparse documentation at:
# http://docs.python.org/library/urlparse.html
# as well as from "URI scheme" at Wikipedia:
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
# Many more will likely need to be added!
ACCEPTABLE_URI_SCHEMES = (
'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
'wais',
# Additional common-but-unofficial schemes
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
)
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
def _urljoin(base, uri):
uri = _urifixer.sub(r'\1\3', uri)
try:
uri = urllib.parse.urljoin(base, uri)
except ValueError:
uri = ''
return uri
def convert_to_idn(url):
"""Convert a URL to IDN notation"""
# this function should only be called with a unicode string
# strategy: if the host cannot be encoded in ascii, then
# it'll be necessary to encode it in idn form
parts = list(urllib.parse.urlsplit(url))
try:
parts[1].encode('ascii')
except UnicodeEncodeError:
# the url needs to be converted to idn notation
host = parts[1].rsplit(':', 1)
newhost = []
port = ''
if len(host) == 2:
port = host.pop()
for h in host[0].split('.'):
newhost.append(h.encode('idna').decode('utf-8'))
parts[1] = '.'.join(newhost)
if port:
parts[1] += ':' + port
return urllib.parse.urlunsplit(parts)
else:
return url
def make_safe_absolute_uri(base, rel=None):
# bail if ACCEPTABLE_URI_SCHEMES is empty
if not ACCEPTABLE_URI_SCHEMES:
return _urljoin(base, rel or '')
if not base:
return rel or ''
if not rel:
try:
scheme = urllib.parse.urlparse(base)[0]
except ValueError:
return ''
if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
return base
return ''
uri = _urljoin(base, rel)
if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
return ''
return uri
class RelativeURIResolver(_BaseHTMLProcessor):
relative_uris = {
('a', 'href'),
('applet', 'codebase'),
('area', 'href'),
('audio', 'src'),
('blockquote', 'cite'),
('body', 'background'),
('del', 'cite'),
('form', 'action'),
('frame', 'longdesc'),
('frame', 'src'),
('iframe', 'longdesc'),
('iframe', 'src'),
('head', 'profile'),
('img', 'longdesc'),
('img', 'src'),
('img', 'usemap'),
('input', 'src'),
('input', 'usemap'),
('ins', 'cite'),
('link', 'href'),
('object', 'classid'),
('object', 'codebase'),
('object', 'data'),
('object', 'usemap'),
('q', 'cite'),
('script', 'src'),
('source', 'src'),
('video', 'poster'),
('video', 'src'),
}
def __init__(self, baseuri, encoding, _type):
_BaseHTMLProcessor.__init__(self, encoding, _type)
self.baseuri = baseuri
def resolve_uri(self, uri):
return make_safe_absolute_uri(self.baseuri, uri.strip())
def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value) for key, value in attrs]
super(RelativeURIResolver, self).unknown_starttag(tag, attrs)
def resolve_relative_uris(html_source, base_uri, encoding, type_):
p = RelativeURIResolver(base_uri, encoding, type_)
p.feed(html_source)
return p.output()

View File

@@ -0,0 +1,163 @@
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import warnings
class FeedParserDict(dict):
keymap = {
'channel': 'feed',
'items': 'entries',
'guid': 'id',
'date': 'updated',
'date_parsed': 'updated_parsed',
'description': ['summary', 'subtitle'],
'description_detail': ['summary_detail', 'subtitle_detail'],
'url': ['href'],
'modified': 'updated',
'modified_parsed': 'updated_parsed',
'issued': 'published',
'issued_parsed': 'published_parsed',
'copyright': 'rights',
'copyright_detail': 'rights_detail',
'tagline': 'subtitle',
'tagline_detail': 'subtitle_detail',
}
def __getitem__(self, key):
"""
:return: A :class:`FeedParserDict`.
"""
if key == 'category':
try:
return dict.__getitem__(self, 'tags')[0]['term']
except IndexError:
raise KeyError("object doesn't have key 'category'")
elif key == 'enclosures':
norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
return [
norel(link)
for link in dict.__getitem__(self, 'links')
if link['rel'] == 'enclosure'
]
elif key == 'license':
for link in dict.__getitem__(self, 'links'):
if link['rel'] == 'license' and 'href' in link:
return link['href']
elif key == 'updated':
# Temporarily help developers out by keeping the old
# broken behavior that was reported in issue 310.
# This fix was proposed in issue 328.
if (
not dict.__contains__(self, 'updated')
and dict.__contains__(self, 'published')
):
warnings.warn(
"To avoid breaking existing software while "
"fixing issue 310, a temporary mapping has been created "
"from `updated` to `published` if `updated` doesn't "
"exist. This fallback will be removed in a future version "
"of feedparser.",
DeprecationWarning,
)
return dict.__getitem__(self, 'published')
return dict.__getitem__(self, 'updated')
elif key == 'updated_parsed':
if (
not dict.__contains__(self, 'updated_parsed')
and dict.__contains__(self, 'published_parsed')
):
warnings.warn(
"To avoid breaking existing software while "
"fixing issue 310, a temporary mapping has been created "
"from `updated_parsed` to `published_parsed` if "
"`updated_parsed` doesn't exist. This fallback will be "
"removed in a future version of feedparser.",
DeprecationWarning,
)
return dict.__getitem__(self, 'published_parsed')
return dict.__getitem__(self, 'updated_parsed')
else:
realkey = self.keymap.get(key, key)
if isinstance(realkey, list):
for k in realkey:
if dict.__contains__(self, k):
return dict.__getitem__(self, k)
elif dict.__contains__(self, realkey):
return dict.__getitem__(self, realkey)
return dict.__getitem__(self, key)
def __contains__(self, key):
if key in ('updated', 'updated_parsed'):
# Temporarily help developers out by keeping the old
# broken behavior that was reported in issue 310.
# This fix was proposed in issue 328.
return dict.__contains__(self, key)
try:
self.__getitem__(key)
except KeyError:
return False
else:
return True
has_key = __contains__
def get(self, key, default=None):
"""
:return: A :class:`FeedParserDict`.
"""
try:
return self.__getitem__(key)
except KeyError:
return default
def __setitem__(self, key, value):
key = self.keymap.get(key, key)
if isinstance(key, list):
key = key[0]
return dict.__setitem__(self, key, value)
def setdefault(self, k, default):
if k not in self:
self[k] = default
return default
return self[k]
def __getattr__(self, key):
# __getattribute__() is called first; this will be called
# only if an attribute was not already found
try:
return self.__getitem__(key)
except KeyError:
raise AttributeError("object has no attribute '%s'" % key)
def __hash__(self):
# This is incorrect behavior -- dictionaries shouldn't be hashable.
# Note to self: remove this behavior in the future.
return id(self)

547
scripts/_vendor/sgmllib.py Normal file
View File

@@ -0,0 +1,547 @@
"""A parser for SGML, using the derived class as a static DTD."""
# XXX This only supports those SGML features used by HTML.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special). RCDATA is
# not supported at all.
import _markupbase
import re
__all__ = ["SGMLParser", "SGMLParseError"]
# Regular expressions used for parsing
interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
'<([a-zA-Z][^<>]*|'
'/([a-zA-Z][^<>]*)?|'
'![^<>]*)?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#([0-9]+)[^0-9]')
starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
class SGMLParseError(RuntimeError):
"""Exception raised for all parse errors."""
pass
# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.) The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbitrary
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
class SGMLParser(_markupbase.ParserBase):
# Definition of entities -- derived classes may override
entity_or_charref = re.compile('&(?:'
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
')(;?)')
def __init__(self, verbose=0):
"""Initialize and reset this instance."""
self.verbose = verbose
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.__starttag_text = None
self.rawdata = ''
self.stack = []
self.lasttag = '???'
self.nomoretags = 0
self.literal = 0
_markupbase.ParserBase.reset(self)
def setnomoretags(self):
"""Enter literal mode (CDATA) till EOF.
Intended for derived classes only.
"""
self.nomoretags = self.literal = 1
def setliteral(self, *args):
"""Enter literal mode (CDATA).
Intended for derived classes only.
"""
self.literal = 1
def feed(self, data):
"""Feed some data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n'). (This just saves the text,
all the processing is done by goahead().)
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""Handle the remaining data."""
self.goahead(1)
def error(self, message):
raise SGMLParseError(message)
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.nomoretags:
self.handle_data(rawdata[i:n])
i = n
break
match = interesting.search(rawdata, i)
if match: j = match.start()
else: j = n
if i < j:
self.handle_data(rawdata[i:j])
i = j
if i == n: break
if rawdata[i] == '<':
if starttagopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_starttag(i)
if k < 0: break
i = k
continue
if rawdata.startswith("</", i):
k = self.parse_endtag(i)
if k < 0: break
i = k
self.literal = 0
continue
if self.literal:
if n > (i + 1):
self.handle_data("<")
i = i+1
else:
# incomplete
break
continue
if rawdata.startswith("<!--", i):
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
k = self.parse_comment(i)
if k < 0: break
i = k
continue
if rawdata.startswith("<?", i):
k = self.parse_pi(i)
if k < 0: break
i = i+k
continue
if rawdata.startswith("<!", i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
k = self.parse_declaration(i)
if k < 0: break
i = k
continue
elif rawdata[i] == '&':
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
match = charref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_charref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
else:
self.error('neither < nor & ??')
# We get here only if incomplete matches but
# nothing else
match = incomplete.match(rawdata, i)
if not match:
self.handle_data(rawdata[i])
i = i+1
continue
j = match.end(0)
if j == n:
break # Really incomplete
self.handle_data(rawdata[i:j])
i = j
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
i = n
self.rawdata = rawdata[i:]
# XXX if end: check for empty stack
# Extensions for the DOCTYPE scanner:
_decl_otherchars = '='
# Internal -- parse processing instr, return length or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
if rawdata[i:i+2] != '<?':
self.error('unexpected call to parse_pi()')
match = piclose.search(rawdata, i+2)
if not match:
return -1
j = match.start(0)
self.handle_pi(rawdata[i+2: j])
j = match.end(0)
return j-i
def get_starttag_text(self):
return self.__starttag_text
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
start_pos = i
rawdata = self.rawdata
if shorttagopen.match(rawdata, i):
# SGML shorthand: <tag/data/ == <tag>data</tag>
# XXX Can data contain &... (entity or char refs)?
# XXX Can data contain < or > (tag characters)?
# XXX Can there be whitespace before the first /?
match = shorttag.match(rawdata, i)
if not match:
return -1
tag, data = match.group(1, 2)
self.__starttag_text = '<%s/' % tag
tag = tag.lower()
k = match.end(0)
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
# XXX The following should skip matching quotes (' or ")
# As a shortcut way to exit, this isn't so bad, but shouldn't
# be used to locate the actual end of the start tag since the
# < or > characters may be embedded in an attribute value.
match = endbracket.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
# SGML shorthand: <> == <last open tag seen>
k = j
tag = self.lasttag
else:
match = tagfind.match(rawdata, i+1)
if not match:
self.error('unexpected call to parse_starttag')
k = match.end(0)
tag = rawdata[i+1:k].lower()
self.lasttag = tag
while k < j:
match = attrfind.match(rawdata, k)
if not match: break
attrname, rest, attrvalue = match.group(1, 2, 3)
if not rest:
attrvalue = attrname
else:
if (attrvalue[:1] == "'" == attrvalue[-1:] or
attrvalue[:1] == '"' == attrvalue[-1:]):
# strip quotes
attrvalue = attrvalue[1:-1]
attrvalue = self.entity_or_charref.sub(
self._convert_ref, attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
j = j+1
self.__starttag_text = rawdata[start_pos:j]
self.finish_starttag(tag, attrs)
return j
# Internal -- convert entity or character reference
def _convert_ref(self, match):
if match.group(2):
return self.convert_charref(match.group(2)) or \
'&#%s%s' % match.groups()[1:]
elif match.group(3):
return self.convert_entityref(match.group(1)) or \
'&%s;' % match.group(1)
else:
return '&%s' % match.group(1)
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
match = endbracket.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1
self.finish_endtag(tag)
return j
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
def finish_shorttag(self, tag, data):
self.finish_starttag(tag, [])
self.handle_data(data)
self.finish_endtag(tag)
# Internal -- finish processing of start tag
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
def finish_starttag(self, tag, attrs):
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
self.unknown_starttag(tag, attrs)
return -1
else:
self.handle_starttag(tag, method, attrs)
return 0
else:
self.stack.append(tag)
self.handle_starttag(tag, method, attrs)
return 1
# Internal -- finish processing of end tag
def finish_endtag(self, tag):
if not tag:
found = len(self.stack) - 1
if found < 0:
self.unknown_endtag(tag)
return
else:
if tag not in self.stack:
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
self.unknown_endtag(tag)
else:
self.report_unbalanced(tag)
return
found = len(self.stack)
for i in range(found):
if self.stack[i] == tag: found = i
while len(self.stack) > found:
tag = self.stack[-1]
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
method = None
if method:
self.handle_endtag(tag, method)
else:
self.unknown_endtag(tag)
del self.stack[-1]
# Overridable -- handle start tag
def handle_starttag(self, tag, method, attrs):
method(attrs)
# Overridable -- handle end tag
def handle_endtag(self, tag, method):
method()
# Example -- report an unbalanced </...> tag.
def report_unbalanced(self, tag):
if self.verbose:
print('*** Unbalanced </' + tag + '>')
print('*** Stack:', self.stack)
def convert_charref(self, name):
"""Convert character reference, may be overridden."""
try:
n = int(name)
except ValueError:
return
if not 0 <= n <= 127:
return
return self.convert_codepoint(n)
def convert_codepoint(self, codepoint):
return chr(codepoint)
def handle_charref(self, name):
"""Handle character reference, no need to override."""
replacement = self.convert_charref(name)
if replacement is None:
self.unknown_charref(name)
else:
self.handle_data(replacement)
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
def convert_entityref(self, name):
"""Convert entity references.
As an alternative to overriding this method; one can tailor the
results by setting up the self.entitydefs mapping appropriately.
"""
table = self.entitydefs
if name in table:
return table[name]
else:
return
def handle_entityref(self, name):
"""Handle entity references, no need to override."""
replacement = self.convert_entityref(name)
if replacement is None:
self.unknown_entityref(name)
else:
self.handle_data(replacement)
# Example -- handle data, should be overridden
def handle_data(self, data):
pass
# Example -- handle comment, could be overridden
def handle_comment(self, data):
pass
# Example -- handle declaration, could be overridden
def handle_decl(self, decl):
pass
# Example -- handle processing instruction, could be overridden
def handle_pi(self, data):
pass
# To be overridden -- handlers for unknown objects
def unknown_starttag(self, tag, attrs): pass
def unknown_endtag(self, tag): pass
def unknown_charref(self, ref): pass
def unknown_entityref(self, ref): pass
class TestSGMLParser(SGMLParser):
def __init__(self, verbose=0):
self.testdata = ""
SGMLParser.__init__(self, verbose)
def handle_data(self, data):
self.testdata = self.testdata + data
if len(repr(self.testdata)) >= 70:
self.flush()
def flush(self):
data = self.testdata
if data:
self.testdata = ""
print('data:', repr(data))
def handle_comment(self, data):
self.flush()
r = repr(data)
if len(r) > 68:
r = r[:32] + '...' + r[-32:]
print('comment:', r)
def unknown_starttag(self, tag, attrs):
self.flush()
if not attrs:
print('start tag: <' + tag + '>')
else:
print('start tag: <' + tag, end=' ')
for name, value in attrs:
print(name + '=' + '"' + value + '"', end=' ')
print('>')
def unknown_endtag(self, tag):
self.flush()
print('end tag: </' + tag + '>')
def unknown_entityref(self, ref):
self.flush()
print('*** unknown entity ref: &' + ref + ';')
def unknown_charref(self, ref):
self.flush()
print('*** unknown char ref: &#' + ref + ';')
def unknown_decl(self, data):
self.flush()
print('*** unknown decl: [' + data + ']')
def close(self):
SGMLParser.close(self)
self.flush()
def test(args = None):
import sys
if args is None:
args = sys.argv[1:]
if args and args[0] == '-s':
args = args[1:]
klass = SGMLParser
else:
klass = TestSGMLParser
if args:
file = args[0]
else:
file = 'test.html'
if file == '-':
f = sys.stdin
else:
try:
f = open(file, 'r')
except IOError as msg:
print(file, ":", msg)
sys.exit(1)
data = f.read()
if f is not sys.stdin:
f.close()
x = klass()
for c in data:
x.feed(c)
x.close()
if __name__ == '__main__':
test()

View File

@@ -0,0 +1 @@
sgmllib

263
scripts/config.py Normal file
View File

@@ -0,0 +1,263 @@
#!/usr/bin/env python3
"""
Configuration loader for proactive-research skill.
"""
import json
import os
from pathlib import Path
from typing import Dict, List, Optional
SKILL_DIR = Path(__file__).parent.parent
CONFIG_FILE = SKILL_DIR / "config.json"
# State files: configurable via TOPIC_MONITOR_DATA_DIR env, defaults to skill-local .data/
MEMORY_DIR = Path(os.environ.get("TOPIC_MONITOR_DATA_DIR", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".data")))
STATE_FILE = MEMORY_DIR / "topic-monitor-state.json"
FINDINGS_DIR = MEMORY_DIR / "findings"
ALERTS_QUEUE = MEMORY_DIR / "alerts-queue.json"
def ensure_memory_dir():
"""Ensure memory directory structure exists."""
MEMORY_DIR.mkdir(parents=True, exist_ok=True)
FINDINGS_DIR.mkdir(parents=True, exist_ok=True)
def load_config() -> Dict:
"""Load configuration from config.json."""
if not CONFIG_FILE.exists():
raise FileNotFoundError(
f"Config file not found: {CONFIG_FILE}\n"
"Copy config.example.json to config.json and customize it."
)
with open(CONFIG_FILE) as f:
return json.load(f)
def save_config(config: Dict):
"""Save configuration to config.json."""
with open(CONFIG_FILE, 'w') as f:
json.dump(config, f, indent=2)
def load_state() -> Dict:
"""Load state from topic-monitor-state.json in memory/monitors/."""
ensure_memory_dir()
if STATE_FILE.exists():
with open(STATE_FILE) as f:
return json.load(f)
return {
"topics": {},
"deduplication": {"url_hash_map": {}},
"learning": {"interactions": []},
"feeds": {},
"sentiment": {}
}
def save_state(state: Dict):
"""Save state to topic-monitor-state.json in memory/monitors/."""
ensure_memory_dir()
with open(STATE_FILE, 'w') as f:
json.dump(state, f, indent=2)
def get_topics() -> List[Dict]:
"""Get all topics from config."""
config = load_config()
return config.get("topics", [])
def get_topic(topic_id: str) -> Optional[Dict]:
"""Get a specific topic by ID."""
topics = get_topics()
for topic in topics:
if topic.get("id") == topic_id:
return topic
return None
def get_settings() -> Dict:
"""Get global settings."""
config = load_config()
return config.get("settings", {})
def get_channel_config(channel: str) -> Dict:
"""Get channel-specific configuration."""
config = load_config()
channels = config.get("channels", {})
return channels.get(channel, {})
def ensure_findings_dir():
"""Ensure findings directory exists in memory/monitors/."""
ensure_memory_dir()
FINDINGS_DIR.mkdir(exist_ok=True)
def get_findings_file(topic_id: str, date_str: str) -> Path:
"""Get path to findings file for topic and date."""
ensure_findings_dir()
return FINDINGS_DIR / f"{date_str}_{topic_id}.json"
def save_finding(topic_id: str, date_str: str, finding: Dict):
"""Save a finding to the findings directory."""
findings_file = get_findings_file(topic_id, date_str)
# Load existing findings
findings = []
if findings_file.exists():
with open(findings_file) as f:
findings = json.load(f)
# Append new finding
findings.append(finding)
# Save
with open(findings_file, 'w') as f:
json.dump(findings, f, indent=2)
def load_findings(topic_id: str, date_str: str) -> List[Dict]:
"""Load findings for a topic and date."""
findings_file = get_findings_file(topic_id, date_str)
if findings_file.exists():
with open(findings_file) as f:
return json.load(f)
return []
# ============================================================================
# ALERTS QUEUE - For real-time alerting via OpenClaw agent
# ============================================================================
def queue_alert(alert: Dict):
"""
Queue an alert for delivery by the OpenClaw agent.
Alert format:
{
"id": "unique-id",
"timestamp": "ISO timestamp",
"priority": "high|medium|low",
"channel": "telegram|discord|email",
"topic_id": "topic-id",
"topic_name": "Topic Name",
"title": "Result title",
"snippet": "Result snippet",
"url": "https://...",
"score": 0.75,
"reason": "scoring reason",
"sent": false
}
"""
ensure_memory_dir()
# Load existing queue
queue = []
if ALERTS_QUEUE.exists():
try:
with open(ALERTS_QUEUE) as f:
queue = json.load(f)
except (json.JSONDecodeError, IOError):
queue = []
# Add alert with unique ID
import hashlib
from datetime import datetime
alert_id = hashlib.md5(
f"{alert.get('url', '')}{alert.get('timestamp', '')}".encode()
).hexdigest()[:12]
alert["id"] = alert_id
alert["sent"] = False
if "timestamp" not in alert:
alert["timestamp"] = datetime.now().isoformat()
# Avoid duplicates
existing_ids = {a.get("id") for a in queue}
if alert_id not in existing_ids:
queue.append(alert)
# Save queue
with open(ALERTS_QUEUE, 'w') as f:
json.dump(queue, f, indent=2)
return alert_id
def get_pending_alerts() -> List[Dict]:
"""Get all unsent alerts from the queue."""
ensure_memory_dir()
if not ALERTS_QUEUE.exists():
return []
try:
with open(ALERTS_QUEUE) as f:
queue = json.load(f)
except (json.JSONDecodeError, IOError):
return []
return [a for a in queue if not a.get("sent", False)]
def mark_alert_sent(alert_id: str):
"""Mark an alert as sent."""
ensure_memory_dir()
if not ALERTS_QUEUE.exists():
return
try:
with open(ALERTS_QUEUE) as f:
queue = json.load(f)
except (json.JSONDecodeError, IOError):
return
for alert in queue:
if alert.get("id") == alert_id:
alert["sent"] = True
alert["sent_at"] = json.dumps({"_": "now"})[7:-2] # hack for timestamp
from datetime import datetime
alert["sent_at"] = datetime.now().isoformat()
break
with open(ALERTS_QUEUE, 'w') as f:
json.dump(queue, f, indent=2)
def clear_old_alerts(max_age_hours: int = 168):
"""Clear alerts older than max_age_hours (default 7 days)."""
ensure_memory_dir()
if not ALERTS_QUEUE.exists():
return
from datetime import datetime, timedelta
try:
with open(ALERTS_QUEUE) as f:
queue = json.load(f)
except (json.JSONDecodeError, IOError):
return
cutoff = datetime.now() - timedelta(hours=max_age_hours)
new_queue = []
for alert in queue:
try:
ts = datetime.fromisoformat(alert.get("timestamp", ""))
if ts > cutoff:
new_queue.append(alert)
except (ValueError, TypeError):
# Keep alerts with invalid timestamps (let them be manually reviewed)
new_queue.append(alert)
with open(ALERTS_QUEUE, 'w') as f:
json.dump(new_queue, f, indent=2)

215
scripts/digest.py Normal file
View File

@@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
Generate and send weekly research digest.
Compiles medium-priority findings from the week into a readable report.
"""
import sys
import json
import argparse
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict
sys.path.insert(0, str(Path(__file__).parent))
from config import load_config, get_topic, FINDINGS_DIR
def get_week_range(offset_weeks: int = 0) -> tuple[datetime, datetime]:
"""Get start and end of current week (or offset)."""
today = datetime.now()
# Find most recent Sunday
days_since_sunday = (today.weekday() + 1) % 7
sunday = today - timedelta(days=days_since_sunday)
# Apply offset
start = sunday - timedelta(weeks=offset_weeks)
end = start + timedelta(days=6, hours=23, minutes=59, seconds=59)
return start, end
def load_week_findings(start: datetime, end: datetime) -> dict:
"""Load all findings from the week."""
if not FINDINGS_DIR.exists():
return {}
findings_by_topic = defaultdict(list)
# Scan findings directory
for findings_file in FINDINGS_DIR.glob("*.json"):
# Parse filename: YYYY-MM-DD_topic-id.json
parts = findings_file.stem.split("_", 1)
if len(parts) != 2:
continue
date_str, topic_id = parts
try:
file_date = datetime.strptime(date_str, "%Y-%m-%d")
except ValueError:
continue
# Check if in week range
if not (start <= file_date <= end):
continue
# Load findings
with open(findings_file) as f:
findings = json.load(f)
findings_by_topic[topic_id].extend(findings)
return dict(findings_by_topic)
def generate_digest(findings_by_topic: dict, start: datetime, end: datetime) -> str:
"""Generate digest markdown."""
config = load_config()
# Header
digest = f"# 📊 Weekly Research Digest\n\n"
digest += f"**{start.strftime('%B %d')} - {end.strftime('%B %d, %Y')}**\n\n"
digest += "---\n\n"
# Summary stats
total_findings = sum(len(f) for f in findings_by_topic.values())
topic_count = len(findings_by_topic)
if total_findings == 0:
digest += "No new findings this week.\n"
return digest
digest += f"📈 **Summary:** {total_findings} findings across {topic_count} topic(s)\n\n"
digest += "---\n\n"
# Highlights (highest scored findings)
all_findings = []
for topic_id, findings in findings_by_topic.items():
topic = get_topic(topic_id)
if not topic:
continue
for finding in findings:
all_findings.append({
"topic": topic,
"finding": finding
})
# Sort by score
all_findings.sort(key=lambda x: x["finding"].get("score", 0), reverse=True)
if len(all_findings) >= 3:
digest += "## 🔥 Top Highlights\n\n"
for item in all_findings[:3]:
topic = item["topic"]
finding = item["finding"]
result = finding.get("result", {})
score = finding.get("score", 0)
digest += f"### {topic.get('name')} ({score:.2f})\n\n"
digest += f"**{result.get('title', 'Untitled')}**\n\n"
digest += f"{result.get('snippet', '')}\n\n"
digest += f"🔗 [{result.get('url', '')}]({result.get('url', '')})\n\n"
digest += "---\n\n"
# Findings by topic
digest += "## 📚 Findings by Topic\n\n"
for topic_id, findings in sorted(findings_by_topic.items()):
topic = get_topic(topic_id)
if not topic:
continue
topic_name = topic.get("name", topic_id)
topic_emoji = topic.get("emoji", "📌")
digest += f"### {topic_emoji} {topic_name}\n\n"
digest += f"**{len(findings)} finding(s) this week**\n\n"
# Sort findings by score
sorted_findings = sorted(findings, key=lambda x: x.get("score", 0), reverse=True)
for finding in sorted_findings[:5]: # Top 5 per topic
result = finding.get("result", {})
score = finding.get("score", 0)
reason = finding.get("reason", "")
sentiment = finding.get("sentiment", "")
digest += f"- **{result.get('title', 'Untitled')}** ({score:.2f})\n"
digest += f" {result.get('snippet', '')[:150]}...\n"
digest += f" 🔗 {result.get('url', '')}\n"
if reason:
digest += f" _Reason: {reason}_\n"
if sentiment:
digest += f" _Sentiment: {sentiment}_\n"
digest += "\n"
if len(sorted_findings) > 5:
digest += f"_...and {len(sorted_findings) - 5} more_\n\n"
digest += "\n"
# Recommendations (future enhancement)
digest += "---\n\n"
digest += "## 💡 Recommendations\n\n"
digest += "_Feature coming soon: AI-powered topic suggestions based on your findings_\n\n"
return digest
def send_digest(digest: str, dry_run: bool = False):
"""Send digest via configured channels."""
config = load_config()
settings = config.get("settings", {})
# For now, just print (would integrate with message tool in real environment)
if dry_run:
print("\n" + "="*60)
print("DIGEST PREVIEW:")
print("="*60 + "\n")
print(digest)
print("\n" + "="*60)
else:
# Would send via Telegram, Discord, Email
print("📧 Sending digest...")
print(digest)
print("\n✅ Digest sent")
def main():
parser = argparse.ArgumentParser(description="Generate weekly research digest")
parser.add_argument("--preview", action="store_true", help="Preview without sending")
parser.add_argument("--send", action="store_true", help="Generate and send")
parser.add_argument("--week-offset", type=int, default=0,
help="Week offset (0=current, 1=last week, etc.)")
args = parser.parse_args()
# Get week range
start, end = get_week_range(args.week_offset)
print(f"📊 Generating digest for {start.strftime('%Y-%m-%d')} to {end.strftime('%Y-%m-%d')}")
# Load findings
findings_by_topic = load_week_findings(start, end)
if not findings_by_topic:
print("⚠️ No findings for this period")
return
# Generate digest
digest = generate_digest(findings_by_topic, start, end)
# Send or preview
if args.send:
send_digest(digest, dry_run=False)
else:
send_digest(digest, dry_run=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
AI-powered importance scoring for research findings.
Scores findings as:
- HIGH: Immediate alert
- MEDIUM: Include in digest
- LOW: Ignore
Also returns a coarse sentiment label for alert rendering and shift tracking.
"""
import re
from typing import Dict, Tuple
from datetime import datetime, timedelta
class ImportanceScorer:
"""Score research findings for importance."""
SENTIMENT_LABELS = ("positive", "negative", "neutral", "mixed")
def __init__(self, topic: Dict, settings: Dict):
self.topic = topic
self.settings = settings
self.learning_enabled = settings.get("learning_enabled", False)
def score(self, result: Dict) -> Tuple[str, float, str, str]:
"""
Score a result.
Returns:
(priority, score, reason, sentiment)
"""
title = result.get("title", "")
snippet = result.get("snippet", "")
url = result.get("url", "")
published = result.get("published_date", "")
content = f"{title} {snippet}".lower()
signals = []
total_score = 0.0
keyword_score, keyword_reason = self._score_keywords(content)
signals.append(("keyword_match", keyword_score, keyword_reason))
total_score += keyword_score
freshness_score, freshness_reason = self._score_freshness(published)
signals.append(("freshness", freshness_score, freshness_reason))
total_score += freshness_score
source_score, source_reason = self._score_source(url)
signals.append(("source_quality", source_score, source_reason))
total_score += source_score
condition_score, condition_reason = self._score_conditions(content, title)
signals.append(("alert_conditions", condition_score, condition_reason))
total_score += condition_score
# Clamp into a sane range after penalties.
total_score = max(0.0, min(1.0, total_score))
threshold = self.topic.get("importance_threshold", "medium")
if threshold == "high":
if total_score >= 0.8:
priority = "high"
elif total_score >= 0.5:
priority = "medium"
else:
priority = "low"
elif threshold == "medium":
if total_score >= 0.6:
priority = "high"
elif total_score >= 0.3:
priority = "medium"
else:
priority = "low"
else:
if total_score >= 0.4:
priority = "high"
elif total_score >= 0.1:
priority = "medium"
else:
priority = "low"
top_signals = sorted(signals, key=lambda x: x[1], reverse=True)[:2]
reason_parts = [s[2] for s in top_signals if s[2]]
reason = " + ".join(reason_parts) if reason_parts else "low_relevance"
sentiment = self._score_sentiment(title, snippet)
return priority, total_score, reason, sentiment
def _score_keywords(self, content: str) -> Tuple[float, str]:
keywords = self.topic.get("keywords", [])
if not keywords:
return 0.0, ""
matches = 0
exact_matches = 0
for keyword in keywords:
keyword_lower = keyword.lower().strip()
if not keyword_lower:
continue
if keyword_lower.startswith("-"):
negative_keyword = keyword_lower[1:]
if negative_keyword and negative_keyword in content:
return 0.0, f"contains_excluded_{negative_keyword}"
continue
if re.search(r'\b' + re.escape(keyword_lower) + r'\b', content):
exact_matches += 1
matches += 1
elif keyword_lower in content:
matches += 1
if exact_matches >= 2:
return 0.3, f"exact_match_{exact_matches}_keywords"
if exact_matches == 1:
return 0.2, "exact_match_1_keyword"
if matches >= 2:
return 0.15, f"partial_match_{matches}_keywords"
if matches == 1:
return 0.1, "partial_match_1_keyword"
return 0.0, "no_keyword_match"
def _score_freshness(self, published: str) -> Tuple[float, str]:
if not published:
return 0.0, ""
try:
if "T" in published:
pub_date = datetime.fromisoformat(published.replace("Z", "+00:00"))
else:
pub_date = datetime.strptime(published, "%Y-%m-%d")
age = datetime.now() - pub_date.replace(tzinfo=None)
if age < timedelta(hours=6):
return 0.2, "very_fresh_<6h"
if age < timedelta(days=1):
return 0.15, "fresh_<24h"
if age < timedelta(days=3):
return 0.1, "recent_<3d"
return 0.05, "older_>3d"
except Exception:
return 0.0, ""
def _score_source(self, url: str) -> Tuple[float, str]:
boost_sources = self.topic.get("boost_sources", [])
for source in boost_sources:
if source and source in url:
return 0.2, f"boosted_source_{source}"
ignore_sources = self.topic.get("ignore_sources", [])
for source in ignore_sources:
if source and source in url:
return -1.0, f"ignored_source_{source}"
trusted = [
"github.com",
"arxiv.org",
"news.ycombinator.com",
"techcrunch.com",
"theverge.com",
"arstechnica.com",
]
for source in trusted:
if source in url:
return 0.15, f"trusted_source_{source}"
return 0.05, "standard_source"
def _score_conditions(self, content: str, title: str) -> Tuple[float, str]:
alert_on = self.topic.get("alert_on", [])
for condition in alert_on:
if condition == "price_change_10pct":
if self._detect_price_change(content, threshold=0.10):
return 0.3, "price_change_>10%"
elif condition == "keyword_exact_match":
for kw in self.topic.get("keywords", []):
if kw and re.search(r'\b' + re.escape(kw.lower()) + r'\b', content):
return 0.2, "exact_keyword_in_condition"
elif condition == "major_paper":
if "arxiv" in content or "paper" in title.lower():
return 0.25, "academic_paper_detected"
elif condition == "model_release":
if re.search(r'(release|launch|announce).*\b(model|gpt|llm)\b', content, re.I):
return 0.3, "model_release_detected"
elif condition == "patch_release":
if re.search(r'(patch|update|version|release).*\d+\.\d+', content, re.I):
return 0.25, "patch_release_detected"
elif condition == "major_bug_fix":
if re.search(r'(fix|patch|solve).*(critical|major|bug)', content, re.I):
return 0.2, "major_bug_fix_detected"
elif condition == "github_release":
if "/releases/tag/" in content or "release" in title.lower():
return 0.25, "github_release_detected"
return 0.0, ""
def _detect_price_change(self, content: str, threshold: float = 0.10) -> bool:
matches = re.findall(r'(\d+(?:\.\d+)?)\s*%', content)
for match in matches:
if float(match) >= threshold * 100:
return True
for keyword in ["surge", "plunge", "jump", "drop", "spike", "crash"]:
if keyword in content:
return True
return False
def _score_sentiment(self, title: str, snippet: str) -> str:
text = f"{title} {snippet}".lower()
positive_terms = [
"launch", "released", "release", "improved", "improvement", "wins",
"record", "growth", "surge", "upgrade", "success", "stable",
"available", "general availability", "fast", "faster", "secure",
]
negative_terms = [
"breach", "incident", "critical", "severe", "failure", "fails",
"outage", "downtime", "vulnerability", "cve", "warning", "recall",
"delay", "delayed", "lawsuit", "drop", "crash", "exploit", "bug",
]
pos = sum(1 for term in positive_terms if term in text)
neg = sum(1 for term in negative_terms if term in text)
if pos and neg:
return "mixed"
if neg > 0:
return "negative"
if pos > 0:
return "positive"
return "neutral"
def score_result(result: Dict, topic: Dict, settings: Dict) -> Tuple[str, float, str, str]:
"""Convenience function for scoring without creating scorer instance."""
scorer = ImportanceScorer(topic, settings)
return scorer.score(result)

353
scripts/manage_topics.py Normal file
View File

@@ -0,0 +1,353 @@
#!/usr/bin/env python3
"""
Topic management CLI for topic-monitor.
Usage:
python3 manage_topics.py add "Topic Name" --query "search" --keywords "a,b,c"
python3 manage_topics.py list
python3 manage_topics.py edit <id> --frequency hourly
python3 manage_topics.py remove <id>
python3 manage_topics.py test <id>
python3 manage_topics.py discover-feed https://example.com/blog
python3 manage_topics.py import-opml feeds.opml
"""
import sys
import argparse
import re
import json
import xml.etree.ElementTree as ET
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from config import load_config, save_config, get_topic, load_state, get_settings
from monitor import monitor_topic, discover_feed_urls, github_release_feed_url
def generate_id(name: str) -> str:
topic_id = name.lower()
topic_id = re.sub(r'[^\w\s-]', '', topic_id)
topic_id = re.sub(r'[-\s]+', '-', topic_id)
return topic_id.strip('-')
def split_csv(value: str):
if not value:
return []
return [item.strip() for item in value.split(",") if item.strip()]
def ensure_topic_defaults(topic: dict) -> dict:
topic.setdefault("keywords", [])
topic.setdefault("feeds", [])
topic.setdefault("github_repos", [])
topic.setdefault("exclude_keywords", [])
topic.setdefault("required_keywords", [])
topic.setdefault("alert_on_sentiment_shift", False)
topic.setdefault("frequency", "daily")
topic.setdefault("importance_threshold", "medium")
topic.setdefault("channels", ["telegram"])
topic.setdefault("context", "")
topic.setdefault("alert_on", [])
topic.setdefault("ignore_sources", [])
topic.setdefault("boost_sources", [])
return topic
def add_topic(args):
config = load_config()
topic_id = args.id or generate_id(args.name)
existing_ids = [t.get("id") for t in config.get("topics", [])]
if topic_id in existing_ids:
print(f"❌ Topic with ID '{topic_id}' already exists", file=sys.stderr)
sys.exit(1)
feeds = split_csv(args.feeds)
if args.discover_feeds:
discovered = []
for candidate in split_csv(args.discover_feeds):
discovered.extend(discover_feed_urls(candidate))
feeds = list(dict.fromkeys(feeds + discovered))
github_repos = split_csv(args.github_repos)
alert_on = split_csv(args.alert_on)
if github_repos and "github_release" not in alert_on:
alert_on.append("github_release")
topic = ensure_topic_defaults({
"id": topic_id,
"name": args.name,
"query": args.query or "",
"keywords": split_csv(args.keywords),
"feeds": feeds,
"github_repos": github_repos,
"exclude_keywords": split_csv(args.exclude_keywords),
"required_keywords": split_csv(args.required_keywords),
"frequency": args.frequency,
"importance_threshold": args.importance,
"channels": split_csv(args.channels) if args.channels else ["telegram"],
"context": args.context or "",
"alert_on": alert_on,
"alert_on_sentiment_shift": args.alert_on_sentiment_shift,
"ignore_sources": [],
"boost_sources": [],
})
config.setdefault("topics", []).append(topic)
save_config(config)
print(f"✅ Added topic: {args.name} ({topic_id})")
if topic.get("query"):
print(f" Query: {topic['query']}")
if topic.get("feeds"):
print(f" Feeds: {len(topic['feeds'])}")
if topic.get("github_repos"):
print(f" GitHub repos: {', '.join(topic['github_repos'])}")
if topic.get("required_keywords"):
print(f" Required keywords: {', '.join(topic['required_keywords'])}")
if topic.get("exclude_keywords"):
print(f" Exclude keywords: {', '.join(topic['exclude_keywords'])}")
def list_topics(args):
config = load_config()
topics = config.get("topics", [])
if not topics:
print("No topics configured")
return
print(f"\n📋 Configured Topics ({len(topics)})\n")
for topic in topics:
topic = ensure_topic_defaults(topic)
print(f"{'='*60}")
print(f"ID: {topic.get('id')}")
print(f"Name: {topic.get('name')}")
print(f"Query: {topic.get('query') or ''}")
print(f"Keywords: {', '.join(topic.get('keywords', [])) or ''}")
print(f"Feeds: {', '.join(topic.get('feeds', [])) or ''}")
print(f"GitHub: {', '.join(topic.get('github_repos', [])) or ''}")
print(f"Required: {', '.join(topic.get('required_keywords', [])) or ''}")
print(f"Excluded: {', '.join(topic.get('exclude_keywords', [])) or ''}")
print(f"Frequency: {topic.get('frequency')}")
print(f"Importance: {topic.get('importance_threshold')}")
print(f"Channels: {', '.join(topic.get('channels', []))}")
print(f"Sentiment shift alerts: {topic.get('alert_on_sentiment_shift')}")
if topic.get('context'):
print(f"Context: {topic.get('context')}")
print()
def edit_topic(args):
config = load_config()
topics = config.get("topics", [])
topic_idx = None
for idx, topic in enumerate(topics):
if topic.get("id") == args.topic_id:
topic_idx = idx
break
if topic_idx is None:
print(f"❌ Topic '{args.topic_id}' not found", file=sys.stderr)
sys.exit(1)
topic = ensure_topic_defaults(topics[topic_idx])
if args.name:
topic["name"] = args.name
if args.query is not None:
topic["query"] = args.query
if args.keywords is not None:
topic["keywords"] = split_csv(args.keywords)
if args.feeds is not None:
topic["feeds"] = split_csv(args.feeds)
if args.github_repos is not None:
topic["github_repos"] = split_csv(args.github_repos)
if topic["github_repos"] and "github_release" not in topic["alert_on"]:
topic["alert_on"].append("github_release")
if args.exclude_keywords is not None:
topic["exclude_keywords"] = split_csv(args.exclude_keywords)
if args.required_keywords is not None:
topic["required_keywords"] = split_csv(args.required_keywords)
if args.frequency:
topic["frequency"] = args.frequency
if args.importance:
topic["importance_threshold"] = args.importance
if args.channels:
topic["channels"] = split_csv(args.channels)
if args.context is not None:
topic["context"] = args.context
if args.alert_on is not None:
topic["alert_on"] = split_csv(args.alert_on)
if args.alert_on_sentiment_shift is not None:
topic["alert_on_sentiment_shift"] = args.alert_on_sentiment_shift
if args.discover_feeds:
discovered = []
for candidate in split_csv(args.discover_feeds):
discovered.extend(discover_feed_urls(candidate))
topic["feeds"] = list(dict.fromkeys(topic.get("feeds", []) + discovered))
topics[topic_idx] = topic
config["topics"] = topics
save_config(config)
print(f"✅ Updated topic: {topic.get('name')} ({args.topic_id})")
def remove_topic(args):
config = load_config()
topics = config.get("topics", [])
new_topics = [t for t in topics if t.get("id") != args.topic_id]
if len(new_topics) == len(topics):
print(f"❌ Topic '{args.topic_id}' not found", file=sys.stderr)
sys.exit(1)
config["topics"] = new_topics
save_config(config)
print(f"✅ Removed topic: {args.topic_id}")
def test_topic(args):
topic = get_topic(args.topic_id)
if not topic:
print(f"❌ Topic '{args.topic_id}' not found", file=sys.stderr)
sys.exit(1)
print(f"🧪 Testing topic: {topic.get('name')}\n")
state = load_state()
settings = get_settings()
monitor_topic(topic, state, settings, dry_run=True, verbose=True)
def discover_feed(args):
urls = discover_feed_urls(args.url)
if not urls:
print("No feeds discovered", file=sys.stderr)
sys.exit(1)
for url in urls:
print(url)
def import_opml(args):
config = load_config()
config.setdefault("topics", [])
tree = ET.parse(args.opml_file)
root = tree.getroot()
outlines = root.findall(".//outline")
added = 0
for outline in outlines:
xml_url = outline.attrib.get("xmlUrl") or outline.attrib.get("xmlurl")
title = outline.attrib.get("title") or outline.attrib.get("text") or outline.attrib.get("xmlUrl")
html_url = outline.attrib.get("htmlUrl") or outline.attrib.get("htmlurl")
if not xml_url:
continue
topic_name = title or xml_url
topic_id = generate_id(topic_name)
existing = next((t for t in config["topics"] if t.get("id") == topic_id), None)
feed_list = [xml_url]
if html_url:
feed_list.extend([u for u in discover_feed_urls(html_url) if u != xml_url])
if existing:
merged = list(dict.fromkeys(existing.get("feeds", []) + feed_list))
existing["feeds"] = merged
if html_url and not existing.get("query"):
existing["query"] = html_url
else:
topic = ensure_topic_defaults({
"id": topic_id,
"name": topic_name,
"query": html_url or "",
"keywords": [],
"feeds": list(dict.fromkeys(feed_list)),
"github_repos": [],
"exclude_keywords": [],
"required_keywords": [],
"frequency": args.frequency,
"importance_threshold": args.importance,
"channels": split_csv(args.channels) if args.channels else ["telegram"],
"context": args.context or "Imported from OPML",
"alert_on": [],
"alert_on_sentiment_shift": False,
"ignore_sources": [],
"boost_sources": [],
})
config["topics"].append(topic)
added += 1
save_config(config)
print(f"✅ Imported OPML: added {added} topic(s), updated matching topics where needed")
def main():
parser = argparse.ArgumentParser(description="Manage research topics")
subparsers = parser.add_subparsers(dest="command", required=True)
add_parser = subparsers.add_parser("add", help="Add a new topic")
add_parser.add_argument("name", help="Topic name")
add_parser.add_argument("--id", help="Custom topic ID")
add_parser.add_argument("--query", help="Search query")
add_parser.add_argument("--keywords", help="Comma-separated keywords")
add_parser.add_argument("--feeds", help="Comma-separated RSS/Atom feed URLs")
add_parser.add_argument("--discover-feeds", help="Comma-separated web URLs to auto-discover feeds from")
add_parser.add_argument("--github-repos", help="Comma-separated owner/repo values for GitHub release monitoring")
add_parser.add_argument("--exclude-keywords", help="Comma-separated keywords to filter out before scoring")
add_parser.add_argument("--required-keywords", help="Comma-separated keywords that must all appear")
add_parser.add_argument("--frequency", choices=["hourly", "daily", "weekly"], default="daily")
add_parser.add_argument("--importance", choices=["high", "medium", "low"], default="medium")
add_parser.add_argument("--channels", default="telegram", help="Comma-separated channels")
add_parser.add_argument("--context", help="Why this topic matters to you")
add_parser.add_argument("--alert-on", help="Comma-separated alert conditions")
add_parser.add_argument("--alert-on-sentiment-shift", action="store_true", help="Alert when sentiment changes from previous findings")
add_parser.set_defaults(func=add_topic)
list_parser = subparsers.add_parser("list", help="List all topics")
list_parser.set_defaults(func=list_topics)
edit_parser = subparsers.add_parser("edit", help="Edit a topic")
edit_parser.add_argument("topic_id", help="Topic ID to edit")
edit_parser.add_argument("--name", help="New name")
edit_parser.add_argument("--query", help="New query (empty string allowed)")
edit_parser.add_argument("--keywords", help="New keywords")
edit_parser.add_argument("--feeds", help="New feed URLs")
edit_parser.add_argument("--discover-feeds", help="Discover and append feeds from these URLs")
edit_parser.add_argument("--github-repos", help="New GitHub repos")
edit_parser.add_argument("--exclude-keywords", help="New exclude keywords")
edit_parser.add_argument("--required-keywords", help="New required keywords")
edit_parser.add_argument("--frequency", choices=["hourly", "daily", "weekly"])
edit_parser.add_argument("--importance", choices=["high", "medium", "low"])
edit_parser.add_argument("--channels", help="New channels")
edit_parser.add_argument("--context", help="New context")
edit_parser.add_argument("--alert-on", help="New alert conditions")
edit_parser.add_argument("--alert-on-sentiment-shift", action=argparse.BooleanOptionalAction, default=None, help="Toggle sentiment shift alerts")
edit_parser.set_defaults(func=edit_topic)
remove_parser = subparsers.add_parser("remove", help="Remove a topic")
remove_parser.add_argument("topic_id", help="Topic ID to remove")
remove_parser.set_defaults(func=remove_topic)
test_parser = subparsers.add_parser("test", help="Test a topic")
test_parser.add_argument("topic_id", help="Topic ID to test")
test_parser.set_defaults(func=test_topic)
discover_parser = subparsers.add_parser("discover-feed", help="Discover feed URLs from a webpage URL")
discover_parser.add_argument("url", help="Webpage URL to inspect")
discover_parser.set_defaults(func=discover_feed)
opml_parser = subparsers.add_parser("import-opml", help="Import RSS/Atom feeds from an OPML file")
opml_parser.add_argument("opml_file", help="Path to OPML file")
opml_parser.add_argument("--frequency", choices=["hourly", "daily", "weekly"], default="daily")
opml_parser.add_argument("--importance", choices=["high", "medium", "low"], default="medium")
opml_parser.add_argument("--channels", default="telegram", help="Comma-separated channels")
opml_parser.add_argument("--context", help="Context applied to imported topics")
opml_parser.set_defaults(func=import_opml)
args = parser.parse_args()
try:
args.func(args)
except FileNotFoundError as e:
print(f"{e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

672
scripts/monitor.py Normal file
View File

@@ -0,0 +1,672 @@
#!/usr/bin/env python3
"""
Proactive Research Monitor
Checks topics due for monitoring, scores findings, and sends alerts.
Run via cron for automated monitoring.
"""
import os
import sys
import json
import hashlib
import argparse
import re
from datetime import datetime, timedelta, timezone
from email.utils import parsedate_to_datetime
from html.parser import HTMLParser
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urljoin
from urllib.request import Request, urlopen
SCRIPT_DIR = Path(__file__).parent
VENDOR_DIR = SCRIPT_DIR / "_vendor"
sys.path.insert(0, str(SCRIPT_DIR))
if VENDOR_DIR.exists():
sys.path.insert(0, str(VENDOR_DIR))
from config import (
load_config, load_state, save_state, get_settings,
save_finding, queue_alert
)
from importance_scorer import score_result
try:
import feedparser
except ImportError: # pragma: no cover - handled at runtime
feedparser = None
class FeedDiscoveryParser(HTMLParser):
"""Extract RSS/Atom alternate links from HTML."""
def __init__(self):
super().__init__()
self.feed_links = []
def handle_starttag(self, tag, attrs):
if tag.lower() != "link":
return
attr_map = {k.lower(): v for k, v in attrs}
rel = (attr_map.get("rel") or "").lower()
href = attr_map.get("href")
content_type = (attr_map.get("type") or "").lower()
if not href:
return
if "alternate" in rel and content_type in (
"application/rss+xml",
"application/atom+xml",
"application/xml",
"text/xml",
):
self.feed_links.append(href)
def hash_url(url: str) -> str:
return hashlib.md5(url.encode()).hexdigest()
def normalize_text_list(values: Optional[List[str]]) -> List[str]:
if not values:
return []
out = []
for value in values:
if value is None:
continue
text = str(value).strip()
if text:
out.append(text)
return out
def normalize_feed_url(url: str) -> str:
url = (url or "").strip()
if not url:
return ""
if "github.com" in url and "/releases" not in url and url.count("/") >= 4:
parts = url.rstrip("/").split("/")
if len(parts) >= 5:
owner = parts[-2]
repo = parts[-1]
return f"https://github.com/{owner}/{repo}/releases.atom"
return url
def github_release_feed_url(repo: str) -> str:
repo = repo.strip().strip("/")
return f"https://github.com/{repo}/releases.atom"
def parse_http_date(value: Optional[str]) -> Optional[str]:
if not value:
return None
try:
return parsedate_to_datetime(value).astimezone(timezone.utc).isoformat()
except Exception:
return None
def discover_feed_urls(url: str, timeout: int = 15) -> List[str]:
"""Try to discover RSS/Atom feeds from a regular webpage URL."""
url = (url or "").strip()
if not url:
return []
if url.endswith((".rss", ".xml", ".atom")) or url.endswith("/feed"):
return [url]
request = Request(
url,
headers={
"User-Agent": "topic-monitor/1.5 (+feed-discovery)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
)
try:
with urlopen(request, timeout=timeout) as response:
content_type = response.headers.get("Content-Type", "")
final_url = response.geturl()
body = response.read(250_000)
except Exception:
return []
lowered_type = content_type.lower()
if "rss" in lowered_type or "atom" in lowered_type or "xml" in lowered_type:
return [final_url]
try:
html = body.decode("utf-8", errors="ignore")
except Exception:
return []
parser = FeedDiscoveryParser()
parser.feed(html)
discovered = [urljoin(final_url, href) for href in parser.feed_links]
common_guesses = [
urljoin(final_url, "/feed"),
urljoin(final_url, "/rss"),
urljoin(final_url, "/rss.xml"),
urljoin(final_url, "/atom.xml"),
urljoin(final_url, "/feeds/posts/default"),
]
seen = set()
ordered = []
for candidate in discovered + common_guesses:
candidate = candidate.strip()
if candidate and candidate not in seen:
seen.add(candidate)
ordered.append(candidate)
return ordered
def is_duplicate(url: str, state: Dict, dedup_hours: int = 72) -> bool:
url_hash = hash_url(url)
dedup_map = state.get("deduplication", {}).get("url_hash_map", {})
if url_hash not in dedup_map:
return False
last_seen = datetime.fromisoformat(dedup_map[url_hash])
return (datetime.now() - last_seen) < timedelta(hours=dedup_hours)
def mark_as_seen(url: str, state: Dict):
if "deduplication" not in state:
state["deduplication"] = {"url_hash_map": {}}
state["deduplication"]["url_hash_map"][hash_url(url)] = datetime.now().isoformat()
def get_topic_state(state: Dict, topic_id: str) -> Dict:
if "topics" not in state:
state["topics"] = {}
if topic_id not in state["topics"]:
state["topics"][topic_id] = {}
return state["topics"][topic_id]
def get_feed_state(state: Dict, topic_id: str) -> Dict:
topic_state = get_topic_state(state, topic_id)
if "feeds" not in topic_state:
topic_state["feeds"] = {}
return topic_state["feeds"]
def feed_cache_headers(state: Dict, topic_id: str, feed_url: str) -> Dict[str, str]:
feed_state = get_feed_state(state, topic_id).get(feed_url, {})
headers = {"User-Agent": "topic-monitor/1.5 (+feedparser)"}
if feed_state.get("etag"):
headers["If-None-Match"] = feed_state["etag"]
if feed_state.get("modified"):
headers["If-Modified-Since"] = feed_state["modified"]
return headers
def update_feed_cache(state: Dict, topic_id: str, feed_url: str, parsed_feed):
cache = get_feed_state(state, topic_id).setdefault(feed_url, {})
if getattr(parsed_feed, "etag", None):
cache["etag"] = parsed_feed.etag
response_headers = getattr(parsed_feed, "headers", {}) or {}
if response_headers.get("etag"):
cache["etag"] = response_headers.get("etag")
if response_headers.get("last-modified"):
cache["modified"] = response_headers.get("last-modified")
cache["last_checked"] = datetime.now().isoformat()
if getattr(parsed_feed, "status", None) is not None:
cache["last_status"] = parsed_feed.status
href = getattr(parsed_feed.feed, "link", None) if getattr(parsed_feed, "feed", None) else None
if href:
cache["site_url"] = href
title = getattr(parsed_feed.feed, "title", None) if getattr(parsed_feed, "feed", None) else None
if title:
cache["feed_title"] = title
def iso_from_struct_time(struct_time_value) -> Optional[str]:
if not struct_time_value:
return None
try:
return datetime(*struct_time_value[:6], tzinfo=timezone.utc).isoformat()
except Exception:
return None
def entry_to_result(entry: Dict, feed_url: str, topic: Dict, feed_title: str = "") -> Dict:
title = entry.get("title", "Untitled")
summary = entry.get("summary", "") or entry.get("description", "")
link = entry.get("link", feed_url)
published = (
iso_from_struct_time(entry.get("published_parsed"))
or iso_from_struct_time(entry.get("updated_parsed"))
or entry.get("published")
or entry.get("updated")
or ""
)
tags = ", ".join(tag.get("term", "") for tag in entry.get("tags", []) if tag.get("term"))
source_label = feed_title or topic.get("name", "Feed")
result = {
"title": title,
"url": link,
"snippet": re.sub(r"\s+", " ", summary).strip(),
"published_date": published,
"source": "feed",
"feed_url": feed_url,
"feed_title": feed_title,
"source_label": source_label,
"tags": tags,
}
if "github.com" in feed_url and "/releases.atom" in feed_url:
repo = feed_url.split("github.com/")[-1].split("/releases.atom")[0]
result["source"] = "github_release"
result["github_repo"] = repo
result["title"] = f"{repo} release: {title}"
if not result["snippet"]:
result["snippet"] = f"New GitHub release published for {repo}."
return result
def search_topic(topic: Dict, dry_run: bool = False, verbose: bool = False) -> List[Dict]:
query = topic.get("query", "")
if not query:
return []
web_search_plus = Path(os.environ.get(
"WEB_SEARCH_PLUS_PATH",
os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
"web-search-plus",
"scripts",
"search.py",
),
))
if web_search_plus.exists():
import subprocess
try:
safe_query = re.sub(r'[\x00-\x1f\x7f]', '', query)[:500]
if verbose:
print(f" 🔍 Searching via web-search-plus: {safe_query}")
result = subprocess.run(
["python3", str(web_search_plus), "--query", safe_query, "--max-results", "5"],
capture_output=True,
text=True,
timeout=45,
env={k: v for k, v in os.environ.items() if k in (
"PATH", "HOME", "LANG", "TERM",
"SERPER_API_KEY", "TAVILY_API_KEY", "EXA_API_KEY",
"YOU_API_KEY", "SEARXNG_INSTANCE_URL", "WSP_CACHE_DIR",
)},
)
if result.returncode == 0:
stdout = result.stdout.strip()
json_start = stdout.find('{')
if json_start >= 0:
data = json.loads(stdout[json_start:])
results = data.get("results", [])
if verbose:
print(f" ✅ Got {len(results)} search results from {data.get('provider', 'unknown')}")
for item in results:
item.setdefault("source", "web_search")
return results
elif verbose and result.stderr:
print(f" ⚠️ web-search-plus error: {result.stderr[:200]}", file=sys.stderr)
except subprocess.TimeoutExpired:
print(f"⚠️ web-search-plus timed out for query: {query}", file=sys.stderr)
except Exception as e:
print(f"⚠️ web-search-plus failed: {e}", file=sys.stderr)
elif verbose:
print(f" ⚠️ web-search-plus not found at {web_search_plus}", file=sys.stderr)
if dry_run:
return [{
"title": f"[Mock] Result for: {query}",
"url": f"https://example.com/mock-{hashlib.md5(query.encode()).hexdigest()[:8]}",
"snippet": f"This is a mock/test result for query: {query}. Run without --dry-run to use real search.",
"published_date": datetime.now().isoformat(),
"source": "web_search",
}]
return []
def fetch_feed_results(topic: Dict, state: Dict, dry_run: bool = False, verbose: bool = False) -> List[Dict]:
if feedparser is None:
if verbose:
print(" ⚠️ feedparser is not installed; skipping feeds", file=sys.stderr)
return []
topic_id = topic.get("id")
explicit_feeds = [normalize_feed_url(url) for url in normalize_text_list(topic.get("feeds", []))]
github_feeds = [github_release_feed_url(repo) for repo in normalize_text_list(topic.get("github_repos", []))]
all_feeds = []
for feed in explicit_feeds + github_feeds:
if feed and feed not in all_feeds:
all_feeds.append(feed)
if not all_feeds:
return []
results = []
for feed_url in all_feeds:
headers = feed_cache_headers(state, topic_id, feed_url)
if verbose:
print(f" 📰 Fetching feed: {feed_url}")
parsed = feedparser.parse(feed_url, request_headers=headers)
update_feed_cache(state, topic_id, feed_url, parsed)
status = getattr(parsed, "status", None)
if status == 304:
if verbose:
print(" ⏭️ Feed not modified (304)")
continue
if getattr(parsed, "bozo", False) and not getattr(parsed, "entries", []):
if verbose:
print(f" ⚠️ feed parse issue for {feed_url}: {getattr(parsed, 'bozo_exception', 'unknown')}", file=sys.stderr)
continue
feed_title = parsed.feed.get("title", "") if getattr(parsed, "feed", None) else ""
entries = getattr(parsed, "entries", [])[:10]
if verbose:
print(f" ✅ Got {len(entries)} feed entries")
for entry in entries:
results.append(entry_to_result(entry, feed_url, topic, feed_title=feed_title))
return results
def collect_results(topic: Dict, state: Dict, dry_run: bool = False, verbose: bool = False) -> List[Dict]:
results = []
results.extend(search_topic(topic, dry_run=dry_run, verbose=verbose))
results.extend(fetch_feed_results(topic, state, dry_run=dry_run, verbose=verbose))
return results
def passes_topic_filters(result: Dict, topic: Dict) -> (bool, str):
title = result.get("title", "")
snippet = result.get("snippet", "")
content = f"{title}\n{snippet}".lower()
exclude_keywords = [k.lower() for k in normalize_text_list(topic.get("exclude_keywords", []))]
for keyword in exclude_keywords:
if keyword in content:
return False, f"excluded_by_{keyword}"
required_keywords = [k.lower() for k in normalize_text_list(topic.get("required_keywords", []))]
missing = [keyword for keyword in required_keywords if keyword not in content]
if missing:
return False, f"missing_required_{','.join(missing)}"
return True, ""
def should_check_topic(topic: Dict, state: Dict, force: bool = False) -> bool:
if force:
return True
topic_state = state.get("topics", {}).get(topic.get("id"), {})
last_check_str = topic_state.get("last_check")
if not last_check_str:
return True
last_check = datetime.fromisoformat(last_check_str)
now = datetime.now()
frequency = topic.get("frequency", "daily")
if frequency == "hourly":
return (now - last_check) >= timedelta(hours=1)
if frequency == "daily":
return (now - last_check) >= timedelta(days=1)
if frequency == "weekly":
return (now - last_check) >= timedelta(weeks=1)
return False
def check_rate_limits(topic: Dict, state: Dict, settings: Dict) -> bool:
topic_id = topic.get("id")
max_per_day = settings.get("max_alerts_per_day", 5)
max_per_topic_per_day = settings.get("max_alerts_per_topic_per_day", 2)
topic_state = state.get("topics", {}).get(topic_id, {})
alerts_today = topic_state.get("alerts_today", 0)
if alerts_today >= max_per_topic_per_day:
return False
total_alerts_today = sum(s.get("alerts_today", 0) for s in state.get("topics", {}).values())
return total_alerts_today < max_per_day
def sentiment_shifted(topic: Dict, state: Dict, new_sentiment: str) -> bool:
if not topic.get("alert_on_sentiment_shift"):
return False
history = get_topic_state(state, topic.get("id")).get("sentiment_history", [])
if not history:
return False
previous = history[-1].get("sentiment")
return bool(previous and previous != new_sentiment)
def record_sentiment(topic: Dict, state: Dict, result: Dict, sentiment: str):
topic_state = get_topic_state(state, topic.get("id"))
history = topic_state.setdefault("sentiment_history", [])
history.append({
"timestamp": datetime.now().isoformat(),
"sentiment": sentiment,
"url": result.get("url", ""),
"title": result.get("title", ""),
})
if len(history) > 50:
del history[:-50]
topic_state["last_sentiment"] = sentiment
def build_alert_message(topic: Dict, result: Dict, priority: str, score: float, reason: str, sentiment: str, sentiment_shift: bool) -> str:
emoji_map = {"high": "🔥", "medium": "📌", "low": "📝"}
source = result.get("source", "web_search")
source_label = {
"web_search": "🌐 Web",
"feed": "📰 Feed",
"github_release": "🚀 GitHub Release",
}.get(source, "🌐 Web")
lines = [f"{emoji_map.get(priority, '📌')} **{topic.get('name', 'Research Alert')}** {topic.get('emoji', '🔍')}", ""]
lines.append(f"**{result.get('title', 'Untitled')}**")
lines.append("")
snippet = result.get("snippet", "")
if snippet:
if len(snippet) > 320:
snippet = snippet[:317] + "..."
lines.append(snippet)
lines.append("")
context = topic.get("context", "")
if context:
lines.append(f"💡 _Context: {context}_")
lines.append("")
lines.append(f"{source_label}: {result.get('feed_title') or result.get('source_label') or result.get('url', '')}")
if result.get("url"):
lines.append(f"🔗 {result['url']}")
lines.append(f"📊 _Score: {score:.2f} | {reason}_")
lines.append(f"🙂 _Sentiment: {sentiment}_")
if sentiment_shift:
lines.append("🔄 _Sentiment shift detected_")
return "\n".join(lines)
def send_alert(topic: Dict, result: Dict, priority: str, score: float, reason: str, sentiment: str, sentiment_shift: bool = False, dry_run: bool = False):
channels = topic.get("channels", [])
message = build_alert_message(topic, result, priority, score, reason, sentiment, sentiment_shift)
if dry_run:
print(f"\n{'='*60}")
print("DRY RUN - Would send alert:")
print(f"Channels: {', '.join(channels)}")
print(f"Priority: {priority.upper()}")
print()
print(message)
print(f"{'='*60}\n")
return None
alert_ids = []
for channel in channels:
alert_data = {
"timestamp": datetime.now().isoformat(),
"priority": priority,
"channel": channel,
"topic_id": topic.get("id"),
"topic_name": topic.get("name"),
"title": result.get("title", ""),
"snippet": result.get("snippet", ""),
"url": result.get("url", ""),
"score": score,
"reason": reason,
"message": message,
"context": topic.get("context", ""),
"sentiment": sentiment,
"sentiment_shift": sentiment_shift,
"source": result.get("source", "web_search"),
"feed_url": result.get("feed_url", ""),
"github_repo": result.get("github_repo", ""),
}
alert_id = queue_alert(alert_data)
alert_ids.append(alert_id)
print(f"📢 ALERT_QUEUED: {json.dumps({'id': alert_id, 'channel': channel, 'priority': priority, 'topic': topic.get('name'), 'sentiment': sentiment})}")
return alert_ids
def monitor_topic(topic: Dict, state: Dict, settings: Dict, dry_run: bool = False, verbose: bool = False):
topic_id = topic.get("id")
topic_name = topic.get("name")
if verbose:
print(f"\n🔍 Checking topic: {topic_name} ({topic_id})")
results = collect_results(topic, state, dry_run=dry_run, verbose=verbose)
if verbose:
print(f" Found {len(results)} total results across all sources")
dedup_hours = settings.get("deduplication_window_hours", 72)
high_priority = []
medium_priority = []
for result in results:
url = result.get("url", "") or result.get("feed_url", "")
if not url:
continue
if is_duplicate(url, state, dedup_hours):
if verbose:
print(f" ⏭️ Skipping duplicate: {url}")
continue
passes, filter_reason = passes_topic_filters(result, topic)
if not passes:
if verbose:
print(f" 🚫 Filtered out: {filter_reason} - {result.get('title', '')[:60]}")
mark_as_seen(url, state)
continue
priority, score, reason, sentiment = score_result(result, topic, settings)
sentiment_shift = sentiment_shifted(topic, state, sentiment)
if sentiment_shift and priority == "medium":
priority = "high"
reason = f"{reason} + sentiment_shift"
elif sentiment_shift and priority == "low":
priority = "medium"
reason = f"{reason} + sentiment_shift"
if verbose:
print(f" {priority.upper():6} ({score:.2f}) [{sentiment}] - {result.get('title', '')[:55]}...")
if priority == "high":
high_priority.append((result, score, reason, sentiment, sentiment_shift))
elif priority == "medium":
medium_priority.append((result, score, reason, sentiment, sentiment_shift))
mark_as_seen(url, state)
if not dry_run:
record_sentiment(topic, state, result, sentiment)
for result, score, reason, sentiment, sentiment_shift in high_priority:
if check_rate_limits(topic, state, settings):
send_alert(topic, result, "high", score, reason, sentiment, sentiment_shift, dry_run=dry_run)
if not dry_run:
topic_state = get_topic_state(state, topic_id)
topic_state["alerts_today"] = topic_state.get("alerts_today", 0) + 1
elif verbose:
print(" ⚠️ Rate limit reached, skipping alert")
date_str = datetime.now().strftime("%Y-%m-%d")
for result, score, reason, sentiment, sentiment_shift in medium_priority:
if not dry_run:
save_finding(topic_id, date_str, {
"result": result,
"score": score,
"reason": reason,
"timestamp": datetime.now().isoformat(),
"sentiment": sentiment,
"sentiment_shift": sentiment_shift,
})
if verbose:
print(f" 💾 Saved to digest: {result.get('title', '')[:50]}...")
if not dry_run:
topic_state = get_topic_state(state, topic_id)
topic_state["last_check"] = datetime.now().isoformat()
topic_state["last_results_count"] = len(results)
topic_state["findings_count"] = topic_state.get("findings_count", 0) + len(medium_priority)
def main():
parser = argparse.ArgumentParser(description="Monitor research topics")
parser.add_argument("--dry-run", action="store_true", help="Don't send alerts or save state")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument("--topic", help="Check specific topic by ID")
parser.add_argument("--force", action="store_true", help="Force check even if not due")
parser.add_argument("--frequency", choices=["hourly", "daily", "weekly"], help="Only check topics with this frequency")
parser.add_argument("--discover-feed", metavar="URL", help="Discover RSS/Atom feed links for a URL and exit")
args = parser.parse_args()
if args.discover_feed:
for item in discover_feed_urls(args.discover_feed):
print(item)
return
try:
config = load_config()
except FileNotFoundError as e:
print(f"{e}", file=sys.stderr)
sys.exit(1)
state = load_state()
settings = get_settings()
topics = config.get("topics", [])
if not topics:
print("⚠️ No topics configured", file=sys.stderr)
sys.exit(0)
topics_to_check = []
for topic in topics:
if args.topic and topic.get("id") != args.topic:
continue
if args.frequency and topic.get("frequency") != args.frequency:
continue
if should_check_topic(topic, state, force=args.force):
topics_to_check.append(topic)
if not topics_to_check:
if args.verbose:
print("✅ No topics due for checking")
sys.exit(0)
print(f"🔍 Monitoring {len(topics_to_check)} topic(s)...")
for topic in topics_to_check:
try:
monitor_topic(topic, state, settings, dry_run=args.dry_run, verbose=args.verbose)
except Exception as e:
print(f"❌ Error monitoring {topic.get('name')}: {e}", file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
if not args.dry_run:
save_state(state)
print("✅ State saved")
print("✅ Monitoring complete")
if __name__ == "__main__":
main()

174
scripts/process_alerts.py Normal file
View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
"""
Process pending alerts from the queue.
This script outputs alerts in a format that the OpenClaw agent can parse
and send via the message tool.
Usage:
python3 process_alerts.py # Show pending alerts
python3 process_alerts.py --json # Output as JSON for agent processing
python3 process_alerts.py --mark-sent # Mark all as sent (after agent sends them)
"""
import os
import sys
import json
import argparse
from pathlib import Path
from datetime import datetime
sys.path.insert(0, str(Path(__file__).parent))
from config import get_pending_alerts, mark_alert_sent, clear_old_alerts
def format_alert_message(alert: dict) -> str:
"""Format an alert as a nice message."""
if alert.get("message"):
return alert["message"]
# Build message from components
emoji_map = {"high": "🔥", "medium": "📌", "low": "📝"}
emoji = emoji_map.get(alert.get("priority", "medium"), "📌")
lines = [
f"{emoji} **{alert.get('topic_name', 'Alert')}**",
"",
f"**{alert.get('title', 'Untitled')}**",
"",
]
snippet = alert.get("snippet", "")
if snippet:
if len(snippet) > 300:
snippet = snippet[:297] + "..."
lines.append(snippet)
lines.append("")
if alert.get("context"):
lines.append(f"💡 _Context: {alert['context']}_")
lines.append("")
if alert.get("url"):
lines.append(f"🔗 {alert['url']}")
lines.append("")
lines.append(f"📊 _Score: {alert.get('score', 0):.2f} | {alert.get('reason', '')}_")
if alert.get("sentiment"):
lines.append(f"🙂 _Sentiment: {alert['sentiment']}_")
if alert.get("sentiment_shift"):
lines.append("🔄 _Sentiment shift detected_")
return "\n".join(lines)
def show_pending_alerts():
"""Show pending alerts in human-readable format."""
alerts = get_pending_alerts()
if not alerts:
print("✅ No pending alerts")
return
print(f"\n📬 Pending Alerts: {len(alerts)}\n")
for i, alert in enumerate(alerts, 1):
print(f"{'='*60}")
print(f"Alert #{i} - {alert.get('id', 'unknown')}")
print(f"{'='*60}")
print(f"Priority: {alert.get('priority', 'unknown').upper()}")
print(f"Channel: {alert.get('channel', 'telegram')}")
print(f"Topic: {alert.get('topic_name', 'unknown')}")
print(f"Title: {alert.get('title', 'untitled')[:60]}")
print(f"URL: {alert.get('url', '')}")
print(f"Score: {alert.get('score', 0):.2f}")
print(f"Timestamp: {alert.get('timestamp', '')}")
print()
print("Message Preview:")
print("-" * 40)
print(format_alert_message(alert))
print()
def output_json_for_agent():
"""Output alerts as JSON for agent processing."""
alerts = get_pending_alerts()
if not alerts:
print(json.dumps({"alerts": [], "count": 0}))
return
# Format alerts for agent
formatted = []
for alert in alerts:
formatted.append({
"id": alert.get("id"),
"priority": alert.get("priority", "medium"),
"channel": alert.get("channel", "telegram"),
"target": os.environ.get("TOPIC_MONITOR_TELEGRAM_ID", ""),
"topic_name": alert.get("topic_name"),
"title": alert.get("title"),
"url": alert.get("url"),
"score": alert.get("score", 0),
"message": format_alert_message(alert)
})
output = {
"alerts": formatted,
"count": len(formatted),
"timestamp": datetime.now().isoformat()
}
print(json.dumps(output, indent=2))
def mark_all_sent(alert_ids: list = None):
"""Mark alerts as sent."""
alerts = get_pending_alerts()
if alert_ids:
# Mark specific alerts
for aid in alert_ids:
mark_alert_sent(aid)
print(f"✅ Marked as sent: {aid}")
else:
# Mark all pending
for alert in alerts:
mark_alert_sent(alert.get("id"))
print(f"✅ Marked as sent: {alert.get('id')}")
print(f"\n✅ Marked {len(alert_ids) if alert_ids else len(alerts)} alert(s) as sent")
def main():
parser = argparse.ArgumentParser(description="Process topic monitor alerts")
parser.add_argument("--json", action="store_true", help="Output as JSON for agent")
parser.add_argument("--mark-sent", nargs="*", metavar="ID",
help="Mark alert(s) as sent (all if no IDs given)")
parser.add_argument("--clear-old", type=int, metavar="HOURS",
help="Clear alerts older than HOURS (default: 168 = 7 days)")
args = parser.parse_args()
if args.clear_old:
clear_old_alerts(args.clear_old)
print(f"✅ Cleared alerts older than {args.clear_old} hours")
return
if args.mark_sent is not None:
# --mark-sent with or without IDs
if args.mark_sent:
mark_all_sent(args.mark_sent)
else:
mark_all_sent()
return
if args.json:
output_json_for_agent()
else:
show_pending_alerts()
if __name__ == "__main__":
main()

129
scripts/quick.py Normal file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Quick Start - One-liner topic monitoring setup.
"""
import sys
import argparse
import re
import json
from pathlib import Path
from datetime import datetime
sys.path.insert(0, str(Path(__file__).parent))
try:
from config import load_config, save_config, CONFIG_FILE
except ImportError:
CONFIG_FILE = Path(__file__).parent.parent / "config.json"
def load_config():
if CONFIG_FILE.exists():
with open(CONFIG_FILE) as f:
return json.load(f)
return {"topics": [], "settings": {}}
def save_config(config):
with open(CONFIG_FILE, 'w') as f:
json.dump(config, f, indent=2)
def generate_id(name: str) -> str:
topic_id = name.lower()
topic_id = re.sub(r'[^\w\s-]', '', topic_id)
topic_id = re.sub(r'[-\s]+', '-', topic_id)
return topic_id.strip('-')[:30]
def split_csv(value: str):
if not value:
return []
return [item.strip() for item in value.split(",") if item.strip()]
def quick_add(args):
config = load_config()
topic_id = generate_id(args.topic)
existing_ids = [t.get("id") for t in config.get("topics", [])]
if topic_id in existing_ids:
print(f"⚠️ Topic '{topic_id}' already exists. Use manage_topics.py to edit.")
sys.exit(1)
query = args.query or f"{args.topic} news updates"
if args.keywords:
keywords = split_csv(args.keywords)
else:
words = re.findall(r'\b[A-Za-z]{3,}\b', args.topic)
keywords = list(dict.fromkeys(words))[:5]
topic = {
"id": topic_id,
"name": args.topic,
"query": query,
"keywords": keywords,
"feeds": split_csv(args.feeds),
"github_repos": split_csv(args.github_repos),
"exclude_keywords": split_csv(args.exclude_keywords),
"required_keywords": split_csv(args.required_keywords),
"frequency": args.frequency,
"importance_threshold": args.importance,
"channels": [args.channel],
"context": args.context or f"Monitoring {args.topic}",
"alert_on": ["github_release"] if args.github_repos else [],
"alert_on_sentiment_shift": args.alert_on_sentiment_shift,
"created": datetime.now().isoformat(),
}
config.setdefault("topics", []).append(topic)
config.setdefault("settings", {
"digest_day": "sunday",
"digest_time": "18:00",
"max_alerts_per_day": 5,
"deduplication_window_hours": 72,
})
save_config(config)
print()
print("✅ Topic created!")
print()
print(f" 📌 {args.topic}")
print(f" 🔍 Query: {query or ''}")
print(f" 🏷️ Keywords: {', '.join(keywords) or ''}")
if topic['feeds']:
print(f" 📰 Feeds: {', '.join(topic['feeds'])}")
if topic['github_repos']:
print(f" 🚀 GitHub repos: {', '.join(topic['github_repos'])}")
print(f" ⏰ Frequency: {args.frequency}")
print(f" 🔔 Alert threshold: {args.importance}")
print(f" 📱 Channel: {args.channel}")
print()
print("Next steps:")
print(f" • Test: python3 scripts/monitor.py --topic {topic_id} --dry-run --verbose")
print(f" • Run: python3 scripts/monitor.py --topic {topic_id}")
print(f" • Edit: python3 scripts/manage_topics.py edit {topic_id} --frequency hourly")
print(f" • Remove: python3 scripts/manage_topics.py remove {topic_id}")
print()
return topic_id
def main():
parser = argparse.ArgumentParser(description="Quick Start - Add a topic to monitor in one command")
parser.add_argument("topic", help="Topic name to monitor")
parser.add_argument("--query", "-q", help="Custom search query")
parser.add_argument("--keywords", "-k", help="Comma-separated keywords to watch for")
parser.add_argument("--feeds", help="Comma-separated RSS/Atom feed URLs")
parser.add_argument("--github-repos", help="Comma-separated owner/repo values for release monitoring")
parser.add_argument("--exclude-keywords", help="Comma-separated keywords to filter out")
parser.add_argument("--required-keywords", help="Comma-separated keywords that must all appear")
parser.add_argument("--frequency", "-f", choices=["hourly", "daily", "weekly"], default="daily")
parser.add_argument("--importance", "-i", choices=["high", "medium", "low"], default="medium")
parser.add_argument("--channel", "-c", default="telegram", help="Where to send alerts")
parser.add_argument("--context", help="Why this topic matters to you")
parser.add_argument("--alert-on-sentiment-shift", action="store_true", help="Alert when sentiment changes")
args = parser.parse_args()
quick_add(args)
if __name__ == "__main__":
main()

240
scripts/setup.py Normal file
View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Interactive onboarding wizard for topic-monitor skill.
Runs on first use when no config.json exists.
"""
import json
import sys
from pathlib import Path
SKILL_DIR = Path(__file__).parent.parent
CONFIG_FILE = SKILL_DIR / "config.json"
def print_welcome():
print()
print("=" * 55)
print(" 🔍 Topic Monitor - Setup Wizard")
print("=" * 55)
print()
print("Welcome! Let's set up your personal topic monitoring.")
print("You can mix web search, RSS/Atom feeds, and GitHub release feeds.")
print()
def prompt(question: str, default: str = None) -> str:
question = f"{question} [{default}]: " if default else f"{question}: "
response = input(question).strip()
return response if response else (default or "")
def prompt_yes_no(question: str, default: bool = True) -> bool:
default_hint = "Y/n" if default else "y/N"
response = input(f"{question} ({default_hint}): ").strip().lower()
if not response:
return default
return response in ('y', 'yes', 'ja', 'si', 'oui')
def prompt_choice(question: str, choices: list, default: str = None) -> str:
print(f"\n{question}")
for i, choice in enumerate(choices, 1):
marker = " *" if choice == default else ""
print(f" {i}. {choice}{marker}")
while True:
response = input(f"\nEnter number or value [{default or choices[0]}]: ").strip()
if not response:
return default or choices[0]
try:
idx = int(response)
if 1 <= idx <= len(choices):
return choices[idx - 1]
except ValueError:
pass
for choice in choices:
if choice.lower() == response.lower():
return choice
print(f" Please enter a number 1-{len(choices)} or a valid option.")
def prompt_multiline(question: str, hint: str = None) -> list:
print(f"\n{question}")
if hint:
print(f" {hint}")
print(" (Enter each item on a new line. Empty line when done)")
print()
items = []
while True:
line = input(" > ").strip()
if not line:
break
items.append(line)
return items
def prompt_csv(question: str) -> list:
response = input(f" {question}: ").strip()
if not response:
return []
return [item.strip() for item in response.split(",") if item.strip()]
def create_topic_id(name: str) -> str:
topic_id = name.lower().replace(" ", "-")
topic_id = "".join(c for c in topic_id if c.isalnum() or c == "-")
topic_id = "-".join(filter(None, topic_id.split("-")))
return topic_id[:30]
def gather_topics() -> list:
topics = []
print("-" * 55)
print("📋 STEP 1: Topics to Monitor")
print("-" * 55)
topic_names = prompt_multiline(
"What topics do you want to monitor?",
"Examples: AI Models, Security Alerts, Product Updates"
)
if not topic_names:
print("\n⚠️ No topics entered. You can add them later with manage_topics.py")
return []
for i, name in enumerate(topic_names, 1):
print(f"\n--- Topic {i}/{len(topic_names)}: {name} ---")
query = prompt(f" Search query for '{name}'", f"{name} news updates")
keywords = prompt_csv(f"Keywords for '{name}' (comma-separated)")
feeds = prompt_csv(f"RSS/Atom feeds for '{name}' (comma-separated, optional)")
github_repos = prompt_csv(f"GitHub repos for release monitoring (owner/repo, optional)")
required_keywords = prompt_csv(f"Required keywords (all must appear, optional)")
exclude_keywords = prompt_csv(f"Exclude keywords (filter out if present, optional)")
alert_on_sentiment_shift = prompt_yes_no("Alert on sentiment shift?", default=False)
topic = {
"id": create_topic_id(name),
"name": name,
"query": query,
"keywords": keywords,
"feeds": feeds,
"github_repos": github_repos,
"required_keywords": required_keywords,
"exclude_keywords": exclude_keywords,
"frequency": None,
"importance_threshold": None,
"channels": ["telegram"],
"context": "",
"alert_on": ["keyword_exact_match"] + (["github_release"] if github_repos else []),
"alert_on_sentiment_shift": alert_on_sentiment_shift,
"ignore_sources": [],
"boost_sources": [],
}
topics.append(topic)
return topics
def gather_settings() -> dict:
print()
print("-" * 55)
print("⚙️ STEP 2: Monitoring Settings")
print("-" * 55)
frequency = prompt_choice("How often should I check for updates?", ["hourly", "daily", "weekly"], default="daily")
importance = prompt_choice("Importance threshold for alerts?", ["low", "medium", "high"], default="medium")
digest_enabled = prompt_yes_no("Enable weekly digest?", default=True)
digest_day = "sunday"
if digest_enabled:
digest_day = prompt_choice("Which day should I send the digest?", ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"], default="sunday")
return {
"frequency": frequency,
"importance_threshold": importance,
"digest_enabled": digest_enabled,
"digest_day": digest_day,
"digest_time": "18:00",
"max_alerts_per_day": 5,
"max_alerts_per_topic_per_day": 2,
"deduplication_window_hours": 72,
"learning_enabled": True,
"quiet_hours": {"enabled": False, "start": "22:00", "end": "08:00"},
}
def build_config(topics: list, settings: dict) -> dict:
frequency = settings.pop("frequency")
importance = settings.pop("importance_threshold")
for topic in topics:
topic["frequency"] = frequency
topic["importance_threshold"] = importance
return {
"topics": topics,
"settings": settings,
"channels": {
"telegram": {
"enabled": True,
"chat_id": None,
"silent": False,
"effects": {"high_importance": "🔥", "medium_importance": "📌"},
},
"discord": {"enabled": False, "webhook_url": None, "username": "Topic Monitor", "avatar_url": None},
"email": {
"enabled": False,
"to": None,
"from": "monitor@yourdomain.com",
"smtp_server": "smtp.gmail.com",
"smtp_port": 587,
"smtp_user": None,
"smtp_password": None,
},
},
}
def save_config(config: dict):
with open(CONFIG_FILE, 'w') as f:
json.dump(config, f, indent=2)
def print_summary(config: dict):
print()
print("=" * 55)
print(" ✅ Setup Complete!")
print("=" * 55)
print()
for topic in config.get("topics", []):
print(f"{topic['name']}")
print(f" Query: {topic['query'] or ''}")
print(f" Keywords: {', '.join(topic.get('keywords', [])) or ''}")
print(f" Feeds: {', '.join(topic.get('feeds', [])) or ''}")
print(f" GitHub repos: {', '.join(topic.get('github_repos', [])) or ''}")
print(f" Required keywords: {', '.join(topic.get('required_keywords', [])) or ''}")
print(f" Exclude keywords: {', '.join(topic.get('exclude_keywords', [])) or ''}")
print(f" Sentiment shift alerts: {topic.get('alert_on_sentiment_shift')}")
print()
print("Test with: python3 scripts/monitor.py --dry-run --verbose")
print()
def main():
if CONFIG_FILE.exists():
print("\n⚠️ config.json already exists!\n")
if not prompt_yes_no("Do you want to start fresh and overwrite it?", default=False):
print("\nKeeping existing config. Use manage_topics.py to edit topics.")
sys.exit(0)
print_welcome()
try:
topics = gather_topics()
settings = gather_settings()
config = build_config(topics, settings)
save_config(config)
print_summary(config)
except KeyboardInterrupt:
print("\n\n⚠️ Setup cancelled. No changes made.")
sys.exit(1)
except EOFError:
print("\n\n⚠️ Input ended. No changes made.")
sys.exit(1)
if __name__ == "__main__":
main()

86
scripts/setup_cron.py Normal file
View File

@@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Generate cron job configuration for proactive research monitoring.
Outputs JSON that the agent can use with OpenClaw's cron tool.
Does NOT modify crontab directly.
"""
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from config import load_config, get_settings
SKILL_DIR = Path(__file__).parent.parent
MONITOR_SCRIPT = SKILL_DIR / "scripts" / "monitor.py"
DIGEST_SCRIPT = SKILL_DIR / "scripts" / "digest.py"
def generate_cron_config(settings: dict) -> dict:
"""Generate cron configuration as JSON for the agent."""
digest_day = settings.get("digest_day", "sunday")
digest_time = settings.get("digest_time", "18:00")
hour, minute = digest_time.split(":")
day_map = {
"sunday": "0", "monday": "1", "tuesday": "2", "wednesday": "3",
"thursday": "4", "friday": "5", "saturday": "6"
}
day_num = day_map.get(digest_day.lower(), "0")
return {
"jobs": [
{
"name": "topic-monitor-hourly",
"description": "Hourly topic check",
"schedule": "0 * * * *",
"command": f"cd {SKILL_DIR} && python3 {MONITOR_SCRIPT} --frequency hourly"
},
{
"name": "topic-monitor-daily",
"description": "Daily topic check (9 AM)",
"schedule": "0 9 * * *",
"command": f"cd {SKILL_DIR} && python3 {MONITOR_SCRIPT} --frequency daily"
},
{
"name": "topic-monitor-weekly",
"description": "Weekly topic check (Sunday 9 AM)",
"schedule": "0 9 * * 0",
"command": f"cd {SKILL_DIR} && python3 {MONITOR_SCRIPT} --frequency weekly"
},
{
"name": "topic-monitor-digest",
"description": f"Weekly digest ({digest_day} {digest_time})",
"schedule": f"{minute} {hour} * * {day_num}",
"command": f"cd {SKILL_DIR} && python3 {DIGEST_SCRIPT} --send"
}
],
"note": "Use these with OpenClaw's cron tool. Do NOT run setup_cron.py --auto."
}
def main():
import argparse
parser = argparse.ArgumentParser(description="Generate cron config for topic monitoring")
parser.add_argument("--json", action="store_true", default=True, help="Output as JSON (default)")
parser.parse_args()
try:
config = load_config()
settings = get_settings()
except FileNotFoundError as e:
print(json.dumps({"error": str(e)}))
sys.exit(1)
result = generate_cron_config(settings)
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()