import io
import itertools
import re
import bs4
import bs4.element
class BeautifulSoup(bs4.BeautifulSoup):
"""A wrapper of the original BeautifulSoup class, with convenience methods added."""
IMAGE_ATTRS = {
'img': 'src',
'video': 'poster',
}
NON_BODY_TEXT_TAGS = frozenset([
'img',
'video',
])
SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
def __init__(self, src, parser='html5lib'):
# WARNING! It seems like it would be ideal to use the 'lxml' parser
# for speed, but that doesn't work in our web application. On
# Debian stretch, at least, using lxml causes the web server WSGI
# application to go into an infinite loop.
super().__init__(src, parser)
def _body_text(self, root):
# "Body text" is all the strings under the root element, in order,
# except:
# * strings inside NON_BODY_TEXT_TAGS
# * strings inside containers of NON_BODY_TEXT_TAGS. A container is
# an element that has a NON_BODY_TEXT_TAGS element as its first child.
# For example, in
, none of the div's strings
# are included in the body text, because it's considered to be a
#