website/conservancy/bsoup.py

181 lines
6.3 KiB
Python

import io
import itertools
import re
import bs4
import bs4.element
class BeautifulSoup(bs4.BeautifulSoup):
"""A wrapper of the original BeautifulSoup class, with convenience methods added."""
IMAGE_ATTRS = {
'img': 'src',
'video': 'poster',
}
NON_BODY_TEXT_TAGS = frozenset([
'img',
'video',
])
SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
def __init__(self, src, parser='html5lib'):
# WARNING! It seems like it would be ideal to use the 'lxml' parser
# for speed, but that doesn't work in our web application. On
# Debian stretch, at least, using lxml causes the web server WSGI
# application to go into an infinite loop.
super().__init__(src, parser)
def _body_text(self, root):
# "Body text" is all the strings under the root element, in order,
# except:
# * strings inside NON_BODY_TEXT_TAGS
# * strings inside containers of NON_BODY_TEXT_TAGS. A container is
# an element that has a NON_BODY_TEXT_TAGS element as its first child.
# For example, in <div> <video …> … </div>, none of the div's strings
# are included in the body text, because it's considered to be a
# <video> container, and any strings are probably a caption, fallback
# text, or other non-body text.
started = False
for child in root.children:
child_type = type(child)
if issubclass(child_type, bs4.element.Tag):
if child.name in self.NON_BODY_TEXT_TAGS:
if not started:
break
else:
yield from self._body_text(child)
# It's not worth it to use issubclass here, because elements that
# don't have body text like Comments and CDATA are subclasses of
# NavigableString.
elif child_type is bs4.element.NavigableString:
if started:
yield child
elif child.isspace():
pass
else:
yield child
started = True
def body_text(self):
"""Return an iterator of strings comprising this document's body text."""
return self._body_text(self)
def some_body_text(self, char_target=300):
"""Return an iterator of strings with some of this document's body text.
This is the same as body_text, except after it yields a string that
looks like the end of a sentence, it checks whether it has yielded
at least `char_target` characters. If so, the iterator stops.
"""
# This implementation is likely to overshoot `char_target` a lot,
# because it doesn't look inside the strings it yields, just at the
# end of them. We can implement something smarter later if needed.
char_count = 0
for s in self.body_text():
yield s
char_count += len(s)
if (char_count > char_target) and self.SENTENCE_END.search(s):
break
@staticmethod
def is_video_source(elem):
try:
return elem.name == 'source' and elem.parent.name == 'video'
except AttributeError:
return False
def iter_images(self):
"""Return an iterator of all image elements in this document.
Images include <img> and <video> with a poster attribute.
"""
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
try:
elem[self.IMAGE_ATTRS[elem.name]]
except KeyError:
pass
else:
yield elem
def iter_videos(self):
"""Return an iterator of all video source elements in this document."""
return self.find_all(self.is_video_source, src=True)
class SoupModelMixin:
"""Mixin for models to parse HTML with BeautifulSoup.
Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
that name attributes with HTML in them. After that, all the public methods
are usable.
"""
OG_PREVIEW_ATTR = 'data-ogpreview'
SOUP_ATTRS = []
def _get_soup(self):
try:
return self._soup
except AttributeError:
html = io.StringIO()
for attr_name in self.SOUP_ATTRS:
html.write(getattr(self, attr_name))
html.seek(0)
self._soup = BeautifulSoup(html)
return self._soup
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
def elem_sort_key(elem):
try:
sort_key = getvalue(elem[attr_name])
except (KeyError, ValueError):
sort_key = fallback
elem[attr_name] = sort_key
return sort_key
return elem_sort_key
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
def elem_pred(elem):
return test(elem[attr_name])
return elem_pred
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
seq = filter(pred, sorted(elem_seq, key=elem_key))
if slice_args:
return itertools.islice(seq, *slice_args)
else:
return seq
def get_description(self):
"""Return a string with a brief excerpt of body text from the HTML."""
return ''.join(self._get_soup().some_body_text())
def get_image_urls(self, *slice_args):
"""Return an iterator of source URL strings of all images in the HTML.
Images include <img> sources and <video> poster attributes.
"""
for elem in self._sort_and_slice_elems(
self._get_soup().iter_images(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
def get_one_image_url(self):
return self.get_image_urls(1)
def get_video_urls(self, *slice_args):
"""Return an iterator of source URL strings of all videos in the HTML."""
for elem in self._sort_and_slice_elems(
self._get_soup().iter_videos(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem['src']
def get_one_video_url(self):
return self.get_video_urls(1)