2017-11-07 16:17:33 +00:00
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
|
2021-11-26 02:49:40 +00:00
|
|
|
from builtins import object
|
2017-11-07 16:17:33 +00:00
|
|
|
import io
|
2018-09-21 14:57:14 +00:00
|
|
|
import itertools
|
2017-11-07 16:17:33 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
import bs4
|
|
|
|
import bs4.element
|
|
|
|
|
|
|
|
class BeautifulSoup(bs4.BeautifulSoup):
|
|
|
|
"""A wrapper of the original BeautifulSoup class, with convenience methods added."""
|
|
|
|
|
|
|
|
IMAGE_ATTRS = {
|
|
|
|
'img': 'src',
|
|
|
|
'video': 'poster',
|
|
|
|
}
|
|
|
|
NON_BODY_TEXT_TAGS = frozenset([
|
|
|
|
'img',
|
|
|
|
'video',
|
|
|
|
])
|
|
|
|
SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
|
|
|
|
|
|
|
|
def __init__(self, src, parser='html5lib'):
|
|
|
|
# WARNING! It seems like it would be ideal to use the 'lxml' parser
|
|
|
|
# for speed, but that doesn't work in our web application. On
|
|
|
|
# Debian stretch, at least, using lxml causes the web server WSGI
|
|
|
|
# application to go into an infinite loop.
|
|
|
|
super(BeautifulSoup, self).__init__(src, parser)
|
|
|
|
|
|
|
|
def _body_text(self, root):
|
|
|
|
# "Body text" is all the strings under the root element, in order,
|
|
|
|
# except:
|
|
|
|
# * strings inside NON_BODY_TEXT_TAGS
|
|
|
|
# * strings inside containers of NON_BODY_TEXT_TAGS. A container is
|
|
|
|
# an element that has a NON_BODY_TEXT_TAGS element as its first child.
|
|
|
|
# For example, in <div> <video …> … </div>, none of the div's strings
|
|
|
|
# are included in the body text, because it's considered to be a
|
|
|
|
# <video> container, and any strings are probably a caption, fallback
|
|
|
|
# text, or other non-body text.
|
|
|
|
started = False
|
|
|
|
for child in root.children:
|
|
|
|
child_type = type(child)
|
|
|
|
if issubclass(child_type, bs4.element.Tag):
|
|
|
|
if child.name in self.NON_BODY_TEXT_TAGS:
|
|
|
|
if not started:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
for s in self._body_text(child):
|
|
|
|
yield s
|
|
|
|
# It's not worth it to use issubclass here, because elements that
|
|
|
|
# don't have body text like Comments and CDATA are subclasses of
|
|
|
|
# NavigableString.
|
|
|
|
elif child_type is bs4.element.NavigableString:
|
|
|
|
if started:
|
|
|
|
yield child
|
|
|
|
elif child.isspace():
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
yield child
|
|
|
|
started = True
|
|
|
|
|
|
|
|
def body_text(self):
|
|
|
|
"""Return an iterator of strings comprising this document's body text."""
|
|
|
|
return self._body_text(self)
|
|
|
|
|
|
|
|
def some_body_text(self, char_target=300):
|
|
|
|
"""Return an iterator of strings with some of this document's body text.
|
|
|
|
|
|
|
|
This is the same as body_text, except after it yields a string that
|
|
|
|
looks like the end of a sentence, it checks whether it has yielded
|
|
|
|
at least `char_target` characters. If so, the iterator stops.
|
|
|
|
"""
|
|
|
|
# This implementation is likely to overshoot `char_target` a lot,
|
|
|
|
# because it doesn't look inside the strings it yields, just at the
|
|
|
|
# end of them. We can implement something smarter later if needed.
|
|
|
|
char_count = 0
|
|
|
|
for s in self.body_text():
|
|
|
|
yield s
|
|
|
|
char_count += len(s)
|
|
|
|
if (char_count > char_target) and self.SENTENCE_END.search(s):
|
|
|
|
break
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def is_video_source(elem):
|
|
|
|
try:
|
|
|
|
return elem.name == 'source' and elem.parent.name == 'video'
|
|
|
|
except AttributeError:
|
|
|
|
return False
|
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
def iter_images(self):
|
|
|
|
"""Return an iterator of all image elements in this document.
|
2017-11-07 16:17:33 +00:00
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
Images include <img> and <video> with a poster attribute.
|
2017-11-07 16:17:33 +00:00
|
|
|
"""
|
|
|
|
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
|
|
|
|
try:
|
2018-09-21 14:57:14 +00:00
|
|
|
elem[self.IMAGE_ATTRS[elem.name]]
|
2017-11-07 16:17:33 +00:00
|
|
|
except KeyError:
|
|
|
|
pass
|
2018-09-21 14:57:14 +00:00
|
|
|
else:
|
|
|
|
yield elem
|
2017-11-07 16:17:33 +00:00
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
def iter_videos(self):
|
|
|
|
"""Return an iterator of all video source elements in this document."""
|
|
|
|
return self.find_all(self.is_video_source, src=True)
|
2017-11-07 16:17:33 +00:00
|
|
|
|
|
|
|
|
2021-11-26 02:49:40 +00:00
|
|
|
class SoupModelMixin(object):
|
2017-11-07 16:17:33 +00:00
|
|
|
"""Mixin for models to parse HTML with BeautifulSoup.
|
|
|
|
|
|
|
|
Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
|
|
|
|
that name attributes with HTML in them. After that, all the public methods
|
|
|
|
are usable.
|
|
|
|
"""
|
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
OG_PREVIEW_ATTR = 'data-ogpreview'
|
2017-11-07 16:17:33 +00:00
|
|
|
SOUP_ATTRS = []
|
|
|
|
|
|
|
|
def _get_soup(self):
|
|
|
|
try:
|
|
|
|
return self._soup
|
|
|
|
except AttributeError:
|
|
|
|
html = io.StringIO()
|
|
|
|
for attr_name in self.SOUP_ATTRS:
|
|
|
|
html.write(getattr(self, attr_name))
|
|
|
|
html.seek(0)
|
|
|
|
self._soup = BeautifulSoup(html)
|
|
|
|
return self._soup
|
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
|
|
|
|
def elem_sort_key(elem):
|
|
|
|
try:
|
|
|
|
sort_key = getvalue(elem[attr_name])
|
|
|
|
except (KeyError, ValueError):
|
|
|
|
sort_key = fallback
|
|
|
|
elem[attr_name] = sort_key
|
|
|
|
return sort_key
|
|
|
|
return elem_sort_key
|
|
|
|
|
|
|
|
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
|
|
|
|
def elem_pred(elem):
|
|
|
|
return test(elem[attr_name])
|
|
|
|
return elem_pred
|
|
|
|
|
|
|
|
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
|
|
|
|
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
|
|
|
|
if slice_args:
|
|
|
|
return itertools.islice(seq, *slice_args)
|
|
|
|
else:
|
|
|
|
return seq
|
|
|
|
|
2017-11-07 16:17:33 +00:00
|
|
|
def get_description(self):
|
|
|
|
"""Return a string with a brief excerpt of body text from the HTML."""
|
|
|
|
return u''.join(self._get_soup().some_body_text())
|
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
def get_image_urls(self, *slice_args):
|
2017-11-07 16:17:33 +00:00
|
|
|
"""Return an iterator of source URL strings of all images in the HTML.
|
|
|
|
|
2018-09-21 14:57:14 +00:00
|
|
|
Images include <img> sources and <video> poster attributes.
|
2017-11-07 16:17:33 +00:00
|
|
|
"""
|
2018-09-21 14:57:14 +00:00
|
|
|
for elem in self._sort_and_slice_elems(
|
|
|
|
self._get_soup().iter_images(),
|
|
|
|
self._elem_key(),
|
|
|
|
self._elem_pred(),
|
|
|
|
*slice_args
|
|
|
|
):
|
|
|
|
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
|
|
|
|
|
|
|
|
def get_one_image_url(self):
|
|
|
|
return self.get_image_urls(1)
|
|
|
|
|
|
|
|
def get_video_urls(self, *slice_args):
|
2017-11-07 16:17:33 +00:00
|
|
|
"""Return an iterator of source URL strings of all videos in the HTML."""
|
2018-09-21 14:57:14 +00:00
|
|
|
for elem in self._sort_and_slice_elems(
|
|
|
|
self._get_soup().iter_videos(),
|
|
|
|
self._elem_key(),
|
|
|
|
self._elem_pred(),
|
|
|
|
*slice_args
|
|
|
|
):
|
|
|
|
yield elem['src']
|
|
|
|
|
|
|
|
def get_one_video_url(self):
|
|
|
|
return self.get_video_urls(1)
|