# -*- encoding: utf-8 -*- from builtins import filter from builtins import object import io import itertools import re import bs4 import bs4.element class BeautifulSoup(bs4.BeautifulSoup): """A wrapper of the original BeautifulSoup class, with convenience methods added.""" IMAGE_ATTRS = { 'img': 'src', 'video': 'poster', } NON_BODY_TEXT_TAGS = frozenset([ 'img', 'video', ]) SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$') def __init__(self, src, parser='html5lib'): # WARNING! It seems like it would be ideal to use the 'lxml' parser # for speed, but that doesn't work in our web application. On # Debian stretch, at least, using lxml causes the web server WSGI # application to go into an infinite loop. super(BeautifulSoup, self).__init__(src, parser) def _body_text(self, root): # "Body text" is all the strings under the root element, in order, # except: # * strings inside NON_BODY_TEXT_TAGS # * strings inside containers of NON_BODY_TEXT_TAGS. A container is # an element that has a NON_BODY_TEXT_TAGS element as its first child. # For example, in <div> <video …> … </div>, none of the div's strings # are included in the body text, because it's considered to be a # <video> container, and any strings are probably a caption, fallback # text, or other non-body text. started = False for child in root.children: child_type = type(child) if issubclass(child_type, bs4.element.Tag): if child.name in self.NON_BODY_TEXT_TAGS: if not started: break else: for s in self._body_text(child): yield s # It's not worth it to use issubclass here, because elements that # don't have body text like Comments and CDATA are subclasses of # NavigableString. elif child_type is bs4.element.NavigableString: if started: yield child elif child.isspace(): pass else: yield child started = True def body_text(self): """Return an iterator of strings comprising this document's body text.""" return self._body_text(self) def some_body_text(self, char_target=300): """Return an iterator of strings with some of this document's body text. This is the same as body_text, except after it yields a string that looks like the end of a sentence, it checks whether it has yielded at least `char_target` characters. If so, the iterator stops. """ # This implementation is likely to overshoot `char_target` a lot, # because it doesn't look inside the strings it yields, just at the # end of them. We can implement something smarter later if needed. char_count = 0 for s in self.body_text(): yield s char_count += len(s) if (char_count > char_target) and self.SENTENCE_END.search(s): break @staticmethod def is_video_source(elem): try: return elem.name == 'source' and elem.parent.name == 'video' except AttributeError: return False def iter_images(self): """Return an iterator of all image elements in this document. Images include <img> and <video> with a poster attribute. """ for elem in self.find_all(list(self.IMAGE_ATTRS.keys())): try: elem[self.IMAGE_ATTRS[elem.name]] except KeyError: pass else: yield elem def iter_videos(self): """Return an iterator of all video source elements in this document.""" return self.find_all(self.is_video_source, src=True) class SoupModelMixin(object): """Mixin for models to parse HTML with BeautifulSoup. Classes that use this mixin must define `SOUP_ATTRS`, a list of strings that name attributes with HTML in them. After that, all the public methods are usable. """ OG_PREVIEW_ATTR = 'data-ogpreview' SOUP_ATTRS = [] def _get_soup(self): try: return self._soup except AttributeError: html = io.StringIO() for attr_name in self.SOUP_ATTRS: html.write(getattr(self, attr_name)) html.seek(0) self._soup = BeautifulSoup(html) return self._soup def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999): def elem_sort_key(elem): try: sort_key = getvalue(elem[attr_name]) except (KeyError, ValueError): sort_key = fallback elem[attr_name] = sort_key return sort_key return elem_sort_key def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0): def elem_pred(elem): return test(elem[attr_name]) return elem_pred def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args): seq = filter(pred, sorted(elem_seq, key=elem_key)) if slice_args: return itertools.islice(seq, *slice_args) else: return seq def get_description(self): """Return a string with a brief excerpt of body text from the HTML.""" return u''.join(self._get_soup().some_body_text()) def get_image_urls(self, *slice_args): """Return an iterator of source URL strings of all images in the HTML. Images include <img> sources and <video> poster attributes. """ for elem in self._sort_and_slice_elems( self._get_soup().iter_images(), self._elem_key(), self._elem_pred(), *slice_args ): yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]] def get_one_image_url(self): return self.get_image_urls(1) def get_video_urls(self, *slice_args): """Return an iterator of source URL strings of all videos in the HTML.""" for elem in self._sort_and_slice_elems( self._get_soup().iter_videos(), self._elem_key(), self._elem_pred(), *slice_args ): yield elem['src'] def get_one_video_url(self): return self.get_video_urls(1)