website/www/conservancy/bsoup.py

# -*- encoding: utf-8 -*-

from builtins import filter
from builtins import object
import io
import itertools
import re

import bs4
import bs4.element

class BeautifulSoup(bs4.BeautifulSoup):
    """A wrapper of the original BeautifulSoup class, with convenience methods added."""

    IMAGE_ATTRS = {
        'img': 'src',
        'video': 'poster',
    }
    NON_BODY_TEXT_TAGS = frozenset([
        'img',
        'video',
    ])
    SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')

    def __init__(self, src, parser='html5lib'):
        # WARNING!  It seems like it would be ideal to use the 'lxml' parser
        # for speed, but that doesn't work in our web application.  On
        # Debian stretch, at least, using lxml causes the web server WSGI
        # application to go into an infinite loop.
        super(BeautifulSoup, self).__init__(src, parser)

    def _body_text(self, root):
        # "Body text" is all the strings under the root element, in order,
        # except:
        # * strings inside NON_BODY_TEXT_TAGS
        # * strings inside containers of NON_BODY_TEXT_TAGS.  A container is
        #   an element that has a NON_BODY_TEXT_TAGS element as its first child.
        #   For example, in <div> <video …> … </div>, none of the div's strings
        #   are included in the body text, because it's considered to be a
        #   <video> container, and any strings are probably a caption, fallback
        #   text, or other non-body text.
        started = False
        for child in root.children:
            child_type = type(child)
            if issubclass(child_type, bs4.element.Tag):
                if child.name in self.NON_BODY_TEXT_TAGS:
                    if not started:
                        break
                else:
                    for s in self._body_text(child):
                        yield s
            # It's not worth it to use issubclass here, because elements that
            # don't have body text like Comments and CDATA are subclasses of
            # NavigableString.
            elif child_type is bs4.element.NavigableString:
                if started:
                    yield child
                elif child.isspace():
                    pass
                else:
                    yield child
                    started = True

    def body_text(self):
        """Return an iterator of strings comprising this document's body text."""
        return self._body_text(self)

    def some_body_text(self, char_target=300):
        """Return an iterator of strings with some of this document's body text.

        This is the same as body_text, except after it yields a string that
        looks like the end of a sentence, it checks whether it has yielded
        at least `char_target` characters.  If so, the iterator stops.
        """
        # This implementation is likely to overshoot `char_target` a lot,
        # because it doesn't look inside the strings it yields, just at the
        # end of them.  We can implement something smarter later if needed.
        char_count = 0
        for s in self.body_text():
            yield s
            char_count += len(s)
            if (char_count > char_target) and self.SENTENCE_END.search(s):
                break

    @staticmethod
    def is_video_source(elem):
        try:
            return elem.name == 'source' and elem.parent.name == 'video'
        except AttributeError:
            return False

    def iter_images(self):
        """Return an iterator of all image elements in this document.

        Images include <img> and <video> with a poster attribute.
        """
        for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
            try:
                elem[self.IMAGE_ATTRS[elem.name]]
            except KeyError:
                pass
            else:
                yield elem

    def iter_videos(self):
        """Return an iterator of all video source elements in this document."""
        return self.find_all(self.is_video_source, src=True)


class SoupModelMixin(object):
    """Mixin for models to parse HTML with BeautifulSoup.

    Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
    that name attributes with HTML in them.  After that, all the public methods
    are usable.
    """

    OG_PREVIEW_ATTR = 'data-ogpreview'
    SOUP_ATTRS = []

    def _get_soup(self):
        try:
            return self._soup
        except AttributeError:
            html = io.StringIO()
            for attr_name in self.SOUP_ATTRS:
                html.write(getattr(self, attr_name))
            html.seek(0)
            self._soup = BeautifulSoup(html)
            return self._soup

    def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
        def elem_sort_key(elem):
            try:
                sort_key = getvalue(elem[attr_name])
            except (KeyError, ValueError):
                sort_key = fallback
            elem[attr_name] = sort_key
            return sort_key
        return elem_sort_key

    def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
        def elem_pred(elem):
            return test(elem[attr_name])
        return elem_pred

    def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
        seq = filter(pred, sorted(elem_seq, key=elem_key))
        if slice_args:
            return itertools.islice(seq, *slice_args)
        else:
            return seq

    def get_description(self):
        """Return a string with a brief excerpt of body text from the HTML."""
        return u''.join(self._get_soup().some_body_text())

    def get_image_urls(self, *slice_args):
        """Return an iterator of source URL strings of all images in the HTML.

        Images include <img> sources and <video> poster attributes.
        """
        for elem in self._sort_and_slice_elems(
                self._get_soup().iter_images(),
                self._elem_key(),
                self._elem_pred(),
                *slice_args
        ):
            yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]

    def get_one_image_url(self):
        return self.get_image_urls(1)

    def get_video_urls(self, *slice_args):
        """Return an iterator of source URL strings of all videos in the HTML."""
        for elem in self._sort_and_slice_elems(
                self._get_soup().iter_videos(),
                self._elem_key(),
                self._elem_pred(),
                *slice_args
        ):
            yield elem['src']

    def get_one_video_url(self):
        return self.get_video_urls(1)
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`# -- encoding: utf-8 --`

Apply `futurize --stage-2` Python 2/3 compatibility transformations. These changes specifically require the use of the "future" library. 2021-11-29 20:55:45 +00:00			`from builtins import filter`
Apply "object" changes only from `futurize --stage2`. 2021-11-26 02:49:40 +00:00			`from builtins import object`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`import io`
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`import itertools`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`import re`

			`import bs4`
			`import bs4.element`

			`class BeautifulSoup(bs4.BeautifulSoup):`
			`"""A wrapper of the original BeautifulSoup class, with convenience methods added."""`

			`IMAGE_ATTRS = {`
			`'img': 'src',`
			`'video': 'poster',`
			`}`
			`NON_BODY_TEXT_TAGS = frozenset([`
			`'img',`
			`'video',`
			`])`
			`SENTENCE_END = re.compile(r'[.?!]\s\W\s*$')`

			`def __init__(self, src, parser='html5lib'):`
			`# WARNING! It seems like it would be ideal to use the 'lxml' parser`
			`# for speed, but that doesn't work in our web application. On`
			`# Debian stretch, at least, using lxml causes the web server WSGI`
			`# application to go into an infinite loop.`
			`super(BeautifulSoup, self).__init__(src, parser)`

			`def _body_text(self, root):`
			`# "Body text" is all the strings under the root element, in order,`
			`# except:`
			`# * strings inside NON_BODY_TEXT_TAGS`
			`# * strings inside containers of NON_BODY_TEXT_TAGS. A container is`
			`# an element that has a NON_BODY_TEXT_TAGS element as its first child.`
			`# For example, in <div> <video …> … </div>, none of the div's strings`
			`# are included in the body text, because it's considered to be a`
			`# <video> container, and any strings are probably a caption, fallback`
			`# text, or other non-body text.`
			`started = False`
			`for child in root.children:`
			`child_type = type(child)`
			`if issubclass(child_type, bs4.element.Tag):`
			`if child.name in self.NON_BODY_TEXT_TAGS:`
			`if not started:`
			`break`
			`else:`
			`for s in self._body_text(child):`
			`yield s`
			`# It's not worth it to use issubclass here, because elements that`
			`# don't have body text like Comments and CDATA are subclasses of`
			`# NavigableString.`
			`elif child_type is bs4.element.NavigableString:`
			`if started:`
			`yield child`
			`elif child.isspace():`
			`pass`
			`else:`
			`yield child`
			`started = True`

			`def body_text(self):`
			`"""Return an iterator of strings comprising this document's body text."""`
			`return self._body_text(self)`

			`def some_body_text(self, char_target=300):`
			`"""Return an iterator of strings with some of this document's body text.`

			`This is the same as body_text, except after it yields a string that`
			`looks like the end of a sentence, it checks whether it has yielded`
			at least `char_target` characters. If so, the iterator stops.
			`"""`
			# This implementation is likely to overshoot `char_target` a lot,
			`# because it doesn't look inside the strings it yields, just at the`
			`# end of them. We can implement something smarter later if needed.`
			`char_count = 0`
			`for s in self.body_text():`
			`yield s`
			`char_count += len(s)`
			`if (char_count > char_target) and self.SENTENCE_END.search(s):`
			`break`

			`@staticmethod`
			`def is_video_source(elem):`
			`try:`
			`return elem.name == 'source' and elem.parent.name == 'video'`
			`except AttributeError:`
			`return False`

Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`def iter_images(self):`
			`"""Return an iterator of all image elements in this document.`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`Images include <img> and <video> with a poster attribute.`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`"""`
			`for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):`
			`try:`
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`elem[self.IMAGE_ATTRS[elem.name]]`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`except KeyError:`
			`pass`
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`else:`
			`yield elem`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`def iter_videos(self):`
			`"""Return an iterator of all video source elements in this document."""`
			`return self.find_all(self.is_video_source, src=True)`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00

Apply "object" changes only from `futurize --stage2`. 2021-11-26 02:49:40 +00:00			`class SoupModelMixin(object):`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`"""Mixin for models to parse HTML with BeautifulSoup.`

			Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
			`that name attributes with HTML in them. After that, all the public methods`
			`are usable.`
			`"""`

Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`OG_PREVIEW_ATTR = 'data-ogpreview'`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`SOUP_ATTRS = []`

			`def _get_soup(self):`
			`try:`
			`return self._soup`
			`except AttributeError:`
			`html = io.StringIO()`
			`for attr_name in self.SOUP_ATTRS:`
			`html.write(getattr(self, attr_name))`
			`html.seek(0)`
			`self._soup = BeautifulSoup(html)`
			`return self._soup`

Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):`
			`def elem_sort_key(elem):`
			`try:`
			`sort_key = getvalue(elem[attr_name])`
			`except (KeyError, ValueError):`
			`sort_key = fallback`
			`elem[attr_name] = sort_key`
			`return sort_key`
			`return elem_sort_key`

			`def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):`
			`def elem_pred(elem):`
			`return test(elem[attr_name])`
			`return elem_pred`

			`def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):`
Apply `futurize --stage-2` Python 2/3 compatibility transformations. These changes specifically require the use of the "future" library. 2021-11-29 20:55:45 +00:00			`seq = filter(pred, sorted(elem_seq, key=elem_key))`
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`if slice_args:`
			`return itertools.islice(seq, *slice_args)`
			`else:`
			`return seq`

blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`def get_description(self):`
			`"""Return a string with a brief excerpt of body text from the HTML."""`
			`return u''.join(self._get_soup().some_body_text())`

Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`def get_image_urls(self, *slice_args):`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`"""Return an iterator of source URL strings of all images in the HTML.`

Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`Images include <img> sources and <video> poster attributes.`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`"""`
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`for elem in self._sort_and_slice_elems(`
			`self._get_soup().iter_images(),`
			`self._elem_key(),`
			`self._elem_pred(),`
			`*slice_args`
			`):`
			`yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]`

			`def get_one_image_url(self):`
			`return self.get_image_urls(1)`

			`def get_video_urls(self, *slice_args):`
blog/news: Add Open Graph metadata to entry pages. This helps other social media sites generate nice previews for these pages. 2017-11-07 16:17:33 +00:00			`"""Return an iterator of source URL strings of all videos in the HTML."""`
Blogs/news only include a single OG image/video. For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first. 2018-09-21 14:57:14 +00:00			`for elem in self._sort_and_slice_elems(`
			`self._get_soup().iter_videos(),`
			`self._elem_key(),`
			`self._elem_pred(),`
			`*slice_args`
			`):`
			`yield elem['src']`

			`def get_one_video_url(self):`
			`return self.get_video_urls(1)`