Blogs/news only include a single OG image/video.

For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first.
2018-09-21 10:57:14 -04:00 · 2018-09-21 10:57:14 -04:00 · 703df9c8e9
commit 703df9c8e9
parent 5c75decd30
3 changed files with 60 additions and 21 deletions
--- a/www/conservancy/bsoup.py
+++ b/www/conservancy/bsoup.py
@ -1,6 +1,7 @@
 # -*- encoding: utf-8 -*-

 import io
+import itertools
 import re

 import bs4
@ -86,25 +87,22 @@ class BeautifulSoup(bs4.BeautifulSoup):
        except AttributeError:
            return False

-    def iter_attr(self, tag, attr_name, **kwargs):
-        kwargs[attr_name] = True
-        for elem in self.find_all(tag, **kwargs):
-            yield elem[attr_name]
+    def iter_images(self):
+        """Return an iterator of all image elements in this document.

-    def iter_image_urls(self):
-        """Return an iterator of source URL strings of all images in this document.
-
-        Images include <img> tags and <video> poster attributes.
+        Images include <img> and <video> with a poster attribute.
        """
        for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
            try:
-                yield elem[self.IMAGE_ATTRS[elem.name]]
+                elem[self.IMAGE_ATTRS[elem.name]]
            except KeyError:
                pass
+            else:
+                yield elem

-    def iter_video_urls(self):
-        """Return an iterator of source URL strings of all videos in this document."""
-        return self.iter_attr(self.is_video_source, 'src')
+    def iter_videos(self):
+        """Return an iterator of all video source elements in this document."""
+        return self.find_all(self.is_video_source, src=True)


 class SoupModelMixin:
@ -115,6 +113,7 @@ class SoupModelMixin:
    are usable.
    """

+    OG_PREVIEW_ATTR = 'data-ogpreview'
    SOUP_ATTRS = []

    def _get_soup(self):
@ -128,17 +127,57 @@ class SoupModelMixin:
            self._soup = BeautifulSoup(html)
            return self._soup

+    def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
+        def elem_sort_key(elem):
+            try:
+                sort_key = getvalue(elem[attr_name])
+            except (KeyError, ValueError):
+                sort_key = fallback
+            elem[attr_name] = sort_key
+            return sort_key
+        return elem_sort_key
+
+    def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
+        def elem_pred(elem):
+            return test(elem[attr_name])
+        return elem_pred
+
+    def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
+        seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
+        if slice_args:
+            return itertools.islice(seq, *slice_args)
+        else:
+            return seq
+
    def get_description(self):
        """Return a string with a brief excerpt of body text from the HTML."""
        return u''.join(self._get_soup().some_body_text())

-    def get_image_urls(self):
+    def get_image_urls(self, *slice_args):
        """Return an iterator of source URL strings of all images in the HTML.

-        Images include <img> tags and <video> poster attributes.
+        Images include <img> sources and <video> poster attributes.
        """
-        return self._get_soup().iter_image_urls()
+        for elem in self._sort_and_slice_elems(
+                self._get_soup().iter_images(),
+                self._elem_key(),
+                self._elem_pred(),
+                *slice_args
+        ):
+            yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]

-    def get_video_urls(self):
+    def get_one_image_url(self):
+        return self.get_image_urls(1)
+
+    def get_video_urls(self, *slice_args):
        """Return an iterator of source URL strings of all videos in the HTML."""
-        return self._get_soup().iter_video_urls()
+        for elem in self._sort_and_slice_elems(
+                self._get_soup().iter_videos(),
+                self._elem_key(),
+                self._elem_pred(),
+                *slice_args
+        ):
+            yield elem['src']
+
+    def get_one_video_url(self):
+        return self.get_video_urls(1)
--- a/www/conservancy/templates/blog/entry_detail.html
+++ b/www/conservancy/templates/blog/entry_detail.html
@ -2,8 +2,8 @@

 {% block head %}
 {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
-{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
-{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
+{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
+{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
 {% endblock %}

 {% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
--- a/www/conservancy/templates/news/pressrelease_detail.html
+++ b/www/conservancy/templates/news/pressrelease_detail.html
@ -2,8 +2,8 @@

 {% block head %}
 {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
-{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
-{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
+{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
+{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
 {% endblock %}

 {% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}