Blogs/news only include a single OG image/video.

For now, this gives us more assurance that other sites will choose the
preview we want.

You can control the selection by adding data-ogpreview to image, video, and
source elements.  data-ogpreview=0 excludes the element from being included
in the preview.  Positive numbers set the preview priority.  The lowest
value found is chosen first.
This commit is contained in:
Brett Smith 2018-09-21 10:57:14 -04:00
parent 5c75decd30
commit 703df9c8e9
3 changed files with 60 additions and 21 deletions

View file

@ -1,6 +1,7 @@
# -*- encoding: utf-8 -*-
import io
import itertools
import re
import bs4
@ -86,25 +87,22 @@ class BeautifulSoup(bs4.BeautifulSoup):
except AttributeError:
return False
def iter_attr(self, tag, attr_name, **kwargs):
kwargs[attr_name] = True
for elem in self.find_all(tag, **kwargs):
yield elem[attr_name]
def iter_images(self):
"""Return an iterator of all image elements in this document.
def iter_image_urls(self):
"""Return an iterator of source URL strings of all images in this document.
Images include <img> tags and <video> poster attributes.
Images include <img> and <video> with a poster attribute.
"""
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
try:
yield elem[self.IMAGE_ATTRS[elem.name]]
elem[self.IMAGE_ATTRS[elem.name]]
except KeyError:
pass
else:
yield elem
def iter_video_urls(self):
"""Return an iterator of source URL strings of all videos in this document."""
return self.iter_attr(self.is_video_source, 'src')
def iter_videos(self):
"""Return an iterator of all video source elements in this document."""
return self.find_all(self.is_video_source, src=True)
class SoupModelMixin:
@ -115,6 +113,7 @@ class SoupModelMixin:
are usable.
"""
OG_PREVIEW_ATTR = 'data-ogpreview'
SOUP_ATTRS = []
def _get_soup(self):
@ -128,17 +127,57 @@ class SoupModelMixin:
self._soup = BeautifulSoup(html)
return self._soup
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
def elem_sort_key(elem):
try:
sort_key = getvalue(elem[attr_name])
except (KeyError, ValueError):
sort_key = fallback
elem[attr_name] = sort_key
return sort_key
return elem_sort_key
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
def elem_pred(elem):
return test(elem[attr_name])
return elem_pred
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
if slice_args:
return itertools.islice(seq, *slice_args)
else:
return seq
def get_description(self):
"""Return a string with a brief excerpt of body text from the HTML."""
return u''.join(self._get_soup().some_body_text())
def get_image_urls(self):
def get_image_urls(self, *slice_args):
"""Return an iterator of source URL strings of all images in the HTML.
Images include <img> tags and <video> poster attributes.
Images include <img> sources and <video> poster attributes.
"""
return self._get_soup().iter_image_urls()
for elem in self._sort_and_slice_elems(
self._get_soup().iter_images(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
def get_video_urls(self):
def get_one_image_url(self):
return self.get_image_urls(1)
def get_video_urls(self, *slice_args):
"""Return an iterator of source URL strings of all videos in the HTML."""
return self._get_soup().iter_video_urls()
for elem in self._sort_and_slice_elems(
self._get_soup().iter_videos(),
self._elem_key(),
self._elem_pred(),
*slice_args
):
yield elem['src']
def get_one_video_url(self):
return self.get_video_urls(1)

View file

@ -2,8 +2,8 @@
{% block head %}
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
{% endblock %}
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}

View file

@ -2,8 +2,8 @@
{% block head %}
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
{% endblock %}
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}