Blogs/news only include a single OG image/video.
For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first.
This commit is contained in:
parent
5c75decd30
commit
703df9c8e9
3 changed files with 60 additions and 21 deletions
|
@ -1,6 +1,7 @@
|
|||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import io
|
||||
import itertools
|
||||
import re
|
||||
|
||||
import bs4
|
||||
|
@ -86,25 +87,22 @@ class BeautifulSoup(bs4.BeautifulSoup):
|
|||
except AttributeError:
|
||||
return False
|
||||
|
||||
def iter_attr(self, tag, attr_name, **kwargs):
|
||||
kwargs[attr_name] = True
|
||||
for elem in self.find_all(tag, **kwargs):
|
||||
yield elem[attr_name]
|
||||
def iter_images(self):
|
||||
"""Return an iterator of all image elements in this document.
|
||||
|
||||
def iter_image_urls(self):
|
||||
"""Return an iterator of source URL strings of all images in this document.
|
||||
|
||||
Images include <img> tags and <video> poster attributes.
|
||||
Images include <img> and <video> with a poster attribute.
|
||||
"""
|
||||
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
|
||||
try:
|
||||
yield elem[self.IMAGE_ATTRS[elem.name]]
|
||||
elem[self.IMAGE_ATTRS[elem.name]]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
yield elem
|
||||
|
||||
def iter_video_urls(self):
|
||||
"""Return an iterator of source URL strings of all videos in this document."""
|
||||
return self.iter_attr(self.is_video_source, 'src')
|
||||
def iter_videos(self):
|
||||
"""Return an iterator of all video source elements in this document."""
|
||||
return self.find_all(self.is_video_source, src=True)
|
||||
|
||||
|
||||
class SoupModelMixin:
|
||||
|
@ -115,6 +113,7 @@ class SoupModelMixin:
|
|||
are usable.
|
||||
"""
|
||||
|
||||
OG_PREVIEW_ATTR = 'data-ogpreview'
|
||||
SOUP_ATTRS = []
|
||||
|
||||
def _get_soup(self):
|
||||
|
@ -128,17 +127,57 @@ class SoupModelMixin:
|
|||
self._soup = BeautifulSoup(html)
|
||||
return self._soup
|
||||
|
||||
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
|
||||
def elem_sort_key(elem):
|
||||
try:
|
||||
sort_key = getvalue(elem[attr_name])
|
||||
except (KeyError, ValueError):
|
||||
sort_key = fallback
|
||||
elem[attr_name] = sort_key
|
||||
return sort_key
|
||||
return elem_sort_key
|
||||
|
||||
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
|
||||
def elem_pred(elem):
|
||||
return test(elem[attr_name])
|
||||
return elem_pred
|
||||
|
||||
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
|
||||
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
|
||||
if slice_args:
|
||||
return itertools.islice(seq, *slice_args)
|
||||
else:
|
||||
return seq
|
||||
|
||||
def get_description(self):
|
||||
"""Return a string with a brief excerpt of body text from the HTML."""
|
||||
return u''.join(self._get_soup().some_body_text())
|
||||
|
||||
def get_image_urls(self):
|
||||
def get_image_urls(self, *slice_args):
|
||||
"""Return an iterator of source URL strings of all images in the HTML.
|
||||
|
||||
Images include <img> tags and <video> poster attributes.
|
||||
Images include <img> sources and <video> poster attributes.
|
||||
"""
|
||||
return self._get_soup().iter_image_urls()
|
||||
for elem in self._sort_and_slice_elems(
|
||||
self._get_soup().iter_images(),
|
||||
self._elem_key(),
|
||||
self._elem_pred(),
|
||||
*slice_args
|
||||
):
|
||||
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
|
||||
|
||||
def get_video_urls(self):
|
||||
def get_one_image_url(self):
|
||||
return self.get_image_urls(1)
|
||||
|
||||
def get_video_urls(self, *slice_args):
|
||||
"""Return an iterator of source URL strings of all videos in the HTML."""
|
||||
return self._get_soup().iter_video_urls()
|
||||
for elem in self._sort_and_slice_elems(
|
||||
self._get_soup().iter_videos(),
|
||||
self._elem_key(),
|
||||
self._elem_pred(),
|
||||
*slice_args
|
||||
):
|
||||
yield elem['src']
|
||||
|
||||
def get_one_video_url(self):
|
||||
return self.get_video_urls(1)
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
{% block head %}
|
||||
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
||||
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
|
||||
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
|
||||
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
|
||||
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
|
||||
{% endblock %}
|
||||
|
||||
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
{% block head %}
|
||||
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
||||
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
|
||||
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
|
||||
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
|
||||
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
|
||||
{% endblock %}
|
||||
|
||||
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
|
||||
|
|
Loading…
Reference in a new issue