Blogs/news only include a single OG image/video.
For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first.
This commit is contained in:
parent
5c75decd30
commit
703df9c8e9
3 changed files with 60 additions and 21 deletions
|
@ -1,6 +1,7 @@
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
@ -86,25 +87,22 @@ class BeautifulSoup(bs4.BeautifulSoup):
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def iter_attr(self, tag, attr_name, **kwargs):
|
def iter_images(self):
|
||||||
kwargs[attr_name] = True
|
"""Return an iterator of all image elements in this document.
|
||||||
for elem in self.find_all(tag, **kwargs):
|
|
||||||
yield elem[attr_name]
|
|
||||||
|
|
||||||
def iter_image_urls(self):
|
Images include <img> and <video> with a poster attribute.
|
||||||
"""Return an iterator of source URL strings of all images in this document.
|
|
||||||
|
|
||||||
Images include <img> tags and <video> poster attributes.
|
|
||||||
"""
|
"""
|
||||||
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
|
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
|
||||||
try:
|
try:
|
||||||
yield elem[self.IMAGE_ATTRS[elem.name]]
|
elem[self.IMAGE_ATTRS[elem.name]]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
else:
|
||||||
|
yield elem
|
||||||
|
|
||||||
def iter_video_urls(self):
|
def iter_videos(self):
|
||||||
"""Return an iterator of source URL strings of all videos in this document."""
|
"""Return an iterator of all video source elements in this document."""
|
||||||
return self.iter_attr(self.is_video_source, 'src')
|
return self.find_all(self.is_video_source, src=True)
|
||||||
|
|
||||||
|
|
||||||
class SoupModelMixin:
|
class SoupModelMixin:
|
||||||
|
@ -115,6 +113,7 @@ class SoupModelMixin:
|
||||||
are usable.
|
are usable.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
OG_PREVIEW_ATTR = 'data-ogpreview'
|
||||||
SOUP_ATTRS = []
|
SOUP_ATTRS = []
|
||||||
|
|
||||||
def _get_soup(self):
|
def _get_soup(self):
|
||||||
|
@ -128,17 +127,57 @@ class SoupModelMixin:
|
||||||
self._soup = BeautifulSoup(html)
|
self._soup = BeautifulSoup(html)
|
||||||
return self._soup
|
return self._soup
|
||||||
|
|
||||||
|
def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999):
|
||||||
|
def elem_sort_key(elem):
|
||||||
|
try:
|
||||||
|
sort_key = getvalue(elem[attr_name])
|
||||||
|
except (KeyError, ValueError):
|
||||||
|
sort_key = fallback
|
||||||
|
elem[attr_name] = sort_key
|
||||||
|
return sort_key
|
||||||
|
return elem_sort_key
|
||||||
|
|
||||||
|
def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0):
|
||||||
|
def elem_pred(elem):
|
||||||
|
return test(elem[attr_name])
|
||||||
|
return elem_pred
|
||||||
|
|
||||||
|
def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args):
|
||||||
|
seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key))
|
||||||
|
if slice_args:
|
||||||
|
return itertools.islice(seq, *slice_args)
|
||||||
|
else:
|
||||||
|
return seq
|
||||||
|
|
||||||
def get_description(self):
|
def get_description(self):
|
||||||
"""Return a string with a brief excerpt of body text from the HTML."""
|
"""Return a string with a brief excerpt of body text from the HTML."""
|
||||||
return u''.join(self._get_soup().some_body_text())
|
return u''.join(self._get_soup().some_body_text())
|
||||||
|
|
||||||
def get_image_urls(self):
|
def get_image_urls(self, *slice_args):
|
||||||
"""Return an iterator of source URL strings of all images in the HTML.
|
"""Return an iterator of source URL strings of all images in the HTML.
|
||||||
|
|
||||||
Images include <img> tags and <video> poster attributes.
|
Images include <img> sources and <video> poster attributes.
|
||||||
"""
|
"""
|
||||||
return self._get_soup().iter_image_urls()
|
for elem in self._sort_and_slice_elems(
|
||||||
|
self._get_soup().iter_images(),
|
||||||
|
self._elem_key(),
|
||||||
|
self._elem_pred(),
|
||||||
|
*slice_args
|
||||||
|
):
|
||||||
|
yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]]
|
||||||
|
|
||||||
def get_video_urls(self):
|
def get_one_image_url(self):
|
||||||
|
return self.get_image_urls(1)
|
||||||
|
|
||||||
|
def get_video_urls(self, *slice_args):
|
||||||
"""Return an iterator of source URL strings of all videos in the HTML."""
|
"""Return an iterator of source URL strings of all videos in the HTML."""
|
||||||
return self._get_soup().iter_video_urls()
|
for elem in self._sort_and_slice_elems(
|
||||||
|
self._get_soup().iter_videos(),
|
||||||
|
self._elem_key(),
|
||||||
|
self._elem_pred(),
|
||||||
|
*slice_args
|
||||||
|
):
|
||||||
|
yield elem['src']
|
||||||
|
|
||||||
|
def get_one_video_url(self):
|
||||||
|
return self.get_video_urls(1)
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
|
|
||||||
{% block head %}
|
{% block head %}
|
||||||
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
||||||
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
|
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
|
||||||
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
|
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
|
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
|
|
||||||
{% block head %}
|
{% block head %}
|
||||||
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
||||||
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
|
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %}
|
||||||
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
|
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %}
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
|
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
|
||||||
|
|
Loading…
Reference in a new issue