blog/news: Add Open Graph metadata to entry pages.

This helps other social media sites generate nice previews for these pages.
2017-11-07 11:17:33 -05:00 · 2017-11-07 11:17:33 -05:00 · 3b2ed8397d
commit 3b2ed8397d
parent 87961c6cee
7 changed files with 228 additions and 2 deletions
--- a/www/conservancy/apps/blog/models.py
+++ b/www/conservancy/apps/blog/models.py
@ -1,5 +1,6 @@
 from django.db import models
 from django.conf import settings
+from conservancy import bsoup
 from conservancy.apps.staff.models import Person
 from datetime import datetime, timedelta

@ -18,7 +19,7 @@ class EntryTag(models.Model):
    def get_absolute_url(self):
        return u"/blog/?tag=%s" % self.slug

-class Entry(models.Model):
+class Entry(models.Model, bsoup.SoupModelMixin):
    """Blog entry"""

    headline = models.CharField(max_length=200)
@ -38,6 +39,8 @@ class Entry(models.Model):
        ordering = ('-pub_date',)
        get_latest_by = 'pub_date'

+    SOUP_ATTRS = ['body']
+
    def __unicode__(self):
        return self.headline

--- a/www/conservancy/apps/news/models.py
+++ b/www/conservancy/apps/news/models.py
@ -1,11 +1,12 @@
 from django.db import models
 from django.conf import settings
+from conservancy import bsoup
 from conservancy.apps.staff.models import Person
 from conservancy.apps.events.models import Event
 from django.contrib.sites.models import Site
 from datetime import datetime, timedelta

-class PressRelease(models.Model):
+class PressRelease(models.Model, bsoup.SoupModelMixin):
    """News release model"""

    headline = models.CharField(max_length=300)
@ -24,6 +25,8 @@ class PressRelease(models.Model):
        ordering = ("-pub_date",)
        get_latest_by = "pub_date"

+    SOUP_ATTRS = ['summary', 'body']
+
    def __unicode__(self):
        return self.headline

--- a/www/conservancy/bsoup.py
+++ b/www/conservancy/bsoup.py
@ -0,0 +1,144 @@
+# -*- encoding: utf-8 -*-
+
+import io
+import re
+
+import bs4
+import bs4.element
+
+class BeautifulSoup(bs4.BeautifulSoup):
+    """A wrapper of the original BeautifulSoup class, with convenience methods added."""
+
+    IMAGE_ATTRS = {
+        'img': 'src',
+        'video': 'poster',
+    }
+    NON_BODY_TEXT_TAGS = frozenset([
+        'img',
+        'video',
+    ])
+    SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
+
+    def __init__(self, src, parser='html5lib'):
+        # WARNING!  It seems like it would be ideal to use the 'lxml' parser
+        # for speed, but that doesn't work in our web application.  On
+        # Debian stretch, at least, using lxml causes the web server WSGI
+        # application to go into an infinite loop.
+        super(BeautifulSoup, self).__init__(src, parser)
+
+    def _body_text(self, root):
+        # "Body text" is all the strings under the root element, in order,
+        # except:
+        # * strings inside NON_BODY_TEXT_TAGS
+        # * strings inside containers of NON_BODY_TEXT_TAGS.  A container is
+        #   an element that has a NON_BODY_TEXT_TAGS element as its first child.
+        #   For example, in <div> <video …> … </div>, none of the div's strings
+        #   are included in the body text, because it's considered to be a
+        #   <video> container, and any strings are probably a caption, fallback
+        #   text, or other non-body text.
+        started = False
+        for child in root.children:
+            child_type = type(child)
+            if issubclass(child_type, bs4.element.Tag):
+                if child.name in self.NON_BODY_TEXT_TAGS:
+                    if not started:
+                        break
+                else:
+                    for s in self._body_text(child):
+                        yield s
+            # It's not worth it to use issubclass here, because elements that
+            # don't have body text like Comments and CDATA are subclasses of
+            # NavigableString.
+            elif child_type is bs4.element.NavigableString:
+                if started:
+                    yield child
+                elif child.isspace():
+                    pass
+                else:
+                    yield child
+                    started = True
+
+    def body_text(self):
+        """Return an iterator of strings comprising this document's body text."""
+        return self._body_text(self)
+
+    def some_body_text(self, char_target=300):
+        """Return an iterator of strings with some of this document's body text.
+
+        This is the same as body_text, except after it yields a string that
+        looks like the end of a sentence, it checks whether it has yielded
+        at least `char_target` characters.  If so, the iterator stops.
+        """
+        # This implementation is likely to overshoot `char_target` a lot,
+        # because it doesn't look inside the strings it yields, just at the
+        # end of them.  We can implement something smarter later if needed.
+        char_count = 0
+        for s in self.body_text():
+            yield s
+            char_count += len(s)
+            if (char_count > char_target) and self.SENTENCE_END.search(s):
+                break
+
+    @staticmethod
+    def is_video_source(elem):
+        try:
+            return elem.name == 'source' and elem.parent.name == 'video'
+        except AttributeError:
+            return False
+
+    def iter_attr(self, tag, attr_name, **kwargs):
+        kwargs[attr_name] = True
+        for elem in self.find_all(tag, **kwargs):
+            yield elem[attr_name]
+
+    def iter_image_urls(self):
+        """Return an iterator of source URL strings of all images in this document.
+
+        Images include <img> tags and <video> poster attributes.
+        """
+        for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
+            try:
+                yield elem[self.IMAGE_ATTRS[elem.name]]
+            except KeyError:
+                pass
+
+    def iter_video_urls(self):
+        """Return an iterator of source URL strings of all videos in this document."""
+        return self.iter_attr(self.is_video_source, 'src')
+
+
+class SoupModelMixin:
+    """Mixin for models to parse HTML with BeautifulSoup.
+
+    Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
+    that name attributes with HTML in them.  After that, all the public methods
+    are usable.
+    """
+
+    SOUP_ATTRS = []
+
+    def _get_soup(self):
+        try:
+            return self._soup
+        except AttributeError:
+            html = io.StringIO()
+            for attr_name in self.SOUP_ATTRS:
+                html.write(getattr(self, attr_name))
+            html.seek(0)
+            self._soup = BeautifulSoup(html)
+            return self._soup
+
+    def get_description(self):
+        """Return a string with a brief excerpt of body text from the HTML."""
+        return u''.join(self._get_soup().some_body_text())
+
+    def get_image_urls(self):
+        """Return an iterator of source URL strings of all images in the HTML.
+
+        Images include <img> tags and <video> poster attributes.
+        """
+        return self._get_soup().iter_image_urls()
+
+    def get_video_urls(self):
+        """Return an iterator of source URL strings of all videos in the HTML."""
+        return self._get_soup().iter_video_urls()
--- a/www/conservancy/templates/blog/entry_detail.html
+++ b/www/conservancy/templates/blog/entry_detail.html
@ -1,5 +1,11 @@
 {% extends "base_blog.html" %}

+{% block head %}
+{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
+{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
+{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
+{% endblock %}
+
 {% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}

 {% block content %}
--- a/www/conservancy/templates/news/pressrelease_detail.html
+++ b/www/conservancy/templates/news/pressrelease_detail.html
@ -1,5 +1,11 @@
 {% extends "base_news.html" %}

+{% block head %}
+{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
+{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
+{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
+{% endblock %}
+
 {% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}

 {% block content %}
--- a/www/conservancy/templates/opengraph_partial.html
+++ b/www/conservancy/templates/opengraph_partial.html
@ -0,0 +1,38 @@
+{% comment %}
+
+Include this partial in a head section to include basic Open Graph metadata.
+Pass a variable `NAME` to give a value for the `og:NAME` property.
+
+These properties are only listed if you give a value for them:
+
+* url: A URL string that includes at least an absolute path.  This partial
+  will fill in a default scheme and host if needed.
+* title: A string.  Tags are stripped, then the rest is assumed HTML-safe.
+* description: A string.  Tags are stripped, then the rest is assumed
+  HTML-safe.
+
+These properties are always included.  You can override them but you
+normally shouldn't need to:
+
+* type: Default "website".
+* locale: Default "en_US".
+* site_name: Default "Software Freedom Conservancy"
+
+{% endcomment %}
+
+<meta property="og:type" content="{{ type|default:"website" }}">
+<meta property="og:locale" content="{{ locale|default:"en_US" }}">
+<meta property="og:site_name" content="{{ site_name|default:"Software Freedom Conservancy" }}">
+
+{% if url %}
+{% load fill_url %}
+<meta property="og:url" content="{{ url|fill_url:host_url }}">
+{% endif %}
+
+{% if title %}
+<meta property="og:title" content="{{ title|striptags|safe }}">
+{% endif %}
+
+{% if description %}
+<meta property="og:description" content="{{ description|striptags|safe }}">
+{% endif %}
--- a/www/conservancy/templates/opengraph_urllist_partial.html
+++ b/www/conservancy/templates/opengraph_urllist_partial.html
@ -0,0 +1,26 @@
+{% comment %}
+
+Include this partial in a head section to include a series of URLs for a
+given property, like og:image or og:video.
+
+You must pass the following variables:
+
+* property: A string with the name of the property, like 'image' or 'video'.
+* urls: A sequence of URL strings.  Each should include at least an absolute
+  path.  This partial will fill in a scheme and host if needed.
+
+You may also pass:
+
+* fallback: A URL string, following the same rules as in `urls`.  This URL
+  will be used if `urls` is empty.
+
+{% endcomment %}
+
+{% load fill_url %}
+{% for url in urls %}
+<meta property="og:{{ property }}" content="{{ url|fill_url:host_url }}">
+{% empty %}
+{% if fallback %}
+<meta property="og:{{ property }}" content="{{ fallback|fill_url:host_url }}">
+{% endif %}
+{% endfor %}