blog/news: Add Open Graph metadata to entry pages.
This helps other social media sites generate nice previews for these pages.
This commit is contained in:
parent
87961c6cee
commit
3b2ed8397d
7 changed files with 228 additions and 2 deletions
|
@ -1,5 +1,6 @@
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from conservancy import bsoup
|
||||||
from conservancy.apps.staff.models import Person
|
from conservancy.apps.staff.models import Person
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -18,7 +19,7 @@ class EntryTag(models.Model):
|
||||||
def get_absolute_url(self):
|
def get_absolute_url(self):
|
||||||
return u"/blog/?tag=%s" % self.slug
|
return u"/blog/?tag=%s" % self.slug
|
||||||
|
|
||||||
class Entry(models.Model):
|
class Entry(models.Model, bsoup.SoupModelMixin):
|
||||||
"""Blog entry"""
|
"""Blog entry"""
|
||||||
|
|
||||||
headline = models.CharField(max_length=200)
|
headline = models.CharField(max_length=200)
|
||||||
|
@ -38,6 +39,8 @@ class Entry(models.Model):
|
||||||
ordering = ('-pub_date',)
|
ordering = ('-pub_date',)
|
||||||
get_latest_by = 'pub_date'
|
get_latest_by = 'pub_date'
|
||||||
|
|
||||||
|
SOUP_ATTRS = ['body']
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.headline
|
return self.headline
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from conservancy import bsoup
|
||||||
from conservancy.apps.staff.models import Person
|
from conservancy.apps.staff.models import Person
|
||||||
from conservancy.apps.events.models import Event
|
from conservancy.apps.events.models import Event
|
||||||
from django.contrib.sites.models import Site
|
from django.contrib.sites.models import Site
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
class PressRelease(models.Model):
|
class PressRelease(models.Model, bsoup.SoupModelMixin):
|
||||||
"""News release model"""
|
"""News release model"""
|
||||||
|
|
||||||
headline = models.CharField(max_length=300)
|
headline = models.CharField(max_length=300)
|
||||||
|
@ -24,6 +25,8 @@ class PressRelease(models.Model):
|
||||||
ordering = ("-pub_date",)
|
ordering = ("-pub_date",)
|
||||||
get_latest_by = "pub_date"
|
get_latest_by = "pub_date"
|
||||||
|
|
||||||
|
SOUP_ATTRS = ['summary', 'body']
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.headline
|
return self.headline
|
||||||
|
|
||||||
|
|
144
www/conservancy/bsoup.py
Normal file
144
www/conservancy/bsoup.py
Normal file
|
@ -0,0 +1,144 @@
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
import bs4.element
|
||||||
|
|
||||||
|
class BeautifulSoup(bs4.BeautifulSoup):
|
||||||
|
"""A wrapper of the original BeautifulSoup class, with convenience methods added."""
|
||||||
|
|
||||||
|
IMAGE_ATTRS = {
|
||||||
|
'img': 'src',
|
||||||
|
'video': 'poster',
|
||||||
|
}
|
||||||
|
NON_BODY_TEXT_TAGS = frozenset([
|
||||||
|
'img',
|
||||||
|
'video',
|
||||||
|
])
|
||||||
|
SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
|
||||||
|
|
||||||
|
def __init__(self, src, parser='html5lib'):
|
||||||
|
# WARNING! It seems like it would be ideal to use the 'lxml' parser
|
||||||
|
# for speed, but that doesn't work in our web application. On
|
||||||
|
# Debian stretch, at least, using lxml causes the web server WSGI
|
||||||
|
# application to go into an infinite loop.
|
||||||
|
super(BeautifulSoup, self).__init__(src, parser)
|
||||||
|
|
||||||
|
def _body_text(self, root):
|
||||||
|
# "Body text" is all the strings under the root element, in order,
|
||||||
|
# except:
|
||||||
|
# * strings inside NON_BODY_TEXT_TAGS
|
||||||
|
# * strings inside containers of NON_BODY_TEXT_TAGS. A container is
|
||||||
|
# an element that has a NON_BODY_TEXT_TAGS element as its first child.
|
||||||
|
# For example, in <div> <video …> … </div>, none of the div's strings
|
||||||
|
# are included in the body text, because it's considered to be a
|
||||||
|
# <video> container, and any strings are probably a caption, fallback
|
||||||
|
# text, or other non-body text.
|
||||||
|
started = False
|
||||||
|
for child in root.children:
|
||||||
|
child_type = type(child)
|
||||||
|
if issubclass(child_type, bs4.element.Tag):
|
||||||
|
if child.name in self.NON_BODY_TEXT_TAGS:
|
||||||
|
if not started:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
for s in self._body_text(child):
|
||||||
|
yield s
|
||||||
|
# It's not worth it to use issubclass here, because elements that
|
||||||
|
# don't have body text like Comments and CDATA are subclasses of
|
||||||
|
# NavigableString.
|
||||||
|
elif child_type is bs4.element.NavigableString:
|
||||||
|
if started:
|
||||||
|
yield child
|
||||||
|
elif child.isspace():
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
yield child
|
||||||
|
started = True
|
||||||
|
|
||||||
|
def body_text(self):
|
||||||
|
"""Return an iterator of strings comprising this document's body text."""
|
||||||
|
return self._body_text(self)
|
||||||
|
|
||||||
|
def some_body_text(self, char_target=300):
|
||||||
|
"""Return an iterator of strings with some of this document's body text.
|
||||||
|
|
||||||
|
This is the same as body_text, except after it yields a string that
|
||||||
|
looks like the end of a sentence, it checks whether it has yielded
|
||||||
|
at least `char_target` characters. If so, the iterator stops.
|
||||||
|
"""
|
||||||
|
# This implementation is likely to overshoot `char_target` a lot,
|
||||||
|
# because it doesn't look inside the strings it yields, just at the
|
||||||
|
# end of them. We can implement something smarter later if needed.
|
||||||
|
char_count = 0
|
||||||
|
for s in self.body_text():
|
||||||
|
yield s
|
||||||
|
char_count += len(s)
|
||||||
|
if (char_count > char_target) and self.SENTENCE_END.search(s):
|
||||||
|
break
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_video_source(elem):
|
||||||
|
try:
|
||||||
|
return elem.name == 'source' and elem.parent.name == 'video'
|
||||||
|
except AttributeError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def iter_attr(self, tag, attr_name, **kwargs):
|
||||||
|
kwargs[attr_name] = True
|
||||||
|
for elem in self.find_all(tag, **kwargs):
|
||||||
|
yield elem[attr_name]
|
||||||
|
|
||||||
|
def iter_image_urls(self):
|
||||||
|
"""Return an iterator of source URL strings of all images in this document.
|
||||||
|
|
||||||
|
Images include <img> tags and <video> poster attributes.
|
||||||
|
"""
|
||||||
|
for elem in self.find_all(list(self.IMAGE_ATTRS.keys())):
|
||||||
|
try:
|
||||||
|
yield elem[self.IMAGE_ATTRS[elem.name]]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def iter_video_urls(self):
|
||||||
|
"""Return an iterator of source URL strings of all videos in this document."""
|
||||||
|
return self.iter_attr(self.is_video_source, 'src')
|
||||||
|
|
||||||
|
|
||||||
|
class SoupModelMixin:
|
||||||
|
"""Mixin for models to parse HTML with BeautifulSoup.
|
||||||
|
|
||||||
|
Classes that use this mixin must define `SOUP_ATTRS`, a list of strings
|
||||||
|
that name attributes with HTML in them. After that, all the public methods
|
||||||
|
are usable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SOUP_ATTRS = []
|
||||||
|
|
||||||
|
def _get_soup(self):
|
||||||
|
try:
|
||||||
|
return self._soup
|
||||||
|
except AttributeError:
|
||||||
|
html = io.StringIO()
|
||||||
|
for attr_name in self.SOUP_ATTRS:
|
||||||
|
html.write(getattr(self, attr_name))
|
||||||
|
html.seek(0)
|
||||||
|
self._soup = BeautifulSoup(html)
|
||||||
|
return self._soup
|
||||||
|
|
||||||
|
def get_description(self):
|
||||||
|
"""Return a string with a brief excerpt of body text from the HTML."""
|
||||||
|
return u''.join(self._get_soup().some_body_text())
|
||||||
|
|
||||||
|
def get_image_urls(self):
|
||||||
|
"""Return an iterator of source URL strings of all images in the HTML.
|
||||||
|
|
||||||
|
Images include <img> tags and <video> poster attributes.
|
||||||
|
"""
|
||||||
|
return self._get_soup().iter_image_urls()
|
||||||
|
|
||||||
|
def get_video_urls(self):
|
||||||
|
"""Return an iterator of source URL strings of all videos in the HTML."""
|
||||||
|
return self._get_soup().iter_video_urls()
|
|
@ -1,5 +1,11 @@
|
||||||
{% extends "base_blog.html" %}
|
{% extends "base_blog.html" %}
|
||||||
|
|
||||||
|
{% block head %}
|
||||||
|
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
||||||
|
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
|
||||||
|
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
|
{% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %}
|
||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
{% extends "base_news.html" %}
|
{% extends "base_news.html" %}
|
||||||
|
|
||||||
|
{% block head %}
|
||||||
|
{% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %}
|
||||||
|
{% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %}
|
||||||
|
{% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %}
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
|
{% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %}
|
||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
|
|
38
www/conservancy/templates/opengraph_partial.html
Normal file
38
www/conservancy/templates/opengraph_partial.html
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
{% comment %}
|
||||||
|
|
||||||
|
Include this partial in a head section to include basic Open Graph metadata.
|
||||||
|
Pass a variable `NAME` to give a value for the `og:NAME` property.
|
||||||
|
|
||||||
|
These properties are only listed if you give a value for them:
|
||||||
|
|
||||||
|
* url: A URL string that includes at least an absolute path. This partial
|
||||||
|
will fill in a default scheme and host if needed.
|
||||||
|
* title: A string. Tags are stripped, then the rest is assumed HTML-safe.
|
||||||
|
* description: A string. Tags are stripped, then the rest is assumed
|
||||||
|
HTML-safe.
|
||||||
|
|
||||||
|
These properties are always included. You can override them but you
|
||||||
|
normally shouldn't need to:
|
||||||
|
|
||||||
|
* type: Default "website".
|
||||||
|
* locale: Default "en_US".
|
||||||
|
* site_name: Default "Software Freedom Conservancy"
|
||||||
|
|
||||||
|
{% endcomment %}
|
||||||
|
|
||||||
|
<meta property="og:type" content="{{ type|default:"website" }}">
|
||||||
|
<meta property="og:locale" content="{{ locale|default:"en_US" }}">
|
||||||
|
<meta property="og:site_name" content="{{ site_name|default:"Software Freedom Conservancy" }}">
|
||||||
|
|
||||||
|
{% if url %}
|
||||||
|
{% load fill_url %}
|
||||||
|
<meta property="og:url" content="{{ url|fill_url:host_url }}">
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if title %}
|
||||||
|
<meta property="og:title" content="{{ title|striptags|safe }}">
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if description %}
|
||||||
|
<meta property="og:description" content="{{ description|striptags|safe }}">
|
||||||
|
{% endif %}
|
26
www/conservancy/templates/opengraph_urllist_partial.html
Normal file
26
www/conservancy/templates/opengraph_urllist_partial.html
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
{% comment %}
|
||||||
|
|
||||||
|
Include this partial in a head section to include a series of URLs for a
|
||||||
|
given property, like og:image or og:video.
|
||||||
|
|
||||||
|
You must pass the following variables:
|
||||||
|
|
||||||
|
* property: A string with the name of the property, like 'image' or 'video'.
|
||||||
|
* urls: A sequence of URL strings. Each should include at least an absolute
|
||||||
|
path. This partial will fill in a scheme and host if needed.
|
||||||
|
|
||||||
|
You may also pass:
|
||||||
|
|
||||||
|
* fallback: A URL string, following the same rules as in `urls`. This URL
|
||||||
|
will be used if `urls` is empty.
|
||||||
|
|
||||||
|
{% endcomment %}
|
||||||
|
|
||||||
|
{% load fill_url %}
|
||||||
|
{% for url in urls %}
|
||||||
|
<meta property="og:{{ property }}" content="{{ url|fill_url:host_url }}">
|
||||||
|
{% empty %}
|
||||||
|
{% if fallback %}
|
||||||
|
<meta property="og:{{ property }}" content="{{ fallback|fill_url:host_url }}">
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
Loading…
Reference in a new issue