diff --git a/www/conservancy/apps/blog/models.py b/www/conservancy/apps/blog/models.py
index 68b99caa..85dea531 100644
--- a/www/conservancy/apps/blog/models.py
+++ b/www/conservancy/apps/blog/models.py
@@ -1,5 +1,6 @@
from django.db import models
from django.conf import settings
+from conservancy import bsoup
from conservancy.apps.staff.models import Person
from datetime import datetime, timedelta
@@ -18,7 +19,7 @@ class EntryTag(models.Model):
def get_absolute_url(self):
return u"/blog/?tag=%s" % self.slug
-class Entry(models.Model):
+class Entry(models.Model, bsoup.SoupModelMixin):
"""Blog entry"""
headline = models.CharField(max_length=200)
@@ -38,6 +39,8 @@ class Entry(models.Model):
ordering = ('-pub_date',)
get_latest_by = 'pub_date'
+ SOUP_ATTRS = ['body']
+
def __unicode__(self):
return self.headline
diff --git a/www/conservancy/apps/news/models.py b/www/conservancy/apps/news/models.py
index 89e0cc4c..4fc5e3d9 100644
--- a/www/conservancy/apps/news/models.py
+++ b/www/conservancy/apps/news/models.py
@@ -1,11 +1,12 @@
from django.db import models
from django.conf import settings
+from conservancy import bsoup
from conservancy.apps.staff.models import Person
from conservancy.apps.events.models import Event
from django.contrib.sites.models import Site
from datetime import datetime, timedelta
-class PressRelease(models.Model):
+class PressRelease(models.Model, bsoup.SoupModelMixin):
"""News release model"""
headline = models.CharField(max_length=300)
@@ -24,6 +25,8 @@ class PressRelease(models.Model):
ordering = ("-pub_date",)
get_latest_by = "pub_date"
+ SOUP_ATTRS = ['summary', 'body']
+
def __unicode__(self):
return self.headline
diff --git a/www/conservancy/bsoup.py b/www/conservancy/bsoup.py
new file mode 100644
index 00000000..fb0ef6cb
--- /dev/null
+++ b/www/conservancy/bsoup.py
@@ -0,0 +1,144 @@
+# -*- encoding: utf-8 -*-
+
+import io
+import re
+
+import bs4
+import bs4.element
+
+class BeautifulSoup(bs4.BeautifulSoup):
+ """A wrapper of the original BeautifulSoup class, with convenience methods added."""
+
+ IMAGE_ATTRS = {
+ 'img': 'src',
+ 'video': 'poster',
+ }
+ NON_BODY_TEXT_TAGS = frozenset([
+ 'img',
+ 'video',
+ ])
+ SENTENCE_END = re.compile(r'[.?!]\s*\W*\s*$')
+
+ def __init__(self, src, parser='html5lib'):
+ # WARNING! It seems like it would be ideal to use the 'lxml' parser
+ # for speed, but that doesn't work in our web application. On
+ # Debian stretch, at least, using lxml causes the web server WSGI
+ # application to go into an infinite loop.
+ super(BeautifulSoup, self).__init__(src, parser)
+
+ def _body_text(self, root):
+ # "Body text" is all the strings under the root element, in order,
+ # except:
+ # * strings inside NON_BODY_TEXT_TAGS
+ # * strings inside containers of NON_BODY_TEXT_TAGS. A container is
+ # an element that has a NON_BODY_TEXT_TAGS element as its first child.
+ # For example, in
, none of the div's strings
+ # are included in the body text, because it's considered to be a
+ #