From 0652471164ecad89ce35c06a025669631141a5ec Mon Sep 17 00:00:00 2001
From: Sachi King <nakato@nakato.io>
Date: Fri, 21 Apr 2017 10:34:48 +1000
Subject: [PATCH] Sanitize user input on markdown fields

This is an XSS vulnribilitiy.

This also blocks a number of MD attributes that a user might attempt to
use.

The following are the allowed attributes.

['a', 'abbr', 'acronym', 'b', 'blockquote', 'code', 'em', 'i', 'li',
'ol', 'p', 'pre', 'strong', 'ul']

I belive this to be acceptable, as honeslty, a speaker using H1 is going
to stomp all over the page and make it harder for the reviewer to parse.

UX wise, it's less than great.  A user can do # title and be left with
<h1> in the sanitized output.
---
 requirements/base.txt        |  2 +-
 symposion/markdown_parser.py | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)
diff --git a/requirements/base.txt b/requirements/base.txt
index 5a955d0d..830c9b29 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -6,7 +6,7 @@ django-sitetree>=1.7.0
 django-taggit==0.18.0
 django-timezone-field>=2.0
 easy-thumbnails==2.3
-html5lib==0.9999999
+bleach
 markdown==2.6.5
 pytz==2015.7
 django-ical==1.4
diff --git a/symposion/markdown_parser.py b/symposion/markdown_parser.py
index b3eaa53c..d92a5020 100644
--- a/symposion/markdown_parser.py
+++ b/symposion/markdown_parser.py
@@ -1,17 +1,14 @@
 from __future__ import unicode_literals
 
+import bleach
 import markdown
 
 
+tags = bleach.sanitizer.ALLOWED_TAGS[:]
+tags.extend(['p', 'pre'])
+
+
 def parse(text):
-
-    # First run through the Markdown parser
-    text = markdown.markdown(text, extensions=["extra"], safe_mode=False)
-
-    # Sanitize using html5lib
-    # bits = []
-    # parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
-    # for token in parser.parseFragment(text).childNodes:
-    #     bits.append(token.toxml())
-    # return "".join(bits)
+    md = markdown.markdown(text, extensions=['extra'])
+    text = bleach.clean(md, tags=tags)
     return text