Blogs/news only include a single OG image/video.
For now, this gives us more assurance that other sites will choose the preview we want. You can control the selection by adding data-ogpreview to image, video, and source elements. data-ogpreview=0 excludes the element from being included in the preview. Positive numbers set the preview priority. The lowest value found is chosen first.
This commit is contained in:
		
							parent
							
								
									5c75decd30
								
							
						
					
					
						commit
						703df9c8e9
					
				
					 3 changed files with 60 additions and 21 deletions
				
			
		|  | @ -1,6 +1,7 @@ | ||||||
| # -*- encoding: utf-8 -*- | # -*- encoding: utf-8 -*- | ||||||
| 
 | 
 | ||||||
| import io | import io | ||||||
|  | import itertools | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| import bs4 | import bs4 | ||||||
|  | @ -86,25 +87,22 @@ class BeautifulSoup(bs4.BeautifulSoup): | ||||||
|         except AttributeError: |         except AttributeError: | ||||||
|             return False |             return False | ||||||
| 
 | 
 | ||||||
|     def iter_attr(self, tag, attr_name, **kwargs): |     def iter_images(self): | ||||||
|         kwargs[attr_name] = True |         """Return an iterator of all image elements in this document. | ||||||
|         for elem in self.find_all(tag, **kwargs): |  | ||||||
|             yield elem[attr_name] |  | ||||||
| 
 | 
 | ||||||
|     def iter_image_urls(self): |         Images include <img> and <video> with a poster attribute. | ||||||
|         """Return an iterator of source URL strings of all images in this document. |  | ||||||
| 
 |  | ||||||
|         Images include <img> tags and <video> poster attributes. |  | ||||||
|         """ |         """ | ||||||
|         for elem in self.find_all(list(self.IMAGE_ATTRS.keys())): |         for elem in self.find_all(list(self.IMAGE_ATTRS.keys())): | ||||||
|             try: |             try: | ||||||
|                 yield elem[self.IMAGE_ATTRS[elem.name]] |                 elem[self.IMAGE_ATTRS[elem.name]] | ||||||
|             except KeyError: |             except KeyError: | ||||||
|                 pass |                 pass | ||||||
|  |             else: | ||||||
|  |                 yield elem | ||||||
| 
 | 
 | ||||||
|     def iter_video_urls(self): |     def iter_videos(self): | ||||||
|         """Return an iterator of source URL strings of all videos in this document.""" |         """Return an iterator of all video source elements in this document.""" | ||||||
|         return self.iter_attr(self.is_video_source, 'src') |         return self.find_all(self.is_video_source, src=True) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SoupModelMixin: | class SoupModelMixin: | ||||||
|  | @ -115,6 +113,7 @@ class SoupModelMixin: | ||||||
|     are usable. |     are usable. | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|  |     OG_PREVIEW_ATTR = 'data-ogpreview' | ||||||
|     SOUP_ATTRS = [] |     SOUP_ATTRS = [] | ||||||
| 
 | 
 | ||||||
|     def _get_soup(self): |     def _get_soup(self): | ||||||
|  | @ -128,17 +127,57 @@ class SoupModelMixin: | ||||||
|             self._soup = BeautifulSoup(html) |             self._soup = BeautifulSoup(html) | ||||||
|             return self._soup |             return self._soup | ||||||
| 
 | 
 | ||||||
|  |     def _elem_key(self, attr_name=OG_PREVIEW_ATTR, getvalue=int, fallback=999999): | ||||||
|  |         def elem_sort_key(elem): | ||||||
|  |             try: | ||||||
|  |                 sort_key = getvalue(elem[attr_name]) | ||||||
|  |             except (KeyError, ValueError): | ||||||
|  |                 sort_key = fallback | ||||||
|  |             elem[attr_name] = sort_key | ||||||
|  |             return sort_key | ||||||
|  |         return elem_sort_key | ||||||
|  | 
 | ||||||
|  |     def _elem_pred(self, attr_name=OG_PREVIEW_ATTR, test=lambda n: n > 0): | ||||||
|  |         def elem_pred(elem): | ||||||
|  |             return test(elem[attr_name]) | ||||||
|  |         return elem_pred | ||||||
|  | 
 | ||||||
|  |     def _sort_and_slice_elems(self, elem_seq, elem_key, pred, *slice_args): | ||||||
|  |         seq = itertools.ifilter(pred, sorted(elem_seq, key=elem_key)) | ||||||
|  |         if slice_args: | ||||||
|  |             return itertools.islice(seq, *slice_args) | ||||||
|  |         else: | ||||||
|  |             return seq | ||||||
|  | 
 | ||||||
|     def get_description(self): |     def get_description(self): | ||||||
|         """Return a string with a brief excerpt of body text from the HTML.""" |         """Return a string with a brief excerpt of body text from the HTML.""" | ||||||
|         return u''.join(self._get_soup().some_body_text()) |         return u''.join(self._get_soup().some_body_text()) | ||||||
| 
 | 
 | ||||||
|     def get_image_urls(self): |     def get_image_urls(self, *slice_args): | ||||||
|         """Return an iterator of source URL strings of all images in the HTML. |         """Return an iterator of source URL strings of all images in the HTML. | ||||||
| 
 | 
 | ||||||
|         Images include <img> tags and <video> poster attributes. |         Images include <img> sources and <video> poster attributes. | ||||||
|         """ |         """ | ||||||
|         return self._get_soup().iter_image_urls() |         for elem in self._sort_and_slice_elems( | ||||||
|  |                 self._get_soup().iter_images(), | ||||||
|  |                 self._elem_key(), | ||||||
|  |                 self._elem_pred(), | ||||||
|  |                 *slice_args | ||||||
|  |         ): | ||||||
|  |             yield elem[BeautifulSoup.IMAGE_ATTRS[elem.name]] | ||||||
| 
 | 
 | ||||||
|     def get_video_urls(self): |     def get_one_image_url(self): | ||||||
|  |         return self.get_image_urls(1) | ||||||
|  | 
 | ||||||
|  |     def get_video_urls(self, *slice_args): | ||||||
|         """Return an iterator of source URL strings of all videos in the HTML.""" |         """Return an iterator of source URL strings of all videos in the HTML.""" | ||||||
|         return self._get_soup().iter_video_urls() |         for elem in self._sort_and_slice_elems( | ||||||
|  |                 self._get_soup().iter_videos(), | ||||||
|  |                 self._elem_key(), | ||||||
|  |                 self._elem_pred(), | ||||||
|  |                 *slice_args | ||||||
|  |         ): | ||||||
|  |             yield elem['src'] | ||||||
|  | 
 | ||||||
|  |     def get_one_video_url(self): | ||||||
|  |         return self.get_video_urls(1) | ||||||
|  |  | ||||||
|  | @ -2,8 +2,8 @@ | ||||||
| 
 | 
 | ||||||
| {% block head %} | {% block head %} | ||||||
| {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %} | {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %} | ||||||
| {% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %} | {% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %} | ||||||
| {% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %} | {% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %} | ||||||
| {% endblock %} | {% endblock %} | ||||||
| 
 | 
 | ||||||
| {% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %} | {% block subtitle %}{{ object.headline|striptags|safe }} - Conservancy Blog - {% endblock %} | ||||||
|  |  | ||||||
|  | @ -2,8 +2,8 @@ | ||||||
| 
 | 
 | ||||||
| {% block head %} | {% block head %} | ||||||
| {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %} | {% include "opengraph_partial.html" with url=object.get_absolute_url title=object.headline description=object.get_description %} | ||||||
| {% include "opengraph_urllist_partial.html" with property='image' urls=object.get_image_urls fallback='/img/conservancy-logo.png' %} | {% include "opengraph_urllist_partial.html" with property='image' urls=object.get_one_image_url fallback='/img/conservancy-logo.png' %} | ||||||
| {% include "opengraph_urllist_partial.html" with property='video' urls=object.get_video_urls %} | {% include "opengraph_urllist_partial.html" with property='video' urls=object.get_one_video_url %} | ||||||
| {% endblock %} | {% endblock %} | ||||||
| 
 | 
 | ||||||
| {% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %} | {% block subtitle %}{{ object.headline|striptags|safe }} - {% endblock %} | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Brett Smith
						Brett Smith