From 1afe753462f0293122dc7a9b534b4f5cdb1e5c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 9 Feb 2014 14:22:56 +0100 Subject: [PATCH] [slideshare] Fix description extraction and modernize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ‘og:description’ property doesn’t contain the full description --- youtube_dl/extractor/slideshare.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index afc3001b5..9c62825cc 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor): _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P.+?)($|\?)' _TEST = { - u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', - u'file': u'25665706.mp4', - u'info_dict': { - u'title': u'Managing Scale and Complexity', - u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix', + 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + 'info_dict': { + 'id': '25665706', + 'ext': 'mp4', + 'title': 'Managing Scale and Complexity', + 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', }, } @@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor): webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( r'var slideshare_object = ({.*?}); var user_info =', - webpage, u'slideshare object') + webpage, 'slideshare object') info = json.loads(slideshare_obj) - if info['slideshow']['type'] != u'video': - raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + if info['slideshow']['type'] != 'video': + raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) doc = info['doc'] bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + description = self._html_search_regex( + r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description') return { '_type': 'video', @@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': self._og_search_description(webpage), + 'description': description, }