release 2016.07.07

[kamcord] Add extractor (Closes #10001 )
[daum.net] Fix extraction for specific examples
2016-07-07 01:54:29 +07:00 · 2016-07-07 01:50:39 +07:00 · 2016-07-07 01:26:14 +08:00 · 2016-07-07 01:13:37 +08:00 · 2016-07-06 23:37:54 +08:00 · 2016-07-06 20:32:03 +08:00
20 changed files with 455 additions and 139 deletions
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -6,8 +6,8 @@

 ---

-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.06**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.07*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.07**

 ### Before submitting an *issue* make sure you have:
 - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2016.07.06
+[debug] youtube-dl version 2016.07.07
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -312,6 +312,7 @@
 - **jpopsuki.tv**
 - **JWPlatform**
 - **Kaltura**
+ - **Kamcord**
 - **KanalPlay**: Kanal 5/9/11 Play
 - **Kankan**
 - **Karaoketv**
@@ -476,6 +477,8 @@
 - **Odnoklassniki**
 - **OktoberfestTV**
 - **on.aol.com**
+ - **onet.tv**
+ - **onet.tv:channel**
 - **OnionStudios**
 - **Ooyala**
 - **OoyalaExternal**
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -33,6 +33,7 @@ from youtube_dl.utils import (
    ExtractorError,
    find_xpath_attr,
    fix_xml_ampersands,
+    get_element_by_class,
    InAdvancePagedList,
    intlist_to_bytes,
    is_html,
@@ -991,5 +992,13 @@ The first line
        self.assertEqual(urshift(3, 1), 1)
        self.assertEqual(urshift(-3, 1), 2147483646)

+    def test_get_element_by_class(self):
+        html = '''
+            <span class="foo bar">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), 'nice')
+        self.assertEqual(get_element_by_class('no-such-class', html), None)
+
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -5,6 +5,8 @@ from .common import InfoExtractor
 from ..utils import (
    int_or_none,
    parse_iso8601,
+    mimetype2ext,
+    determine_ext,
 )


@@ -50,21 +52,25 @@ class AMPIE(InfoExtractor):
        if isinstance(media_content, dict):
            media_content = [media_content]
        for media_data in media_content:
-            media = media_data['@attributes']
-            media_type = media['type']
-            if media_type in ('video/f4m', 'application/f4m+xml'):
+            media = media_data.get('@attributes', {})
+            media_url = media.get('url')
+            if not media_url:
+                continue
+            ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
+            if ext == 'f4m':
                formats.extend(self._extract_f4m_formats(
-                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
+                    media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
                    video_id, f4m_id='hds', fatal=False))
-            elif media_type == 'application/x-mpegURL':
+            elif ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
-                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
            else:
                formats.append({
                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
                    'url': media['url'],
                    'tbr': int_or_none(media.get('bitrate')),
                    'filesize': int_or_none(media.get('fileSize')),
+                    'ext': ext,
                })

        self._sort_formats(formats)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -90,6 +90,7 @@ class BrightcoveLegacyIE(InfoExtractor):
                'description': 'md5:363109c02998fee92ec02211bd8000df',
                'uploader': 'National Ballet of Canada',
            },
+            'skip': 'Video gone',
        },
        {
            # test flv videos served by akamaihd.net
@@ -108,7 +109,7 @@ class BrightcoveLegacyIE(InfoExtractor):
            },
        },
        {
-            # playlist test
+            # playlist with 'videoList'
            # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
            'info_dict': {
@@ -117,6 +118,15 @@ class BrightcoveLegacyIE(InfoExtractor):
            },
            'playlist_mincount': 7,
        },
+        {
+            # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965)
+            'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
+            'info_dict': {
+                'id': '1522758701001',
+                'title': 'Lesson 08',
+            },
+            'playlist_mincount': 10,
+        },
    ]
    FLV_VCODECS = {
        1: 'SORENSON',
@@ -298,13 +308,19 @@ class BrightcoveLegacyIE(InfoExtractor):
            info_url, player_key, 'Downloading playlist information')

        json_data = json.loads(playlist_info)
-        if 'videoList' not in json_data:
+        if 'videoList' in json_data:
+            playlist_info = json_data['videoList']
+            playlist_dto = playlist_info['mediaCollectionDTO']
+        elif 'playlistTabs' in json_data:
+            playlist_info = json_data['playlistTabs']
+            playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0]
+        else:
            raise ExtractorError('Empty playlist')
-        playlist_info = json_data['videoList']
-        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+        videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']]

        return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
-                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+                                    playlist_title=playlist_dto['displayName'])

    def _extract_video_info(self, video_info):
        video_id = compat_str(video_info['id'])
--- a/youtube_dl/extractor/cliprs.py
+++ b/youtube_dl/extractor/cliprs.py
@@ -1,16 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    float_or_none,
-    int_or_none,
-    parse_iso8601,
-)
+from .onet import OnetBaseIE


-class ClipRsIE(InfoExtractor):
+class ClipRsIE(OnetBaseIE):
    _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
    _TEST = {
        'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
@@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor):
    }

    def _real_extract(self, url):
-        video_id = self._match_id(url)
+        display_id = self._match_id(url)

-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, display_id)

-        video_id = self._search_regex(
-            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+        mvp_id = self._search_mvp_id(webpage)

-        response = self._download_json(
-            'http://qi.ckm.onetapi.pl/', video_id,
-            query={
-                'body[id]': video_id,
-                'body[jsonrpc]': '2.0',
-                'body[method]': 'get_asset_detail',
-                'body[params][ID_Publikacji]': video_id,
-                'body[params][Service]': 'www.onet.pl',
-                'content-type': 'application/jsonp',
-                'x-onet-app': 'player.front.onetapi.pl',
-            })
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict['display_id'] = display_id

-        error = response.get('error')
-        if error:
-            raise ExtractorError(
-                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
-
-        video = response['result'].get('0')
-
-        formats = []
-        for _, formats_dict in video['formats'].items():
-            if not isinstance(formats_dict, dict):
-                continue
-            for format_id, format_list in formats_dict.items():
-                if not isinstance(format_list, list):
-                    continue
-                for f in format_list:
-                    if not f.get('url'):
-                        continue
-                    formats.append({
-                        'url': f['url'],
-                        'format_id': format_id,
-                        'height': int_or_none(f.get('vertical_resolution')),
-                        'width': int_or_none(f.get('horizontal_resolution')),
-                        'abr': float_or_none(f.get('audio_bitrate')),
-                        'vbr': float_or_none(f.get('video_bitrate')),
-                    })
-        self._sort_formats(formats)
-
-        meta = video.get('meta', {})
-
-        title = self._og_search_title(webpage, default=None) or meta['title']
-        description = self._og_search_description(webpage, default=None) or meta.get('description')
-        duration = meta.get('length') or meta.get('lenght')
-        timestamp = parse_iso8601(meta.get('addDate'), ' ')
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'timestamp': timestamp,
-            'formats': formats,
-        }
+        return info_dict
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -16,6 +16,7 @@ from ..utils import (
    sanitized_Request,
    str_to_int,
    unescapeHTML,
+    mimetype2ext,
 )


@@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
        }
    ]

+    @staticmethod
+    def _extract_urls(webpage):
+        # Look for embedded Dailymotion player
+        matches = re.findall(
+            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+        return list(map(lambda m: unescapeHTML(m[1]), matches))
+
    def _real_extract(self, url):
        video_id = self._match_id(url)

@@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                    type_ = media.get('type')
                    if type_ == 'application/vnd.lumberjack.manifest':
                        continue
-                    ext = determine_ext(media_url)
-                    if type_ == 'application/x-mpegURL' or ext == 'm3u8':
+                    ext = mimetype2ext(type_) or determine_ext(media_url)
+                    if ext == 'm3u8':
                        formats.extend(self._extract_m3u8_formats(
                            media_url, video_id, 'mp4', preference=-1,
                            m3u8_id='hls', fatal=False))
-                    elif type_ == 'application/f4m' or ext == 'f4m':
+                    elif ext == 'f4m':
                        formats.extend(self._extract_f4m_formats(
                            media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
                    else:
                        f = {
                            'url': media_url,
                            'format_id': 'http-%s' % quality,
+                            'ext': ext,
                        }
                        m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
                        if m:
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -66,22 +66,32 @@ class DaumIE(InfoExtractor):
            'view_count': int,
            'comment_count': int,
        },
+    }, {
+        # Requires dte_type=WEB (#9972)
+        'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU',
+        'md5': 'a8917742069a4dd442516b86e7d66529',
+        'info_dict': {
+            'id': 's3794Uf1NZeZ1qMpGpeqeRU',
+            'ext': 'mp4',
+            'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611',
+            'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회',
+            'upload_date': '20160611',
+        },
    }]

    def _real_extract(self, url):
        video_id = compat_urllib_parse_unquote(self._match_id(url))
-        query = compat_urllib_parse_urlencode({'vid': video_id})
        movie_data = self._download_json(
-            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query,
-            video_id, 'Downloading video formats info')
+            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json',
+            video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'})

        # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid
        if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id):
            return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id)

        info = self._download_xml(
-            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
-            'Downloading video info')
+            'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id,
+            'Downloading video info', query={'vid': video_id})

        formats = []
        for format_el in movie_data['output_list']['output_list']:
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -139,9 +139,9 @@ from .chirbit import (
    ChirbitProfileIE,
 )
 from .cinchcast import CinchcastIE
-from .cliprs import ClipRsIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
+from .cliprs import ClipRsIE
 from .clipsyndicate import ClipsyndicateIE
 from .closertotruth import CloserToTruthIE
 from .cloudy import CloudyIE
@@ -368,6 +368,7 @@ from .jove import JoveIE
 from .jwplatform import JWPlatformIE
 from .jpopsukitv import JpopsukiIE
 from .kaltura import KalturaIE
+from .kamcord import KamcordIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
@@ -581,6 +582,10 @@ from .nytimes import (
 from .nuvid import NuvidIE
 from .odnoklassniki import OdnoklassnikiIE
 from .oktoberfesttv import OktoberfestTVIE
+from .onet import (
+    OnetIE,
+    OnetChannelIE,
+)
 from .onionstudios import OnionStudiosIE
 from .ooyala import (
    OoyalaIE,
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -14,7 +14,10 @@ from ..utils import (
    parse_duration,
    determine_ext,
 )
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionCloudIE,
+)


 class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -188,6 +191,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
        'params': {
            'skip_download': True,
        },
+    }, {
+        # Dailymotion embed
+        'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
+        'md5': 'ee7f1828f25a648addc90cb2687b1f12',
+        'info_dict': {
+            'id': 'x4iiko0',
+            'ext': 'mp4',
+            'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
+            'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
+            'timestamp': 1467011958,
+            'upload_date': '20160627',
+            'uploader': 'France Inter',
+            'uploader_id': 'x2q2ez',
+        },
+        'add_ie': ['Dailymotion'],
    }]

    def _real_extract(self, url):
@@ -197,7 +215,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):

        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
        if dmcloud_url:
-            return self.url_result(dmcloud_url, 'DailymotionCloud')
+            return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key())
+
+        dailymotion_urls = DailymotionIE._extract_urls(webpage)
+        if dailymotion_urls:
+            return self.playlist_result([
+                self.url_result(dailymotion_url, DailymotionIE.ie_key())
+                for dailymotion_url in dailymotion_urls])

        video_id, catalogue = self._search_regex(
            (r'id-video=([^@]+@[^"]+)',
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -49,7 +49,10 @@ from .pornhub import PornHubIE
 from .xhamster import XHamsterEmbedIE
 from .tnaflix import TNAFlixNetworkEmbedIE
 from .vimeo import VimeoIE
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionCloudIE,
+)
 from .onionstudios import OnionStudiosIE
 from .viewlift import ViewLiftEmbedIE
 from .screenwavemedia import ScreenwaveMediaIE
@@ -1673,12 +1676,9 @@ class GenericIE(InfoExtractor):
        if matches:
            return _playlist_from_matches(matches, lambda m: m[-1])

-        # Look for embedded Dailymotion player
-        matches = re.findall(
-            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+        matches = DailymotionIE._extract_urls(webpage)
        if matches:
-            return _playlist_from_matches(
-                matches, lambda m: unescapeHTML(m[1]))
+            return _playlist_from_matches(matches)

        # Look for embedded Dailymotion playlist player (#3822)
        m = re.search(
--- a/youtube_dl/extractor/kamcord.py
+++ b/youtube_dl/extractor/kamcord.py
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    qualities,
+)
+
+
+class KamcordIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.kamcord.com/v/hNYRduDgWb4',
+        'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c',
+        'info_dict': {
+            'id': 'hNYRduDgWb4',
+            'ext': 'mp4',
+            'title': 'Drinking Madness',
+            'uploader': 'jacksfilms',
+            'uploader_id': '3044562',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)',
+                webpage, 'video'),
+            video_id)['video']
+
+        title = video['title']
+
+        formats = self._extract_m3u8_formats(
+            video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native')
+        self._sort_formats(formats)
+
+        uploader = video.get('user', {}).get('username')
+        uploader_id = video.get('user', {}).get('id')
+
+        view_count = int_or_none(video.get('viewCount'))
+        like_count = int_or_none(video.get('heartCount'))
+        comment_count = int_or_none(video.get('messageCount'))
+
+        preference_key = qualities(('small', 'medium', 'large'))
+
+        thumbnails = [{
+            'url': thumbnail_url,
+            'id': thumbnail_id,
+            'preference': preference_key(thumbnail_id),
+        } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items()
+            if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -11,13 +11,14 @@ from ..utils import (
    determine_ext,
    ExtractorError,
    int_or_none,
-    sanitized_Request,
    urlencode_postdata,
+    get_element_by_attribute,
+    mimetype2ext,
 )


 class MetacafeIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
    _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
    _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
    IE_NAME = 'metacafe'
@@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor):
                'uploader': 'ign',
                'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
            },
+            'skip': 'Page is temporarily unavailable.',
        },
        # AnyClip video
        {
@@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor):
                'id': 'an-dVVXnuY7Jh77J',
                'ext': 'mp4',
                'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
-                'uploader': 'anyclip',
-                'description': 'md5:38c711dd98f5bb87acf973d573442e67',
+                'uploader': 'AnyClip',
+                'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
            },
        },
        # age-restricted video
@@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor):
    def report_disclaimer(self):
        self.to_screen('Retrieving disclaimer')

-    def _real_initialize(self):
+    def _confirm_age(self):
        # Retrieve disclaimer
        self.report_disclaimer()
        self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')

        # Confirm age
-        disclaimer_form = {
-            'filters': '0',
-            'submit': "Continue - I'm over 18",
-        }
-        request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
        self.report_age_confirmation()
-        self._download_webpage(request, None, False, 'Unable to confirm age')
+        self._download_webpage(
+            self._FILTER_POST, None, False, 'Unable to confirm age',
+            data=urlencode_postdata({
+                'filters': '0',
+                'submit': "Continue - I'm over 18",
+            }), headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })

    def _real_extract(self, url):
        # Extract id and simplified title from URL
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-
-        video_id = mobj.group(1)
+        video_id, display_id = re.match(self._VALID_URL, url).groups()

        # the video may come from an external site
        m_external = re.match('^(\w{2})-(.*)$', video_id)
@@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor):
            if prefix == 'cb':
                return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')

-        # Retrieve video webpage to extract further information
-        req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id)
+        # self._confirm_age()

        # AnyClip videos require the flashversion cookie so that we get the link
        # to the mp4 file
-        mobj_an = re.match(r'^an-(.*?)$', video_id)
-        if mobj_an:
-            req.headers['Cookie'] = 'flashVersion=0;'
-        webpage = self._download_webpage(req, video_id)
+        headers = {}
+        if video_id.startswith('an-'):
+            headers['Cookie'] = 'flashVersion=0;'
+
+        # Retrieve video webpage to extract further information
+        webpage = self._download_webpage(url, video_id, headers=headers)
+
+        error = get_element_by_attribute(
+            'class', 'notfound-page-title', webpage)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        video_title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')

        # Extract URL, uploader and title from webpage
        self.report_extraction(video_id)
@@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor):
                        'player_url': player_url,
                        'ext': play_path.partition(':')[0],
                    })
+        if video_url is None:
+            flashvars = self._parse_json(self._search_regex(
+                r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
+                default=None), video_id, fatal=False)
+            if flashvars:
+                video_url = []
+                for source in flashvars.get('sources'):
+                    source_url = source.get('src')
+                    if not source_url:
+                        continue
+                    ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                    if ext == 'm3u8':
+                        video_url.extend(self._extract_m3u8_formats(
+                            source_url, video_id, 'mp4',
+                            'm3u8_native', m3u8_id='hls', fatal=False))
+                    else:
+                        video_url.append({
+                            'url': source_url,
+                            'ext': ext,
+                        })

        if video_url is None:
            raise ExtractorError('Unsupported video type')

-        video_title = self._html_search_regex(
-            r'(?im)<title>(.*) - Video</title>', webpage, 'title')
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_meta(
+            ['og:description', 'twitter:description', 'description'],
+            webpage, 'title', fatal=False)
+        thumbnail = self._html_search_meta(
+            ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
        video_uploader = self._html_search_regex(
            r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
            webpage, 'uploader nickname', fatal=False)
        duration = int_or_none(
-            self._html_search_meta('video:duration', webpage))
-
+            self._html_search_meta('video:duration', webpage, default=None))
        age_limit = (
            18
            if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
@@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor):
                'url': video_url,
                'ext': video_ext,
            }]
-
        self._sort_formats(formats)
+
        return {
            'id': video_id,
+            'display_id': display_id,
            'description': description,
            'uploader': video_uploader,
            'title': video_title,
--- a/youtube_dl/extractor/onet.py
+++ b/youtube_dl/extractor/onet.py
@@ -0,0 +1,172 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    get_element_by_class,
+    int_or_none,
+    js_to_json,
+    parse_iso8601,
+    remove_start,
+    strip_or_none,
+    url_basename,
+)
+
+
+class OnetBaseIE(InfoExtractor):
+    def _search_mvp_id(self, webpage):
+        return self._search_regex(
+            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+
+    def _extract_from_id(self, video_id, webpage):
+        response = self._download_json(
+            'http://qi.ckm.onetapi.pl/', video_id,
+            query={
+                'body[id]': video_id,
+                'body[jsonrpc]': '2.0',
+                'body[method]': 'get_asset_detail',
+                'body[params][ID_Publikacji]': video_id,
+                'body[params][Service]': 'www.onet.pl',
+                'content-type': 'application/jsonp',
+                'x-onet-app': 'player.front.onetapi.pl',
+            })
+
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
+
+        video = response['result'].get('0')
+
+        formats = []
+        for _, formats_dict in video['formats'].items():
+            if not isinstance(formats_dict, dict):
+                continue
+            for format_id, format_list in formats_dict.items():
+                if not isinstance(format_list, list):
+                    continue
+                for f in format_list:
+                    video_url = f.get('url')
+                    if not video_url:
+                        continue
+                    ext = determine_ext(video_url)
+                    if format_id == 'ism':
+                        # TODO: Support Microsoft Smooth Streaming
+                        continue
+                    elif ext == 'mpd':
+                        # TODO: Current DASH formats are broken - $Time$ pattern in
+                        # <SegmentTemplate> not implemented yet
+                        # formats.extend(self._extract_mpd_formats(
+                        #    video_url, video_id, mpd_id='dash', fatal=False))
+                        continue
+                    else:
+                        formats.append({
+                            'url': video_url,
+                            'format_id': format_id,
+                            'height': int_or_none(f.get('vertical_resolution')),
+                            'width': int_or_none(f.get('horizontal_resolution')),
+                            'abr': float_or_none(f.get('audio_bitrate')),
+                            'vbr': float_or_none(f.get('video_bitrate')),
+                        })
+        self._sort_formats(formats)
+
+        meta = video.get('meta', {})
+
+        title = self._og_search_title(webpage, default=None) or meta['title']
+        description = self._og_search_description(webpage, default=None) or meta.get('description')
+        duration = meta.get('length') or meta.get('lenght')
+        timestamp = parse_iso8601(meta.get('addDate'), ' ')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+
+class OnetIE(OnetBaseIE):
+    _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
+    IE_NAME = 'onet.tv'
+
+    _TEST = {
+        'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+        'md5': 'e3ffbf47590032ac3f27249204173d50',
+        'info_dict': {
+            'id': 'qbpyqc',
+            'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
+            'ext': 'mp4',
+            'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
+            'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
+            'upload_date': '20160705',
+            'timestamp': 1467721580,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id, video_id = mobj.group('display_id', 'id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        mvp_id = self._search_mvp_id(webpage)
+
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+
+        return info_dict
+
+
+class OnetChannelIE(OnetBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)'
+    IE_NAME = 'onet.tv:channel'
+
+    _TEST = {
+        'url': 'http://onet.tv/k/openerfestival',
+        'info_dict': {
+            'id': 'openerfestival',
+            'title': 'Open\'er Festival Live',
+            'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
+        },
+        'playlist_mincount': 46,
+    }
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, channel_id)
+
+        current_clip_info = self._parse_json(self._search_regex(
+            r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id,
+            transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s)))
+        video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
+        video_name = url_basename(current_clip_info['url'])
+
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen(
+                'Downloading just video %s because of --no-playlist' % video_name)
+            return self._extract_from_id(video_id, webpage)
+
+        self.to_screen(
+            'Downloading channel %s - add --no-playlist to just download video %s' % (
+                channel_id, video_name))
+        matches = re.findall(
+            r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
+            webpage)
+        entries = [
+            self.url_result(video_link, OnetIE.ie_key())
+            for video_link in matches]
+
+        channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
+        channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
+        return self.playlist_result(entries, channel_id, channel_title, channel_description)
--- a/youtube_dl/extractor/onionstudios.py
+++ b/youtube_dl/extractor/onionstudios.py
@@ -8,6 +8,7 @@ from ..utils import (
    determine_ext,
    int_or_none,
    float_or_none,
+    mimetype2ext,
 )


@@ -50,9 +51,8 @@ class OnionStudiosIE(InfoExtractor):
            source_url = source.get('url')
            if not source_url:
                continue
-            content_type = source.get('content_type')
-            ext = determine_ext(source_url)
-            if content_type == 'application/x-mpegURL' or ext == 'm3u8':
+            ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url)
+            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
            else:
--- a/youtube_dl/extractor/sixplay.py
+++ b/youtube_dl/extractor/sixplay.py
@@ -5,6 +5,8 @@ from .common import InfoExtractor
 from ..utils import (
    qualities,
    int_or_none,
+    mimetype2ext,
+    determine_ext,
 )


@@ -34,19 +36,21 @@ class SixPlayIE(InfoExtractor):
            source_type, source_url = source.get('type'), source.get('src')
            if not source_url or source_type == 'hls/primetime':
                continue
-            if source_type == 'application/vnd.apple.mpegURL':
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    source_url, video_id, 'mp4', 'm3u8_native',
                    m3u8_id='hls', fatal=False))
                formats.extend(self._extract_f4m_formats(
                    source_url.replace('.m3u8', '.f4m'),
                    video_id, f4m_id='hds', fatal=False))
-            elif source_type == 'video/mp4':
+            elif ext == 'mp4':
                quality = source.get('quality')
                formats.append({
                    'url': source_url,
                    'format_id': quality,
                    'quality': quality_key(quality),
+                    'ext': ext,
                })
        self._sort_formats(formats)

--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -113,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor):
            'ext': 'mp4',
            'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
            'description': 're:^Patrick Kämnitz gehört.{100,}',
+            'upload_date': '20140825',
        },
    }, {
        'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
--- a/youtube_dl/extractor/threeqsdn.py
+++ b/youtube_dl/extractor/threeqsdn.py
@@ -92,12 +92,11 @@ class ThreeQSDNIE(InfoExtractor):
            if not item_url or item_url in urls:
                return
            urls.add(item_url)
-            type_ = item.get('type')
-            ext = determine_ext(item_url, default_ext=None)
-            if type_ == 'application/dash+xml' or ext == 'mpd':
+            ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
+            if ext == 'mpd':
                formats.extend(self._extract_mpd_formats(
                    item_url, video_id, mpd_id='mpd', fatal=False))
-            elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8':
+            elif ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    item_url, video_id, 'mp4',
                    entry_protocol='m3u8' if live else 'm3u8_native',
@@ -111,7 +110,7 @@ class ThreeQSDNIE(InfoExtractor):
                formats.append({
                    'url': item_url,
                    'format_id': item.get('quality'),
-                    'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext,
+                    'ext': 'mp4' if item_url.startswith('rtsp') else ext,
                    'vcodec': 'none' if stream_type == 'audio' else None,
                })

--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -310,9 +310,17 @@ def get_element_by_id(id, html):
    return get_element_by_attribute('id', id, html)


-def get_element_by_attribute(attribute, value, html):
+def get_element_by_class(class_name, html):
+    return get_element_by_attribute(
+        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
+        html, escape_value=False)
+
+
+def get_element_by_attribute(attribute, value, html, escape_value=True):
    """Return the content of the tag with the specified attribute in the passed HTML document"""

+    value = re.escape(value) if escape_value else value
+
    m = re.search(r'''(?xs)
        <([a-zA-Z0-9:._-]+)
         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
@@ -321,7 +329,7 @@ def get_element_by_attribute(attribute, value, html):
        \s*>
        (?P<content>.*?)
        </\1>
-    ''' % (re.escape(attribute), re.escape(value)), html)
+    ''' % (re.escape(attribute), value), html)

    if not m:
        return None
@@ -2097,6 +2105,7 @@ def mimetype2ext(mt):
        return ext

    _, _, res = mt.rpartition('/')
+    res = res.lower()

    return {
        '3gpp': '3gp',
@@ -2108,6 +2117,12 @@ def mimetype2ext(mt):
        'x-flv': 'flv',
        'x-mp4-fragmented': 'mp4',
        'x-ms-wmv': 'wmv',
+        'mpegurl': 'm3u8',
+        'x-mpegurl': 'm3u8',
+        'vnd.apple.mpegurl': 'm3u8',
+        'dash+xml': 'mpd',
+        'f4m': 'f4m',
+        'f4m+xml': 'f4m',
    }.get(res, res)


--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals

-__version__ = '2016.07.06'
+__version__ = '2016.07.07'
Author	SHA1	Message	Date
Sergey M․	34bc2d9dfd	release 2016.07.07	2016-07-07 01:54:29 +07:00
Sergey M․	08c7af4afa	[kamcord] Add extractor (Closes #10001 )	2016-07-07 01:50:39 +07:00
Yen Chi Hsuan	f7291a0b7c	[daum.net] Fix extraction for specific examples Closes #9972	2016-07-07 01:26:14 +08:00
Yen Chi Hsuan	c65aa4e9e1	[brightcove:legacy] Support 'playlistTabs' and skip a dead test Closes #9965	2016-07-07 01:13:37 +08:00
Yen Chi Hsuan	ad213a1d74	[francetv] Recognize more Dailymotion embedded videos Closes #9955	2016-07-06 23:37:54 +08:00
Yen Chi Hsuan	43f1e4e41e	[onet] Add MD5 checksum	2016-07-06 20:32:03 +08:00
Yen Chi Hsuan	54b0e909d5	[amp] Fix a typo	2016-07-06 20:10:47 +08:00
Yen Chi Hsuan	f8752b86ac	[Onet,ClipRs] Add new extractor for onet.tv and use it for clip.rs Closes #9950	2016-07-06 20:09:05 +08:00
Yen Chi Hsuan	84c237fb8a	[utils] Add get_element_by_class For #9950	2016-07-06 20:02:52 +08:00
Remita Amine	ab49d7a9fa	use mimetype2ext to determine manifest ext in multiple extractors	2016-07-06 09:11:46 +01:00
Remita Amine	b4173f1551	[utils] add mimetypes to determine manifest ext(m3u8, f4m, mpd)	2016-07-06 09:06:28 +01:00
Remita Amine	2817b99cf2	[metacafe] fix info extraction(closes #8539 )(closes #3253 )	2016-07-06 02:19:55 +01:00
Remita Amine	001fffd004	[spiegel:article] update test(closes #10018 )	2016-07-06 00:16:41 +01:00