release 2013.07.08.1

[archive.org] Add extractor (Fixes #1003 )
release 2013.07.08
2013-07-08 02:05:22 +02:00 · 2013-07-08 02:05:02 +02:00 · 2013-07-08 01:29:16 +02:00 · 2013-07-08 01:28:19 +02:00 · 2013-07-08 01:15:19 +02:00 · 2013-07-08 01:13:55 +02:00
15 changed files with 419 additions and 62 deletions
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -61,6 +61,17 @@ class TestAllURLsMatching(unittest.TestCase):
                else:
                    self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url))

+    def test_keywords(self):
+        ies = gen_extractors()
+        matching_ies = lambda url: [ie.IE_NAME for ie in ies
+                                    if ie.suitable(url) and ie.IE_NAME != 'generic']
+        self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions'])
+        self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions'])
+        self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral'])
+        self.assertEqual(matching_ies(':tds'), ['ComedyCentral'])
+        self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral'])
+        self.assertEqual(matching_ies(':cr'), ['ComedyCentral'])
+

 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -1,15 +1,18 @@

+from .archiveorg import ArchiveOrgIE
 from .ard import ARDIE
 from .arte import ArteTvIE
 from .auengine import AUEngineIE
 from .bandcamp import BandcampIE
 from .bliptv import BlipTVIE, BlipTVUserIE
 from .breakcom import BreakIE
+from .brightcove import BrightcoveIE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
 from .cspan import CSpanIE
 from .dailymotion import DailymotionIE
 from .depositfiles import DepositFilesIE
+from .dreisat import DreiSatIE
 from .eighttracks import EightTracksIE
 from .escapist import EscapistIE
 from .facebook import FacebookIE
@@ -68,7 +71,15 @@ from .yahoo import YahooIE, YahooSearchIE
 from .youjizz import YouJizzIE
 from .youku import YoukuIE
 from .youporn import YouPornIE
-from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE, YoutubeShowIE
+from .youtube import (
+    YoutubeIE,
+    YoutubePlaylistIE,
+    YoutubeSearchIE,
+    YoutubeUserIE,
+    YoutubeChannelIE,
+    YoutubeShowIE,
+    YoutubeSubscriptionsIE,
+)
 from .zdf import ZDFIE


--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -0,0 +1,66 @@
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unified_strdate,
+)
+
+
+class ArchiveOrgIE(InfoExtractor):
+    IE_NAME = 'archive.org'
+    IE_DESC = 'archive.org videos'
+    _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+    _TEST = {
+        u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
+        u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
+        u'md5': u'8af1d4cf447933ed3c7f4871162602db',
+        u'info_dict': {
+            u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
+            u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
+            u"upload_date": u"19681210",
+            u"uploader": u"SRI International"
+        }
+    }
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
+        json_data = self._download_webpage(json_url, video_id)
+        data = json.loads(json_data)
+
+        title = data['metadata']['title'][0]
+        description = data['metadata']['description'][0]
+        uploader = data['metadata']['creator'][0]
+        upload_date = unified_strdate(data['metadata']['date'][0])
+
+        formats = [{
+                'format': fdata['format'],
+                'url': 'http://' + data['server'] + data['dir'] + fn,
+                'file_size': int(fdata['size']),
+            }
+            for fn,fdata in data['files'].items()
+            if 'Video' in fdata['format']]
+        formats.sort(key=lambda fdata: fdata['file_size'])
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+            'upload_date': upload_date,
+        }
+        thumbnail = data.get('misc', {}).get('image')
+        if thumbnail:
+            info['thumbnail'] = thumbnail
+
+        # TODO: Remove when #980 has been merged
+        info['url'] = formats[-1]['url']
+        info['ext'] = determine_ext(formats[-1]['url'])
+
+        return self.video_result(info)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -1,5 +1,6 @@
 import re
 import json
+import xml.etree.ElementTree

 from .common import InfoExtractor
 from ..utils import (
@@ -16,8 +17,8 @@ class ArteTvIE(InfoExtractor):
    www.arte.tv/guide, the extraction process is different for each one.
    The videos expire in 7 days, so we can't add tests.
    """
-    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
-    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
+    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
    _LIVE_URL = r'index-[0-9]+\.html$'

    IE_NAME = u'arte.tv'
@@ -57,22 +58,24 @@ class ArteTvIE(InfoExtractor):
        mobj = re.match(self._EMISSION_URL, url)
        if mobj is not None:
            name = mobj.group('name')
+            lang = mobj.group('lang')
            # This is not a real id, it can be for example AJT for the news
            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
            video_id = mobj.group('id')
-            return self._extract_emission(url, video_id)
+            return self._extract_emission(url, video_id, lang)

        mobj = re.match(self._VIDEOS_URL, url)
        if mobj is not None:
            id = mobj.group('id')
-            return self._extract_video(url, id)
+            lang = mobj.group('lang')
+            return self._extract_video(url, id, lang)

        if re.search(self._LIVE_URL, video_id) is not None:
            raise ExtractorError(u'Arte live streams are not yet supported, sorry')
            # self.extractLiveStream(url)
            # return

-    def _extract_emission(self, url, video_id):
+    def _extract_emission(self, url, video_id, lang):
        """Extract from www.arte.tv/guide"""
        webpage = self._download_webpage(url, video_id)
        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
@@ -91,6 +94,16 @@ class ArteTvIE(InfoExtractor):
                     }

        formats = player_info['VSR'].values()
+        def _match_lang(f):
+            # Return true if that format is in the language of the url
+            if lang == 'fr':
+                l = 'F'
+            elif lang == 'de':
+                l = 'A'
+            regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
+            return any(re.match(r, f['versionCode']) for r in regexes)
+        # Some formats may not be in the same language as the url
+        formats = filter(_match_lang, formats)
        # We order the formats by quality
        formats = sorted(formats, key=lambda f: int(f['height']))
        # Pick the best quality
@@ -103,13 +116,15 @@ class ArteTvIE(InfoExtractor):

        return info_dict

-    def _extract_video(self, url, video_id):
+    def _extract_video(self, url, video_id, lang):
        """Extract from videos.arte.tv"""
-        config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
-        config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
-        config_xml = self._download_webpage(config_xml_url, video_id)
-        config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
-        config_xml = self._download_webpage(config_xml_url, video_id)
+        ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
+        ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
+        ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
+        ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
+        config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
+        config_xml_url = config_node.attrib['ref']
+        config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')

        video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
        def _key(m):
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -8,6 +8,14 @@ from ..utils import (
 )

 class AUEngineIE(InfoExtractor):
+    _TEST = {
+        u'url': u'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
+        u'file': u'lfvlytY6.mp4',
+        u'md5': u'48972bdbcf1a3a2f5533e62425b41d4f',
+        u'info_dict': {
+            u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
+        }
+    }
    _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'

    def _real_extract(self, url):
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -27,7 +27,7 @@ class BlipTVIE(InfoExtractor):
    _TEST = {
        u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
        u'file': u'5779306.m4v',
-        u'md5': u'b2d849efcf7ee18917e4b4d9ff37cafe',
+        u'md5': u'80baf1ec5c3d2019037c1c707d676b9f',
        u'info_dict': {
            u"upload_date": u"20111205", 
            u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596", 
@@ -103,7 +103,12 @@ class BlipTVIE(InfoExtractor):
                    data = json_data

                upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
-                video_url = data['media']['url']
+                if 'additionalMedia' in data:
+                    formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
+                    best_format = formats[-1]
+                    video_url = best_format['url']
+                else:
+                    video_url = data['media']['url']
                umobj = re.match(self._URL_EXT, video_url)
                if umobj is None:
                    raise ValueError('Can not determine filename extension')
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -0,0 +1,32 @@
+import re
+import json
+
+from .common import InfoExtractor
+
+class BrightcoveIE(InfoExtractor):
+    _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        query = mobj.group('query')
+        video_id = mobj.group('id')
+
+        request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+        webpage = self._download_webpage(request_url, video_id)
+
+        self.report_extraction(video_id)
+        info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
+        info = json.loads(info)['data']
+        video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+        renditions = video_info['renditions']
+        renditions = sorted(renditions, key=lambda r: r['size'])
+        best_format = renditions[-1]
+        
+        return {'id': video_id,
+                'title': video_info['displayName'],
+                'url': best_format['defaultURL'], 
+                'ext': 'mp4',
+                'description': video_info.get('shortDescription'),
+                'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
+                'uploader': video_info.get('publisherName'),
+                }
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -3,6 +3,7 @@ import os
 import re
 import socket
 import sys
+import netrc

 from ..utils import (
    compat_http_client,
@@ -36,6 +37,8 @@ class InfoExtractor(object):
    The following fields are optional:

    format:         The video format, defaults to ext (used for --get-format)
+    thumbnails:     A list of dictionaries (with the entries "resolution" and
+                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
@@ -161,6 +164,10 @@ class InfoExtractor(object):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

+    def report_login(self):
+        """Report attempt to log in."""
+        self.to_screen(u'Logging in')
+
    #Methods for following #608
    #They set the correct value of the '_type' key
    def video_result(self, video_info):
@@ -225,6 +232,36 @@ class InfoExtractor(object):
        else:
            return res

+    def _get_login_info(self):
+        """
+        Get the the login info as (username, password)
+        It will look in the netrc file using the _NETRC_MACHINE value
+        If there's no info available, return (None, None)
+        """
+        if self._downloader is None:
+            return (None, None)
+
+        username = None
+        password = None
+        downloader_params = self._downloader.params
+
+        # Attempt to use provided username and password or .netrc data
+        if downloader_params.get('username', None) is not None:
+            username = downloader_params['username']
+            password = downloader_params['password']
+        elif downloader_params.get('usenetrc', False):
+            try:
+                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+                if info is not None:
+                    username = info[0]
+                    password = info[2]
+                else:
+                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+            except (IOError, netrc.NetrcParseError) as err:
+                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
+        
+        return (username, password)
+
 class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -1,12 +1,11 @@
 import re
+import json

 from .common import InfoExtractor
 from ..utils import (
    compat_urllib_request,
-    compat_urllib_parse,

    ExtractorError,
-    unescapeHTML,
 )

 class DailymotionIE(InfoExtractor):
@@ -39,33 +38,10 @@ class DailymotionIE(InfoExtractor):

        # Extract URL, uploader and title from webpage
        self.report_extraction(video_id)
-        mobj = re.search(r'\s*var flashvars = (.*)', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        flashvars = compat_urllib_parse.unquote(mobj.group(1))

-        for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
-            if key in flashvars:
-                max_quality = key
-                self.to_screen(u'Using %s' % key)
-                break
-        else:
-            raise ExtractorError(u'Unable to extract video URL')
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
+                                              webpage, 'title')

-        mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video URL')
-
-        video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
-
-        # TODO: support choosing qualities
-
-        mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = unescapeHTML(mobj.group('title'))
-
-        video_uploader = None
        video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                             # Looking for official user
                                             r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,6 +52,25 @@ class DailymotionIE(InfoExtractor):
        if mobj is not None:
            video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)

+        embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
+        embed_page = self._download_webpage(embed_url, video_id,
+                                            u'Downloading embed page')
+        info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info')
+        info = json.loads(info)
+
+        # TODO: support choosing qualities
+
+        for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
+                    'stream_h264_hq_url','stream_h264_url',
+                    'stream_h264_ld_url']:
+            if info.get(key):#key in info and info[key]:
+                max_quality = key
+                self.to_screen(u'Using %s' % key)
+                break
+        else:
+            raise ExtractorError(u'Unable to extract video URL')
+        video_url = info[max_quality]
+
        return [{
            'id':       video_id,
            'url':      video_url,
@@ -83,4 +78,5 @@ class DailymotionIE(InfoExtractor):
            'upload_date':  video_upload_date,
            'title':    video_title,
            'ext':      video_extension,
+            'thumbnail': info['thumbnail_url']
        }]
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    unified_strdate,
+)
+
+
+class DreiSatIE(InfoExtractor):
+    IE_NAME = '3sat'
+    _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+    _TEST = {
+        u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
+        u'file': u'36983.webm',
+        u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
+        u'info_dict': {
+            u"title": u"Kaffeeland Schweiz",
+            u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", 
+            u"uploader": u"3sat",
+            u"upload_date": u"20130622"
+        }
+    }
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
+        details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+
+        thumbnail_els = details_doc.findall('.//teaserimage')
+        thumbnails = [{
+            'width': te.attrib['key'].partition('x')[0],
+            'height': te.attrib['key'].partition('x')[2],
+            'url': te.text,
+        } for te in thumbnail_els]
+
+        information_el = details_doc.find('.//information')
+        video_title = information_el.find('./title').text
+        video_description = information_el.find('./detail').text
+
+        details_el = details_doc.find('.//details')
+        video_uploader = details_el.find('./channel').text
+        upload_date = unified_strdate(details_el.find('./airtime').text)
+
+        format_els = details_doc.findall('.//formitaet')
+        formats = [{
+            'format_id': fe.attrib['basetype'],
+            'width': int(fe.find('./width').text),
+            'height': int(fe.find('./height').text),
+            'url': fe.find('./url').text,
+            'filesize': int(fe.find('./filesize').text),
+            'video_bitrate': int(fe.find('./videoBitrate').text),
+            '3sat_qualityname': fe.find('./quality').text,
+        } for fe in format_els
+            if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
+
+        def _sortkey(format):
+            qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
+            prefer_http = 1 if 'rtmp' in format['url'] else 0
+            return (qidx, prefer_http, format['video_bitrate'])
+        formats.sort(key=_sortkey)
+
+        info = {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'description': video_description,
+            'thumbnails': thumbnails,
+            'thumbnail': thumbnails[-1]['url'],
+            'uploader': video_uploader,
+            'upload_date': upload_date,
+        }
+
+        # TODO: Remove when #980 has been merged
+        info['url'] = formats[-1]['url']
+        info['ext'] = determine_ext(formats[-1]['url'])
+
+        return self.video_result(info)
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -1,24 +1,34 @@
+# coding: utf-8
+
 import re
+import json

 from .common import InfoExtractor


 class TudouIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)'
+    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
    _TEST = {
        u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
-        u'file': u'159447792.f4v',
-        u'md5': u'ad7c358a01541e926a1e413612c6b10a',
+        u'file': u'159448201.f4v',
+        u'md5': u'140a49ed444bd22f93330985d8475fcb',
        u'info_dict': {
-            u"title": u"\u5361\u9a6c\u4e54\u56fd\u8db3\u5f00\u5927\u811a\u957f\u4f20\u51b2\u540a\u96c6\u9526"
+            u"title": u"卡马乔国足开大脚长传冲吊集锦"
        }
    }

+    def _url_for_id(self, id, quality = None):
+        info_url = "http://v2.tudou.com/f?id="+str(id)
+        if quality:
+            info_url += '&hd' + quality
+        webpage = self._download_webpage(info_url, id, "Opening the info webpage")
+        final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
+        return final_url
+
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(2).replace('.html','')
+        video_id = mobj.group(2)
        webpage = self._download_webpage(url, video_id)
-        video_id = re.search('"k":(.+?),',webpage).group(1)
        title = re.search(",kw:\"(.+)\"",webpage)
        if title is None:
            title = re.search(",kw: \'(.+)\'",webpage)
@@ -27,14 +37,27 @@ class TudouIE(InfoExtractor):
        if thumbnail_url is None:
            thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
        thumbnail_url = thumbnail_url.group(1)
-        info_url = "http://v2.tudou.com/f?id="+str(video_id)
-        webpage = self._download_webpage(info_url, video_id, "Opening the info webpage")
-        final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1)
-        ext = (final_url.split('?')[0]).split('.')[-1]
-        return [{
-            'id':        video_id,
-            'url':       final_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
-        }]
+
+        segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
+        segments = json.loads(segs_json)
+        # It looks like the keys are the arguments that have to be passed as
+        # the hd field in the request url, we pick the higher
+        quality = sorted(segments.keys())[-1]
+        parts = segments[quality]
+        result = []
+        len_parts = len(parts)
+        if len_parts > 1:
+            self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
+        for part in parts:
+            part_id = part['k']
+            final_url = self._url_for_id(part_id, quality)
+            ext = (final_url.split('?')[0]).split('.')[-1]
+            part_info = {'id': part_id,
+                          'url': final_url,
+                          'ext': ext,
+                          'title': title,
+                          'thumbnail': thumbnail_url,
+                          }
+            result.append(part_info)
+
+        return result
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -17,6 +17,7 @@ class VimeoIE(InfoExtractor):

    # _VALID_URL matches Vimeo URLs
    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
+    _NETRC_MACHINE = 'vimeo'
    IE_NAME = u'vimeo'
    _TEST = {
        u'url': u'http://vimeo.com/56015672',
@@ -31,6 +32,25 @@ class VimeoIE(InfoExtractor):
        }
    }

+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        login_url = 'https://vimeo.com/log_in'
+        webpage = self._download_webpage(login_url, None, False)
+        token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+        data = compat_urllib_parse.urlencode({'email': username,
+                                              'password': password,
+                                              'action': 'login',
+                                              'service': 'vimeo',
+                                              'token': token,
+                                              })
+        login_request = compat_urllib_request.Request(login_url, data)
+        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_request.add_header('Cookie', 'xsrft=%s' % token)
+        self._download_webpage(login_request, None, False, u'Wrong login info')
+
    def _verify_video_password(self, url, video_id, webpage):
        password = self._downloader.params.get('videopassword', None)
        if password is None:
@@ -50,6 +70,9 @@ class VimeoIE(InfoExtractor):
                               u'Verifying the password',
                               u'Wrong password')

+    def _real_initialize(self):
+        self._login()
+
    def _real_extract(self, url, new_video=True):
        # Extract ID from URL
        mobj = re.match(self._VALID_URL, url)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -4,6 +4,7 @@ import json
 import netrc
 import re
 import socket
+import itertools

 from .common import InfoExtractor, SearchInfoExtractor
 from ..utils import (
@@ -19,6 +20,7 @@ from ..utils import (
    ExtractorError,
    unescapeHTML,
    unified_strdate,
+    orderedSet,
 )


@@ -122,7 +124,7 @@ class YoutubeIE(InfoExtractor):
    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
-        if YoutubePlaylistIE.suitable(url): return False
+        if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None

    def report_lang(self):
@@ -471,7 +473,12 @@ class YoutubeIE(InfoExtractor):
        video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])

        # thumbnail image
-        if 'thumbnail_url' not in video_info:
+        # We try first to get a high quality image:
+        m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+                            video_webpage, re.DOTALL)
+        if m_thumb is not None:
+            video_thumbnail = m_thumb.group(1)
+        elif 'thumbnail_url' not in video_info:
            self._downloader.report_warning(u'unable to extract video thumbnail')
            video_thumbnail = ''
        else:   # don't panic if we can't find it
@@ -864,3 +871,34 @@ class YoutubeShowIE(InfoExtractor):
        m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
        self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
        return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
+
+
+class YoutubeSubscriptionsIE(YoutubeIE):
+    """It's a subclass of YoutubeIE because we need to login"""
+    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    IE_NAME = u'youtube:subscriptions'
+    _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
+    _PAGING_STEP = 30
+
+    # Overwrite YoutubeIE properties we don't want
+    _TESTS = []
+    @classmethod
+    def suitable(cls, url):
+        return re.match(cls._VALID_URL, url) is not None
+
+    def _real_extract(self, url):
+        feed_entries = []
+        # The step argument is available only in 2.7 or higher
+        for i in itertools.count(0):
+            paging = i*self._PAGING_STEP
+            info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
+                                          u'Downloading page %s' % i)
+            info = json.loads(info)
+            feed_html = info['feed_html']
+            m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
+            ids = orderedSet(m.group(1) for m in m_ids)
+            feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+            if info['paging'] is None:
+                break
+        return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -623,7 +623,7 @@ def unified_strdate(date_str):
    date_str = date_str.replace(',',' ')
    # %z (UTC offset) is only supported in python>=3.2
    date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
-    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
+    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
    for expression in format_expressions:
        try:
            upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -631,6 +631,13 @@ def unified_strdate(date_str):
            pass
    return upload_date

+def determine_ext(url):
+    guess = url.partition(u'?')[0].rpartition(u'.')[2]
+    if re.match(r'^[A-Za-z0-9]+$', guess):
+        return guess
+    else:
+        return u'unknown_video'
+
 def date_from_str(date_str):
    """
    Return a datetime object from a string in the format YYYYMMDD or
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2013.07.02'
+__version__ = '2013.07.08.1'
Author	SHA1	Message	Date
Philipp Hagemeister	29293c1e09	release 2013.07.08.1	2013-07-08 02:05:22 +02:00
Philipp Hagemeister	5fe3a3c3fb	[archive.org] Add extractor (Fixes #1003 )	2013-07-08 02:05:02 +02:00
Philipp Hagemeister	b04621d155	release 2013.07.08	2013-07-08 01:29:16 +02:00
Philipp Hagemeister	b227060388	[arte] Always look for the JSON URL (Fixes #1002 )	2013-07-08 01:28:19 +02:00
Philipp Hagemeister	d93e4dcbb7	Merge branch 'master' of github.com:rg3/youtube-dl	2013-07-08 01:15:19 +02:00
Philipp Hagemeister	73e79f2a1b	[3sat] Add support (Fixes #1001 )	2013-07-08 01:13:55 +02:00
Jaime Marquínez Ferrándiz	fc79158de2	VimeoIE: authentication support (closes #885 ) and add a method in the base InfoExtractor to get the login info	2013-07-07 23:24:34 +02:00
Jaime Marquínez Ferrándiz	7763b04e5f	YoutubeIE: extract the thumbnail in the best possible quality	2013-07-07 21:21:15 +02:00
Philipp Hagemeister	9d7b44b4cc	release 2013.07.07.01	2013-07-07 17:13:56 +02:00
Philipp Hagemeister	897f36d179	[youtube:subscriptions] Use colon for differentiation of shortcuts	2013-07-07 17:13:26 +02:00
Philipp Hagemeister	94c3637f6d	release 2013.07.07	2013-07-07 16:55:06 +02:00
Jaime Marquínez Ferrándiz	04cc96173c	[youtube] Add and extractor for the subscriptions feed (closes #498 ) It can be downloaded using the ytsubscriptions keyword. It needs the login information.	2013-07-07 13:58:23 +02:00
Jaime Marquínez Ferrándiz	fbaaad49d7	Add BrightcoveIE (closes #832 ) It only accepts the urls that are use for embedding the video, it doesn't search in generic webpages to find Brightcove videos	2013-07-05 21:31:50 +02:00
Jaime Marquínez Ferrándiz	b29f3b250d	DailymotionIE: extract thumbnail	2013-07-05 19:39:37 +02:00
Philipp Hagemeister	fa343954d4	release 2013.07.05	2013-07-05 14:46:24 +02:00
Jaime Marquínez Ferrándiz	2491f5898e	DailymotionIE: simplify the extraction of the title and remove an unused assignment of video_uploader	2013-07-05 14:20:15 +02:00
Jaime Marquínez Ferrándiz	b27c856fbc	Dailymotion: fix the download of the video in the max quality (closes #986 )	2013-07-05 14:15:26 +02:00
Jaime Marquínez Ferrándiz	9941ceb331	ArteTVIE: support emission urls that don't contain the video id Like http://www.arte.tv/guide/fr/emissions/AJT/arte-journal	2013-07-05 12:56:41 +02:00
Philipp Hagemeister	c536d38059	release 2013.07.04	2013-07-04 18:07:34 +02:00
Philipp Hagemeister	8de64cac98	[arte] Fix language selection (Fixes #988 )	2013-07-04 18:07:03 +02:00
Philipp Hagemeister	6d6d286539	Merge branch 'master' of github.com:rg3/youtube-dl	2013-07-03 16:36:42 +02:00
Philipp Hagemeister	5d2eac9eba	[auengine] Add tests (Fixes #985 )	2013-07-03 16:36:36 +02:00
Jaime Marquínez Ferrándiz	9826925a20	ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download	2013-07-02 17:34:40 +02:00
Jaime Marquínez Ferrándiz	24a267b562	TudouIE: extract all the segments of the video and download the best quality (closes #975 ) Also simplify a bit the extraction of the id from the url and write directly the title for the test video	2013-07-02 12:38:24 +02:00
Jaime Marquínez Ferrándiz	d4da3d6116	BlipTVIE: download the video in the best quality (closes #215 )	2013-07-02 10:40:23 +02:00