release 2014.03.11

[generic/funnyordie] Add support for funnyordie embeds (Fixes #2546 )
[playvid] Simplify (#2539 )
2014-03-11 16:51:50 +01:00 · 2014-03-11 16:51:36 +01:00 · 2014-03-10 20:55:47 +01:00 · 2014-03-10 20:45:45 +01:00 · 2014-03-10 20:42:54 +01:00 · 2014-03-10 20:16:49 +01:00
12 changed files with 145 additions and 87 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -701,8 +701,11 @@ class YoutubeDL(object):
        else:
            formats = info_dict['formats']

+        if not formats:
+            raise ExtractorError('No video formats found!')
+
        # We check that all the formats have the format and format_id fields
-        for (i, format) in enumerate(formats):
+        for i, format in enumerate(formats):
            if format.get('format_id') is None:
                format['format_id'] = compat_str(i)
            if format.get('format') is None:
@@ -1167,7 +1170,7 @@ class YoutubeDL(object):

    def urlopen(self, req):
        """ Start an HTTP download """
-        return self._opener.open(req)
+        return self._opener.open(req, timeout=self._socket_timeout)

    def print_debug_header(self):
        if not self.params.get('verbose'):
@@ -1198,7 +1201,7 @@ class YoutubeDL(object):

    def _setup_opener(self):
        timeout_val = self.params.get('socket_timeout')
-        timeout = 600 if timeout_val is None else float(timeout_val)
+        self._socket_timeout = 600 if timeout_val is None else float(timeout_val)

        opts_cookiefile = self.params.get('cookiefile')
        opts_proxy = self.params.get('proxy')
@@ -1236,7 +1239,3 @@ class YoutubeDL(object):
        # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
        opener.addheaders = []
        self._opener = opener
-
-        # TODO remove this global modification
-        compat_urllib_request.install_opener(opener)
-        socket.setdefaulttimeout(timeout)
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -53,7 +53,6 @@ from .dailymotion import (
    DailymotionUserIE,
 )
 from .daum import DaumIE
-from .depositfiles import DepositFilesIE
 from .dotsub import DotsubIE
 from .dreisat import DreiSatIE
 from .defense import DefenseGouvFrIE
@@ -176,6 +175,7 @@ from .ooyala import OoyalaIE
 from .orf import ORFIE
 from .pbs import PBSIE
 from .photobucket import PhotobucketIE
+from .playvid import PlayvidIE
 from .podomatic import PodomaticIE
 from .pornhd import PornHdIE
 from .pornhub import PornHubIE
@@ -247,8 +247,8 @@ from .ustream import UstreamIE, UstreamChannelIE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
-from .vesti import VestiIE
 from .vevo import VevoIE
+from .vgtrk import VGTRKIE
 from .vice import ViceIE
 from .viddler import ViddlerIE
 from .videobam import VideoBamIE
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -118,9 +118,6 @@ class InfoExtractor(object):
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

-    _real_extract() must return a *list* of information dictionaries as
-    described above.
-
    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """
--- a/youtube_dl/extractor/depositfiles.py
+++ b/youtube_dl/extractor/depositfiles.py
@@ -1,60 +0,0 @@
-import re
-import os
-import socket
-
-from .common import InfoExtractor
-from ..utils import (
-    compat_http_client,
-    compat_str,
-    compat_urllib_error,
-    compat_urllib_parse,
-    compat_urllib_request,
-
-    ExtractorError,
-)
-
-
-class DepositFilesIE(InfoExtractor):
-    """Information extractor for depositfiles.com"""
-
-    _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
-
-    def _real_extract(self, url):
-        file_id = url.split('/')[-1]
-        # Rebuild url in english locale
-        url = 'http://depositfiles.com/en/files/' + file_id
-
-        # Retrieve file webpage with 'Free download' button pressed
-        free_download_indication = {'gateway_result' : '1'}
-        request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
-        try:
-            self.report_download_webpage(file_id)
-            webpage = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
-
-        # Search for the real file URL
-        mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
-        if (mobj is None) or (mobj.group(1) is None):
-            # Try to figure out reason of the error.
-            mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
-            if (mobj is not None) and (mobj.group(1) is not None):
-                restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
-                raise ExtractorError(u'%s' % restriction_message)
-            else:
-                raise ExtractorError(u'Unable to extract download URL from: %s' % url)
-
-        file_url = mobj.group(1)
-        file_extension = os.path.splitext(file_url)[1][1:]
-
-        # Search for file title
-        file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
-
-        return [{
-            'id':       file_id.decode('utf-8'),
-            'url':      file_url.decode('utf-8'),
-            'uploader': None,
-            'upload_date':  None,
-            'title':    file_title,
-            'ext':      file_extension.decode('utf-8'),
-        }]
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -1,12 +1,13 @@
 from __future__ import unicode_literals

+import json
 import re

 from .common import InfoExtractor


 class FunnyOrDieIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
+    _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
    _TEST = {
        'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
        'file': '0732f586d7.mp4',
@@ -30,10 +31,20 @@ class FunnyOrDieIE(InfoExtractor):
            [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''],
            webpage, 'video URL', flags=re.DOTALL)

+        if mobj.group('type') == 'embed':
+            post_json = self._search_regex(
+                r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
+            post = json.loads(post_json)['attachment']
+            title = post['name']
+            description = post.get('description')
+        else:
+            title = self._og_search_title(webpage)
+            description = self._og_search_description(webpage)
+
        return {
            'id': video_id,
            'url': video_url,
            'ext': 'mp4',
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
+            'title': title,
+            'description': description,
        }
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,7 +4,6 @@ from __future__ import unicode_literals

 import os
 import re
-import xml.etree.ElementTree

 from .common import InfoExtractor
 from .youtube import YoutubeIE
@@ -17,6 +16,7 @@ from ..utils import (

    ExtractorError,
    HEADRequest,
+    parse_xml,
    smuggle_url,
    unescapeHTML,
    unified_strdate,
@@ -134,6 +134,17 @@ class GenericIE(InfoExtractor):
                'skip_download': True,
            },
        },
+        # funnyordie embed
+        {
+            'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
+            'md5': '7cf780be104d40fea7bae52eed4a470e',
+            'info_dict': {
+                'id': '18e820ec3f',
+                'ext': 'mp4',
+                'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
+                'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
+            }
+        },
    ]

    def report_download_webpage(self, video_id):
@@ -274,7 +285,7 @@ class GenericIE(InfoExtractor):

        # Is it an RSS feed?
        try:
-            doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
+            doc = parse_xml(webpage)
            if doc.tag == 'rss':
                return self._extract_rss(url, video_id, doc)
        except compat_xml_parse_error:
@@ -432,6 +443,14 @@ class GenericIE(InfoExtractor):
        if mobj is not None:
            return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))

+        # Look for funnyordie embed
+        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
+                     for eurl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
        # Start with something easy: JW Player in SWFObject
        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
        if mobj is None:
--- a/youtube_dl/extractor/playvid.py
+++ b/youtube_dl/extractor/playvid.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+)
+
+
+class PlayvidIE(InfoExtractor):
+    _VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
+    _TEST = {
+        'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
+        'md5': '44930f8afa616efdf9482daf4fe53e1e',
+        'info_dict': {
+            'id': 'agbDDi7WZTV',
+            'ext': 'mp4',
+            'title': 'Michelle Lewin in Miami Beach',
+            'duration': 240,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = None
+        duration = None
+        video_thumbnail = None
+        formats = []
+
+        # most of the information is stored in the flashvars
+        flashvars = self._html_search_regex(
+            r'flashvars="(.+?)"', webpage, 'flashvars')
+
+        infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+        for info in infos:
+            videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
+            if videovars_match:
+                key = videovars_match.group(1)
+                val = videovars_match.group(2)
+
+                if key == 'title':
+                    video_title = compat_urllib_parse.unquote_plus(val)
+                if key == 'duration':
+                    try:
+                        duration = int(val)
+                    except ValueError:
+                        pass
+                if key == 'big_thumb':
+                    video_thumbnail = val
+
+                videourl_match = re.match(
+                    r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
+                if videourl_match:
+                    height = int(videourl_match.group('resolution'))
+                    formats.append({
+                        'height': height,
+                        'url': val,
+                    })
+        self._sort_formats(formats)
+
+        # Extract title - should be in the flashvars; if not, look elsewhere
+        if video_title is None:
+            video_title = self._html_search_regex(
+                r'<title>(.*?)</title', webpage, 'title')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_title,
+            'thumbnail': video_thumbnail,
+            'duration': duration,
+            'description': None,
+            'age_limit': 18
+        }
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -57,7 +57,7 @@ class VevoIE(InfoExtractor):
            'age_limit': 18,
            'title': 'Tunnel Vision (Explicit)',
            'uploader': 'Justin Timberlake',
-            'upload_date': '20130704',
+            'upload_date': '20130703',
        },
        'params': {
            'skip_download': 'true',
@@ -169,7 +169,7 @@ class VevoIE(InfoExtractor):

        timestamp_ms = int(self._search_regex(
            r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
-        upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
+        upload_date = datetime.datetime.utcfromtimestamp(timestamp_ms // 1000)
        return {
            'id': video_id,
            'title': video_info['title'],
--- a/youtube_dl/extractor/vgtrk.py
+++ b/youtube_dl/extractor/vgtrk.py
@@ -10,10 +10,9 @@ from ..utils import (
 )


-class VestiIE(InfoExtractor):
-    IE_NAME = 'vesti'
-    IE_DESC = 'Вести.Ru'
-    _VALID_URL = r'http://(?:(?:.+?\.)?vesti\.ru|(?:2\.)?russia\.tv|tvkultura\.ru|rutv\.ru)/(?P<id>.+)'
+class VGTRKIE(InfoExtractor):
+    IE_DESC = 'ВГТРК'
+    _VALID_URL = r'http://(?:.+?\.)?(?:vesti\.ru|russia2?\.tv|tvkultura\.ru|rutv\.ru)/(?P<id>.+)'

    _TESTS = [
        {
@@ -85,7 +84,7 @@ class VestiIE(InfoExtractor):
                # m3u8 download
                'skip_download': True,
            },
-            'skip': 'Blocked outside Russia'
+            'skip': 'Blocked outside Russia',
        },
        {
            'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
@@ -142,6 +141,7 @@ class VestiIE(InfoExtractor):
                # m3u8 download
                'skip_download': True,
            },
+            'skip': 'Blocked outside Russia',
        },
        {
            'url': 'http://tvkultura.ru/video/show/brand_id/31724/episode_id/972347/video_id/978186',
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1285,10 +1285,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):

        # Decide which formats to download
        try:
-            mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
+            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
            if not mobj:
                raise ValueError('Could not find vevo ID')
-            ytplayer_config = json.loads(mobj.group(1))
+            json_code = uppercase_escape(mobj.group(1))
+            ytplayer_config = json.loads(json_code)
            args = ytplayer_config['args']
            # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
            # this signatures are encrypted
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -22,6 +22,7 @@ import struct
 import subprocess
 import sys
 import traceback
+import xml.etree.ElementTree
 import zlib

 try:
@@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd):

 def urlencode_postdata(*args, **kargs):
    return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+
+
+def parse_xml(s):
+    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
+        def doctype(self, name, pubid, system):
+            pass  # Ignore doctypes
+
+    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
+    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
+    return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2014.03.10'
+__version__ = '2014.03.11'
Author	SHA1	Message	Date
Philipp Hagemeister	34ca5d9ba0	release 2014.03.11	2014-03-11 16:51:50 +01:00
Philipp Hagemeister	60cc4dc4b4	[generic/funnyordie] Add support for funnyordie embeds (Fixes #2546 )	2014-03-11 16:51:36 +01:00
Philipp Hagemeister	db95dc13a1	[playvid] Simplify (#2539 )	2014-03-10 20:55:47 +01:00
Philipp Hagemeister	777ac90791	Merge remote-tracking branch 'MikeCol/playvid_extract'	2014-03-10 20:45:45 +01:00
Philipp Hagemeister	04f9bebbcb	Merge remote-tracking branch 'jaimeMF/remove_global_opener'	2014-03-10 20:42:54 +01:00
MikeCol	4ea3137e41	Playvid extractor	2014-03-10 20:16:49 +01:00
Jaime Marquínez Ferrándiz	a0792b738e	Don't install the global url opener All the code uses now the urlopen method of YoutubeDL	2014-03-10 19:04:51 +01:00
Jaime Marquínez Ferrándiz	19a41fc613	Don't set the global socket timeout Use the timeout argument of the `OpenerDirector.open` method instead	2014-03-10 19:03:37 +01:00
Sergey M․	3ee52157fb	[vgtrk] Rename vesti extractor	2014-03-11 00:58:05 +07:00
Sergey M․	c4d197ee2d	[vesti] Fix _VALID_URL regex	2014-03-11 00:49:41 +07:00
Philipp Hagemeister	a33932cfe3	[vevo] Correct test value The date is now interpreted as UTC for consistency.	2014-03-10 17:56:54 +01:00
Philipp Hagemeister	bcf89ce62c	[generic] Suppress warning about doctypes in RSS parser	2014-03-10 17:31:32 +01:00
Philipp Hagemeister	e3899d0e00	Merge branch 'master' of github.com:rg3/youtube-dl	2014-03-10 16:42:22 +01:00
Philipp Hagemeister	dcb00da49c	[depositfiles] Remove extractor This site requires a CAPTCHA to download, supports arbitrary files and not only audio/video, and I can't find a single uncopyrighted video with a quick google search. Closes #1255	2014-03-10 16:41:08 +01:00
Sergey M․	aa51d20d19	[vesti] Skip geo restricted test	2014-03-10 22:31:22 +07:00
Philipp Hagemeister	ae7ed92057	[youtube] Fix up invalid JSON	2014-03-10 13:35:45 +01:00
Philipp Hagemeister	e45b31d9bd	[vevo] Interpret date as UTC instead of local time	2014-03-10 13:12:57 +01:00
Philipp Hagemeister	5a25f39653	Correct extractor documentation	2014-03-10 13:09:55 +01:00