release 2014.09.16

[nhl] Match videos without catid (Fixes #3764 )
[behindkink] Remove call to report_extraction
2014-09-16 10:09:02 +02:00 · 2014-09-16 10:08:34 +02:00 · 2014-09-15 23:37:22 +02:00 · 2014-09-15 23:36:21 +02:00 · 2014-09-15 23:35:00 +02:00 · 2014-09-15 23:33:47 +02:00
11 changed files with 271 additions and 8 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -41,6 +41,8 @@ from youtube_dl.utils import (
    strip_jsonp,
    uppercase_escape,
    limit_length,
+    escape_rfc3986,
+    escape_url,
 )


@@ -294,5 +296,34 @@ class TestUtil(unittest.TestCase):
            limit_length('foo bar baz asd', 12).startswith('foo bar'))
        self.assertTrue('...' in limit_length('foo bar baz asd', 12))

+    def test_escape_rfc3986(self):
+        reserved = "!*'();:@&=+$,/?#[]"
+        unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
+        self.assertEqual(escape_rfc3986(reserved), reserved)
+        self.assertEqual(escape_rfc3986(unreserved), unreserved)
+        self.assertEqual(escape_rfc3986('тест'), '%D1%82%D0%B5%D1%81%D1%82')
+        self.assertEqual(escape_rfc3986('%D1%82%D0%B5%D1%81%D1%82'), '%D1%82%D0%B5%D1%81%D1%82')
+        self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar')
+        self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar')
+
+    def test_escape_url(self):
+        self.assertEqual(
+            escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'),
+            'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4'
+        )
+        self.assertEqual(
+            escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
+            'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290'
+        )
+        self.assertEqual(
+            escape_url('http://тест.рф/фрагмент'),
+            'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82'
+        )
+        self.assertEqual(
+            escape_url('http://тест.рф/абв?абв=абв#абв'),
+            'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2'
+        )
+        self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
+
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -28,6 +28,7 @@ from .utils import (
    compat_str,
    compat_urllib_error,
    compat_urllib_request,
+    escape_url,
    ContentTooShortError,
    date_from_str,
    DateRange,
@@ -1241,6 +1242,25 @@ class YoutubeDL(object):

    def urlopen(self, req):
        """ Start an HTTP download """
+
+        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+        # always respected by websites, some tend to give out URLs with non percent-encoded
+        # non-ASCII characters (see telemb.py, ard.py [#3412])
+        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+        # To work around aforementioned issue we will replace request's original URL with
+        # percent-encoded one
+        url = req if isinstance(req, compat_str) else req.get_full_url()
+        url_escaped = escape_url(url)
+
+        # Substitute URL if any change after escaping
+        if url != url_escaped:
+            if isinstance(req, compat_str):
+                req = url_escaped
+            else:
+                req = compat_urllib_request.Request(
+                    url_escaped, data=req.data, headers=req.headers,
+                    origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+
        return self._opener.open(req, timeout=self._socket_timeout)

    def print_debug_header(self):
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@@ -75,6 +75,7 @@ __authors__  = (
    'Ole Ernst',
    'Aaron McDaniel (mcd1992)',
    'Magnus Kolstad',
+    'Hari Padmanaban',
 )

 __license__ = 'Public Domain'
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -25,6 +25,7 @@ from .bambuser import BambuserIE, BambuserChannelIE
 from .bandcamp import BandcampIE, BandcampAlbumIE
 from .bbccouk import BBCCoUkIE
 from .beeg import BeegIE
+from .behindkink import BehindKinkIE
 from .bilibili import BiliBiliIE
 from .blinkx import BlinkxIE
 from .bliptv import BlipTVIE, BlipTVUserIE
@@ -83,6 +84,7 @@ from .dropbox import DropboxIE
 from .ebaumsworld import EbaumsWorldIE
 from .ehow import EHowIE
 from .eighttracks import EightTracksIE
+from .einthusan import EinthusanIE
 from .eitb import EitbIE
 from .ellentv import (
    EllenTVIE,
@@ -365,6 +367,7 @@ from .trutube import TruTubeIE
 from .tube8 import Tube8IE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
+from .turbo import TurboIE
 from .tutv import TutvIE
 from .tvigle import TvigleIE
 from .tvp import TvpIE
--- a/youtube_dl/extractor/behindkink.py
+++ b/youtube_dl/extractor/behindkink.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import url_basename
+
+
+class BehindKinkIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
+    _TEST = {
+        'url': 'http://www.behindkink.com/2014/08/14/ab1576-performers-voice-finally-heard-the-bill-is-killed/',
+        'md5': '41ad01222b8442089a55528fec43ec01',
+        'info_dict': {
+            'id': '36370',
+            'ext': 'mp4',
+            'title': 'AB1576 - PERFORMERS VOICE FINALLY HEARD - THE BILL IS KILLED!',
+            'description': 'The adult industry voice was finally heard as Assembly Bill 1576 remained\xa0 in suspense today at the Senate Appropriations Hearing. AB1576 was, among other industry damaging issues, a condom mandate...',
+            'upload_date': '20140814',
+            'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/08/36370_AB1576_Win.jpg',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        year = mobj.group('year')
+        month = mobj.group('month')
+        day = mobj.group('day')
+        upload_date = year + month + day
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._search_regex(
+            r"'file':\s*'([^']+)'",
+            webpage, 'URL base')
+
+        video_id = url_basename(video_url)
+        video_id = video_id.split('_')[0]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'title': self._og_search_title(webpage),
+            'display_id': display_id,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+            'upload_date': upload_date,
+            'age_limit': 18,
+        }
--- a/youtube_dl/extractor/einthusan.py
+++ b/youtube_dl/extractor/einthusan.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EinthusanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?einthusan\.com/movies/watch.php\?([^#]*?)id=(?P<id>[0-9]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.einthusan.com/movies/watch.php?id=2447',
+            'md5': 'af244f4458cd667205e513d75da5b8b1',
+            'info_dict': {
+                'id': '2447',
+                'ext': 'mp4',
+                'title': 'Ek Villain',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'description': 'md5:9d29fc91a7abadd4591fb862fa560d93',
+            }
+        },
+        {
+            'url': 'http://www.einthusan.com/movies/watch.php?id=1671',
+            'md5': 'ef63c7a803e22315880ed182c10d1c5c',
+            'info_dict': {
+                'id': '1671',
+                'ext': 'mp4',
+                'title': 'Soodhu Kavvuum',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'description': 'md5:05d8a0c0281a4240d86d76e14f2f4d51',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = self._html_search_regex(
+            r'<h1><a class="movie-title".*?>(.*?)</a></h1>', webpage, 'title')
+
+        video_url = self._html_search_regex(
+            r'''(?s)jwplayer\("mediaplayer"\)\.setup\({.*?'file': '([^']+)'.*?}\);''',
+            webpage, 'video url')
+
+        description = self._html_search_meta('description', webpage)
+        thumbnail = self._html_search_regex(
+            r'''<a class="movie-cover-wrapper".*?><img src=["'](.*?)["'].*?/></a>''',
+            webpage, "thumbnail url", fatal=False)
+        if thumbnail is not None:
+            thumbnail = thumbnail.replace('..', 'http://www.einthusan.com')
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -46,9 +46,9 @@ class NHLBaseInfoExtractor(InfoExtractor):

 class NHLIE(NHLBaseInfoExtractor):
    IE_NAME = 'nhl.com'
-    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?:[?&])id=(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9]+)'

-    _TEST = {
+    _TESTS = [{
        'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
        'info_dict': {
            'id': '453614',
@@ -58,7 +58,10 @@ class NHLIE(NHLBaseInfoExtractor):
            'duration': 18,
            'upload_date': '20131006',
        },
-    }
+    }, {
+        'url': 'http://video.flames.nhl.com/videocenter/console?id=630616',
+        'only_matching': True,
+    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
--- a/youtube_dl/extractor/nosvideo.py
+++ b/youtube_dl/extractor/nosvideo.py
@@ -8,11 +8,11 @@ from ..utils import (
    ExtractorError,
    compat_urllib_request,
    urlencode_postdata,
+    xpath_text,
    xpath_with_ns,
 )

 _x = lambda p: xpath_with_ns(p, {'xspf': 'http://xspf.org/ns/0/'})
-_find = lambda el, p: el.find(_x(p)).text.strip()


 class NosVideoIE(InfoExtractor):
@@ -53,9 +53,15 @@ class NosVideoIE(InfoExtractor):
        playlist = self._download_xml(playlist_url, video_id)

        track = playlist.find(_x('.//xspf:track'))
-        title = _find(track, './xspf:title')
-        url = _find(track, './xspf:file')
-        thumbnail = _find(track, './xspf:image')
+        if track is None:
+            raise ExtractorError(
+                'XML playlist is missing the \'track\' element',
+                expected=True)
+        title = xpath_text(track, _x('./xspf:title'), 'title')
+        url = xpath_text(track, _x('./xspf:file'), 'URL', fatal=True)
+        thumbnail = xpath_text(track, _x('./xspf:image'), 'thumbnail')
+        if title is not None:
+            title = title.strip()

        formats = [{
            'format_id': 'sd',
--- a/youtube_dl/extractor/turbo.py
+++ b/youtube_dl/extractor/turbo.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+    xpath_text,
+)
+
+
+class TurboIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P<id>[0-9]+)-'
+    _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}'
+    _TEST = {
+        'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html',
+        'md5': '33f4b91099b36b5d5a91f84b5bcba600',
+        'info_dict': {
+            'id': '454443',
+            'ext': 'mp4',
+            'duration': 3715,
+            'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+            'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        playlist = self._download_xml(self._API_URL.format(video_id), video_id)
+        item = playlist.find('./channel/item')
+        if item is None:
+            raise ExtractorError('Playlist item was not found', expected=True)
+
+        title = xpath_text(item, './title', 'title')
+        duration = int_or_none(xpath_text(item, './durate', 'duration'))
+        thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
+        description = self._og_search_description(webpage)
+
+        formats = []
+        get_quality = qualities(['3g', 'sd', 'hq'])
+        for child in item:
+            m = re.search(r'url_video_(?P<quality>.+)', child.tag)
+            if m:
+                quality = m.group('quality')
+                formats.append({
+                    'format_id': quality,
+                    'url': child.text,
+                    'quality': get_quality(quality),
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'description': description,
+            'formats': formats,
+        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1437,6 +1437,24 @@ def uppercase_escape(s):
        lambda m: unicode_escape(m.group(0))[0],
        s)

+
+def escape_rfc3986(s):
+    """Escape non-ASCII characters as suggested by RFC 3986"""
+    if sys.version_info < (3, 0) and isinstance(s, unicode):
+        s = s.encode('utf-8')
+    return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+
+
+def escape_url(url):
+    """Escape URL as suggested by RFC 3986"""
+    url_parsed = compat_urllib_parse_urlparse(url)
+    return url_parsed._replace(
+        path=escape_rfc3986(url_parsed.path),
+        params=escape_rfc3986(url_parsed.params),
+        query=escape_rfc3986(url_parsed.query),
+        fragment=escape_rfc3986(url_parsed.fragment)
+    ).geturl()
+
 try:
    struct.pack(u'!I', 0)
 except TypeError:
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2014.09.15.1'
+__version__ = '2014.09.16'
Author	SHA1	Message	Date
Philipp Hagemeister	ed86ee3b4a	release 2014.09.16	2014-09-16 10:09:02 +02:00
Philipp Hagemeister	7bb5df1cda	[nhl] Match videos without catid (Fixes #3764 )	2014-09-16 10:08:34 +02:00
Philipp Hagemeister	37a81dff04	[behindkink] Remove call to report_extraction	2014-09-15 23:37:22 +02:00
Philipp Hagemeister	fc96eb4e21	Merge remote-tracking branch '5moufl/behindkink'	2014-09-15 23:36:21 +02:00
Philipp Hagemeister	ae369738b0	Credit @haricharan for einthusan (#3755 )	2014-09-15 23:35:00 +02:00
Philipp Hagemeister	e2037b3f7d	[einthusan] Add description and beautify	2014-09-15 23:33:47 +02:00
Philipp Hagemeister	5419033935	Fixed tests	2014-09-15 23:27:18 +02:00
Philipp Hagemeister	2eebf060af	Merge commit '98703c7fbfcf06348220aa63f9422cdd792cfe1a'	2014-09-15 23:26:54 +02:00
Philipp Hagemeister	acd9db5902	Merge remote-tracking branch 'naglis/nosvideo'	2014-09-15 16:10:52 +02:00
Naglis Jonaitis	d0e8b3d59b	[nosvideo] Make more robust against missing metadata	2014-09-15 16:59:03 +03:00
Philipp Hagemeister	c15dd15388	Merge remote-tracking branch 'naglis/turbo'	2014-09-15 15:48:48 +02:00
Philipp Hagemeister	0003a5c416	Merge remote-tracking branch 'dstftw/escape-non-ascii-in-urls' Conflicts: test/test_utils.py	2014-09-15 15:40:10 +02:00
5moufl	6d1f2431bd	[BehindKink] Minor fixes - fix _VALID_URL regex - remove unnecessary variable - remove second call of report_extraction	2014-09-15 15:09:17 +02:00
Naglis Jonaitis	fdea3abdf8	[turbo] Add new extractor	2014-09-15 16:08:20 +03:00
Haricharan Padmanaban	98703c7fbf	Einthusan Add new extractor	2014-09-14 23:14:00 -05:00
5moufl	2bca84e345	[BehindKink] Add new extractor	2014-09-13 17:47:19 +02:00
Sergey M․	984e8e14ea	[utils] Remove debug garbage	2014-09-13 21:08:04 +07:00
Sergey M․	d05cfe0600	[YoutubeDL/utils] Clarify rationale for URL escaping in comment, move escape routines to utils and add some tests	2014-09-13 20:59:16 +07:00
Sergey M․	37419b4f99	[YoutubeDL] Escape non-ASCII characters in URLs urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) Working around by replacing request's original URL with escaped one	2014-09-12 23:20:17 +07:00