Compare commits

...

55 Commits

Author SHA1 Message Date
Philipp Hagemeister
fd87ff26b9 release 2013.07.11 2013-07-11 21:04:59 +02:00
Jaime Marquínez Ferrándiz
85347e1cb6 YoutubeIE: a new algo for length 83 2013-07-11 20:21:45 +02:00
Jaime Marquínez Ferrándiz
41897817cc GametrailersIE: support multipart videos
Use xml.etree.ElementTree instead of re when possible
2013-07-11 18:24:53 +02:00
Philipp Hagemeister
45ff2d51d0 [brightcove] add import 2013-07-11 16:31:29 +02:00
Philipp Hagemeister
5de3ece225 [brightcove] fix on Python 2.6 2013-07-11 16:16:02 +02:00
Philipp Hagemeister
df50a41289 [arte] Fix on 2.6 2013-07-11 16:12:16 +02:00
Philipp Hagemeister
59ae56fad5 Add helper function find_path_attr 2013-07-11 16:12:08 +02:00
Philipp Hagemeister
690e872c51 Remove video_result helper method
Calling it was more complex then actually including the type in the video info
2013-07-11 12:12:30 +02:00
Philipp Hagemeister
81082e046e [ehow] improve minor bits 2013-07-11 12:11:00 +02:00
Philipp Hagemeister
3fa9550837 Merge remote-tracking branch 'yasoob/master' 2013-07-11 12:02:16 +02:00
M.Yasoob Khalid
b1082f01a6 added test for ehow 2013-07-11 14:30:25 +05:00
M.Yasoob Khalid
f35b84c807 added an IE for Ehow videos 2013-07-11 14:25:14 +05:00
Jaime Marquínez Ferrándiz
117adb0f0f GenericIE: detect more Brightcove videos
In some sites "class" contains more that BrightcoveExperience
2013-07-11 00:25:38 +02:00
Jaime Marquínez Ferrándiz
abb285fb1b BrightcoveIE: add support for playlists 2013-07-11 00:04:33 +02:00
Jaime Marquínez Ferrándiz
a431154706 Set the playlist_index and playlist fields for already resolved video results. 2013-07-10 23:36:30 +02:00
Jaime Marquínez Ferrándiz
cfe50f04ed GenericIE: Detect videos from Brightcove
Brightcove videos info is usually found in an <object class="BrightcoveExperience"></object> node, this is passed to a new method of BrightcoveIE that builds a url to extract the video.
2013-07-10 17:49:11 +02:00
Jaime Marquínez Ferrándiz
a7055eb956 YoutubeIE: show a more meaningful error when it founds a rtmpe download (related #343) 2013-07-10 14:35:11 +02:00
Philipp Hagemeister
0a1be1e997 release 2013.07.10 2013-07-10 11:36:11 +02:00
Jaime Marquínez Ferrándiz
c93898dae9 YoutubeIE: new algo for length 83 (closes #1017 and closes #1016) 2013-07-10 10:44:04 +02:00
Jaime Marquínez Ferrándiz
ebdf2af727 GameSpotIE: support more urls and download videos in the best quality 2013-07-09 20:07:52 +02:00
Jaime Marquínez Ferrándiz
c108eb73cc YoutubeIE: Fix vevo explicit videos (closes #956)
When an age restricted video is detected it simulates accessing the video from www.youtube.com/v/{video_id}
2013-07-09 15:43:44 +02:00
Jaime Marquínez Ferrándiz
3a1375dacf VeohIE: remove debug logging 2013-07-09 11:11:55 +02:00
Jaime Marquínez Ferrándiz
41bece30b4 DotsubIE: simplify and extract the upload date
Do not declare variables for fields in the info dictionary.
2013-07-08 22:40:42 +02:00
Jaime Marquínez Ferrándiz
16ea58cbda Merge pull request #1009 from yasoob/master
Added an IE and test for dotsub.com videos. ( closes #1008 )
2013-07-08 22:21:06 +02:00
Jaime Marquínez Ferrándiz
99e350d902 Add VeohIE (closes #1006) 2013-07-08 22:02:23 +02:00
M.Yasoob Khalid
13e06d298c added an IE and test for dotsub. 2013-07-09 00:05:52 +05:00
Jaime Marquínez Ferrándiz
81f0259b9e YoutubeSubscriptionsIE: raise an error if there's no login information. 2013-07-08 11:24:11 +02:00
Jaime Marquínez Ferrándiz
fefcb5d314 YoutubeIE: use the new method in the base IE for getting the login info 2013-07-08 11:24:11 +02:00
Philipp Hagemeister
345b0c9b46 Remove dead code 2013-07-08 02:13:50 +02:00
Philipp Hagemeister
20c3893f0e Do not redefine variables in list comprehensions 2013-07-08 02:12:20 +02:00
Philipp Hagemeister
29293c1e09 release 2013.07.08.1 2013-07-08 02:05:22 +02:00
Philipp Hagemeister
5fe3a3c3fb [archive.org] Add extractor (Fixes #1003) 2013-07-08 02:05:02 +02:00
Philipp Hagemeister
b04621d155 release 2013.07.08 2013-07-08 01:29:16 +02:00
Philipp Hagemeister
b227060388 [arte] Always look for the JSON URL (Fixes #1002) 2013-07-08 01:28:19 +02:00
Philipp Hagemeister
d93e4dcbb7 Merge branch 'master' of github.com:rg3/youtube-dl 2013-07-08 01:15:19 +02:00
Philipp Hagemeister
73e79f2a1b [3sat] Add support (Fixes #1001) 2013-07-08 01:13:55 +02:00
Jaime Marquínez Ferrándiz
fc79158de2 VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info 2013-07-07 23:24:34 +02:00
Jaime Marquínez Ferrándiz
7763b04e5f YoutubeIE: extract the thumbnail in the best possible quality 2013-07-07 21:21:15 +02:00
Philipp Hagemeister
9d7b44b4cc release 2013.07.07.01 2013-07-07 17:13:56 +02:00
Philipp Hagemeister
897f36d179 [youtube:subscriptions] Use colon for differentiation of shortcuts 2013-07-07 17:13:26 +02:00
Philipp Hagemeister
94c3637f6d release 2013.07.07 2013-07-07 16:55:06 +02:00
Jaime Marquínez Ferrándiz
04cc96173c [youtube] Add and extractor for the subscriptions feed (closes #498)
It can be downloaded using the ytsubscriptions keyword.
It needs the login information.
2013-07-07 13:58:23 +02:00
Jaime Marquínez Ferrándiz
fbaaad49d7 Add BrightcoveIE (closes #832)
It only accepts the urls that are use for embedding the video, it doesn't search in generic webpages to find Brightcove videos
2013-07-05 21:31:50 +02:00
Jaime Marquínez Ferrándiz
b29f3b250d DailymotionIE: extract thumbnail 2013-07-05 19:39:37 +02:00
Philipp Hagemeister
fa343954d4 release 2013.07.05 2013-07-05 14:46:24 +02:00
Jaime Marquínez Ferrándiz
2491f5898e DailymotionIE: simplify the extraction of the title and remove an unused assignment of video_uploader 2013-07-05 14:20:15 +02:00
Jaime Marquínez Ferrándiz
b27c856fbc Dailymotion: fix the download of the video in the max quality (closes #986) 2013-07-05 14:15:26 +02:00
Jaime Marquínez Ferrándiz
9941ceb331 ArteTVIE: support emission urls that don't contain the video id
Like http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
2013-07-05 12:56:41 +02:00
Philipp Hagemeister
c536d38059 release 2013.07.04 2013-07-04 18:07:34 +02:00
Philipp Hagemeister
8de64cac98 [arte] Fix language selection (Fixes #988) 2013-07-04 18:07:03 +02:00
Philipp Hagemeister
6d6d286539 Merge branch 'master' of github.com:rg3/youtube-dl 2013-07-03 16:36:42 +02:00
Philipp Hagemeister
5d2eac9eba [auengine] Add tests (Fixes #985) 2013-07-03 16:36:36 +02:00
Jaime Marquínez Ferrándiz
9826925a20 ArteTVIE: extract the video with the correct language
Some urls from the French version of the page could download the German version.

Also instead of extracting the json url from the webpage, build it to skip the download
2013-07-02 17:34:40 +02:00
Jaime Marquínez Ferrándiz
24a267b562 TudouIE: extract all the segments of the video and download the best quality (closes #975)
Also simplify a bit the extraction of the id from the url and write directly the title for the test video
2013-07-02 12:38:24 +02:00
Jaime Marquínez Ferrándiz
d4da3d6116 BlipTVIE: download the video in the best quality (closes #215) 2013-07-02 10:40:23 +02:00
27 changed files with 794 additions and 165 deletions

View File

@@ -20,9 +20,9 @@ tests = [
# 84 # 84
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
"<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
# 83 # 83 - vflcaqGO8 2013/07/11
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
"D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"), "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"),
# 82 # 82
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
"Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),

View File

@@ -61,6 +61,17 @@ class TestAllURLsMatching(unittest.TestCase):
else: else:
self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url)) self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url))
def test_keywords(self):
ies = gen_extractors()
matching_ies = lambda url: [ie.IE_NAME for ie in ies
if ie.suitable(url) and ie.IE_NAME != 'generic']
self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions'])
self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions'])
self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral'])
self.assertEqual(matching_ies(':tds'), ['ComedyCentral'])
self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral'])
self.assertEqual(matching_ies(':cr'), ['ComedyCentral'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -4,6 +4,7 @@
import sys import sys
import unittest import unittest
import xml.etree.ElementTree
# Allow direct execution # Allow direct execution
import os import os
@@ -16,6 +17,7 @@ from youtube_dl.utils import unescapeHTML
from youtube_dl.utils import orderedSet from youtube_dl.utils import orderedSet
from youtube_dl.utils import DateRange from youtube_dl.utils import DateRange
from youtube_dl.utils import unified_strdate from youtube_dl.utils import unified_strdate
from youtube_dl.utils import find_xpath_attr
if sys.version_info < (3, 0): if sys.version_info < (3, 0):
_compat_str = lambda b: b.decode('unicode-escape') _compat_str = lambda b: b.decode('unicode-escape')
@@ -112,5 +114,18 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
def test_find_xpath_attr(self):
testxml = u'''<root>
<node/>
<node x="a"/>
<node x="a" y="c" />
<node x="b" y="d" />
</root>'''
doc = xml.etree.ElementTree.fromstring(testxml)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -45,7 +45,7 @@ class TestYoutubeSig(unittest.TestCase):
def test_83(self): def test_83(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<" wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ" right = "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"
self.assertEqual(sig(wrong), right) self.assertEqual(sig(wrong), right)
def test_82(self): def test_82(self):

View File

@@ -348,6 +348,7 @@ class YoutubeDL(object):
result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
if result_type == 'video': if result_type == 'video':
ie_result.update(extra_info)
if 'playlist' not in ie_result: if 'playlist' not in ie_result:
# It isn't part of a playlist # It isn't part of a playlist
ie_result['playlist'] = None ie_result['playlist'] = None

View File

@@ -1,15 +1,19 @@
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE from .ard import ARDIE
from .arte import ArteTvIE from .arte import ArteTvIE
from .auengine import AUEngineIE from .auengine import AUEngineIE
from .bandcamp import BandcampIE from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .collegehumor import CollegeHumorIE from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE from .comedycentral import ComedyCentralIE
from .cspan import CSpanIE from .cspan import CSpanIE
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
from .depositfiles import DepositFilesIE from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .escapist import EscapistIE from .escapist import EscapistIE
from .facebook import FacebookIE from .facebook import FacebookIE
@@ -55,6 +59,7 @@ from .tumblr import TumblrIE
from .tutv import TutvIE from .tutv import TutvIE
from .ustream import UstreamIE from .ustream import UstreamIE
from .vbox7 import Vbox7IE from .vbox7 import Vbox7IE
from .veoh import VeohIE
from .vevo import VevoIE from .vevo import VevoIE
from .vimeo import VimeoIE from .vimeo import VimeoIE
from .vine import VineIE from .vine import VineIE
@@ -68,7 +73,15 @@ from .yahoo import YahooIE, YahooSearchIE
from .youjizz import YouJizzIE from .youjizz import YouJizzIE
from .youku import YoukuIE from .youku import YoukuIE
from .youporn import YouPornIE from .youporn import YouPornIE
from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE, YoutubeShowIE from .youtube import (
YoutubeIE,
YoutubePlaylistIE,
YoutubeSearchIE,
YoutubeUserIE,
YoutubeChannelIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
)
from .zdf import ZDFIE from .zdf import ZDFIE

View File

@@ -0,0 +1,67 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
unified_strdate,
)
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
_VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
_TEST = {
u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
u'md5': u'8af1d4cf447933ed3c7f4871162602db',
u'info_dict': {
u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
u"upload_date": u"19681210",
u"uploader": u"SRI International"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
json_data = self._download_webpage(json_url, video_id)
data = json.loads(json_data)
title = data['metadata']['title'][0]
description = data['metadata']['description'][0]
uploader = data['metadata']['creator'][0]
upload_date = unified_strdate(data['metadata']['date'][0])
formats = [{
'format': fdata['format'],
'url': 'http://' + data['server'] + data['dir'] + fn,
'file_size': int(fdata['size']),
}
for fn,fdata in data['files'].items()
if 'Video' in fdata['format']]
formats.sort(key=lambda fdata: fdata['file_size'])
info = {
'_type': 'video',
'id': video_id,
'title': title,
'formats': formats,
'description': description,
'uploader': uploader,
'upload_date': upload_date,
}
thumbnail = data.get('misc', {}).get('image')
if thumbnail:
info['thumbnail'] = thumbnail
# TODO: Remove when #980 has been merged
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
return info

View File

@@ -32,7 +32,7 @@ class ARDIE(InfoExtractor):
# determine title and media streams from webpage # determine title and media streams from webpage
html = self._download_webpage(url, video_id) html = self._download_webpage(url, video_id)
title = re.search(self._TITLE, html).group('title') title = re.search(self._TITLE, html).group('title')
streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)]
if not streams: if not streams:
assert '"fsk"' in html assert '"fsk"' in html
raise ExtractorError(u'This video is only available after 8:00 pm') raise ExtractorError(u'This video is only available after 8:00 pm')

View File

@@ -1,12 +1,11 @@
import re import re
import json import json
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
# This is used by the not implemented extractLiveStream method
compat_urllib_parse,
ExtractorError, ExtractorError,
find_xpath_attr,
unified_strdate, unified_strdate,
) )
@@ -16,8 +15,8 @@ class ArteTvIE(InfoExtractor):
www.arte.tv/guide, the extraction process is different for each one. www.arte.tv/guide, the extraction process is different for each one.
The videos expire in 7 days, so we can't add tests. The videos expire in 7 days, so we can't add tests.
""" """
_EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
_LIVE_URL = r'index-[0-9]+\.html$' _LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv' IE_NAME = u'arte.tv'
@@ -27,6 +26,7 @@ class ArteTvIE(InfoExtractor):
return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
# TODO implement Live Stream # TODO implement Live Stream
# from ..utils import compat_urllib_parse
# def extractLiveStream(self, url): # def extractLiveStream(self, url):
# video_lang = url.split('/')[-4] # video_lang = url.split('/')[-4]
# info = self.grep_webpage( # info = self.grep_webpage(
@@ -56,23 +56,24 @@ class ArteTvIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._EMISSION_URL, url) mobj = re.match(self._EMISSION_URL, url)
if mobj is not None: if mobj is not None:
name = mobj.group('name') lang = mobj.group('lang')
# This is not a real id, it can be for example AJT for the news # This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
video_id = mobj.group('id') video_id = mobj.group('id')
return self._extract_emission(url, video_id) return self._extract_emission(url, video_id, lang)
mobj = re.match(self._VIDEOS_URL, url) mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None: if mobj is not None:
id = mobj.group('id') id = mobj.group('id')
return self._extract_video(url, id) lang = mobj.group('lang')
return self._extract_video(url, id, lang)
if re.search(self._LIVE_URL, video_id) is not None: if re.search(self._LIVE_URL, video_id) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry') raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url) # self.extractLiveStream(url)
# return # return
def _extract_emission(self, url, video_id): def _extract_emission(self, url, video_id, lang):
"""Extract from www.arte.tv/guide""" """Extract from www.arte.tv/guide"""
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
@@ -91,6 +92,16 @@ class ArteTvIE(InfoExtractor):
} }
formats = player_info['VSR'].values() formats = player_info['VSR'].values()
def _match_lang(f):
# Return true if that format is in the language of the url
if lang == 'fr':
l = 'F'
elif lang == 'de':
l = 'A'
regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
formats = filter(_match_lang, formats)
# We order the formats by quality # We order the formats by quality
formats = sorted(formats, key=lambda f: int(f['height'])) formats = sorted(formats, key=lambda f: int(f['height']))
# Pick the best quality # Pick the best quality
@@ -103,13 +114,15 @@ class ArteTvIE(InfoExtractor):
return info_dict return info_dict
def _extract_video(self, url, video_id): def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv""" """Extract from videos.arte.tv"""
config_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
config_xml = self._download_webpage(config_xml_url, video_id) ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url') ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
config_xml = self._download_webpage(config_xml_url, video_id) config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml)) video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
def _key(m): def _key(m):

View File

@@ -8,6 +8,14 @@ from ..utils import (
) )
class AUEngineIE(InfoExtractor): class AUEngineIE(InfoExtractor):
_TEST = {
u'url': u'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
u'file': u'lfvlytY6.mp4',
u'md5': u'48972bdbcf1a3a2f5533e62425b41d4f',
u'info_dict': {
u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
}
}
_VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?' _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'
def _real_extract(self, url): def _real_extract(self, url):

View File

@@ -27,7 +27,7 @@ class BlipTVIE(InfoExtractor):
_TEST = { _TEST = {
u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
u'file': u'5779306.m4v', u'file': u'5779306.m4v',
u'md5': u'b2d849efcf7ee18917e4b4d9ff37cafe', u'md5': u'80baf1ec5c3d2019037c1c707d676b9f',
u'info_dict': { u'info_dict': {
u"upload_date": u"20111205", u"upload_date": u"20111205",
u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596", u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596",
@@ -103,7 +103,12 @@ class BlipTVIE(InfoExtractor):
data = json_data data = json_data
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
video_url = data['media']['url'] if 'additionalMedia' in data:
formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
best_format = formats[-1]
video_url = best_format['url']
else:
video_url = data['media']['url']
umobj = re.match(self._URL_EXT, video_url) umobj = re.match(self._URL_EXT, video_url)
if umobj is None: if umobj is None:
raise ValueError('Can not determine filename extension') raise ValueError('Can not determine filename extension')
@@ -184,5 +189,5 @@ class BlipTVUserIE(InfoExtractor):
pagenum += 1 pagenum += 1
urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(url, 'BlipTV') for url in urls] url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
return [self.playlist_result(url_entries, playlist_title = username)] return [self.playlist_result(url_entries, playlist_title = username)]

View File

@@ -0,0 +1,85 @@
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
find_xpath_attr,
)
class BrightcoveIE(InfoExtractor):
_VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
# There is a test for Brigtcove in GenericIE, that way we test both the download
# and the detection of videos, and we don't have to find an URL that is always valid
@classmethod
def _build_brighcove_url(cls, object_str):
"""
Build a Brightcove url from a xml string containing
<object class="BrightcoveExperience">{params}</object>
"""
object_doc = xml.etree.ElementTree.fromstring(object_str)
assert u'BrightcoveExperience' in object_doc.attrib['class']
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey.attrib['value']
videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
if videoPlayer is not None:
params['@videoPlayer'] = videoPlayer.attrib['value']
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = mobj.group('query')
m_video_id = re.search(r'videoPlayer=(\d+)', query)
if m_video_id is not None:
video_id = m_video_id.group(1)
return self._get_video_info(video_id, query)
else:
player_key = self._search_regex(r'playerKey=(.+?)(&|$)', query, 'playlist_id')
return self._get_playlist_info(player_key)
def _get_video_info(self, video_id, query):
request_url = self._FEDERATED_URL_TEMPLATE % query
webpage = self._download_webpage(request_url, video_id)
self.report_extraction(video_id)
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
info = json.loads(info)['data']
video_info = info['programmedContent']['videoPlayer']['mediaDTO']
return self._extract_video_info(video_info)
def _get_playlist_info(self, player_key):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
player_key, u'Downloading playlist information')
playlist_info = json.loads(playlist_info)['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
return self.playlist_result(videos, playlist_id=playlist_info['id'],
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):
renditions = video_info['renditions']
renditions = sorted(renditions, key=lambda r: r['size'])
best_format = renditions[-1]
return {'id': video_info['id'],
'title': video_info['displayName'],
'url': best_format['defaultURL'],
'ext': 'mp4',
'description': video_info.get('shortDescription'),
'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
'uploader': video_info.get('publisherName'),
}

View File

@@ -3,6 +3,7 @@ import os
import re import re
import socket import socket
import sys import sys
import netrc
from ..utils import ( from ..utils import (
compat_http_client, compat_http_client,
@@ -36,6 +37,8 @@ class InfoExtractor(object):
The following fields are optional: The following fields are optional:
format: The video format, defaults to ext (used for --get-format) format: The video format, defaults to ext (used for --get-format)
thumbnails: A list of dictionaries (with the entries "resolution" and
"url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image. thumbnail: Full URL to a video thumbnail image.
description: One-line video description. description: One-line video description.
uploader: Full name of the video uploader. uploader: Full name of the video uploader.
@@ -161,12 +164,11 @@ class InfoExtractor(object):
"""Report attempt to confirm age.""" """Report attempt to confirm age."""
self.to_screen(u'Confirming age') self.to_screen(u'Confirming age')
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
#Methods for following #608 #Methods for following #608
#They set the correct value of the '_type' key
def video_result(self, video_info):
"""Returns a video"""
video_info['_type'] = 'video'
return video_info
def url_result(self, url, ie=None): def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed""" """Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info #TODO: ie should be the class used for getting the info
@@ -225,6 +227,36 @@ class InfoExtractor(object):
else: else:
return res return res
def _get_login_info(self):
"""
Get the the login info as (username, password)
It will look in the netrc file using the _NETRC_MACHINE value
If there's no info available, return (None, None)
"""
if self._downloader is None:
return (None, None)
username = None
password = None
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
username = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
username = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
return (username, password)
class SearchInfoExtractor(InfoExtractor): class SearchInfoExtractor(InfoExtractor):
""" """
Base class for paged search queries extractors. Base class for paged search queries extractors.

View File

@@ -1,12 +1,11 @@
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_urllib_request, compat_urllib_request,
compat_urllib_parse,
ExtractorError, ExtractorError,
unescapeHTML,
) )
class DailymotionIE(InfoExtractor): class DailymotionIE(InfoExtractor):
@@ -39,33 +38,10 @@ class DailymotionIE(InfoExtractor):
# Extract URL, uploader and title from webpage # Extract URL, uploader and title from webpage
self.report_extraction(video_id) self.report_extraction(video_id)
mobj = re.search(r'\s*var flashvars = (.*)', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
flashvars = compat_urllib_parse.unquote(mobj.group(1))
for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
if key in flashvars: webpage, 'title')
max_quality = key
self.to_screen(u'Using %s' % key)
break
else:
raise ExtractorError(u'Unable to extract video URL')
mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
if mobj is None:
raise ExtractorError(u'Unable to extract video URL')
video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
# TODO: support choosing qualities
mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = unescapeHTML(mobj.group('title'))
video_uploader = None
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user # Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,6 +52,25 @@ class DailymotionIE(InfoExtractor):
if mobj is not None: if mobj is not None:
video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
embed_page = self._download_webpage(embed_url, video_id,
u'Downloading embed page')
info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info')
info = json.loads(info)
# TODO: support choosing qualities
for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
'stream_h264_hq_url','stream_h264_url',
'stream_h264_ld_url']:
if info.get(key):#key in info and info[key]:
max_quality = key
self.to_screen(u'Using %s' % key)
break
else:
raise ExtractorError(u'Unable to extract video URL')
video_url = info[max_quality]
return [{ return [{
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
@@ -83,4 +78,5 @@ class DailymotionIE(InfoExtractor):
'upload_date': video_upload_date, 'upload_date': video_upload_date,
'title': video_title, 'title': video_title,
'ext': video_extension, 'ext': video_extension,
'thumbnail': info['thumbnail_url']
}] }]

View File

@@ -0,0 +1,41 @@
import re
import json
import time
from .common import InfoExtractor
class DotsubIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)'
_TEST = {
u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv',
u'md5': u'0914d4d69605090f623b7ac329fea66e',
u'info_dict': {
u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
u"uploader": u"4v4l0n42",
u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
u'upload_date': u'20101213',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
webpage = self._download_webpage(info_url, video_id)
info = json.loads(webpage)
date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
return [{
'id': video_id,
'url': info['mediaURI'],
'ext': 'flv',
'title': info['title'],
'thumbnail': info['screenshotURI'],
'description': info['description'],
'uploader': info['user'],
'view_count': info['numberOfViews'],
'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}]

View File

@@ -0,0 +1,85 @@
# coding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
determine_ext,
unified_strdate,
)
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
_VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TEST = {
u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
u'file': u'36983.webm',
u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
u'info_dict': {
u"title": u"Kaffeeland Schweiz",
u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...",
u"uploader": u"3sat",
u"upload_date": u"20130622"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
thumbnail_els = details_doc.findall('.//teaserimage')
thumbnails = [{
'width': te.attrib['key'].partition('x')[0],
'height': te.attrib['key'].partition('x')[2],
'url': te.text,
} for te in thumbnail_els]
information_el = details_doc.find('.//information')
video_title = information_el.find('./title').text
video_description = information_el.find('./detail').text
details_el = details_doc.find('.//details')
video_uploader = details_el.find('./channel').text
upload_date = unified_strdate(details_el.find('./airtime').text)
format_els = details_doc.findall('.//formitaet')
formats = [{
'format_id': fe.attrib['basetype'],
'width': int(fe.find('./width').text),
'height': int(fe.find('./height').text),
'url': fe.find('./url').text,
'filesize': int(fe.find('./filesize').text),
'video_bitrate': int(fe.find('./videoBitrate').text),
'3sat_qualityname': fe.find('./quality').text,
} for fe in format_els
if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
def _sortkey(format):
qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
prefer_http = 1 if 'rtmp' in format['url'] else 0
return (qidx, prefer_http, format['video_bitrate'])
formats.sort(key=_sortkey)
info = {
'_type': 'video',
'id': video_id,
'title': video_title,
'formats': formats,
'description': video_description,
'thumbnails': thumbnails,
'thumbnail': thumbnails[-1]['url'],
'uploader': video_uploader,
'upload_date': upload_date,
}
# TODO: Remove when #980 has been merged
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
return info

View File

@@ -0,0 +1,51 @@
import re
from ..utils import (
compat_urllib_parse,
determine_ext
)
from .common import InfoExtractor
class EHowIE(InfoExtractor):
IE_NAME = u'eHow'
_VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = {
u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
u'file': u'12245069.flv',
u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
u'info_dict': {
u"title": u"Hardwood Flooring Basics",
u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
u"uploader": u"Erick Nathan"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
webpage, u'video URL')
final_url = compat_urllib_parse.unquote(video_url)
thumbnail_url = self._search_regex(r'<meta property="og:image" content="(.+?)" />',
webpage, u'thumbnail URL')
uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
webpage, u'uploader')
title = self._search_regex(r'<meta property="og:title" content="(.+?)" />',
webpage, u'Video title').replace(' | eHow', '')
description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
webpage, u'video description')
ext = determine_ext(final_url)
return {
'_type': 'video',
'id': video_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
'description': description,
'uploader': uploader,
}

View File

@@ -4,14 +4,15 @@ import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unified_strdate, unified_strdate,
compat_urllib_parse,
) )
class GameSpotIE(InfoExtractor): class GameSpotIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/([^/]+)/videos/([^/]+)-([^/d]+)/' _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = { _TEST = {
u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
u"file": u"6410818.mp4", u"file": u"6410818.mp4",
u"md5": u"5569d64ca98db01f0177c934fe8c1e9b", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": { u"info_dict": {
u"title": u"Arma III - Community Guide: SITREP I", u"title": u"Arma III - Community Guide: SITREP I",
u"upload_date": u"20130627", u"upload_date": u"20130627",
@@ -21,13 +22,22 @@ class GameSpotIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(3).split("-")[-1] page_id = mobj.group('page_id')
info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id) webpage = self._download_webpage(url, page_id)
video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
r'http://www\.gamespot\.com/videoembed/(\d+)'],
webpage, 'video id')
data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
info_xml = self._download_webpage(info_url, video_id) info_xml = self._download_webpage(info_url, video_id)
doc = xml.etree.ElementTree.fromstring(info_xml) doc = xml.etree.ElementTree.fromstring(info_xml)
clip_el = doc.find('./playList/clip') clip_el = doc.find('./playList/clip')
video_url = clip_el.find('./URI').text http_urls = [{'url': node.find('filePath').text,
'rate': int(node.find('rate').text)}
for node in clip_el.find('./httpURI')]
best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
video_url = best_quality['url']
title = clip_el.find('./title').text title = clip_el.find('./title').text
ext = video_url.rpartition('.')[2] ext = video_url.rpartition('.')[2]
thumbnail_url = clip_el.find('./screenGrabURI').text thumbnail_url = clip_el.find('./screenGrabURI').text

View File

@@ -1,4 +1,5 @@
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
@@ -11,7 +12,7 @@ class GametrailersIE(InfoExtractor):
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = { _TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'zbvr8i.flv', u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.flv',
u'md5': u'c3edbc995ab4081976e16779bd96a878', u'md5': u'c3edbc995ab4081976e16779bd96a878',
u'info_dict': { u'info_dict': {
u"title": u"E3 2013: Debut Trailer" u"title": u"E3 2013: Debut Trailer"
@@ -24,45 +25,39 @@ class GametrailersIE(InfoExtractor):
if mobj is None: if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url) raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id') video_id = mobj.group('id')
video_type = mobj.group('type')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if video_type == 'full-episodes': mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
else: webpage, u'mgid')
mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
mgid = self._search_regex(mgid_re, webpage, u'mgid')
data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
video_id, u'Downloading video info') video_id, u'Downloading video info')
links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
default_thumb = doc.find('./channel/image/url').text
media_namespace = {'media': 'http://search.yahoo.com/mrss/'}
parts = [{
'title': video_doc.find('title').text,
'ext': 'flv',
'id': video_doc.find('guid').text.rpartition(':')[2],
# Videos are actually flv not mp4
'url': self._get_video_url(video_doc.find('media:group/media:content', media_namespace).attrib['url'], video_id),
# The thumbnail may not be defined, it would be ''
'thumbnail': video_doc.find('media:group/media:thumbnail', media_namespace).attrib['url'] or default_thumb,
'description': video_doc.find('description').text,
} for video_doc in doc.findall('./channel/item')]
return parts
def _get_video_url(self, mediagen_url, video_id):
if 'acceptMethods' not in mediagen_url:
mediagen_url += '&acceptMethods=fms'
links_webpage = self._download_webpage(mediagen_url,
video_id, u'Downloading video urls info') video_id, u'Downloading video urls info')
doc = xml.etree.ElementTree.fromstring(links_webpage)
self.report_extraction(video_id) urls = list(doc.iter('src'))
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.* if len(urls) == 0:
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
<image>.*
<url>(?P<thumb>.*?)</url>.*
</image>'''
m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
if m_info is None:
raise ExtractorError(u'Unable to extract video info')
video_title = m_info.group('title')
video_description = m_info.group('description')
video_thumb = m_info.group('thumb')
m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
if m_urls is None or len(m_urls) == 0:
raise ExtractorError(u'Unable to extract video url') raise ExtractorError(u'Unable to extract video url')
# They are sorted from worst to best quality # They are sorted from worst to best quality
video_url = m_urls[-1].group('url') return urls[-1].text
return {'url': video_url,
'id': video_id,
'title': video_title,
# Videos are actually flv not mp4
'ext': 'flv',
'thumbnail': video_thumb,
'description': video_description,
}

View File

@@ -1,3 +1,5 @@
# encoding: utf-8
import os import os
import re import re
@@ -9,20 +11,34 @@ from ..utils import (
ExtractorError, ExtractorError,
) )
from .brightcove import BrightcoveIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
IE_DESC = u'Generic downloader that works on some sites' IE_DESC = u'Generic downloader that works on some sites'
_VALID_URL = r'.*' _VALID_URL = r'.*'
IE_NAME = u'generic' IE_NAME = u'generic'
_TEST = { _TESTS = [
u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', {
u'file': u'13601338388002.mp4', u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', u'file': u'13601338388002.mp4',
u'info_dict': { u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
u"uploader": u"www.hodiho.fr", u'info_dict': {
u"title": u"R\u00e9gis plante sa Jeep" u"uploader": u"www.hodiho.fr",
} u"title": u"R\u00e9gis plante sa Jeep"
} }
},
{
u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
u'file': u'2371591881001.mp4',
u'md5': u'9e80619e0a94663f0bdc849b4566af19',
u'note': u'Test Brightcove downloads and detection in GenericIE',
u'info_dict': {
u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
u'uploader': u'8TV',
u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
}
},
]
def report_download_webpage(self, video_id): def report_download_webpage(self, video_id):
"""Report webpage download.""" """Report webpage download."""
@@ -103,6 +119,13 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url) raise ExtractorError(u'Invalid URL: %s' % url)
self.report_extraction(video_id) self.report_extraction(video_id)
# Look for BrigthCove:
m_brightcove = re.search(r'<object.+?class=".*?BrightcoveExperience.*?".+?</object>', webpage, re.DOTALL)
if m_brightcove is not None:
self.to_screen(u'Brightcove video detected.')
bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
return self.url_result(bc_url, 'Brightcove')
# Start with something easy: JW Player in SWFObject # Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None: if mobj is None:

View File

@@ -23,14 +23,16 @@ class SteamIE(InfoExtractor):
u"file": u"81300.flv", u"file": u"81300.flv",
u"md5": u"f870007cee7065d7c76b88f0a45ecc07", u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
u"info_dict": { u"info_dict": {
u"title": u"Terraria 1.1 Trailer" u"title": u"Terraria 1.1 Trailer",
u'playlist_index': 1,
} }
}, },
{ {
u"file": u"80859.flv", u"file": u"80859.flv",
u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751", u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
u"info_dict": { u"info_dict": {
u"title": u"Terraria Trailer" u"title": u"Terraria Trailer",
u'playlist_index': 2,
} }
} }
] ]

View File

@@ -1,24 +1,34 @@
# coding: utf-8
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
class TudouIE(InfoExtractor): class TudouIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)' _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
_TEST = { _TEST = {
u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
u'file': u'159447792.f4v', u'file': u'159448201.f4v',
u'md5': u'ad7c358a01541e926a1e413612c6b10a', u'md5': u'140a49ed444bd22f93330985d8475fcb',
u'info_dict': { u'info_dict': {
u"title": u"\u5361\u9a6c\u4e54\u56fd\u8db3\u5f00\u5927\u811a\u957f\u4f20\u51b2\u540a\u96c6\u9526" u"title": u"卡马乔国足开大脚长传冲吊集锦"
} }
} }
def _url_for_id(self, id, quality = None):
info_url = "http://v2.tudou.com/f?id="+str(id)
if quality:
info_url += '&hd' + quality
webpage = self._download_webpage(info_url, id, "Opening the info webpage")
final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
return final_url
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(2).replace('.html','') video_id = mobj.group(2)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_id = re.search('"k":(.+?),',webpage).group(1)
title = re.search(",kw:\"(.+)\"",webpage) title = re.search(",kw:\"(.+)\"",webpage)
if title is None: if title is None:
title = re.search(",kw: \'(.+)\'",webpage) title = re.search(",kw: \'(.+)\'",webpage)
@@ -27,14 +37,27 @@ class TudouIE(InfoExtractor):
if thumbnail_url is None: if thumbnail_url is None:
thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
thumbnail_url = thumbnail_url.group(1) thumbnail_url = thumbnail_url.group(1)
info_url = "http://v2.tudou.com/f?id="+str(video_id)
webpage = self._download_webpage(info_url, video_id, "Opening the info webpage") segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1) segments = json.loads(segs_json)
ext = (final_url.split('?')[0]).split('.')[-1] # It looks like the keys are the arguments that have to be passed as
return [{ # the hd field in the request url, we pick the higher
'id': video_id, quality = sorted(segments.keys())[-1]
'url': final_url, parts = segments[quality]
'ext': ext, result = []
'title': title, len_parts = len(parts)
'thumbnail': thumbnail_url, if len_parts > 1:
}] self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
for part in parts:
part_id = part['k']
final_url = self._url_for_id(part_id, quality)
ext = (final_url.split('?')[0]).split('.')[-1]
part_info = {'id': part_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
}
result.append(part_info)
return result

View File

@@ -0,0 +1,47 @@
import re
import json
from .common import InfoExtractor
from ..utils import (
determine_ext,
)
class VeohIE(InfoExtractor):
_VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
_TEST = {
u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
u'file': u'56314296.mp4',
u'md5': u'620e68e6a3cff80086df3348426c9ca3',
u'info_dict': {
u'title': u'Straight Backs Are Stronger',
u'uploader': u'LUMOback',
u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
self.to_screen(u'%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
self.report_extraction(video_id)
info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
info = json.loads(info)
video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
return {'id': info['videoId'],
'title': info['title'],
'ext': determine_ext(video_url),
'url': video_url,
'uploader': info['username'],
'thumbnail': info.get('highResImage') or info.get('medResImage'),
'description': info['description'],
'view_count': info['views'],
}

View File

@@ -17,6 +17,7 @@ class VimeoIE(InfoExtractor):
# _VALID_URL matches Vimeo URLs # _VALID_URL matches Vimeo URLs
_VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$' _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
_NETRC_MACHINE = 'vimeo'
IE_NAME = u'vimeo' IE_NAME = u'vimeo'
_TEST = { _TEST = {
u'url': u'http://vimeo.com/56015672', u'url': u'http://vimeo.com/56015672',
@@ -31,6 +32,25 @@ class VimeoIE(InfoExtractor):
} }
} }
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
self.report_login()
login_url = 'https://vimeo.com/log_in'
webpage = self._download_webpage(login_url, None, False)
token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
data = compat_urllib_parse.urlencode({'email': username,
'password': password,
'action': 'login',
'service': 'vimeo',
'token': token,
})
login_request = compat_urllib_request.Request(login_url, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_request.add_header('Cookie', 'xsrft=%s' % token)
self._download_webpage(login_request, None, False, u'Wrong login info')
def _verify_video_password(self, url, video_id, webpage): def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None) password = self._downloader.params.get('videopassword', None)
if password is None: if password is None:
@@ -50,6 +70,9 @@ class VimeoIE(InfoExtractor):
u'Verifying the password', u'Verifying the password',
u'Wrong password') u'Wrong password')
def _real_initialize(self):
self._login()
def _real_extract(self, url, new_video=True): def _real_extract(self, url, new_video=True):
# Extract ID from URL # Extract ID from URL
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)

View File

@@ -4,6 +4,7 @@ import json
import netrc import netrc
import re import re
import socket import socket
import itertools
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..utils import ( from ..utils import (
@@ -19,6 +20,7 @@ from ..utils import (
ExtractorError, ExtractorError,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
orderedSet,
) )
@@ -115,24 +117,32 @@ class YoutubeIE(InfoExtractor):
u"uploader": u"IconaPop", u"uploader": u"IconaPop",
u"uploader_id": u"IconaPop" u"uploader_id": u"IconaPop"
} }
} },
{
u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
u"file": u"07FYdnEawAQ.mp4",
u"note": u"Test VEVO video with age protection (#956)",
u"info_dict": {
u"upload_date": u"20130703",
u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
u"description": u"md5:64249768eec3bc4276236606ea996373",
u"uploader": u"justintimberlakeVEVO",
u"uploader_id": u"justintimberlakeVEVO"
}
},
] ]
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE.""" """Receives a URL and returns True if suitable for this IE."""
if YoutubePlaylistIE.suitable(url): return False if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_lang(self): def report_lang(self):
"""Report attempt to set language.""" """Report attempt to set language."""
self.to_screen(u'Setting language') self.to_screen(u'Setting language')
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
def report_video_webpage_download(self, video_id): def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage.""" """Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id) self.to_screen(u'%s: Downloading video webpage' % video_id)
@@ -180,7 +190,7 @@ class YoutubeIE(InfoExtractor):
elif len(s) == 84: elif len(s) == 84:
return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26] return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
elif len(s) == 83: elif len(s) == 83:
return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36] return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[53] + s[34:53] + s[24] + s[54:]
elif len(s) == 82: elif len(s) == 82:
return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
@@ -294,26 +304,6 @@ class YoutubeIE(InfoExtractor):
if self._downloader is None: if self._downloader is None:
return return
username = None
password = None
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
username = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
username = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
return
# Set language # Set language
request = compat_urllib_request.Request(self._LANG_URL) request = compat_urllib_request.Request(self._LANG_URL)
try: try:
@@ -323,6 +313,8 @@ class YoutubeIE(InfoExtractor):
self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
return return
(username, password) = self._get_login_info()
# No authentication to be performed # No authentication to be performed
if username is None: if username is None:
return return
@@ -430,15 +422,35 @@ class YoutubeIE(InfoExtractor):
# Get video info # Get video info
self.report_video_info_webpage_download(video_id) self.report_video_info_webpage_download(video_id)
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: if re.search(r'player-age-gate-content">', video_webpage) is not None:
video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' self.report_age_confirmation()
% (video_id, el_type)) age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse.urlencode({'video_id': video_id,
'el': 'embedded',
'gl': 'US',
'hl': 'en',
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'asv': 3,
'sts':'1588',
})
video_info_url = 'https://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id, video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False, note=False,
errnote='unable to download video info webpage') errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage) video_info = compat_parse_qs(video_info_webpage)
if 'token' in video_info: else:
break age_gate = False
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
if 'token' in video_info:
break
if 'token' not in video_info: if 'token' not in video_info:
if 'reason' in video_info: if 'reason' in video_info:
raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True) raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
@@ -471,7 +483,12 @@ class YoutubeIE(InfoExtractor):
video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
# thumbnail image # thumbnail image
if 'thumbnail_url' not in video_info: # We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
video_webpage, re.DOTALL)
if m_thumb is not None:
video_thumbnail = m_thumb.group(1)
elif 'thumbnail_url' not in video_info:
self._downloader.report_warning(u'unable to extract video thumbnail') self._downloader.report_warning(u'unable to extract video thumbnail')
video_thumbnail = '' video_thumbnail = ''
else: # don't panic if we can't find it else: # don't panic if we can't find it
@@ -550,6 +567,8 @@ class YoutubeIE(InfoExtractor):
self.report_rtmp_download() self.report_rtmp_download()
video_url_list = [(None, video_info['conn'][0])] video_url_list = [(None, video_info['conn'][0])]
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {} url_map = {}
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str) url_data = compat_parse_qs(url_data_str)
@@ -560,9 +579,15 @@ class YoutubeIE(InfoExtractor):
elif 's' in url_data: elif 's' in url_data:
if self._downloader.params.get('verbose'): if self._downloader.params.get('verbose'):
s = url_data['s'][0] s = url_data['s'][0]
player = self._search_regex(r'html5player-(.+?)\.js', video_webpage, if age_gate:
'html5 player', fatal=False) player_version = self._search_regex(r'ad3-(.+?)\.swf',
self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' % video_info['ad3_module'][0], 'flash player',
fatal=False)
player = 'flash player %s' % player_version
else:
player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
'html5 player', fatal=False)
self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
(len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player)) (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
signature = self._decrypt_signature(url_data['s'][0]) signature = self._decrypt_signature(url_data['s'][0])
url += '&signature=' + signature url += '&signature=' + signature
@@ -690,7 +715,7 @@ class YoutubePlaylistIE(InfoExtractor):
videos = [v[1] for v in sorted(videos)] videos = [v[1] for v in sorted(videos)]
url_results = [self.url_result(url, 'Youtube') for url in videos] url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
return [self.playlist_result(url_results, playlist_id, playlist_title)] return [self.playlist_result(url_results, playlist_id, playlist_title)]
@@ -748,7 +773,7 @@ class YoutubeChannelIE(InfoExtractor):
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
url_entries = [self.url_result(url, 'Youtube') for url in urls] url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
return [self.playlist_result(url_entries, channel_id)] return [self.playlist_result(url_entries, channel_id)]
@@ -805,7 +830,7 @@ class YoutubeUserIE(InfoExtractor):
pagenum += 1 pagenum += 1
urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
url_results = [self.url_result(url, 'Youtube') for url in urls] url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
return [self.playlist_result(url_results, playlist_title = username)] return [self.playlist_result(url_results, playlist_title = username)]
class YoutubeSearchIE(SearchInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor):
@@ -864,3 +889,40 @@ class YoutubeShowIE(InfoExtractor):
m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons))) self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
class YoutubeSubscriptionsIE(YoutubeIE):
"""It's a subclass of YoutubeIE because we need to login"""
IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
IE_NAME = u'youtube:subscriptions'
_FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
_PAGING_STEP = 30
# Overwrite YoutubeIE properties we don't want
_TESTS = []
@classmethod
def suitable(cls, url):
return re.match(cls._VALID_URL, url) is not None
def _real_initialize(self):
(username, password) = self._get_login_info()
if username is None:
raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True)
super(YoutubeSubscriptionsIE, self)._real_initialize()
def _real_extract(self, url):
feed_entries = []
# The step argument is available only in 2.7 or higher
for i in itertools.count(0):
paging = i*self._PAGING_STEP
info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
u'Downloading page %s' % i)
info = json.loads(info)
feed_html = info['feed_html']
m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
if info['paging'] is None:
break
return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')

View File

@@ -198,6 +198,20 @@ else:
with open(fn, 'w', encoding='utf-8') as f: with open(fn, 'w', encoding='utf-8') as f:
json.dump(obj, f) json.dump(obj, f)
if sys.version_info >= (2,7):
def find_xpath_attr(node, xpath, key, val):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z]+$', key)
assert re.match(r'^[a-zA-Z@]*$', val)
expr = xpath + u"[@%s='%s']" % (key, val)
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val):
for f in node.findall(xpath):
if f.attrib.get(key) == val:
return f
return None
def htmlentity_transform(matchobj): def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character. """Transforms an HTML entity to a character.
@@ -623,7 +637,7 @@ def unified_strdate(date_str):
date_str = date_str.replace(',',' ') date_str = date_str.replace(',',' ')
# %z (UTC offset) is only supported in python>=3.2 # %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S'] format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
for expression in format_expressions: for expression in format_expressions:
try: try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -631,6 +645,13 @@ def unified_strdate(date_str):
pass pass
return upload_date return upload_date
def determine_ext(url):
guess = url.partition(u'?')[0].rpartition(u'.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
else:
return u'unknown_video'
def date_from_str(date_str): def date_from_str(date_str):
""" """
Return a datetime object from a string in the format YYYYMMDD or Return a datetime object from a string in the format YYYYMMDD or

View File

@@ -1,2 +1,2 @@
__version__ = '2013.07.02' __version__ = '2013.07.11'