Compare commits

..

49 Commits

Author SHA1 Message Date
Philipp Hagemeister
ffd1833b87 release 2013.07.17 2013-07-17 01:14:38 +02:00
Philipp Hagemeister
896d5b63e8 [metacafe] Add support for AnyClip videos (#1059) 2013-07-17 01:14:30 +02:00
Jaime Marquínez Ferrándiz
67de24e449 [freesound] Minor improvements 2013-07-15 21:33:45 +02:00
Jaime Marquínez Ferrándiz
66400c470c Merge pull request #1050 from yasoob/master
Added an IE and test for Freesound.org .
2013-07-15 21:06:51 +02:00
M.Yasoob Khalid
7665010267 added test for freesound.org 2013-07-15 20:17:09 +05:00
M.Yasoob Khalid
5d9b75051a Added an IE for freesound.org 2013-07-15 20:16:44 +05:00
Jaime Marquínez Ferrándiz
ab2f744b90 GametrailersIE: make it a subclass of MTVIE to reuse most of the extraction process 2013-07-14 14:29:15 +02:00
Jaime Marquínez Ferrándiz
300fcad8a6 MTVIE: fix xml tags in the media namespace (python2.6) 2013-07-14 14:02:04 +02:00
Jaime Marquínez Ferrándiz
f7e025958a [mtv]: rework MTVIE and add tests (closes #913)
It uses the same system as ComedyCentralIE to transform ramp urls into http.
2013-07-14 13:41:46 +02:00
Jaime Marquínez Ferrándiz
0ab5531363 [livestream] fix import statement 2013-07-14 09:25:51 +02:00
Jaime Marquínez Ferrándiz
b4444d5ca2 Add LivestreamIE (closes #1042) 2013-07-13 23:58:04 +02:00
Philipp Hagemeister
b9d3e1635f Strip hash info from URL when making requests (Fixes #1038) 2013-07-13 22:52:12 +02:00
Philipp Hagemeister
aa6b734e02 [instagram] really fix uploader_id detection (Fixes #1038) 2013-07-13 21:45:33 +02:00
Philipp Hagemeister
73b57f0ccb [instagram] fix uploader_id detection (Fixes #1038) 2013-07-13 20:40:04 +02:00
Philipp Hagemeister
3c4e6d8337 Improve OpenGraph property matching 2013-07-13 20:39:47 +02:00
Philipp Hagemeister
36034aecc2 Merge remote-tracking branch 'jaimeMF/opengraph' 2013-07-13 20:33:23 +02:00
Jaime Marquínez Ferrándiz
ffca4b5c32 Add CanalplusIE (closes #59 and closes #918) 2013-07-13 13:36:15 +02:00
Jaime Marquínez Ferrándiz
b0e72bcf34 CriterionIE: simplify some parts and use _html_search_regex 2013-07-13 12:26:05 +02:00
Jaime Marquínez Ferrándiz
7fd930c0c8 Merge pull request #1036 from yasoob/master
Added an IE and test for Criterion videos (closes #1035).
2013-07-13 12:18:03 +02:00
Jaime Marquínez Ferrándiz
2e78b2bead YouJizzIE: support videos that define the urls in a playlist page (closes #1037) 2013-07-13 12:07:07 +02:00
Jaime Marquínez Ferrándiz
44dbe89035 Use re.DOTALL by default when searching OpenGraph properties 2013-07-13 11:29:08 +02:00
M.Yasoob Khalid
2d5a8b5512 added test for criterion.com 2013-07-13 09:18:03 +05:00
M.Yasoob Khalid
159736c1b8 added an IE for criterion.com 2013-07-13 09:17:48 +05:00
Jaime Marquínez Ferrándiz
46720279c2 InfoExtractor: add some helper methods to extract OpenGraph info 2013-07-12 22:12:04 +02:00
Jaime Marquínez Ferrándiz
d8269e1dfb Don't try to save the thumbnail if it's None
It means the extractor couldn't find it
2013-07-12 22:11:59 +02:00
Jaime Marquínez Ferrándiz
cbdbb76665 Use determine_ext when saving the thumbnail
Urls that contain a query produced filenames with wrong extensions
2013-07-12 22:08:49 +02:00
Jaime Marquínez Ferrándiz
6543f0dca5 BrightcoveIE: Use parse_qs to extract the fields of the query (closes #1032)
Add a compat_urlparse to utils.
2013-07-12 14:53:28 +02:00
Jaime Marquínez Ferrándiz
232eb88bfe GenericIE: allow to match declaration of the Brightocove parameters that use ' instead of " 2013-07-12 14:52:01 +02:00
Jaime Marquínez Ferrándiz
a95967f8b7 [ign]: support some country versions and add an extractor for 1up.com
1up.com uses the gin video system, the extractor is a subclass of IGNIE, it just replaces the video id
2013-07-12 11:39:40 +02:00
Jaime Marquínez Ferrándiz
2ef648d3d3 Add IGNIE
Only for www.ign.com, it doesn't support country specific versions (like es.ign.com)
2013-07-12 00:03:59 +02:00
Philipp Hagemeister
33f6830fd5 release 2013.07.12 2013-07-11 23:54:34 +02:00
Jaime Marquínez Ferrándiz
606d7e67fd YoutubeIE: add algo for length 81 (closes #1026) 2013-07-11 23:47:54 +02:00
Philipp Hagemeister
fd87ff26b9 release 2013.07.11 2013-07-11 21:04:59 +02:00
Jaime Marquínez Ferrándiz
85347e1cb6 YoutubeIE: a new algo for length 83 2013-07-11 20:21:45 +02:00
Jaime Marquínez Ferrándiz
41897817cc GametrailersIE: support multipart videos
Use xml.etree.ElementTree instead of re when possible
2013-07-11 18:24:53 +02:00
Philipp Hagemeister
45ff2d51d0 [brightcove] add import 2013-07-11 16:31:29 +02:00
Philipp Hagemeister
5de3ece225 [brightcove] fix on Python 2.6 2013-07-11 16:16:02 +02:00
Philipp Hagemeister
df50a41289 [arte] Fix on 2.6 2013-07-11 16:12:16 +02:00
Philipp Hagemeister
59ae56fad5 Add helper function find_path_attr 2013-07-11 16:12:08 +02:00
Philipp Hagemeister
690e872c51 Remove video_result helper method
Calling it was more complex then actually including the type in the video info
2013-07-11 12:12:30 +02:00
Philipp Hagemeister
81082e046e [ehow] improve minor bits 2013-07-11 12:11:00 +02:00
Philipp Hagemeister
3fa9550837 Merge remote-tracking branch 'yasoob/master' 2013-07-11 12:02:16 +02:00
M.Yasoob Khalid
b1082f01a6 added test for ehow 2013-07-11 14:30:25 +05:00
M.Yasoob Khalid
f35b84c807 added an IE for Ehow videos 2013-07-11 14:25:14 +05:00
Jaime Marquínez Ferrándiz
117adb0f0f GenericIE: detect more Brightcove videos
In some sites "class" contains more that BrightcoveExperience
2013-07-11 00:25:38 +02:00
Jaime Marquínez Ferrándiz
abb285fb1b BrightcoveIE: add support for playlists 2013-07-11 00:04:33 +02:00
Jaime Marquínez Ferrándiz
a431154706 Set the playlist_index and playlist fields for already resolved video results. 2013-07-10 23:36:30 +02:00
Jaime Marquínez Ferrándiz
cfe50f04ed GenericIE: Detect videos from Brightcove
Brightcove videos info is usually found in an <object class="BrightcoveExperience"></object> node, this is passed to a new method of BrightcoveIE that builds a url to extract the video.
2013-07-10 17:49:11 +02:00
Jaime Marquínez Ferrándiz
a7055eb956 YoutubeIE: show a more meaningful error when it founds a rtmpe download (related #343) 2013-07-10 14:35:11 +02:00
40 changed files with 704 additions and 263 deletions

View File

@@ -20,12 +20,15 @@ tests = [
# 84
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
"<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
# 83 - vfl26ng3K 2013/07/10
# 83 - vflcaqGO8 2013/07/11
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
"qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"),
"urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"),
# 82
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
"Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),
# 81
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
"urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."),
]
def find_matching(wrong, right):

View File

@@ -4,6 +4,7 @@
import sys
import unittest
import xml.etree.ElementTree
# Allow direct execution
import os
@@ -16,6 +17,7 @@ from youtube_dl.utils import unescapeHTML
from youtube_dl.utils import orderedSet
from youtube_dl.utils import DateRange
from youtube_dl.utils import unified_strdate
from youtube_dl.utils import find_xpath_attr
if sys.version_info < (3, 0):
_compat_str = lambda b: b.decode('unicode-escape')
@@ -112,5 +114,18 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
def test_find_xpath_attr(self):
testxml = u'''<root>
<node/>
<node x="a"/>
<node x="a" y="c" />
<node x="b" y="d" />
</root>'''
doc = xml.etree.ElementTree.fromstring(testxml)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
if __name__ == '__main__':
unittest.main()

View File

@@ -45,7 +45,7 @@ class TestYoutubeSig(unittest.TestCase):
def test_83(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
right = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"
right = "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"
self.assertEqual(sig(wrong), right)
def test_82(self):
@@ -53,5 +53,10 @@ class TestYoutubeSig(unittest.TestCase):
right = "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"
self.assertEqual(sig(wrong), right)
def test_81(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>."
right = "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."
self.assertEqual(sig(wrong), right)
if __name__ == '__main__':
unittest.main()

View File

@@ -348,6 +348,7 @@ class YoutubeDL(object):
result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
if result_type == 'video':
ie_result.update(extra_info)
if 'playlist' not in ie_result:
# It isn't part of a playlist
ie_result['playlist'] = None
@@ -528,10 +529,8 @@ class YoutubeDL(object):
return
if self.params.get('writethumbnail', False):
if 'thumbnail' in info_dict:
thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
if not thumb_format:
thumb_format = 'jpg'
if info_dict.get('thumbnail') is not None:
thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
(info_dict['extractor'], info_dict['id']))

View File

@@ -6,17 +6,21 @@ from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .canalplus import CanalplusIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
from .criterion import CriterionIE
from .cspan import CSpanIE
from .dailymotion import DailymotionIE
from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
from .facebook import FacebookIE
from .flickr import FlickrIE
from .freesound import FreesoundIE
from .funnyordie import FunnyOrDieIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
@@ -26,6 +30,7 @@ from .googlesearch import GoogleSearchIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
@@ -33,6 +38,7 @@ from .jukebox import JukeboxIE
from .justintv import JustinTVIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
from .livestream import LivestreamIE
from .metacafe import MetacafeIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE

View File

@@ -48,6 +48,7 @@ class ArchiveOrgIE(InfoExtractor):
formats.sort(key=lambda fdata: fdata['file_size'])
info = {
'_type': 'video',
'id': video_id,
'title': title,
'formats': formats,
@@ -63,4 +64,4 @@ class ArchiveOrgIE(InfoExtractor):
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
return self.video_result(info)
return info

View File

@@ -5,6 +5,7 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
ExtractorError,
find_xpath_attr,
unified_strdate,
)
@@ -119,7 +120,7 @@ class ArteTvIE(InfoExtractor):
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')

View File

@@ -1,28 +1,82 @@
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
find_xpath_attr,
compat_urlparse,
)
class BrightcoveIE(InfoExtractor):
_VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
_VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
# There is a test for Brigtcove in GenericIE, that way we test both the download
# and the detection of videos, and we don't have to find an URL that is always valid
@classmethod
def _build_brighcove_url(cls, object_str):
"""
Build a Brightcove url from a xml string containing
<object class="BrightcoveExperience">{params}</object>
"""
object_doc = xml.etree.ElementTree.fromstring(object_str)
assert u'BrightcoveExperience' in object_doc.attrib['class']
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey.attrib['value']
videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
if videoPlayer is not None:
params['@videoPlayer'] = videoPlayer.attrib['value']
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = mobj.group('query')
video_id = mobj.group('id')
query_str = mobj.group('query')
query = compat_urlparse.parse_qs(query_str)
request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
return self._get_video_info(videoPlayer[0], query_str)
else:
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
def _get_video_info(self, video_id, query):
request_url = self._FEDERATED_URL_TEMPLATE % query
webpage = self._download_webpage(request_url, video_id)
self.report_extraction(video_id)
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
info = json.loads(info)['data']
video_info = info['programmedContent']['videoPlayer']['mediaDTO']
return self._extract_video_info(video_info)
def _get_playlist_info(self, player_key):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
player_key, u'Downloading playlist information')
playlist_info = json.loads(playlist_info)['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
return self.playlist_result(videos, playlist_id=playlist_info['id'],
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):
renditions = video_info['renditions']
renditions = sorted(renditions, key=lambda r: r['size'])
best_format = renditions[-1]
return {'id': video_id,
return {'id': video_info['id'],
'title': video_info['displayName'],
'url': best_format['defaultURL'],
'ext': 'mp4',

View File

@@ -0,0 +1,46 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import unified_strdate
class CanalplusIE(InfoExtractor):
_VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
IE_NAME = u'canalplus.fr'
_TEST = {
u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861',
u'file': u'889861.flv',
u'md5': u'590a888158b5f0d6832f84001fbf3e99',
u'info_dict': {
u'title': u'Le Petit Journal 20/06/13 - La guerre des drone',
u'upload_date': u'20130620',
},
u'skip': u'Requires rtmpdump'
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id
info_page = self._download_webpage(info_url,video_id,
u'Downloading video info')
self.report_extraction(video_id)
doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
video_info = [video for video in doc if video.find('ID').text == video_id][0]
infos = video_info.find('INFOS')
media = video_info.find('MEDIA')
formats = [media.find('VIDEOS/%s' % format)
for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']]
video_url = [format.text for format in formats if format is not None][-1]
return {'id': video_id,
'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text,
infos.find('TITRAGE/SOUS_TITRE').text),
'url': video_url,
'ext': 'flv',
'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
'thumbnail': media.find('IMAGES/GRAND').text,
}

View File

@@ -125,6 +125,11 @@ class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -169,11 +174,6 @@ class InfoExtractor(object):
self.to_screen(u'Logging in')
#Methods for following #608
#They set the correct value of the '_type' key
def video_result(self, video_info):
"""Returns a video"""
video_info['_type'] = 'video'
return video_info
def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
@@ -262,6 +262,30 @@ class InfoExtractor(object):
return (username, password)
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regex(prop):
return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
return self._html_search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
def _og_search_thumbnail(self, html, **kargs):
return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
def _og_search_title(self, html, **kargs):
return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', **kargs):
return self._html_search_regex([self._og_regex('video:secure_url'),
self._og_regex('video')],
html, name, **kargs)
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.

View File

@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
import re
from .common import InfoExtractor
from ..utils import determine_ext
class CriterionIE(InfoExtractor):
_VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+'
_TEST = {
u'url': u'http://www.criterion.com/films/184-le-samourai',
u'file': u'184.mp4',
u'md5': u'bc51beba55685509883a9a7830919ec3',
u'info_dict': {
u"title": u"Le Samouraï",
u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;',
webpage, 'video url')
title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />',
webpage, 'video title')
description = self._html_search_regex(r'<meta name="description" content="(.+?)" />',
webpage, 'video description')
thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
webpage, 'thumbnail url')
return {'id': video_id,
'url' : final_url,
'title': title,
'ext': determine_ext(final_url),
'description': description,
'thumbnail': thumbnail,
}

View File

@@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor):
description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
webpage, 'description',
flags=re.MULTILINE|re.DOTALL)
thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"',
webpage, 'thumbnail')
url = self._search_regex(r'<string name="URL">(.*?)</string>',
video_info, 'video url')
@@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor):
'url': url,
'play_path': path,
'description': description,
'thumbnail': thumbnail,
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@@ -39,9 +39,6 @@ class DailymotionIE(InfoExtractor):
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
webpage, 'title')
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,7 +73,7 @@ class DailymotionIE(InfoExtractor):
'url': video_url,
'uploader': video_uploader,
'upload_date': video_upload_date,
'title': video_title,
'title': self._og_search_title(webpage),
'ext': video_extension,
'thumbnail': info['thumbnail_url']
}]

View File

@@ -67,6 +67,7 @@ class DreiSatIE(InfoExtractor):
formats.sort(key=_sortkey)
info = {
'_type': 'video',
'id': video_id,
'title': video_title,
'formats': formats,
@@ -81,4 +82,4 @@ class DreiSatIE(InfoExtractor):
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
return self.video_result(info)
return info

View File

@@ -0,0 +1,46 @@
import re
from ..utils import (
compat_urllib_parse,
determine_ext
)
from .common import InfoExtractor
class EHowIE(InfoExtractor):
IE_NAME = u'eHow'
_VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = {
u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
u'file': u'12245069.flv',
u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
u'info_dict': {
u"title": u"Hardwood Flooring Basics",
u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
u"uploader": u"Erick Nathan"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
webpage, u'video URL')
final_url = compat_urllib_parse.unquote(video_url)
uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
webpage, u'uploader')
title = self._og_search_title(webpage).replace(' | eHow', '')
ext = determine_ext(final_url)
return {
'_type': 'video',
'id': video_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'uploader': uploader,
}

View File

@@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor):
videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
webpage, u'thumbnail', fatal=False)
playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
webpage, u'player url')
playerUrl = self._og_search_video_url(webpage, name='player url')
title = self._html_search_regex('<meta name="title" content="([^"]*)"',
webpage, u'player url').split(' : ')[-1]
@@ -70,7 +66,7 @@ class EscapistIE(InfoExtractor):
'upload_date': None,
'title': title,
'ext': 'mp4',
'thumbnail': imgUrl,
'thumbnail': self._og_search_thumbnail(webpage),
'description': videoDesc,
'player_url': playerUrl,
}

View File

@@ -47,21 +47,12 @@ class FlickrIE(InfoExtractor):
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'video title')
video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'description', fatal=False)
thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'description': video_description,
'thumbnail': thumbnail,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id': video_uploader_id,
}]

View File

@@ -0,0 +1,36 @@
import re
from .common import InfoExtractor
from ..utils import determine_ext
class FreesoundIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
_TEST = {
u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/',
u'file': u'194503.mp3',
u'md5': u'12280ceb42c81f19a515c745eae07650',
u'info_dict': {
u"title": u"gulls in the city.wav",
u"uploader" : u"miklovan",
u'description': u'the sounds of seagulls in the city',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
music_id = mobj.group('id')
webpage = self._download_webpage(url, music_id)
title = self._html_search_regex(r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
webpage, 'music title', flags=re.DOTALL)
music_url = self._og_search_property('audio', webpage, 'music url')
description = self._html_search_regex(r'<div id="sound_description">(.*?)</div>',
webpage, 'description', fatal=False, flags=re.DOTALL)
return [{
'id': music_id,
'title': title,
'url': music_url,
'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'),
'ext': determine_ext(music_url),
'description': description,
}]

View File

@@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor):
title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False, flags=re.DOTALL)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
'description': video_description,
'description': self._og_search_description(webpage),
}
return [info]

View File

@@ -1,68 +1,36 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
from .mtv import MTVIE, _media_xml_tag
ExtractorError,
)
class GametrailersIE(InfoExtractor):
class GametrailersIE(MTVIE):
"""
Gametrailers use the same videos system as MTVIE, it just changes the feed
url, where the uri is and the method to get the thumbnails.
"""
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'zbvr8i.flv',
u'md5': u'c3edbc995ab4081976e16779bd96a878',
u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7',
u'info_dict': {
u"title": u"E3 2013: Debut Trailer"
u'title': u'E3 2013: Debut Trailer',
u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
u'skip': u'Requires rtmpdump'
}
# Overwrite MTVIE properties we don't want
_TESTS = []
_FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
def _get_thumbnail_url(self, uri, itemdoc):
search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
return itemdoc.find(search_path).attrib['url']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
video_type = mobj.group('type')
webpage = self._download_webpage(url, video_id)
if video_type == 'full-episodes':
mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
else:
mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
mgid = self._search_regex(mgid_re, webpage, u'mgid')
data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
video_id, u'Downloading video info')
links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
video_id, u'Downloading video urls info')
self.report_extraction(video_id)
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
<image>.*
<url>(?P<thumb>.*?)</url>.*
</image>'''
m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
if m_info is None:
raise ExtractorError(u'Unable to extract video info')
video_title = m_info.group('title')
video_description = m_info.group('description')
video_thumb = m_info.group('thumb')
m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
if m_urls is None or len(m_urls) == 0:
raise ExtractorError(u'Unable to extract video url')
# They are sorted from worst to best quality
video_url = m_urls[-1].group('url')
return {'url': video_url,
'id': video_id,
'title': video_title,
# Videos are actually flv not mp4
'ext': 'flv',
'thumbnail': video_thumb,
'description': video_description,
}
mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
webpage, u'mgid')
return self._get_videos_info(mgid)

View File

@@ -1,3 +1,5 @@
# encoding: utf-8
import os
import re
@@ -9,20 +11,34 @@ from ..utils import (
ExtractorError,
)
from .brightcove import BrightcoveIE
class GenericIE(InfoExtractor):
IE_DESC = u'Generic downloader that works on some sites'
_VALID_URL = r'.*'
IE_NAME = u'generic'
_TEST = {
u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
u'file': u'13601338388002.mp4',
u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
u'info_dict': {
u"uploader": u"www.hodiho.fr",
u"title": u"R\u00e9gis plante sa Jeep"
}
}
_TESTS = [
{
u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
u'file': u'13601338388002.mp4',
u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
u'info_dict': {
u"uploader": u"www.hodiho.fr",
u"title": u"R\u00e9gis plante sa Jeep"
}
},
{
u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
u'file': u'2371591881001.mp4',
u'md5': u'9e80619e0a94663f0bdc849b4566af19',
u'note': u'Test Brightcove downloads and detection in GenericIE',
u'info_dict': {
u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
u'uploader': u'8TV',
u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
}
},
]
def report_download_webpage(self, video_id):
"""Report webpage download."""
@@ -103,6 +119,13 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
self.report_extraction(video_id)
# Look for BrigthCove:
m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
if m_brightcove is not None:
self.to_screen(u'Brightcove video detected.')
bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
return self.url_result(bc_url, 'Brightcove')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor):
video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
thumbnail = self._html_search_regex(r'"og:image" content="(.*)"',
webpage_src, u'thumbnail', fatal=False)
results = [{
'id': video_id,
'url' : video_url,
'title' : video_title,
'thumbnail' : thumbnail,
'thumbnail' : self._og_search_thumbnail(webpage_src),
'ext' : 'mp3',
}]
return results
return results

View File

@@ -0,0 +1,91 @@
import re
import json
from .common import InfoExtractor
from ..utils import (
determine_ext,
)
class IGNIE(InfoExtractor):
"""
Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
Some videos of it.ign.com are also supported
"""
_VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P<name_or_id>.+)'
IE_NAME = u'ign.com'
_CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
_DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>',
r'id="my_show_video">.*?<p>(.*?)</p>',
]
_TEST = {
u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
u'file': u'8f862beef863986b2785559b9e1aa599.mp4',
u'md5': u'eac8bdc1890980122c3b66f14bdd02e9',
u'info_dict': {
u'title': u'The Last of Us Review',
u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c',
}
}
def _find_video_id(self, webpage):
res_id = [r'data-video-id="(.+?)"',
r'<object id="vid_(.+?)"',
r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
]
return self._search_regex(res_id, webpage, 'video id')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name_or_id = mobj.group('name_or_id')
webpage = self._download_webpage(url, name_or_id)
video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id)
description = self._html_search_regex(self._DESCRIPTION_RE,
webpage, 'video description',
flags=re.DOTALL)
result['description'] = description
return result
def _get_video_info(self, video_id):
config_url = self._CONFIG_URL_TEMPLATE % video_id
config = json.loads(self._download_webpage(config_url, video_id,
u'Downloading video info'))
media = config['playlist']['media']
video_url = media['url']
return {'id': media['metadata']['videoId'],
'url': video_url,
'ext': determine_ext(video_url),
'title': media['metadata']['title'],
'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
}
class OneUPIE(IGNIE):
"""Extractor for 1up.com, it uses the ign videos system."""
_VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
_TEST = {
u'url': u'http://gamevideos.1up.com/video/id/34976',
u'file': u'34976.mp4',
u'md5': u'68a54ce4ebc772e4b71e3123d413163d',
u'info_dict': {
u'title': u'Sniper Elite V2 - Trailer',
u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
id = mobj.group('name_or_id')
result = super(OneUPIE, self)._real_extract(url)
result['id'] = id
return result

View File

@@ -5,7 +5,7 @@ from .common import InfoExtractor
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
_TEST = {
u'url': u'http://instagram.com/p/aye83DjauH/#',
u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',
u'md5': u'0d2da106a9d2631273e192b372806516',
u'info_dict': {
@@ -18,25 +18,20 @@ class InstagramIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<meta property="og:video" content="(.+?)"',
webpage, u'video URL')
thumbnail_url = self._html_search_regex(
r'<meta property="og:image" content="(.+?)" />',
webpage, u'thumbnail URL', fatal=False)
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, u'title', flags=re.DOTALL)
title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip()
uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram',
webpage, u'uploader name', fatal=False)
uploader_id = self._html_search_regex(
r'<div class="media-user" id="media_user">.*?<h2><a href="[^"]*">([^<]*)</a></h2>',
webpage, u'uploader id', fatal=False, flags=re.DOTALL)
ext = 'mp4'
return [{
'id': video_id,
'url': video_url,
'url': self._og_search_video_url(webpage),
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id
}]

View File

@@ -24,8 +24,7 @@ class KeekIE(InfoExtractor):
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
webpage, u'title')
video_title = self._og_search_title(webpage)
uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
webpage, u'uploader', fatal=False)

View File

@@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor):
video_url = self._search_regex(r'file: "(.*?)",',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
webpage, u'title').replace('LiveLeak.com -', '').strip()
video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False)
video_description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
webpage, u'uploader', fatal=False)

View File

@@ -0,0 +1,52 @@
import re
import json
from .common import InfoExtractor
from ..utils import compat_urllib_parse_urlparse, compat_urlparse
class LivestreamIE(InfoExtractor):
_VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
_TEST = {
u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
u'file': u'4719370.mp4',
u'md5': u'0d2186e3187d185a04b3cdd02b828836',
u'info_dict': {
u'title': u'Live from Webster Hall NYC',
u'upload_date': u'20121012',
}
}
def _extract_video_info(self, video_data):
video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url')
return {'id': video_data['id'],
'url': video_url,
'ext': 'mp4',
'title': video_data['caption'],
'thumbnail': video_data['thumbnail_url'],
'upload_date': video_data['updated_at'].replace('-','')[:8],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
event_name = mobj.group('event_name')
webpage = self._download_webpage(url, video_id or event_name)
if video_id is None:
# This is an event page:
api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'',
webpage, 'api url')
info = json.loads(self._download_webpage(api_url, event_name,
u'Downloading event info'))
videos = [self._extract_video_info(video_data['data'])
for video_data in info['feed']['data'] if video_data['type'] == u'video']
return self.playlist_result(videos, info['id'], info['full_name'])
else:
og_video = self._og_search_video_url(webpage, name=u'player url')
query_str = compat_urllib_parse_urlparse(og_video).query
query = compat_urlparse.parse_qs(query_str)
api_url = query['play_url'][0].replace('.smil', '')
info = json.loads(self._download_webpage(api_url, video_id,
u'Downloading video info'))
return self._extract_video_info(info)

View File

@@ -9,7 +9,7 @@ from ..utils import (
compat_urllib_parse,
compat_urllib_request,
compat_str,
determine_ext,
ExtractorError,
)
@@ -20,7 +20,7 @@ class MetacafeIE(InfoExtractor):
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = u'metacafe'
_TEST = {
_TESTS = [{
u"add_ie": ["Youtube"],
u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
u"file": u"_aUehQsCQtM.flv",
@@ -31,7 +31,15 @@ class MetacafeIE(InfoExtractor):
u"uploader": u"PBS",
u"uploader_id": u"PBS"
}
}
},
{
u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
u"file": u"an-dVVXnuY7Jh77J.mp4",
u"info_dict": {
u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
u"uploader": u"AnyClip",
}
}]
def report_disclaimer(self):
@@ -73,14 +81,16 @@ class MetacafeIE(InfoExtractor):
return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
# Retrieve video webpage to extract further information
webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
req.headers['Cookie'] = 'flashVersion=0;'
webpage = self._download_webpage(req, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
if mobj is not None:
mediaURL = compat_urllib_parse.unquote(mobj.group(1))
video_extension = mediaURL[-3:]
video_ext = mediaURL[-3:]
# Extract gdaKey if available
mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
@@ -90,34 +100,36 @@ class MetacafeIE(InfoExtractor):
gdaKey = mobj.group(1)
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
else:
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
vardict = compat_parse_qs(mobj.group(1))
if 'mediaData' not in vardict:
raise ExtractorError(u'Unable to extract media URL')
mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
mediaURL = mobj.group('mediaURL').replace('\\/', '/')
video_extension = mediaURL[-3:]
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
mobj = re.search(r'<video src="([^"]+)"', webpage)
if mobj:
video_url = mobj.group(1)
video_ext = 'mp4'
else:
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
vardict = compat_parse_qs(mobj.group(1))
if 'mediaData' not in vardict:
raise ExtractorError(u'Unable to extract media URL')
mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
mediaURL = mobj.group('mediaURL').replace('\\/', '/')
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
video_ext = determine_ext(video_url)
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = mobj.group(1).decode('utf-8')
mobj = re.search(r'submitter=(.*?);', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract uploader nickname')
video_uploader = mobj.group(1)
video_uploader = self._html_search_regex(r'submitter=(.*?);|<p class="By">\s*By\s*<a[^>]*>(.*?)</a>', webpage, u'uploader nickname', fatal=False)
return [{
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'ext': video_extension.decode('utf-8'),
'ext': video_ext,
}]

View File

@@ -1,28 +1,110 @@
import re
import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_request,
compat_urllib_parse,
ExtractorError,
)
def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag
class MTVIE(InfoExtractor):
_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
_WORKING = False
_VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
_FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
_TESTS = [
{
u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
u'file': u'853555.mp4',
u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
u'info_dict': {
u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
},
},
{
u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
u'file': u'USCJY1331283.mp4',
u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
u'info_dict': {
u'title': u'Everything Has Changed',
u'upload_date': u'20130606',
u'uploader': u'Taylor Swift',
},
u'skip': u'VEVO is only available in some countries',
},
]
@staticmethod
def _id_from_uri(uri):
return uri.split(':')[-1]
# This was originally implemented for ComedyCentral, but it also works here
@staticmethod
def _transform_rtmp_url(rtmp_video_url):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
if not m:
raise ExtractorError(u'Cannot transform RTMP url')
base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
return base + m.group('finalid')
def _get_thumbnail_url(self, uri, itemdoc):
return 'http://mtv.mtvnimages.com/uri/' + uri
def _extract_video_url(self, metadataXml):
if '/error_country_block.swf' in metadataXml:
raise ExtractorError(u'This video is not available from your country.', expected=True)
mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
renditions = mdoc.findall('.//rendition')
# For now, always pick the highest quality.
rendition = renditions[-1]
try:
_,_,ext = rendition.attrib['type'].partition('/')
format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
rtmp_video_url = rendition.find('./src').text
except KeyError:
raise ExtractorError('Invalid rendition field.')
video_url = self._transform_rtmp_url(rtmp_video_url)
return {'ext': ext, 'url': video_url, 'format': format}
def _get_video_info(self, itemdoc):
uri = itemdoc.find('guid').text
video_id = self._id_from_uri(uri)
self.report_extraction(video_id)
mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
if 'acceptMethods' not in mediagen_url:
mediagen_url += '&acceptMethods=fms'
mediagen_page = self._download_webpage(mediagen_url, video_id,
u'Downloading video urls')
video_info = self._extract_video_url(mediagen_page)
description_node = itemdoc.find('description')
if description_node is not None:
description = description_node.text
else:
description = None
video_info.update({'title': itemdoc.find('title').text,
'id': video_id,
'thumbnail': self._get_thumbnail_url(uri, itemdoc),
'description': description,
})
return video_info
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
u'Downloading info')
idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
return [self._get_video_info(item) for item in idoc.findall('.//item')]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
if not mobj.group('proto'):
url = 'http://' + url
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
@@ -35,46 +117,5 @@ class MTVIE(InfoExtractor):
self.to_screen(u'Vevo video detected: %s' % vevo_id)
return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
#song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
# webpage, u'song name', fatal=False)
video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
webpage, u'title')
mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
webpage, u'mtvn_uri', fatal=False)
content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
webpage, u'content id', fatal=False)
videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
self.report_extraction(video_id)
request = compat_urllib_request.Request(videogen_url)
try:
metadataXml = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
mdoc = xml.etree.ElementTree.fromstring(metadataXml)
renditions = mdoc.findall('.//rendition')
# For now, always pick the highest quality.
rendition = renditions[-1]
try:
_,_,ext = rendition.attrib['type'].partition('/')
format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
video_url = rendition.find('./src').text
except KeyError:
raise ExtractorError('Invalid rendition field.')
info = {
'id': video_id,
'url': video_url,
'upload_date': None,
'title': video_title,
'ext': ext,
'format': format,
}
return [info]
uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
return self._get_videos_info(uri)

View File

@@ -30,8 +30,7 @@ class NBAIE(InfoExtractor):
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
shortened_video_id = video_id.rpartition('/')[2]
title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')
# It isn't there in the HTML it returns to us
# uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)

View File

@@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<meta property="og:video:secure_url" content="(.+?)">',
webpage, u'video URL')
thumbnail_url = self._html_search_regex(
r'<meta property="og:image" content="(.+?)" />',
webpage, u'thumbnail URL', fatal=False)
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, u'title')
@@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor):
return [{
'id': video_id,
'url': video_url,
'url': self._og_search_video_url(webpage),
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id
}]

View File

@@ -23,14 +23,16 @@ class SteamIE(InfoExtractor):
u"file": u"81300.flv",
u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
u"info_dict": {
u"title": u"Terraria 1.1 Trailer"
u"title": u"Terraria 1.1 Trailer",
u'playlist_index': 1,
}
},
{
u"file": u"80859.flv",
u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
u"info_dict": {
u"title": u"Terraria Trailer"
u"title": u"Terraria Trailer",
u'playlist_index': 2,
}
}
]

View File

@@ -30,15 +30,6 @@ class TeamcocoIE(InfoExtractor):
self.report_extraction(video_id)
video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
webpage, u'thumbnail', fatal=False)
video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
webpage, u'description', fatal=False)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
@@ -49,7 +40,7 @@ class TeamcocoIE(InfoExtractor):
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'thumbnail': thumbnail,
'description': video_description,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
}]

View File

@@ -24,11 +24,8 @@ class TrailerAddictIE(InfoExtractor):
webpage, 'video title').replace(' - Trailer Addict','')
view_count = self._search_regex(r'Views: (.+?)<br />',
webpage, 'Views Count')
description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
webpage, 'video description')
video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />',
webpage, 'Video id').split('=')[1]
video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1]
info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id))
info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage")
@@ -44,6 +41,6 @@ class TrailerAddictIE(InfoExtractor):
'ext' : ext,
'title' : title,
'thumbnail' : thumbnail_url,
'description' : description,
'description' : self._og_search_description(webpage),
'view_count' : view_count,
}]

View File

@@ -22,8 +22,6 @@ class TutvIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<meta property="og:title" content="(.*?)">', webpage, u'title')
internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
@@ -36,6 +34,6 @@ class TutvIE(InfoExtractor):
'id': internal_id,
'url': video_url,
'ext': ext,
'title': title,
'title': self._og_search_title(webpage),
}
return [info]

View File

@@ -27,12 +27,6 @@ class VineIE(InfoExtractor):
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
webpage, u'thumbnail', fatal=False)
uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
@@ -40,7 +34,7 @@ class VineIE(InfoExtractor):
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'thumbnail': thumbnail,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader': uploader,
}]

View File

@@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(embed_page_url, video_id)
# Get the video URL
video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
webpage, u'video URL')
m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
if m_playlist is not None:
playlist_url = m_playlist.group('playlist')
playlist_page = self._download_webpage(playlist_url, video_id,
u'Downloading playlist page')
m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page))
if len(m_levels) == 0:
raise ExtractorError(u'Unable to extract video url')
videos = [(int(m.group(1)), m.group(2)) for m in m_levels]
(_, video_url) = sorted(videos)[0]
video_url = video_url.replace('%252F', '%2F')
else:
video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
webpage, u'video URL')
info = {'id': video_id,
'url': video_url,

View File

@@ -190,9 +190,11 @@ class YoutubeIE(InfoExtractor):
elif len(s) == 84:
return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
elif len(s) == 83:
return s[:81]
return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[53] + s[34:53] + s[24] + s[54:]
elif len(s) == 82:
return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
elif len(s) == 81:
return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[2] + s[34:53] + s[24] + s[54:81]
else:
raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
@@ -567,6 +569,8 @@ class YoutubeIE(InfoExtractor):
self.report_rtmp_download()
video_url_list = [(None, video_info['conn'][0])]
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {}
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)

View File

@@ -35,6 +35,11 @@ try:
except ImportError: # Python 2
from urlparse import urlparse as compat_urllib_parse_urlparse
try:
import urllib.parse as compat_urlparse
except ImportError: # Python 2
import urlparse as compat_urlparse
try:
import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
@@ -198,6 +203,20 @@ else:
with open(fn, 'w', encoding='utf-8') as f:
json.dump(obj, f)
if sys.version_info >= (2,7):
def find_xpath_attr(node, xpath, key, val):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z]+$', key)
assert re.match(r'^[a-zA-Z@]*$', val)
expr = xpath + u"[@%s='%s']" % (key, val)
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val):
for f in node.findall(xpath):
if f.attrib.get(key) == val:
return f
return None
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
@@ -631,12 +650,12 @@ def unified_strdate(date_str):
pass
return upload_date
def determine_ext(url):
def determine_ext(url, default_ext=u'unknown_video'):
guess = url.partition(u'?')[0].rpartition(u'.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
else:
return u'unknown_video'
return default_ext
def date_from_str(date_str):
"""

View File

@@ -1,2 +1,2 @@
__version__ = '2013.07.10'
__version__ = '2013.07.17'