Compare commits

..

5 Commits

Author SHA1 Message Date
Philipp Hagemeister
a81b4d5c8f release 2013.11.18 2013-11-18 13:30:43 +01:00
Philipp Hagemeister
887c6acdf2 Support multiple embedded YouTube URLs (Fixes #1787) 2013-11-18 13:28:26 +01:00
Philipp Hagemeister
83aa529330 Support protocol-independent URLs (#1787) 2013-11-18 13:18:17 +01:00
Philipp Hagemeister
96b31b6533 Add iPhone to UA (#1746) 2013-11-18 13:05:58 +01:00
Philipp Hagemeister
fccd377198 Suppor embed-only videos (Fixes #1746) 2013-11-18 13:05:18 +01:00
4 changed files with 35 additions and 20 deletions

View File

@@ -162,6 +162,16 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Failed to download URL: %s' % url)
self.report_extraction(video_id)
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._html_search_regex(r'<title>(.*)</title>',
webpage, u'video title', default=u'video', flags=re.DOTALL)
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
if bc_url is not None:
@@ -177,11 +187,13 @@ class GenericIE(InfoExtractor):
return self.url_result(surl, 'Vimeo')
# Look for embedded YouTube player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)
if mobj:
surl = unescapeHTML(mobj.group(u'url'))
return self.url_result(surl, 'Youtube')
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
for tuppl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
@@ -226,15 +238,6 @@ class GenericIE(InfoExtractor):
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._html_search_regex(r'<title>(.*)</title>',
webpage, u'video title', default=u'video', flags=re.DOTALL)
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')

View File

@@ -139,9 +139,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_DESC = u'YouTube.com'
_VALID_URL = r"""^
_VALID_URL = r"""(?xi)^
(
(?:https?://)? # http(s):// (optional)
(?:https?://|//)? # http(s):// or protocol-independent URL (optional)
(?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
@@ -363,6 +363,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader_id": u"justintimberlakeVEVO"
}
},
{
u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
u"file": u"yZIXLfi8CZQ.mp4",
u"note": u"Embed-only video (#1746)",
u"info_dict": {
u"upload_date": u"20120608",
u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
u"uploader": u"SET India",
u"uploader_id": u"setindia"
}
},
]
@@ -370,7 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
if YoutubePlaylistIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
return re.match(cls._VALID_URL, url) is not None
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
@@ -1272,7 +1284,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse.urlencode({'video_id': video_id,
'el': 'embedded',
'el': 'player_embedded',
'gl': 'US',
'hl': 'en',
'eurl': 'https://youtube.googleapis.com/v/' + video_id,

View File

@@ -176,7 +176,7 @@ def compat_ord(c):
compiled_regex_type = type(re.compile(''))
std_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) (iPhone)',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',

View File

@@ -1,2 +1,2 @@
__version__ = '2013.11.17'
__version__ = '2013.11.18'