Compare commits

...

48 Commits

Author SHA1 Message Date
Philipp Hagemeister
2417dc1715 release 2014.10.12 2014-10-12 22:25:18 +02:00
Naglis Jonaitis
23d83ad4d5 [niconico] Fix ignored --netrc flag
See issue #3753
2014-10-12 23:18:42 +03:00
Jaime Marquínez Ferrándiz
772ece3571 Merge pull request #3932 from Dineshs91/ndr-fix
Description changed
2014-10-12 17:54:51 +02:00
dinesh
2c9f31188b Description changed 2014-10-12 20:09:12 +05:30
Naglis Jonaitis
d18be55533 [theonion] Fix a small mistake in string formatting 2014-10-12 15:47:31 +03:00
Naglis Jonaitis
ac20fc047a [theonion] Add new extractor (closes #3928) 2014-10-12 15:42:35 +03:00
Jaime Marquínez Ferrándiz
b4c3c8c172 [mixcloud] Fix metadata extraction (fixes #3930) 2014-10-12 13:06:31 +02:00
Jaime Marquínez Ferrándiz
3357110a4c [vimeo] Make the protocol mandatory in the url (fixes #3926)
If it's missing, it will be correctly handled by the generic IE.
2014-10-11 22:26:26 +02:00
Sergey M.
e29fdedb45 Merge pull request #3923 from Dineshs91/howstuffworks-fix
Replace 404 url
2014-10-12 01:48:11 +07:00
Sergey M․
4828703f14 [googleplus] Modernize and extract all formats 2014-10-12 01:44:13 +07:00
Sergey M.
afe08e0d4a Merge pull request #3924 from Dineshs91/googleplus-fix
Fix download error in GooglePlus
2014-10-12 00:48:58 +07:00
dinesh
071420e136 Fix download error in GooglePlus 2014-10-11 21:10:53 +05:30
dinesh
f4cf848d1d Replace 404 url 2014-10-11 15:59:42 +05:30
Sergey M.
b7b2ca6e2b Merge pull request #3921 from Dineshs91/ndr-fix
Fix ndr.de outdated test url
2014-10-11 16:07:39 +07:00
dinesh
1409704afa Fix ndr.de outdated test url 2014-10-11 12:20:13 +05:30
Sergey M.
c8e390c2b0 Merge pull request #3911 from Dockheas23/master
KeyError on initialising YoutubeDL in python3 #3910
2014-10-10 22:28:18 +07:00
Sergey M․
823f1e015a [yahoo] Wipe out yahoo news extractor 2014-10-10 22:18:37 +07:00
Sergey M․
3c06d3715e [yahoo] Generalize, support arbitrary subdomains, support iframe videos, capture error message (Closes #2470) 2014-10-10 22:11:30 +07:00
Sergey M․
762958d5af [yahoo] Add support for regional subdomains and extract duration (Closes #3915) 2014-10-10 19:50:29 +07:00
George Boyle
53d9009bdb KeyError on initialising YoutubeDL in python3 #3910 2014-10-10 10:03:24 +01:00
George Boyle
1b725173a5 Fixed typo 2014-10-10 09:35:41 +01:00
Sergey M․
0ca41c3d9c [walla] Fix typo 2014-10-09 21:10:07 +07:00
Sergey M․
fc6861b175 [sportbox] Add extractor (Closes #3906) 2014-10-09 21:05:39 +07:00
Sergey M․
b097b5f246 [mlb] Remove unused import 2014-10-09 20:07:34 +07:00
Sergey M․
385009fc44 [mlb] Fix thumbnails extraction (Closes #3905) 2014-10-09 19:56:55 +07:00
Sergey M․
ced659bb4d [generic] Ignore some non-video file extensions during generic extraction (Closes #3900) 2014-10-09 19:26:23 +07:00
Sergey M․
842cca7d56 [pornhd] Fix formats extraction (Closes #3898) 2014-10-08 20:08:29 +07:00
Sergey M․
b3826f6c8d Merge branch 'lenaten-walla' 2014-10-07 22:23:22 +07:00
Sergey M․
7bc8780c57 [walla] Fix extractor and add subtitle tests 2014-10-07 22:23:05 +07:00
Sergey M․
c59c3c84ed Merge branch 'walla' of https://github.com/lenaten/youtube-dl into lenaten-walla 2014-10-07 20:24:52 +07:00
net
31d06400ec add missed init file 2014-10-06 03:03:05 +03:00
Philipp Hagemeister
642b76ac15 release 2014.10.05.2 2014-10-05 22:04:02 +02:00
Philipp Hagemeister
4c4de296d4 release 2014.10.05.1 2014-10-05 22:00:16 +02:00
Philipp Hagemeister
b10609d98c [dailymotion] Alternative title search (Fixes #3882) 2014-10-05 21:59:53 +02:00
Sergey M
3ae165aa10 [gorillavid] Add check for non existing videos 2014-10-06 01:48:01 +07:00
Sergey M
e4b85e35d0 [gorillavid] Fix title extraction and make thumbnail optional (Closes #3884) 2014-10-06 01:47:22 +07:00
Philipp Hagemeister
bb0c206f59 release 2014.10.05 2014-10-05 07:53:25 +02:00
Philipp Hagemeister
b81f484b60 [gorillavid] Add support for movpod.in (Fixes #3881) 2014-10-05 07:53:02 +02:00
Naglis Jonaitis
5e69192ef7 [thesixtyone] Add new extractor (closes #3781) 2014-10-04 22:40:36 +03:00
Naglis Jonaitis
e9be9a6acd [utils] Add additional format to unified_strdate 2014-10-04 22:38:23 +03:00
Sergey M․
f47754f061 [globo] Initial extractor implementation (Closes #3823) 2014-10-04 18:56:36 +07:00
Philipp Hagemeister
d838b1bd4a [utils] Default age_limit to None
If we can't parse it, it means we don't have any information, not that the content is unrestricted.
2014-10-03 20:17:12 +02:00
Naglis Jonaitis
fe506288bd [planetaplay] Add new extractor (closes #3839) 2014-10-03 19:43:36 +03:00
Sergey M․
d397c0b3dd [breakcom] Extract all formats 2014-10-03 19:37:47 +07:00
Sergey M․
146c80e256 [utils] Add parse_age_limit 2014-10-03 19:37:25 +07:00
Sergey M․
f78c01f68b [breakcom] Cover more URLs with _VALID_URL (Closes #3876) 2014-10-03 18:57:18 +07:00
Naglis Jonaitis
8489578df4 [generic] Support embedded Dailymotion playlists (fixes #3822) 2014-10-02 21:42:45 +03:00
net
e4d6cca0c1 [walla] Add new extractor 2014-10-01 23:45:35 +03:00
25 changed files with 1129 additions and 193 deletions

View File

@@ -15,6 +15,7 @@ from youtube_dl.extractor import (
DailymotionIE,
TEDIE,
VimeoIE,
WallaIE,
)
@@ -279,5 +280,32 @@ class TestVimeoSubtitles(BaseTestSubtitles):
self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
class TestWallaSubtitles(BaseTestSubtitles):
url = 'http://vod.walla.co.il/movie/2705958/the-yes-men'
IE = WallaIE
def test_list_subtitles(self):
self.DL.expect_warning(u'Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self):
self.DL.expect_warning(u'Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['heb']))
self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920')
def test_nosubtitles(self):
self.DL.expect_warning(u'video doesn\'t have subtitles')
self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0)
if __name__ == '__main__':
unittest.main()

View File

@@ -228,11 +228,11 @@ class YoutubeDL(object):
if (sys.version_info >= (3,) and sys.platform != 'win32' and
sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
and not params['restrictfilenames']):
and not params.get('restrictfilenames', False)):
# On Python 3, the Unicode filesystem API will throw errors (#1474)
self.report_warning(
'Assuming --restrict-filenames since file system encoding '
'cannot encode all charactes. '
'cannot encode all characters. '
'Set the LC_ALL environment variable to fix this.')
self.params['restrictfilenames'] = True

View File

@@ -134,6 +134,7 @@ from .gamestar import GameStarIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .globo import GloboIE
from .godtube import GodTubeIE
from .golem import GolemIE
from .googleplus import GooglePlusIE
@@ -275,6 +276,7 @@ from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .planetaplay import PlanetaPlayIE
from .played import PlayedIE
from .playfm import PlayFMIE
from .playvid import PlayvidIE
@@ -345,6 +347,7 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
@@ -368,7 +371,9 @@ from .telemb import TeleMBIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
from .tf1 import TF1IE
from .theonion import TheOnionIE
from .theplatform import ThePlatformIE
from .thesixtyone import TheSixtyOneIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
@@ -437,6 +442,7 @@ from .vporn import VpornIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vulture import VultureIE
from .walla import WallaIE
from .washingtonpost import WashingtonPostIE
from .wat import WatIE
from .wayofthemaster import WayOfTheMasterIE
@@ -458,7 +464,6 @@ from .xvideos import XVideosIE
from .xtube import XTubeUserIE, XTubeIE
from .yahoo import (
YahooIE,
YahooNewsIE,
YahooSearchIE,
)
from .ynet import YnetIE

View File

@@ -4,37 +4,61 @@ import re
import json
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_age_limit,
)
class BreakIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)'
_TEST = {
_VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b',
'md5': '33aa4ff477ecd124d18d7b5d23b87ce5',
'info_dict': {
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
}
}
}, {
'url': 'http://www.break.com/video/ugc/baby-flex-2773063',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1).split("-")[-1]
embed_url = 'http://www.break.com/embed/%s' % video_id
webpage = self._download_webpage(embed_url, video_id)
info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>',
webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://www.break.com/embed/%s' % video_id, video_id)
info = json.loads(self._search_regex(
r'var embedVars = ({.*})\s*?</script>',
webpage, 'info json', flags=re.DOTALL))
youtube_id = info.get('youtubeId')
if youtube_id:
return self.url_result(youtube_id, 'Youtube')
final_url = video_url + '?' + info['AuthToken']
formats = [{
'url': media['uri'] + '?' + info['AuthToken'],
'tbr': media['bitRate'],
'width': media['width'],
'height': media['height'],
} for media in info['media']]
if not formats:
formats.append({
'url': info['videoUri']
})
self._sort_formats(formats)
duration = int_or_none(info.get('videoLengthInSeconds'))
age_limit = parse_age_limit(info.get('audienceRating'))
return {
'id': video_id,
'url': final_url,
'title': info['contentName'],
'thumbnail': info['thumbUri'],
'duration': duration,
'age_limit': age_limit,
'formats': formats,
}

View File

@@ -138,6 +138,8 @@ class InfoExtractor(object):
Unless mentioned otherwise, the fields should be Unicode strings.
Unless mentioned otherwise, None is equivalent to absence of information.
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.

View File

@@ -82,11 +82,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
]
def _real_extract(self, url):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
url = 'http://www.dailymotion.com/video/%s' % video_id
# Retrieve video webpage to extract further information
@@ -147,18 +143,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self._list_available_subtitles(video_id, webpage)
return
view_count = self._search_regex(
r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False)
if view_count is not None:
view_count = str_to_int(view_count)
view_count = str_to_int(self._search_regex(
r'video_views_count[^>]+>\s+([\d\.,]+)',
webpage, 'view count', fatal=False))
title = self._og_search_title(webpage, default=None)
if title is None:
title = self._html_search_regex(
r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
'title')
return {
'id': video_id,
'id': video_id,
'formats': formats,
'uploader': info['owner.screenname'],
'upload_date': video_upload_date,
'title': self._og_search_title(webpage),
'subtitles': video_subtitles,
'upload_date': video_upload_date,
'title': title,
'subtitles': video_subtitles,
'thumbnail': info['thumbnail_url'],
'age_limit': age_limit,
'view_count': view_count,

View File

@@ -639,6 +639,16 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
if m:
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
return _playlist_from_matches(
playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
match = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
@@ -837,47 +847,51 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
def filter_video(urls):
return list(filter(check_video, urls))
# Start with something easy: JW Player in SWFObject
found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
if not found:
# Look for gorilla-vid style embedding
found = re.findall(r'''(?sx)
found = filter_video(re.findall(r'''(?sx)
(?:
jw_plugins|
JWPlayerOptions|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
)
.*?file\s*:\s*["\'](.*?)["\']''', webpage)
.*?file\s*:\s*["\'](.*?)["\']''', webpage))
if not found:
# Broaden the search a little bit
found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
found = filter_video(re.findall(
r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
if not found:
# Flow player
found = re.findall(r'''(?xs)
found = filter_video(re.findall(r'''(?xs)
flowplayer\("[^"]+",\s*
\{[^}]+?\}\s*,
\s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
["']?url["']?\s*:\s*["']([^"']+)["']
''', webpage)
''', webpage))
if not found:
# Try to find twitter cards info
found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
found = filter_video(re.findall(
r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
if not found:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
return '.' in vpath and vext not in ('swf', 'png', 'jpg')
found = list(filter(
check_video,
re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
if not found:
# HTML5 video
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)

View File

@@ -0,0 +1,398 @@
# coding: utf-8
from __future__ import unicode_literals
import random
import math
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
compat_str,
compat_chr,
compat_ord,
)
class GloboIE(InfoExtractor):
_VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)'
_API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist'
_SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s'
_VIDEOID_REGEXES = [
r'\bdata-video-id="(\d+)"',
r'\bdata-player-videosids="(\d+)"',
r'<div[^>]+\bid="(\d+)"',
]
_RESIGN_EXPIRATION = 86400
_TESTS = [
{
'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/',
'md5': '03ebf41cb7ade43581608b7d9b71fab0',
'info_dict': {
'id': '3654973',
'ext': 'mp4',
'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão',
'duration': 251.585,
'uploader': 'SporTV',
'uploader_id': 698,
'like_count': int,
}
},
{
'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
'info_dict': {
'id': '3607726',
'ext': 'mp4',
'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
'duration': 103.204,
'uploader': 'Globo.com',
'uploader_id': 265,
'like_count': int,
}
},
{
'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
'md5': '307fdeae4390ccfe6ba1aa198cf6e72b',
'info_dict': {
'id': '3652183',
'ext': 'mp4',
'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião',
'duration': 110.711,
'uploader': 'Rede Globo',
'uploader_id': 196,
'like_count': int,
}
},
]
class MD5():
HEX_FORMAT_LOWERCASE = 0
HEX_FORMAT_UPPERCASE = 1
BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = ''
BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '='
PADDING = '=0xFF01DD'
hexcase = 0
b64pad = ''
def __init__(self):
pass
class JSArray(list):
def __getitem__(self, y):
try:
return list.__getitem__(self, y)
except IndexError:
return 0
def __setitem__(self, i, y):
try:
return list.__setitem__(self, i, y)
except IndexError:
self.extend([0] * (i - len(self) + 1))
self[-1] = y
@classmethod
def hex_md5(cls, param1):
return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1)))
@classmethod
def b64_md5(cls, param1, param2=None):
return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2)))
@classmethod
def any_md5(cls, param1, param2):
return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2)
@classmethod
def rstr_md5(cls, param1):
return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8))
@classmethod
def rstr2hex(cls, param1):
_loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef'
_loc_3 = ''
for _loc_5 in range(0, len(param1)):
_loc_4 = compat_ord(param1[_loc_5])
_loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15]
return _loc_3
@classmethod
def rstr2b64(cls, param1):
_loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
_loc_3 = ''
_loc_4 = len(param1)
for _loc_5 in range(0, _loc_4, 3):
_loc_6_1 = compat_ord(param1[_loc_5]) << 16
_loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0
_loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0
_loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3
for _loc_7 in range(0, 4):
if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8:
_loc_3 += cls.b64pad
else:
_loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63]
return _loc_3
@staticmethod
def rstr2any(param1, param2):
_loc_3 = len(param2)
_loc_4 = []
_loc_9 = [0] * ((len(param1) >> 2) + 1)
for _loc_5 in range(0, len(_loc_9)):
_loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1])
while len(_loc_9) > 0:
_loc_8 = []
_loc_7 = 0
for _loc_5 in range(0, len(_loc_9)):
_loc_7 = (_loc_7 << 16) + _loc_9[_loc_5]
_loc_6 = math.floor(_loc_7 / _loc_3)
_loc_7 -= _loc_6 * _loc_3
if len(_loc_8) > 0 or _loc_6 > 0:
_loc_8[len(_loc_8)] = _loc_6
_loc_4[len(_loc_4)] = _loc_7
_loc_9 = _loc_8
_loc_10 = ''
_loc_5 = len(_loc_4) - 1
while _loc_5 >= 0:
_loc_10 += param2[_loc_4[_loc_5]]
_loc_5 -= 1
return _loc_10
@classmethod
def str2rstr_utf8(cls, param1, param2=None):
_loc_3 = ''
_loc_4 = -1
if not param2:
param2 = cls.PADDING
param1 = param1 + param2[1:9]
while True:
_loc_4 += 1
if _loc_4 >= len(param1):
break
_loc_5 = compat_ord(param1[_loc_4])
_loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0
if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343:
_loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023)
_loc_4 += 1
if _loc_5 <= 127:
_loc_3 += compat_chr(_loc_5)
continue
if _loc_5 <= 2047:
_loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63)
continue
if _loc_5 <= 65535:
_loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr(
128 | _loc_5 & 63)
continue
if _loc_5 <= 2097151:
_loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr(
128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63)
return _loc_3
@staticmethod
def rstr2binl(param1):
_loc_2 = [0] * ((len(param1) >> 2) + 1)
for _loc_3 in range(0, len(_loc_2)):
_loc_2[_loc_3] = 0
for _loc_3 in range(0, len(param1) * 8, 8):
_loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32
return _loc_2
@staticmethod
def binl2rstr(param1):
_loc_2 = ''
for _loc_3 in range(0, len(param1) * 32, 8):
_loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255)
return _loc_2
@classmethod
def binl_md5(cls, param1, param2):
param1 = cls.JSArray(param1)
param1[param2 >> 5] |= 128 << param2 % 32
param1[(param2 + 64 >> 9 << 4) + 14] = param2
_loc_3 = 1732584193
_loc_4 = -271733879
_loc_5 = -1732584194
_loc_6 = 271733878
for _loc_7 in range(0, len(param1), 16):
_loc_8 = _loc_3
_loc_9 = _loc_4
_loc_10 = _loc_5
_loc_11 = _loc_6
_loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936)
_loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586)
_loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819)
_loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330)
_loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897)
_loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426)
_loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341)
_loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983)
_loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416)
_loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417)
_loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063)
_loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162)
_loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682)
_loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101)
_loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290)
_loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329)
_loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510)
_loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632)
_loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713)
_loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302)
_loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691)
_loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083)
_loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335)
_loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848)
_loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438)
_loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690)
_loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961)
_loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501)
_loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467)
_loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784)
_loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473)
_loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734)
_loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558)
_loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463)
_loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562)
_loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556)
_loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060)
_loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353)
_loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632)
_loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640)
_loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174)
_loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222)
_loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979)
_loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189)
_loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487)
_loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835)
_loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520)
_loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651)
_loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844)
_loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415)
_loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905)
_loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055)
_loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571)
_loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606)
_loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523)
_loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799)
_loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359)
_loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744)
_loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380)
_loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649)
_loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070)
_loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379)
_loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259)
_loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551)
_loc_3 = cls.safe_add(_loc_3, _loc_8)
_loc_4 = cls.safe_add(_loc_4, _loc_9)
_loc_5 = cls.safe_add(_loc_5, _loc_10)
_loc_6 = cls.safe_add(_loc_6, _loc_11)
return [_loc_3, _loc_4, _loc_5, _loc_6]
@classmethod
def md5_cmn(cls, param1, param2, param3, param4, param5, param6):
return cls.safe_add(
cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3)
@classmethod
def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7):
return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7)
@classmethod
def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7):
return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7)
@classmethod
def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7):
return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7)
@classmethod
def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7):
return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7)
@classmethod
def safe_add(cls, param1, param2):
_loc_3 = (param1 & 65535) + (param2 & 65535)
_loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16)
return cls.lshift(_loc_4, 16) | _loc_3 & 65535
@classmethod
def bit_rol(cls, param1, param2):
return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2)
@staticmethod
def lshift(value, count):
r = (0xFFFFFFFF & value) << count
return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id')
video = self._download_json(
self._API_URL_TEMPLATE % video_id, video_id)['videos'][0]
title = video['title']
duration = float_or_none(video['duration'], 1000)
like_count = video['likes']
uploader = video['channel']
uploader_id = video['channel_id']
formats = []
for resource in video['resources']:
resource_id = resource.get('_id')
if not resource_id:
continue
security = self._download_json(
self._SECURITY_URL_TEMPLATE % (video_id, resource_id),
video_id, 'Downloading security hash for %s' % resource_id)
security_hash = security.get('hash')
if not security_hash:
message = security.get('message')
if message:
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, message), expected=True)
continue
hash_code = security_hash[:2]
received_time = int(security_hash[2:12])
received_random = security_hash[12:22]
received_md5 = security_hash[22:]
sign_time = received_time + self._RESIGN_EXPIRATION
padding = '%010d' % random.randint(1, 10000000000)
signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding)
signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5
formats.append({
'url': '%s?h=%s&k=%s' % (resource['url'], signed_hash, 'flash'),
'format_id': resource_id,
'height': resource['height']
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'like_count': like_count,
'formats': formats
}

View File

@@ -1,13 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
import re
import codecs
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
from ..utils import unified_strdate
class GooglePlusIE(InfoExtractor):
@@ -19,74 +17,57 @@ class GooglePlusIE(InfoExtractor):
'info_dict': {
'id': 'ZButuJc6CtH',
'ext': 'flv',
'title': '嘆きの天使 降臨',
'upload_date': '20120613',
'uploader': '井上ヨシマサ',
'title': '嘆きの天使 降臨',
}
}
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
# Step 1, Retrieve post webpage to extract further information
webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
self.report_extraction(video_id)
# Extract update date
upload_date = self._html_search_regex(
title = self._og_search_description(webpage).splitlines()[0]
upload_date = unified_strdate(self._html_search_regex(
r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
webpage, 'upload date', fatal=False, flags=re.VERBOSE)
if upload_date:
# Convert timestring to a format suitable for filename
upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
upload_date = upload_date.strftime('%Y%m%d')
# Extract uploader
uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
webpage, 'uploader', fatal=False)
# Extract title
# Get the first line for title
video_title = self._og_search_description(webpage).splitlines()[0]
webpage, 'upload date', fatal=False, flags=re.VERBOSE))
uploader = self._html_search_regex(
r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
# Step 2, Simulate clicking the image box to launch video
DOMAIN = 'https://plus.google.com/'
video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
video_page = self._search_regex(
r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
webpage, 'video page URL')
if not video_page.startswith(DOMAIN):
video_page = DOMAIN + video_page
webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
def unicode_escape(s):
decoder = codecs.getdecoder('unicode_escape')
return re.sub(
r'\\u[0-9a-fA-F]{4,}',
lambda m: decoder(m.group(0))[0],
s)
# Extract video links all sizes
pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
mobj = re.findall(pattern, webpage)
if len(mobj) == 0:
raise ExtractorError('Unable to extract video links')
# Sort in resolution
links = sorted(mobj)
# Choose the lowest of the sort, i.e. highest resolution
video_url = links[-1]
# Only get the url. The resolution part in the tuple has no use anymore
video_url = video_url[-1]
# Treat escaped \u0026 style hex
try:
video_url = video_url.decode("unicode_escape")
except AttributeError: # Python 3
video_url = bytes(video_url, 'ascii').decode('unicode-escape')
formats = [{
'url': unicode_escape(video_url),
'ext': 'flv',
'width': int(width),
'height': int(height),
} for width, height, video_url in re.findall(
r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)]
self._sort_formats(formats)
return {
'id': video_id,
'url': video_url,
'title': title,
'uploader': uploader,
'upload_date': upload_date,
'title': video_title,
'ext': 'flv',
'formats': formats,
}

View File

@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
compat_urllib_parse,
compat_urllib_request,
@@ -12,20 +13,22 @@ from ..utils import (
class GorillaVidIE(InfoExtractor):
IE_DESC = 'GorillaVid.in and daclips.in'
IE_DESC = 'GorillaVid.in, daclips.in and movpod.in'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
(?:daclips\.in|gorillavid\.in))/
(?:daclips\.in|gorillavid\.in|movpod\.in))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
_FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<'
_TESTS = [{
'url': 'http://gorillavid.in/06y9juieqpmi',
'md5': '5ae4a3580620380619678ee4875893ba',
'info_dict': {
'id': '06y9juieqpmi',
'ext': 'flv',
'title': 'Rebecca Black My Moment Official Music Video Reaction',
'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',
'thumbnail': 're:http://.*\.jpg',
},
}, {
@@ -46,6 +49,9 @@ class GorillaVidIE(InfoExtractor):
'title': 'Micro Pig piglets ready on 16th July 2009',
'thumbnail': 're:http://.*\.jpg',
},
}, {
'url': 'http://movpod.in/0wguyyxi1yca',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -54,6 +60,9 @@ class GorillaVidIE(InfoExtractor):
webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id)
if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
fields = dict(re.findall(r'''(?x)<input\s+
type="hidden"\s+
name="([^"]+)"\s+
@@ -69,14 +78,14 @@ class GorillaVidIE(InfoExtractor):
webpage = self._download_webpage(req, video_id, 'Downloading video page')
title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title')
thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail')
url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url')
title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title')
video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url')
thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False)
formats = [{
'format_id': 'sd',
'url': url,
'ext': determine_ext(url),
'url': video_url,
'ext': determine_ext(video_url),
'quality': 1,
}]

View File

@@ -28,13 +28,13 @@ class HowStuffWorksIE(InfoExtractor):
}
},
{
'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm',
'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm',
'info_dict': {
'id': '553470',
'display_id': 'deadliest-catch-jakes-farewell-pots',
'id': '453464',
'display_id': 'survival-zone-food-and-water-in-the-savanna',
'ext': 'mp4',
'title': 'Deadliest Catch: Jake\'s Farewell Pots',
'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc',
'title': 'Survival Zone: Food and Water In the Savanna',
'description': 'md5:7e1c89f6411434970c15fa094170c371',
'thumbnail': 're:^https?://.*\.jpg$',
},
'params': {

View File

@@ -70,7 +70,7 @@ class MixcloudIE(InfoExtractor):
raise ExtractorError('Unable to extract track url')
PREFIX = (
r'<div class="cloudcast-play-button-container"'
r'<div class="cloudcast-play-button-container[^"]*?"'
r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
title = self._html_search_regex(
PREFIX + r'm-title="([^"]+)"', webpage, 'title')

View File

@@ -6,7 +6,6 @@ from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
find_xpath_attr,
)
@@ -88,8 +87,9 @@ class MLBIE(InfoExtractor):
duration = parse_duration(detail.find('./duration').text)
timestamp = parse_iso8601(detail.attrib['date'][:-5])
thumbnail = find_xpath_attr(
detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text
thumbnails = [{
'url': thumbnail.text,
} for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')]
formats = []
for media_url in detail.findall('./url'):
@@ -116,5 +116,5 @@ class MLBIE(InfoExtractor):
'duration': duration,
'timestamp': timestamp,
'formats': formats,
'thumbnail': thumbnail,
'thumbnails': thumbnails,
}

View File

@@ -18,16 +18,16 @@ class NDRIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.ndr.de/fernsehen/media/dienordreportage325.html',
'md5': '4a4eeafd17c3058b65f0c8f091355855',
'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
'note': 'Video file',
'info_dict': {
'id': '325',
'id': '25866',
'ext': 'mp4',
'title': 'Blaue Bohnen aus Blocken',
'description': 'md5:190d71ba2ccddc805ed01547718963bc',
'duration': 1715,
},
'title': 'Kartoffeltage in der Lewitz',
'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
'duration': 166,
}
},
{
'url': 'http://www.ndr.de/info/audio51535.html',

View File

@@ -39,18 +39,17 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
# Determine whether the downloader uses authentication to download video
_AUTHENTICATE = False
# Determine whether the downloader used authentication to download video
_AUTHENTICATED = False
def _real_initialize(self):
if self._downloader.params.get('username', None) is not None:
self._AUTHENTICATE = True
if self._AUTHENTICATE:
self._login()
self._login()
def _login(self):
(username, password) = self._get_login_info()
# No authentication to be performed
if not username:
return True
# Log in
login_form_strs = {
@@ -68,6 +67,8 @@ class NiconicoIE(InfoExtractor):
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
self._downloader.report_warning('unable to log in: bad username or password')
return False
# Successful login
self._AUTHENTICATED = True
return True
def _real_extract(self, url):
@@ -82,7 +83,7 @@ class NiconicoIE(InfoExtractor):
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
note='Downloading video info page')
if self._AUTHENTICATE:
if self._AUTHENTICATED:
# Get flv info
flv_info_webpage = self._download_webpage(
'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,

View File

@@ -0,0 +1,60 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class PlanetaPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)'
_API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}'
_THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}'
_TEST = {
'url': 'http://planetaplay.com/?sng=3586',
'md5': '9d569dceb7251a4e01355d5aea60f9db',
'info_dict': {
'id': '3586',
'ext': 'flv',
'title': 'md5:e829428ee28b1deed00de90de49d1da1',
}
}
_SONG_FORMATS = {
'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'),
'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'),
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
response = self._download_json(
self._API_URL.format(video_id), video_id)['response']
try:
data = response.get('data')[0]
except IndexError:
raise ExtractorError(
'%s: failed to get the playlist' % self.IE_NAME, expected=True)
title = '{song_artists:} - {sng_name:}'.format(**data)
thumbnail = self._THUMBNAIL_URL.format(**data)
formats = []
for format_id, (quality, url_template) in self._SONG_FORMATS.items():
formats.append({
'format_id': format_id,
'url': url_template.format(**data),
'quality': quality,
'ext': 'flv',
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
}

View File

@@ -4,19 +4,27 @@ import re
import json
from .common import InfoExtractor
from ..utils import int_or_none
from ..utils import (
int_or_none,
js_to_json,
qualities,
determine_ext,
)
class PornHdIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
_VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
_TEST = {
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
'md5': '956b8ca569f7f4d8ec563e2c41598441',
'info_dict': {
'id': '1962',
'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
'ext': 'mp4',
'title': 'Sierra loves doing laundry',
'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
'thumbnail': 're:^https?://.*\.jpg',
'view_count': int,
'age_limit': 18,
}
}
@@ -24,8 +32,9 @@ class PornHdIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, video_id)
webpage = self._download_webpage(url, display_id or video_id)
title = self._html_search_regex(
r'<title>(.+) porn HD.+?</title>', webpage, 'title')
@@ -33,38 +42,21 @@ class PornHdIE(InfoExtractor):
r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
view_count = int_or_none(self._html_search_regex(
r'(\d+) views\s*</span>', webpage, 'view count', fatal=False))
thumbnail = self._search_regex(
r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
videos = re.findall(
r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
if mobj:
flashvars = json.loads(mobj.group('flashvars'))
for key, quality in [('hashlink', 'low'), ('hd', 'high')]:
redirect_url = flashvars.get(key)
if redirect_url:
videos.append(('flv', quality, redirect_url))
thumbnail = flashvars['urlWallpaper']
else:
thumbnail = self._og_search_thumbnail(webpage)
formats = []
for format_, quality, redirect_url in videos:
format_id = '%s-%s' % (format_.lower(), quality.lower())
video_url = self._download_webpage(
redirect_url, video_id, 'Downloading %s video link' % format_id, fatal=False)
if not video_url:
continue
formats.append({
'url': video_url,
'ext': format_.lower(),
'format_id': format_id,
'quality': 1 if quality.lower() == 'high' else 0,
})
quality = qualities(['SD', 'HD'])
formats = [{
'url': source['file'],
'format_id': '%s-%s' % (source['label'], determine_ext(source['file'])),
'quality': quality(source['label']),
} for source in json.loads(js_to_json(self._search_regex(
r"(?s)'sources'\s*:\s*(\[.+?\])", webpage, 'sources')))]
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,

View File

@@ -0,0 +1,81 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
int_or_none,
)
class SportBoxIE(InfoExtractor):
_VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
_TESTS = [
{
'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
'md5': 'ff56a598c2cf411a9a38a69709e97079',
'info_dict': {
'id': '80822',
'ext': 'mp4',
'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1411896237,
'upload_date': '20140928',
'duration': 4846,
'view_count': int,
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
'only_matching': True,
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
player = self._download_webpage(
'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
display_id, 'Downloading player webpage')
hls = self._search_regex(
r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
title = self._html_search_regex(
r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
description = self._html_search_regex(
r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
timestamp = parse_iso8601(self._search_regex(
r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
duration = parse_duration(self._html_search_regex(
r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
r'<span>Просмотров: (\d+)</span>', player, 'view count', fatal=False))
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@@ -0,0 +1,70 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class TheOnionIE(InfoExtractor):
_VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?'
_TEST = {
'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
'md5': '19eaa9a39cf9b9804d982e654dc791ee',
'info_dict': {
'id': '2133',
'ext': 'mp4',
'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image',
'description': 'md5:cc12448686b5600baae9261d3e180910',
'thumbnail': 're:^https?://.*\.jpg\?\d+$',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
article_id = mobj.group('article_id')
webpage = self._download_webpage(url, article_id)
video_id = self._search_regex(
r'"videoId":\s(\d+),', webpage, 'video ID')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
if not sources:
raise ExtractorError(
'No sources found for video %s' % video_id, expected=True)
formats = []
for src, type_ in sources:
if type_ == 'video/mp4':
formats.append({
'format_id': 'mp4_sd',
'preference': 1,
'url': src,
})
elif type_ == 'video/webm':
formats.append({
'format_id': 'webm_sd',
'preference': 0,
'url': src,
})
elif type_ == 'application/x-mpegURL':
formats.extend(
self._extract_m3u8_formats(src, video_id, preference=-1))
else:
self.report_warning(
'Encountered unexpected format: %s' % type_)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'description': description,
}

View File

@@ -0,0 +1,100 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import unified_strdate
class TheSixtyOneIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?thesixtyone\.com/
(?:.*?/)*
(?:
s|
song/comments/list|
song
)/(?P<id>[A-Za-z0-9]+)/?$'''
_SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
_SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream'
_THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
_TESTS = [
{
'url': 'http://www.thesixtyone.com/s/SrE3zD7s1jt/',
'md5': '821cc43b0530d3222e3e2b70bb4622ea',
'info_dict': {
'id': 'SrE3zD7s1jt',
'ext': 'mp3',
'title': 'CASIO - Unicorn War Mixtape',
'thumbnail': 're:^https?://.*_desktop$',
'upload_date': '20071217',
'duration': 3208,
}
},
{
'url': 'http://www.thesixtyone.com/song/comments/list/SrE3zD7s1jt',
'only_matching': True,
},
{
'url': 'http://www.thesixtyone.com/s/ULoiyjuJWli#/s/SrE3zD7s1jt/',
'only_matching': True,
},
{
'url': 'http://www.thesixtyone.com/#/s/SrE3zD7s1jt/',
'only_matching': True,
},
{
'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/',
'only_matching': True,
},
]
_DECODE_MAP = {
"x": "a",
"m": "b",
"w": "c",
"q": "d",
"n": "e",
"p": "f",
"a": "0",
"h": "1",
"e": "2",
"u": "3",
"s": "4",
"i": "5",
"o": "6",
"y": "7",
"r": "8",
"c": "9"
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
song_id = mobj.group('id')
webpage = self._download_webpage(
self._SONG_URL_TEMPLATE.format(song_id), song_id)
song_data = json.loads(self._search_regex(
r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'))
keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]
url = self._SONG_FILE_URL_TEMPLATE.format(
"".join(reversed(keys)), **song_data)
formats = [{
'format_id': 'sd',
'url': url,
'ext': 'mp3',
}]
return {
'id': song_id,
'title': '{artist:} - {name:}'.format(**song_data),
'formats': formats,
'comment_count': song_data.get('comments_count'),
'duration': song_data.get('play_time'),
'like_count': song_data.get('score'),
'thumbnail': self._THUMBNAIL_URL_TEMPLATE.format(**song_data),
'upload_date': unified_strdate(song_data.get('publish_date')),
}

View File

@@ -56,7 +56,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
(?P<proto>(?:https?:)?//)?
https?://
(?:(?:www|(?P<player>player))\.)?
vimeo(?P<pro>pro)?\.com/
(?!channels/[^/?#]+/?(?:$|[?#])|album/)

View File

@@ -0,0 +1,89 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
xpath_text,
int_or_none,
)
class WallaIE(SubtitlesInfoExtractor):
_VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
_TEST = {
'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one',
'info_dict': {
'id': '2642630',
'display_id': 'one-direction-all-for-one',
'ext': 'flv',
'title': 'וואן דיירקשן: ההיסטריה',
'description': 'md5:de9e2512a92442574cdb0913c49bc4d8',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 3600,
},
'params': {
# rtmp download
'skip_download': True,
}
}
_SUBTITLE_LANGS = {
'עברית': 'heb',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
video = self._download_xml(
'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id,
display_id)
item = video.find('./items/item')
title = xpath_text(item, './title', 'title')
description = xpath_text(item, './synopsis', 'description')
thumbnail = xpath_text(item, './preview_pic', 'thumbnail')
duration = int_or_none(xpath_text(item, './duration', 'duration'))
subtitles = {}
for subtitle in item.findall('./subtitles/subtitle'):
lang = xpath_text(subtitle, './title')
subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src')
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
subtitles = self.extract_subtitles(video_id, subtitles)
formats = []
for quality in item.findall('./qualities/quality'):
format_id = xpath_text(quality, './title')
fmt = {
'url': 'rtmp://wafla.walla.co.il/vod',
'play_path': xpath_text(quality, './src'),
'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf',
'page_url': url,
'ext': 'flv',
'format_id': xpath_text(quality, './title'),
}
m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
if m:
fmt['height'] = int(m.group('height'))
formats.append(fmt)
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
}

View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
@@ -6,6 +7,7 @@ import re
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse,
compat_urlparse,
clean_html,
@@ -15,7 +17,7 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
_VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
_VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -25,6 +27,7 @@ class YahooIE(InfoExtractor):
'ext': 'mp4',
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
'duration': 6863,
},
},
{
@@ -34,7 +37,8 @@ class YahooIE(InfoExtractor):
'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
'ext': 'mp4',
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
'description': 'md5:66b627ab0a282b26352136ca96ce73c1',
'duration': 151,
},
},
{
@@ -45,15 +49,95 @@ class YahooIE(InfoExtractor):
'ext': 'mp4',
'title': "Yahoo Saves 'Community'",
'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
'duration': 170,
}
},
{
'url': 'https://tw.screen.yahoo.com/taipei-opinion-poll/選情站報-街頭民調-台北市篇-102823042.html',
'md5': '92a7fdd8a08783c68a174d7aa067dde8',
'info_dict': {
'id': '7a23b569-7bea-36cb-85b9-bd5301a0a1fb',
'ext': 'mp4',
'title': '選情站報 街頭民調 台北市篇',
'description': '選情站報 街頭民調 台北市篇',
'duration': 429,
}
},
{
'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
'md5': '0b51660361f0e27c9789e7037ef76f4b',
'info_dict': {
'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
'ext': 'mp4',
'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
'description': 'md5:f66c890e1490f4910a9953c941dee944',
'duration': 97,
}
},
{
'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html',
'md5': '57e06440778b1828a6079d2f744212c4',
'info_dict': {
'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73',
'ext': 'mp4',
'title': 'Program that makes hockey more affordable not offered in Manitoba',
'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
'duration': 121,
}
}, {
'url': 'https://ca.finance.yahoo.com/news/20-most-valuable-brands-world-112600775.html',
'md5': '3e401e4eed6325aa29d9b96125fd5b4f',
'info_dict': {
'id': 'c1b4c09c-8ed8-3b65-8b05-169c55358a83',
'ext': 'mp4',
'title': "Apple Is The World's Most Valuable Brand",
'description': 'md5:73eabc1a11c6f59752593b2ceefa1262',
'duration': 21,
}
}, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '67010fdf3a08d290e060a4dd96baa07b',
'info_dict': {
'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
'ext': 'mp4',
'title': 'China Moses Is Crazy About the Blues',
'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
'duration': 128,
}
}, {
'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html',
'md5': 'd9a083ccf1379127bf25699d67e4791b',
'info_dict': {
'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c',
'ext': 'mp4',
'title': 'Connect the Dots: Dark Side of Virgo',
'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
'duration': 201,
}
}, {
'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
'only_matching': True,
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
url = mobj.group('url')
webpage = self._download_webpage(url, video_id)
host = mobj.group('host')
webpage = self._download_webpage(url, display_id)
# Look for iframed media first
iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
if iframe_m:
iframepage = self._download_webpage(
host + iframe_m.group(1), display_id, 'Downloading iframe webpage')
items_json = self._search_regex(
r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
if items_json:
items = json.loads(items_json)
video_id = items[0]['id']
return self._get_info(video_id, display_id, webpage)
items_json = self._search_regex(
r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
@@ -64,20 +148,22 @@ class YahooIE(InfoExtractor):
r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
r'"first_videoid"\s*:\s*"([^"]+)"',
]
long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
video_id = long_id
video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
else:
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
# from another page
long_id = info['id']
return self._get_info(long_id, video_id, webpage)
video_id = info['id']
return self._get_info(video_id, display_id, webpage)
def _get_info(self, long_id, video_id, webpage):
def _get_info(self, video_id, display_id, webpage):
region = self._search_regex(
r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
webpage, 'region', fatal=False, default='US')
query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
' AND protocol="http"' % long_id)
' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"'
' AND protocol="http"' % (video_id, region))
data = compat_urllib_parse.urlencode({
'q': query,
'env': 'prod',
@@ -85,9 +171,17 @@ class YahooIE(InfoExtractor):
})
query_result = self._download_json(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, 'Downloading video info')
display_id, 'Downloading video info')
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
meta = info.get('meta')
if not meta:
msg = info['status'].get('msg')
if msg:
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, msg), expected=True)
raise ExtractorError('Unable to extract media object meta')
formats = []
for s in info['streams']:
@@ -114,36 +208,15 @@ class YahooIE(InfoExtractor):
return {
'id': video_id,
'display_id': display_id,
'title': meta['title'],
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
'duration': int_or_none(meta.get('duration')),
}
class YahooNewsIE(YahooIE):
IE_NAME = 'yahoo:news'
_VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
_TESTS = [{
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '67010fdf3a08d290e060a4dd96baa07b',
'info_dict': {
'id': '104538833',
'ext': 'mp4',
'title': 'China Moses Is Crazy About the Blues',
'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
return self._get_info(long_id, video_id, webpage)
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = 'Yahoo screen search'
_MAX_RESULTS = 1000

View File

@@ -894,6 +894,7 @@ def unified_strdate(date_str):
'%Y/%m/%d %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M',
'%d.%m.%Y %H.%M',
'%Y-%m-%dT%H:%M:%SZ',
@@ -1574,6 +1575,13 @@ US_RATINGS = {
}
def parse_age_limit(s):
if s is None:
return None
m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
return int(m.group('age')) if m else US_RATINGS.get(s, None)
def strip_jsonp(code):
return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)

View File

@@ -1,2 +1,2 @@
__version__ = '2014.10.02'
__version__ = '2014.10.12'