[vk] improve extraction

- add support for Odnoklassniki embeds
- update tests
- extract more video from user lists(closes #4470)
- fix wall post audio extraction(closes #18332)
- improve error detection(closes #22568)
This commit is contained in:
Remita Amine 2019-10-25 19:35:07 +01:00
parent 416c3ca7f5
commit 3c989818e7

View File

@ -12,7 +12,6 @@ from ..utils import (
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
remove_start,
str_or_none, str_or_none,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
@ -21,6 +20,7 @@ from ..utils import (
urlencode_postdata, urlencode_postdata,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
from .odnoklassniki import OdnoklassnikiIE
from .pladform import PladformIE from .pladform import PladformIE
from .vimeo import VimeoIE from .vimeo import VimeoIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
@ -60,6 +60,18 @@ class VKBaseIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _download_payload(self, path, video_id, data, fatal=True):
data['al'] = 1
code, payload = self._download_json(
'https://vk.com/%s.php' % path, video_id,
data=urlencode_postdata(data), fatal=fatal,
headers={'X-Requested-With': 'XMLHttpRequest'})['payload']
if code == '3':
self.raise_login_required()
elif code == '8':
raise ExtractorError(clean_html(payload[0][1:-1]), expected=True)
return payload
class VKIE(VKBaseIE): class VKIE(VKBaseIE):
IE_NAME = 'vk' IE_NAME = 'vk'
@ -96,7 +108,6 @@ class VKIE(VKBaseIE):
}, },
{ {
'url': 'http://vk.com/video205387401_165548505', 'url': 'http://vk.com/video205387401_165548505',
'md5': '6c0aeb2e90396ba97035b9cbde548700',
'info_dict': { 'info_dict': {
'id': '205387401_165548505', 'id': '205387401_165548505',
'ext': 'mp4', 'ext': 'mp4',
@ -110,18 +121,18 @@ class VKIE(VKBaseIE):
}, },
{ {
'note': 'Embedded video', 'note': 'Embedded video',
'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa',
'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 'md5': '7babad3b85ea2e91948005b1b8b0cb84',
'info_dict': { 'info_dict': {
'id': '32194266_162925554', 'id': '-77521_162222515',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Vladimir Gavrin', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'title': 'Lin Dan', 'title': 'ProtivoGunz - Хуёвая песня',
'duration': 101, 'duration': 195,
'upload_date': '20120730', 'upload_date': '20120212',
'view_count': int, 'timestamp': 1329049880,
'uploader_id': '-77521',
}, },
'skip': 'This video has been removed from public access.',
}, },
{ {
# VIDEO NOW REMOVED # VIDEO NOW REMOVED
@ -138,18 +149,19 @@ class VKIE(VKBaseIE):
'upload_date': '20121218', 'upload_date': '20121218',
'view_count': int, 'view_count': int,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Removed',
}, },
{ {
'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
'md5': '4d7a5ef8cf114dfa09577e57b2993202',
'info_dict': { 'info_dict': {
'id': '-43215063_168067957', 'id': '-43215063_168067957',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Киномания - лучшее из мира кино', 'uploader': 'Bro Mazter',
'title': ' ', 'title': ' ',
'duration': 7291, 'duration': 7291,
'upload_date': '20140328', 'upload_date': '20140328',
'uploader_id': '223413403',
'timestamp': 1396018030,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
}, },
@ -165,7 +177,7 @@ class VKIE(VKBaseIE):
'upload_date': '20140626', 'upload_date': '20140626',
'view_count': int, 'view_count': int,
}, },
'skip': 'Only works from Russia', 'skip': 'Removed',
}, },
{ {
# video (removed?) only available with list id # video (removed?) only available with list id
@ -247,6 +259,9 @@ class VKIE(VKBaseIE):
'uploader_id': '-387766', 'uploader_id': '-387766',
'timestamp': 1475137527, 'timestamp': 1475137527,
}, },
'params': {
'skip_download': True,
},
}, },
{ {
# live stream, hls and rtmp links, most likely already finished live # live stream, hls and rtmp links, most likely already finished live
@ -288,80 +303,94 @@ class VKIE(VKBaseIE):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid') video_id = mobj.group('videoid')
mv_data = {}
if video_id: if video_id:
info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id data = {
'act': 'show_inline',
'video': video_id,
}
# Some videos (removed?) can only be downloaded with list id specified # Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id') list_id = mobj.group('list_id')
if list_id: if list_id:
info_url += '&list=%s' % list_id data['list'] = list_id
payload = self._download_payload('al_video', video_id, data)
info_page = payload[1]
opts = payload[-1]
mv_data = opts.get('mvData') or {}
player = opts.get('player') or {}
else: else:
info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_page = self._download_webpage(info_url, video_id) info_page = self._download_webpage(
'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id)
error_message = self._html_search_regex( error_message = self._html_search_regex(
[r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
info_page, 'error message', default=None) info_page, 'error message', default=None)
if error_message: if error_message:
raise ExtractorError(error_message, expected=True) raise ExtractorError(error_message, expected=True)
if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
raise ExtractorError( raise ExtractorError(
'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
expected=True) expected=True)
ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.' ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.'
ERRORS = { ERRORS = {
r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
ERROR_COPYRIGHT, ERROR_COPYRIGHT,
r'>The video .*? was removed from public access by request of the copyright holder.<': r'>The video .*? was removed from public access by request of the copyright holder.<':
ERROR_COPYRIGHT, ERROR_COPYRIGHT,
r'<!>Please log in or <': r'<!>Please log in or <':
'Video %s is only available for registered users, ' 'Video %s is only available for registered users, '
'use --username and --password options to provide account credentials.', 'use --username and --password options to provide account credentials.',
r'<!>Unknown error': r'<!>Unknown error':
'Video %s does not exist.', 'Video %s does not exist.',
r'<!>Видео временно недоступно': r'<!>Видео временно недоступно':
'Video %s is temporarily unavailable.', 'Video %s is temporarily unavailable.',
r'<!>Access denied': r'<!>Access denied':
'Access denied to video %s.', 'Access denied to video %s.',
r'<!>Видеозапись недоступна, так как её автор был заблокирован.': r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
'Video %s is no longer available, because its author has been blocked.', 'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because its author has been blocked.': r'<!>This video is no longer available, because its author has been blocked.':
'Video %s is no longer available, because its author has been blocked.', 'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because it has been deleted.': r'<!>This video is no longer available, because it has been deleted.':
'Video %s is no longer available, because it has been deleted.', 'Video %s is no longer available, because it has been deleted.',
r'<!>The video .+? is not available in your region.': r'<!>The video .+? is not available in your region.':
'Video %s is not available in your region.', 'Video %s is not available in your region.',
} }
for error_re, error_msg in ERRORS.items(): for error_re, error_msg in ERRORS.items():
if re.search(error_re, info_page): if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True) raise ExtractorError(error_msg % video_id, expected=True)
player = self._parse_json(self._search_regex(
r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n',
info_page, 'player params'), video_id)
youtube_url = YoutubeIE._extract_url(info_page) youtube_url = YoutubeIE._extract_url(info_page)
if youtube_url: if youtube_url:
return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) return self.url_result(youtube_url, YoutubeIE.ie_key())
vimeo_url = VimeoIE._extract_url(url, info_page) vimeo_url = VimeoIE._extract_url(url, info_page)
if vimeo_url is not None: if vimeo_url is not None:
return self.url_result(vimeo_url) return self.url_result(vimeo_url, VimeoIE.ie_key())
pladform_url = PladformIE._extract_url(info_page) pladform_url = PladformIE._extract_url(info_page)
if pladform_url: if pladform_url:
return self.url_result(pladform_url) return self.url_result(pladform_url, PladformIE.ie_key())
m_rutube = re.search( m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page)
@ -374,6 +403,10 @@ class VKIE(VKBaseIE):
if dailymotion_urls: if dailymotion_urls:
return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page)
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts: if m_opts:
m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
@ -383,38 +416,7 @@ class VKIE(VKBaseIE):
opts_url = 'http:' + opts_url opts_url = 'http:' + opts_url
return self.url_result(opts_url) return self.url_result(opts_url)
# vars does not look to be served anymore since 24.10.2016 data = player['params'][0]
data = self._parse_json(
self._search_regex(
r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'),
video_id, fatal=False)
# <!json> is served instead
if not data:
data = self._parse_json(
self._search_regex(
[r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
info_page, 'json', default='{}'),
video_id)
if data:
data = data['player']['params'][0]
if not data:
data = self._parse_json(
self._search_regex(
r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page,
'player params', default='{}'),
video_id)
if data:
data = data['params'][0]
# <!--{...}
if not data:
data = self._parse_json(
self._search_regex(
r'<!--\s*({.+})', info_page, 'payload'),
video_id)['payload'][-1][-1]['player']['params'][0]
title = unescapeHTML(data['md_title']) title = unescapeHTML(data['md_title'])
# 2 = live # 2 = live
@ -463,12 +465,12 @@ class VKIE(VKBaseIE):
'title': title, 'title': title,
'thumbnail': data.get('jpg'), 'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'), 'uploader': data.get('md_author'),
'uploader_id': str_or_none(data.get('author_id')), 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')),
'duration': data.get('duration'), 'duration': int_or_none(data.get('duration') or mv_data.get('duration')),
'timestamp': timestamp, 'timestamp': timestamp,
'view_count': view_count, 'view_count': view_count,
'like_count': int_or_none(data.get('liked')), 'like_count': int_or_none(mv_data.get('likes')),
'dislike_count': int_or_none(data.get('nolikes')), 'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live, 'is_live': is_live,
} }
@ -482,7 +484,6 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://vk.com/videos205387401', 'url': 'http://vk.com/videos205387401',
'info_dict': { 'info_dict': {
'id': '205387401', 'id': '205387401',
'title': "Tom Cruise's Videos",
}, },
'playlist_mincount': 4, 'playlist_mincount': 4,
}, { }, {
@ -498,22 +499,25 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://new.vk.com/videos205387401', 'url': 'http://new.vk.com/videos205387401',
'only_matching': True, 'only_matching': True,
}] }]
_VIDEO = collections.namedtuple(
'Video', ['owner_id', 'id', 'thumb', 'title', 'flags', 'duration', 'hash', 'moder_acts', 'owner', 'date', 'views', 'platform', 'blocked', 'music_video_meta'])
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) l = self._download_payload('al_video', page_id, {
'act': 'load_videos_silent',
'oid': page_id,
})[0]['']['list']
entries = [ entries = []
self.url_result( for video in l:
'http://vk.com/video' + video_id, 'VK', video_id=video_id) v = self._VIDEO._make(video)
for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] video_id = '%d_%d' % (v.owner_id, v.id)
entries.append(self.url_result(
'http://vk.com/video' + video_id, 'VK', video_id=video_id))
title = unescapeHTML(self._search_regex( return self.playlist_result(entries, page_id)
r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
webpage, 'title', default=page_id))
return self.playlist_result(entries, page_id, title)
class VKWallPostIE(VKBaseIE): class VKWallPostIE(VKBaseIE):
@ -523,15 +527,15 @@ class VKWallPostIE(VKBaseIE):
# public page URL, audio playlist # public page URL, audio playlist
'url': 'https://vk.com/bs.official?w=wall-23538238_35', 'url': 'https://vk.com/bs.official?w=wall-23538238_35',
'info_dict': { 'info_dict': {
'id': '23538238_35', 'id': '-23538238_35',
'title': 'Black Shadow - Wall post 23538238_35', 'title': 'Black Shadow - Wall post -23538238_35',
'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
}, },
'playlist': [{ 'playlist': [{
'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
'info_dict': { 'info_dict': {
'id': '135220665_111806521', 'id': '135220665_111806521',
'ext': 'mp3', 'ext': 'mp4',
'title': 'Black Shadow - Слепое Верование', 'title': 'Black Shadow - Слепое Верование',
'duration': 370, 'duration': 370,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
@ -542,18 +546,16 @@ class VKWallPostIE(VKBaseIE):
'md5': '4cc7e804579122b17ea95af7834c9233', 'md5': '4cc7e804579122b17ea95af7834c9233',
'info_dict': { 'info_dict': {
'id': '135220665_111802303', 'id': '135220665_111802303',
'ext': 'mp3', 'ext': 'mp4',
'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
'duration': 423, 'duration': 423,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
'artist': 'Black Shadow', 'artist': 'Black Shadow',
'track': 'Война - Негасимое Бездны Пламя!', 'track': 'Война - Негасимое Бездны Пламя!',
}, },
'params': {
'skip_download': True,
},
}], }],
'params': { 'params': {
'skip_download': True,
'usenetrc': True, 'usenetrc': True,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
@ -562,7 +564,7 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://vk.com/wall85155021_6319', 'url': 'https://vk.com/wall85155021_6319',
'info_dict': { 'info_dict': {
'id': '85155021_6319', 'id': '85155021_6319',
'title': 'Sergey Gorbunov - Wall post 85155021_6319', 'title': 'Сергей Горбунов - Wall post 85155021_6319',
}, },
'playlist_count': 1, 'playlist_count': 1,
'params': { 'params': {
@ -578,58 +580,73 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://m.vk.com/wall-23538238_35', 'url': 'https://m.vk.com/wall-23538238_35',
'only_matching': True, 'only_matching': True,
}] }]
_BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
_AUDIO = collections.namedtuple(
'Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads', 'subtitle', 'main_artists', 'feat_artists', 'album', 'track_code', 'restriction', 'album_part', 'new_stats', 'access_key'])
def _decode(self, enc):
dec = ''
e = n = 0
for c in enc:
r = self._BASE64_CHARS.index(c)
cond = n % 4
e = 64 * e + r if cond else r
n += 1
if cond:
dec += chr(255 & e >> (-2 * n & 6))
return dec
def _unmask_url(self, mask_url, vk_id):
if 'audio_api_unavailable' in mask_url:
extra = mask_url.split('?extra=')[1].split('#')
func, base = self._decode(extra[1]).split(chr(11))
assert (func == 'i')
mask_url = list(self._decode(extra[0]))
url_len = len(mask_url)
indexes = [None] * url_len
index = int(base) ^ vk_id
for n in range(url_len - 1, -1, -1):
index = (url_len * (n + 1) ^ index + n) % url_len
indexes[n] = index
for n in range(1, url_len):
c = mask_url[n]
index = indexes[url_len - 1 - n]
mask_url[n] = mask_url[index]
mask_url[index] = c
mask_url = ''.join(mask_url)
return mask_url
def _real_extract(self, url): def _real_extract(self, url):
post_id = self._match_id(url) post_id = self._match_id(url)
wall_url = 'https://vk.com/wall%s' % post_id webpage = self._download_payload('wkview', post_id, {
'act': 'show',
post_id = remove_start(post_id, '-') 'w': 'wall' + post_id,
})[1]
webpage = self._download_webpage(wall_url, post_id)
error = self._html_search_regex(
r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)',
webpage, 'error', default=None)
if error:
raise ExtractorError('VK said: %s' % error, expected=True)
description = clean_html(get_element_by_class('wall_post_text', webpage)) description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class('author', webpage)) uploader = clean_html(get_element_by_class('author', webpage))
thumbnail = self._og_search_thumbnail(webpage)
entries = [] entries = []
audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) for audio in re.findall(r'data-audio="([^"]+)', webpage):
if audio_ids: audio = self._parse_json(unescapeHTML(audio), post_id)
al_audio = self._download_webpage( a = self._AUDIO._make(audio)
'https://vk.com/al_audio.php', post_id, if not a.url:
note='Downloading audio info', fatal=False, continue
data=urlencode_postdata({ title = unescapeHTML(a.title)
'act': 'reload_audio', entries.append({
'al': '1', 'id': '%s_%s' % (a.owner_id, a.id),
'ids': ','.join(audio_ids) 'url': self._unmask_url(a.url, a.ads['vk_id']),
})) 'title': '%s - %s' % (a.performer, title) if a.performer else title,
if al_audio: 'thumbnail': a.cover_url.split(',') if a.cover_url else None,
Audio = collections.namedtuple( 'duration': a.duration,
'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) 'uploader': uploader,
audios = self._parse_json( 'artist': a.performer,
self._search_regex( 'track': title,
r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'), 'ext': 'mp4',
post_id, fatal=False, transform_source=unescapeHTML) 'protocol': 'm3u8',
if isinstance(audios, list): })
for audio in audios:
a = Audio._make(audio[:6])
entries.append({
'id': '%s_%s' % (a.user_id, a.id),
'url': a.url,
'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
'thumbnail': thumbnail,
'duration': a.duration,
'uploader': uploader,
'artist': a.artist,
'track': a.track,
})
for video in re.finditer( for video in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):