Merge remote-tracking branch 'origin/master' into pr-bbcnews

This commit is contained in:
fnord 2015-06-25 00:34:46 -05:00
commit aa5740fb61
21 changed files with 702 additions and 258 deletions

View File

@ -4,7 +4,10 @@ from .abc import ABCIE
from .abc7news import Abc7NewsIE from .abc7news import Abc7NewsIE
from .academicearth import AcademicEarthCourseIE from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE from .addanime import AddAnimeIE
from .adobetv import AdobeTVIE from .adobetv import (
AdobeTVIE,
AdobeTVVideoIE,
)
from .adultswim import AdultSwimIE from .adultswim import AdultSwimIE
from .aftenposten import AftenpostenIE from .aftenposten import AftenpostenIE
from .aftonbladet import AftonbladetIE from .aftonbladet import AftonbladetIE
@ -103,6 +106,7 @@ from .dailymotion import (
DailymotionIE, DailymotionIE,
DailymotionPlaylistIE, DailymotionPlaylistIE,
DailymotionUserIE, DailymotionUserIE,
DailymotionCloudIE,
) )
from .daum import DaumIE from .daum import DaumIE
from .dbtv import DBTVIE from .dbtv import DBTVIE
@ -401,6 +405,7 @@ from .pbs import PBSIE
from .philharmoniedeparis import PhilharmonieDeParisIE from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE from .photobucket import PhotobucketIE
from .pinkbike import PinkbikeIE
from .planetaplay import PlanetaPlayIE from .planetaplay import PlanetaPlayIE
from .pladform import PladformIE from .pladform import PladformIE
from .played import PlayedIE from .played import PlayedIE
@ -696,7 +701,10 @@ from .wrzuta import WrzutaIE
from .wsj import WSJIE from .wsj import WSJIE
from .xbef import XBefIE from .xbef import XBefIE
from .xboxclips import XboxClipsIE from .xboxclips import XboxClipsIE
from .xhamster import XHamsterIE from .xhamster import (
XHamsterIE,
XHamsterEmbedIE,
)
from .xminus import XMinusIE from .xminus import XMinusIE
from .xnxx import XNXXIE from .xnxx import XNXXIE
from .xstream import XstreamIE from .xstream import XstreamIE

View File

@ -5,6 +5,8 @@ from ..utils import (
parse_duration, parse_duration,
unified_strdate, unified_strdate,
str_to_int, str_to_int,
float_or_none,
ISO639Utils,
) )
@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
'view_count': view_count, 'view_count': view_count,
'formats': formats, 'formats': formats,
} }
class AdobeTVVideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
'url': 'https://video.tv.adobe.com/v/2456/',
'md5': '43662b577c018ad707a63766462b1e87',
'info_dict': {
'id': '2456',
'ext': 'mp4',
'title': 'New experience with Acrobat DC',
'description': 'New experience with Acrobat DC',
'duration': 248.667,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_params = self._parse_json(self._search_regex(
r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
video_id)
formats = [{
'url': source['src'],
'width': source.get('width'),
'height': source.get('height'),
'tbr': source.get('bitrate'),
} for source in player_params['sources']]
# For both metadata and downloaded files the duration varies among
# formats. I just pick the max one
duration = max(filter(None, [
float_or_none(source.get('duration'), scale=1000)
for source in player_params['sources']]))
subtitles = {}
for translation in player_params.get('translations', []):
lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
if lang_id not in subtitles:
subtitles[lang_id] = []
subtitles[lang_id].append({
'url': translation['vttPath'],
'ext': 'vtt',
})
return {
'id': video_id,
'formats': formats,
'title': player_params['title'],
'description': self._og_search_description(webpage),
'duration': duration,
'subtitles': subtitles,
}

View File

@ -255,26 +255,11 @@ class BBCCoUkIE(InfoExtractor):
for connection in self._extract_connections(media): for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
srt = ''
def _extract_text(p):
if p.text is not None:
stripped_text = p.text.strip()
if stripped_text:
return stripped_text
return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
for pos, p in enumerate(ps):
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
subtitles[lang] = [ subtitles[lang] = [
{ {
'url': connection.get('href'), 'url': connection.get('href'),
'ext': 'ttml', 'ext': 'ttml',
}, },
{
'data': srt,
'ext': 'srt',
},
] ]
return subtitles return subtitles

View File

@ -13,6 +13,7 @@ from ..compat import (
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
compat_xml_parse_error,
) )
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):
try: try:
object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
except xml.etree.ElementTree.ParseError: except compat_xml_parse_error:
return return
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')

View File

@ -251,3 +251,45 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'title': full_user, 'title': full_user,
'entries': self._extract_entries(user), 'entries': self._extract_entries(user),
} }
class DailymotionCloudIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P<id>[^/?]+)'
_TEST = {
# From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
# Tested at FranceTvInfo_2
'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
'only_matching': True,
}
@classmethod
def _extract_dmcloud_url(self, webpage):
mobj = re.search(r'<iframe[^>]+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
if mobj:
return mobj.group(1)
mobj = re.search(r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage)
if mobj:
return mobj.group(1)
def _real_extract(self, url):
video_id = self._match_id(url)
request = self._build_request(url)
webpage = self._download_webpage(request, video_id)
title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
video_info = self._parse_json(self._search_regex(
r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
# TODO: parse ios_url, which is in fact a manifest
video_url = video_info['mp4_url']
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': video_info.get('thumbnail_url'),
}

View File

@ -6,6 +6,8 @@ import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse, compat_urlparse,
) )
from ..utils import ( from ..utils import (
@ -17,7 +19,39 @@ from ..utils import (
) )
class DramaFeverIE(InfoExtractor): class DramaFeverBaseIE(InfoExtractor):
_LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
_NETRC_MACHINE = 'dramafever'
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_form = {
'username': username,
'password': password,
}
request = compat_urllib_request.Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
if all(logout_pattern not in response
for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
error = self._html_search_regex(
r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
response, 'error message', default=None)
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
class DramaFeverIE(DramaFeverBaseIE):
IE_NAME = 'dramafever' IE_NAME = 'dramafever'
_VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
_TEST = { _TEST = {
@ -97,7 +131,7 @@ class DramaFeverIE(InfoExtractor):
} }
class DramaFeverSeriesIE(InfoExtractor): class DramaFeverSeriesIE(DramaFeverBaseIE):
IE_NAME = 'dramafever:series' IE_NAME = 'dramafever:series'
_VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
_TESTS = [{ _TESTS = [{
@ -151,8 +185,11 @@ class DramaFeverSeriesIE(InfoExtractor):
% (consumer_secret, series_id, self._PAGE_SIZE, page_num), % (consumer_secret, series_id, self._PAGE_SIZE, page_num),
series_id, 'Downloading episodes JSON page #%d' % page_num) series_id, 'Downloading episodes JSON page #%d' % page_num)
for episode in episodes.get('value', []): for episode in episodes.get('value', []):
episode_url = episode.get('episode_url')
if not episode_url:
continue
entries.append(self.url_result( entries.append(self.url_result(
compat_urlparse.urljoin(url, episode['episode_url']), compat_urlparse.urljoin(url, episode_url),
'DramaFever', episode.get('guid'))) 'DramaFever', episode.get('guid')))
if page_num == episodes['num_pages']: if page_num == episodes['num_pages']:
break break

View File

@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
'md5': 'fe330252ddea607635cf2eb2c99a0af3',
'info_dict': { 'info_dict': {
'id': '65517', 'id': '65517',
'ext': 'mp4', 'ext': 'mp4',
@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
'upload_date': '20110120', 'upload_date': '20110120',
'duration': 3664, 'duration': 3664,
}, },
'params': {
'skip_download': True, # requires rtmp
},
}, { }, {
'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
'md5': '6dfe039417e76795fb783c52da3de11d', 'md5': '6dfe039417e76795fb783c52da3de11d',
@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
'format_id': file['Type'].replace('Video', ''), 'format_id': file['Type'].replace('Video', ''),
'preference': preferencemap.get(file['Type'], -10), 'preference': preferencemap.get(file['Type'], -10),
}) })
if format['url'].startswith('rtmp'):
rtmp_url = format['url']
format['rtmp_live'] = True # --resume does not work
if '/bonanza/' in rtmp_url:
format['play_path'] = rtmp_url.split('/bonanza/')[1]
formats.append(format) formats.append(format)
elif file['Type'] == "Thumb": elif file['Type'] == "Thumb":
thumbnail = file['Location'] thumbnail = file['Location']
@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
description = '%s\n%s\n%s\n' % ( description = '%s\n%s\n%s\n' % (
info['Description'], info['Actors'], info['Colophon']) info['Description'], info['Actors'], info['Colophon'])
for f in formats:
f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
self._sort_formats(formats) self._sort_formats(formats)
display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id

View File

@ -6,9 +6,9 @@ from .common import InfoExtractor
class FazIE(InfoExtractor): class FazIE(InfoExtractor):
IE_NAME = 'faz.net' IE_NAME = 'faz.net'
_VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
_TEST = { _TESTS = [{
'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', 'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
'info_dict': { 'info_dict': {
'id': '12610585', 'id': '12610585',
@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', 'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
'description': 'md5:1453fbf9a0d041d985a47306192ea253', 'description': 'md5:1453fbf9a0d041d985a47306192ea253',
}, },
} }, {
'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
'only_matching': True,
}, {
'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
'only_matching': True,
}, {
'url': 'http://www.faz.net/-13659345.html',
'only_matching': True,
}, {
'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
'only_matching': True,
}, {
'url': 'http://www.faz.net/foobarblafasel-13659345.html',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -18,6 +18,7 @@ from ..utils import (
parse_duration, parse_duration,
determine_ext, determine_ext,
) )
from .dailymotion import DailymotionCloudIE
class FranceTVBaseInfoExtractor(InfoExtractor): class FranceTVBaseInfoExtractor(InfoExtractor):
@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'skip_download': 'HLS (reqires ffmpeg)' 'skip_download': 'HLS (reqires ffmpeg)'
}, },
'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
}, {
'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
'md5': 'f485bda6e185e7d15dbc69b72bae993e',
'info_dict': {
'id': '556e03339473995ee145930c',
'ext': 'mp4',
'title': 'Les entreprises familiales : le secret de la réussite',
'thumbnail': 're:^https?://.*\.jpe?g$',
}
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title') page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title) webpage = self._download_webpage(url, page_title)
dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
if dmcloud_url:
return self.url_result(dmcloud_url, 'DailymotionCloud')
video_id, catalogue = self._search_regex( video_id, catalogue = self._search_regex(
r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
return self._extract_video(video_id, catalogue) return self._extract_video(video_id, catalogue)

View File

@ -43,6 +43,9 @@ from .senateisvp import SenateISVPIE
from .bliptv import BlipTVIE from .bliptv import BlipTVIE
from .svt import SVTIE from .svt import SVTIE
from .pornhub import PornHubIE from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
from .vimeo import VimeoIE
from .dailymotion import DailymotionCloudIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -333,6 +336,15 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
# XHamster embed
{
'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
'info_dict': {
'id': 'showthread',
'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
},
'playlist_mincount': 7,
},
# Embedded TED video # Embedded TED video
{ {
'url': 'http://en.support.wordpress.com/videos/ted-talks/', 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@ -812,6 +824,29 @@ class GenericIE(InfoExtractor):
'description': 'To understand why he was the Toronto Blue Jays top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'description': 'To understand why he was the Toronto Blue Jays top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
'uploader': 'Rogers Sportsnet', 'uploader': 'Rogers Sportsnet',
}, },
},
# Dailymotion Cloud video
{
'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
'md5': '49444254273501a64675a7e68c502681',
'info_dict': {
'id': '5585de919473990de4bee11b',
'ext': 'mp4',
'title': 'Le débat',
'thumbnail': 're:^https?://.*\.jpe?g$',
}
},
# AdobeTVVideo embed
{
'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
'md5': '43662b577c018ad707a63766462b1e87',
'info_dict': {
'id': '2456',
'ext': 'mp4',
'title': 'New experience with Acrobat DC',
'description': 'New experience with Acrobat DC',
'duration': 248.667,
},
} }
] ]
@ -1089,18 +1124,9 @@ class GenericIE(InfoExtractor):
if matches: if matches:
return _playlist_from_matches(matches, ie='RtlNl') return _playlist_from_matches(matches, ie='RtlNl')
# Look for embedded (iframe) Vimeo player vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
mobj = re.search( if vimeo_url is not None:
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) return self.url_result(vimeo_url)
if mobj:
player_url = unescapeHTML(mobj.group('url'))
surl = smuggle_url(player_url, {'Referer': url})
return self.url_result(surl)
# Look for embedded (swf embed) Vimeo player
mobj = re.search(
r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
if mobj:
return self.url_result(mobj.group(1))
# Look for embedded YouTube player # Look for embedded YouTube player
matches = re.findall(r'''(?x) matches = re.findall(r'''(?x)
@ -1327,6 +1353,11 @@ class GenericIE(InfoExtractor):
if pornhub_url: if pornhub_url:
return self.url_result(pornhub_url, 'PornHub') return self.url_result(pornhub_url, 'PornHub')
# Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
if xhamster_urls:
return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
# Look for embedded Tvigle player # Look for embedded Tvigle player
mobj = re.search( mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@ -1494,6 +1525,20 @@ class GenericIE(InfoExtractor):
if senate_isvp_url: if senate_isvp_url:
return self.url_result(senate_isvp_url, 'SenateISVP') return self.url_result(senate_isvp_url, 'SenateISVP')
# Look for Dailymotion Cloud videos
dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
if dmcloud_url:
return self.url_result(dmcloud_url, 'DailymotionCloud')
# Look for AdobeTVVideo embeds
mobj = re.search(
r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
webpage)
if mobj is not None:
return self.url_result(
self._proto_relative_url(unescapeHTML(mobj.group(1))),
'AdobeTVVideo')
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

View File

@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):
format_info = info['videoPlayerObject']['video'] format_info = info['videoPlayerObject']['video']
formats.append({ formats.append({
'format_id': f_id, 'format_id': f_id,
'url': format_info['url'], 'url': format_info['videoInfoList'][0]['videoUrl'],
}) })
return { return {

View File

@ -0,0 +1,96 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
remove_end,
remove_start,
str_to_int,
unified_strdate,
)
class PinkbikeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.pinkbike.com/video/402811/',
'md5': '4814b8ca7651034cd87e3361d5c2155a',
'info_dict': {
'id': '402811',
'ext': 'mp4',
'title': 'Brandon Semenuk - RAW 100',
'description': 'Official release: www.redbull.ca/rupertwalker',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 100,
'upload_date': '20150406',
'uploader': 'revelco',
'location': 'Victoria, British Columbia, Canada',
'view_count': int,
'comment_count': int,
}
}, {
'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://www.pinkbike.com/video/%s' % video_id, video_id)
formats = []
for _, format_id, src in re.findall(
r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None))
formats.append({
'url': src,
'format_id': format_id,
'height': height,
})
self._sort_formats(formats)
title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
description = self._html_search_regex(
r'(?s)id="media-description"[^>]*>(.+?)<',
webpage, 'description', default=None) or remove_start(
self._og_search_description(webpage), title + '. ')
thumbnail = self._og_search_thumbnail(webpage)
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration'))
uploader = self._search_regex(
r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._search_regex(
r'class="fullTime"[^>]+title="([^"]+)"',
webpage, 'upload date', fatal=False))
location = self._html_search_regex(
r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
webpage, 'location', fatal=False)
def extract_count(webpage, label):
return str_to_int(self._search_regex(
r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
webpage, label, fatal=False))
view_count = extract_count(webpage, 'Views')
comment_count = extract_count(webpage, 'Comments')
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'upload_date': upload_date,
'uploader': uploader,
'location': location,
'view_count': view_count,
'comment_count': comment_count,
'formats': formats
}

View File

@ -6,9 +6,12 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
compat_urllib_request compat_urllib_request,
compat_urllib_parse,
)
from ..utils import (
ExtractorError,
) )
from ..utils import ExtractorError
class SohuIE(InfoExtractor): class SohuIE(InfoExtractor):
@ -26,7 +29,7 @@ class SohuIE(InfoExtractor):
'skip': 'On available in China', 'skip': 'On available in China',
}, { }, {
'url': 'http://tv.sohu.com/20150305/n409385080.shtml', 'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', 'md5': '699060e75cf58858dd47fb9c03c42cfb',
'info_dict': { 'info_dict': {
'id': '409385080', 'id': '409385080',
'ext': 'mp4', 'ext': 'mp4',
@ -34,7 +37,7 @@ class SohuIE(InfoExtractor):
} }
}, { }, {
'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
'md5': '49308ff6dafde5ece51137d04aec311e', 'md5': '9bf34be48f2f4dadcb226c74127e203c',
'info_dict': { 'info_dict': {
'id': '78693464', 'id': '78693464',
'ext': 'mp4', 'ext': 'mp4',
@ -48,7 +51,7 @@ class SohuIE(InfoExtractor):
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
}, },
'playlist': [{ 'playlist': [{
'md5': '492923eac023ba2f13ff69617c32754a', 'md5': 'bdbfb8f39924725e6589c146bc1883ad',
'info_dict': { 'info_dict': {
'id': '78910339_part1', 'id': '78910339_part1',
'ext': 'mp4', 'ext': 'mp4',
@ -56,7 +59,7 @@ class SohuIE(InfoExtractor):
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
} }
}, { }, {
'md5': 'de604848c0e8e9c4a4dde7e1347c0637', 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',
'info_dict': { 'info_dict': {
'id': '78910339_part2', 'id': '78910339_part2',
'ext': 'mp4', 'ext': 'mp4',
@ -64,7 +67,7 @@ class SohuIE(InfoExtractor):
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
} }
}, { }, {
'md5': '93584716ee0657c0b205b8aa3d27aa13', 'md5': '8407e634175fdac706766481b9443450',
'info_dict': { 'info_dict': {
'id': '78910339_part3', 'id': '78910339_part3',
'ext': 'mp4', 'ext': 'mp4',
@ -139,21 +142,42 @@ class SohuIE(InfoExtractor):
for i in range(part_count): for i in range(part_count):
formats = [] formats = []
for format_id, format_data in formats_json.items(): for format_id, format_data in formats_json.items():
data = format_data['data'] allot = format_data['allot']
data = format_data['data']
clips_url = data['clipsURL']
su = data['su']
# URLs starts with http://newflv.sohu.ccgslb.net/ is not usable
# so retry until got a working URL
video_url = 'newflv.sohu.ccgslb.net' video_url = 'newflv.sohu.ccgslb.net'
cdnId = None
retries = 0 retries = 0
while 'newflv.sohu.ccgslb.net' in video_url and retries < 5:
download_note = 'Download information from CDN gateway for format ' + format_id while 'newflv.sohu.ccgslb.net' in video_url:
params = {
'prot': 9,
'file': clips_url[i],
'new': su[i],
'prod': 'flash',
}
if cdnId is not None:
params['idc'] = cdnId
download_note = 'Downloading %s video URL part %d of %d' % (
format_id, i + 1, part_count)
if retries > 0: if retries > 0:
download_note += ' (retry #%d)' % retries download_note += ' (retry #%d)' % retries
part_info = self._parse_json(self._download_webpage(
'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
video_id, download_note), video_id)
video_url = part_info['url']
cdnId = part_info.get('nid')
retries += 1 retries += 1
cdn_info = self._download_json( if retries > 5:
'http://data.vod.itc.cn/cdnList?new=' + data['su'][i], raise ExtractorError('Failed to get video URL')
video_id, download_note)
video_url = cdn_info['url']
formats.append({ formats.append({
'url': video_url, 'url': video_url,

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .pornhub import PornHubIE from .pornhub import PornHubIE
from .vimeo import VimeoIE
class TumblrIE(InfoExtractor): class TumblrIE(InfoExtractor):
@ -40,6 +41,17 @@ class TumblrIE(InfoExtractor):
'timestamp': 1430931613, 'timestamp': 1430931613,
}, },
'add_ie': ['Vidme'], 'add_ie': ['Vidme'],
}, {
'url': 'http://camdamage.tumblr.com/post/98846056295/',
'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
'info_dict': {
'id': '105463834',
'ext': 'mp4',
'title': 'Cam Damage-HD 720p',
'uploader': 'John Moyer',
'uploader_id': 'user32021558',
},
'add_ie': ['Vimeo'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -60,6 +72,10 @@ class TumblrIE(InfoExtractor):
if pornhub_url: if pornhub_url:
return self.url_result(pornhub_url, 'PornHub') return self.url_result(pornhub_url, 'PornHub')
vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
if vimeo_url:
return self.url_result(vimeo_url, 'Vimeo')
iframe_url = self._search_regex( iframe_url = self._search_regex(
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
webpage, 'iframe url') webpage, 'iframe url')

View File

@ -28,11 +28,15 @@ class VikiBaseIE(InfoExtractor):
_NETRC_MACHINE = 'viki' _NETRC_MACHINE = 'viki'
_token = None
def _prepare_call(self, path, timestamp=None, post_data=None): def _prepare_call(self, path, timestamp=None, post_data=None):
path += '?' if '?' not in path else '&' path += '?' if '?' not in path else '&'
if not timestamp: if not timestamp:
timestamp = int(time.time()) timestamp = int(time.time())
query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
if self._token:
query += '&token=%s' % self._token
sig = hmac.new( sig = hmac.new(
self._APP_SECRET.encode('ascii'), self._APP_SECRET.encode('ascii'),
query.encode('ascii'), query.encode('ascii'),
@ -76,10 +80,14 @@ class VikiBaseIE(InfoExtractor):
'password': password, 'password': password,
} }
self._call_api( login = self._call_api(
'sessions.json', None, 'sessions.json', None,
'Logging in as %s' % username, post_data=login_form) 'Logging in as %s' % username, post_data=login_form)
self._token = login.get('token')
if not self._token:
self.report_warning('Unable to get session token, login has probably failed')
class VikiIE(VikiBaseIE): class VikiIE(VikiBaseIE):
IE_NAME = 'viki' IE_NAME = 'viki'

View File

@ -22,6 +22,7 @@ from ..utils import (
unified_strdate, unified_strdate,
unsmuggle_url, unsmuggle_url,
urlencode_postdata, urlencode_postdata,
unescapeHTML,
) )
@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor):
}, },
] ]
@staticmethod
def _extract_vimeo_url(url, webpage):
# Look for embedded (iframe) Vimeo player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
if mobj:
player_url = unescapeHTML(mobj.group('url'))
surl = smuggle_url(player_url, {'Referer': url})
return surl
# Look for embedded (swf embed) Vimeo player
mobj = re.search(
r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
if mobj:
return mobj.group(1)
def _verify_video_password(self, url, video_id, webpage): def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None) password = self._downloader.params.get('videopassword', None)
if password is None: if password is None:

View File

@ -13,7 +13,6 @@ from ..utils import (
class XHamsterIE(InfoExtractor): class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [ _TESTS = [
{ {
@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):
'age_limit': age_limit, 'age_limit': age_limit,
'formats': formats, 'formats': formats,
} }
class XHamsterEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
_TEST = {
'url': 'http://xhamster.com/xembed.php?video=3328539',
'info_dict': {
'id': '3328539',
'ext': 'mp4',
'title': 'Pen Masturbation',
'upload_date': '20140728',
'uploader_id': 'anonymous',
'duration': 5,
'age_limit': 18,
}
}
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
webpage, 'xhamster url')
return self.url_result(video_url, 'XHamster')

View File

@ -5,10 +5,12 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request,
) )
from ..utils import ( from ..utils import (
clean_html, clean_html,
ExtractorError, ExtractorError,
determine_ext,
) )
@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):
} }
} }
_ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor):
video_thumbnail = self._search_regex( video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False) r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
formats = [{
'url': video_url,
}]
android_req = compat_urllib_request.Request(url)
android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
android_webpage = self._download_webpage(android_req, video_id, fatal=False)
if android_webpage is not None:
player_params_str = self._search_regex(
'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
android_webpage, 'player parameters', default='')
player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
if player_params:
formats.extend([{
'url': param,
'preference': -10,
} for param in player_params if determine_ext(param) == 'mp4'])
self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'url': video_url, 'formats': formats,
'title': video_title, 'title': video_title,
'ext': 'flv', 'ext': 'flv',
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,

View File

@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'44': {'ext': 'webm', 'width': 854, 'height': 480}, '44': {'ext': 'webm', 'width': 854, 'height': 480},
'45': {'ext': 'webm', 'width': 1280, 'height': 720}, '45': {'ext': 'webm', 'width': 1280, 'height': 720},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080}, '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
'59': {'ext': 'mp4', 'width': 854, 'height': 480},
'78': {'ext': 'mp4', 'width': 854, 'height': 480},
# 3d videos # 3d videos

View File

@ -21,6 +21,7 @@ from ..utils import (
shell_quote, shell_quote,
subtitles_filename, subtitles_filename,
dfxp2srt, dfxp2srt,
ISO639Utils,
) )
@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
# See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
_lang_map = {
'aa': 'aar',
'ab': 'abk',
'ae': 'ave',
'af': 'afr',
'ak': 'aka',
'am': 'amh',
'an': 'arg',
'ar': 'ara',
'as': 'asm',
'av': 'ava',
'ay': 'aym',
'az': 'aze',
'ba': 'bak',
'be': 'bel',
'bg': 'bul',
'bh': 'bih',
'bi': 'bis',
'bm': 'bam',
'bn': 'ben',
'bo': 'bod',
'br': 'bre',
'bs': 'bos',
'ca': 'cat',
'ce': 'che',
'ch': 'cha',
'co': 'cos',
'cr': 'cre',
'cs': 'ces',
'cu': 'chu',
'cv': 'chv',
'cy': 'cym',
'da': 'dan',
'de': 'deu',
'dv': 'div',
'dz': 'dzo',
'ee': 'ewe',
'el': 'ell',
'en': 'eng',
'eo': 'epo',
'es': 'spa',
'et': 'est',
'eu': 'eus',
'fa': 'fas',
'ff': 'ful',
'fi': 'fin',
'fj': 'fij',
'fo': 'fao',
'fr': 'fra',
'fy': 'fry',
'ga': 'gle',
'gd': 'gla',
'gl': 'glg',
'gn': 'grn',
'gu': 'guj',
'gv': 'glv',
'ha': 'hau',
'he': 'heb',
'hi': 'hin',
'ho': 'hmo',
'hr': 'hrv',
'ht': 'hat',
'hu': 'hun',
'hy': 'hye',
'hz': 'her',
'ia': 'ina',
'id': 'ind',
'ie': 'ile',
'ig': 'ibo',
'ii': 'iii',
'ik': 'ipk',
'io': 'ido',
'is': 'isl',
'it': 'ita',
'iu': 'iku',
'ja': 'jpn',
'jv': 'jav',
'ka': 'kat',
'kg': 'kon',
'ki': 'kik',
'kj': 'kua',
'kk': 'kaz',
'kl': 'kal',
'km': 'khm',
'kn': 'kan',
'ko': 'kor',
'kr': 'kau',
'ks': 'kas',
'ku': 'kur',
'kv': 'kom',
'kw': 'cor',
'ky': 'kir',
'la': 'lat',
'lb': 'ltz',
'lg': 'lug',
'li': 'lim',
'ln': 'lin',
'lo': 'lao',
'lt': 'lit',
'lu': 'lub',
'lv': 'lav',
'mg': 'mlg',
'mh': 'mah',
'mi': 'mri',
'mk': 'mkd',
'ml': 'mal',
'mn': 'mon',
'mr': 'mar',
'ms': 'msa',
'mt': 'mlt',
'my': 'mya',
'na': 'nau',
'nb': 'nob',
'nd': 'nde',
'ne': 'nep',
'ng': 'ndo',
'nl': 'nld',
'nn': 'nno',
'no': 'nor',
'nr': 'nbl',
'nv': 'nav',
'ny': 'nya',
'oc': 'oci',
'oj': 'oji',
'om': 'orm',
'or': 'ori',
'os': 'oss',
'pa': 'pan',
'pi': 'pli',
'pl': 'pol',
'ps': 'pus',
'pt': 'por',
'qu': 'que',
'rm': 'roh',
'rn': 'run',
'ro': 'ron',
'ru': 'rus',
'rw': 'kin',
'sa': 'san',
'sc': 'srd',
'sd': 'snd',
'se': 'sme',
'sg': 'sag',
'si': 'sin',
'sk': 'slk',
'sl': 'slv',
'sm': 'smo',
'sn': 'sna',
'so': 'som',
'sq': 'sqi',
'sr': 'srp',
'ss': 'ssw',
'st': 'sot',
'su': 'sun',
'sv': 'swe',
'sw': 'swa',
'ta': 'tam',
'te': 'tel',
'tg': 'tgk',
'th': 'tha',
'ti': 'tir',
'tk': 'tuk',
'tl': 'tgl',
'tn': 'tsn',
'to': 'ton',
'tr': 'tur',
'ts': 'tso',
'tt': 'tat',
'tw': 'twi',
'ty': 'tah',
'ug': 'uig',
'uk': 'ukr',
'ur': 'urd',
'uz': 'uzb',
've': 'ven',
'vi': 'vie',
'vo': 'vol',
'wa': 'wln',
'wo': 'wol',
'xh': 'xho',
'yi': 'yid',
'yo': 'yor',
'za': 'zha',
'zh': 'zho',
'zu': 'zul',
}
@classmethod
def _conver_lang_code(cls, code):
"""Convert language code from ISO 639-1 to ISO 639-2/T"""
return cls._lang_map.get(code[:2])
def run(self, information): def run(self, information):
if information['ext'] not in ['mp4', 'mkv']: if information['ext'] not in ['mp4', 'mkv']:
self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
opts += ['-c:s', 'mov_text'] opts += ['-c:s', 'mov_text']
for (i, lang) in enumerate(sub_langs): for (i, lang) in enumerate(sub_langs):
opts.extend(['-map', '%d:0' % (i + 1)]) opts.extend(['-map', '%d:0' % (i + 1)])
lang_code = self._conver_lang_code(lang) lang_code = ISO639Utils.short2long(lang)
if lang_code is not None: if lang_code is not None:
opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])

View File

@ -1841,7 +1841,10 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data): def dfxp2srt(dfxp_data):
_x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) _x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
})
def parse_node(node): def parse_node(node):
str_or_empty = functools.partial(str_or_none, default='') str_or_empty = functools.partial(str_or_none, default='')
@ -1849,9 +1852,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text) out = str_or_empty(node.text)
for child in node: for child in node:
if child.tag in (_x('ttml:br'), 'br'): if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
out += '\n' + str_or_empty(child.tail) out += '\n' + str_or_empty(child.tail)
elif child.tag in (_x('ttml:span'), 'span'): elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
out += str_or_empty(parse_node(child)) out += str_or_empty(parse_node(child))
else: else:
out += str_or_empty(xml.etree.ElementTree.tostring(child)) out += str_or_empty(xml.etree.ElementTree.tostring(child))
@ -1860,7 +1863,7 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = [] out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
if not paras: if not paras:
raise ValueError('Invalid dfxp/TTML subtitle') raise ValueError('Invalid dfxp/TTML subtitle')
@ -1879,6 +1882,208 @@ def dfxp2srt(dfxp_data):
return ''.join(out) return ''.join(out)
class ISO639Utils(object):
# See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
_lang_map = {
'aa': 'aar',
'ab': 'abk',
'ae': 'ave',
'af': 'afr',
'ak': 'aka',
'am': 'amh',
'an': 'arg',
'ar': 'ara',
'as': 'asm',
'av': 'ava',
'ay': 'aym',
'az': 'aze',
'ba': 'bak',
'be': 'bel',
'bg': 'bul',
'bh': 'bih',
'bi': 'bis',
'bm': 'bam',
'bn': 'ben',
'bo': 'bod',
'br': 'bre',
'bs': 'bos',
'ca': 'cat',
'ce': 'che',
'ch': 'cha',
'co': 'cos',
'cr': 'cre',
'cs': 'ces',
'cu': 'chu',
'cv': 'chv',
'cy': 'cym',
'da': 'dan',
'de': 'deu',
'dv': 'div',
'dz': 'dzo',
'ee': 'ewe',
'el': 'ell',
'en': 'eng',
'eo': 'epo',
'es': 'spa',
'et': 'est',
'eu': 'eus',
'fa': 'fas',
'ff': 'ful',
'fi': 'fin',
'fj': 'fij',
'fo': 'fao',
'fr': 'fra',
'fy': 'fry',
'ga': 'gle',
'gd': 'gla',
'gl': 'glg',
'gn': 'grn',
'gu': 'guj',
'gv': 'glv',
'ha': 'hau',
'he': 'heb',
'hi': 'hin',
'ho': 'hmo',
'hr': 'hrv',
'ht': 'hat',
'hu': 'hun',
'hy': 'hye',
'hz': 'her',
'ia': 'ina',
'id': 'ind',
'ie': 'ile',
'ig': 'ibo',
'ii': 'iii',
'ik': 'ipk',
'io': 'ido',
'is': 'isl',
'it': 'ita',
'iu': 'iku',
'ja': 'jpn',
'jv': 'jav',
'ka': 'kat',
'kg': 'kon',
'ki': 'kik',
'kj': 'kua',
'kk': 'kaz',
'kl': 'kal',
'km': 'khm',
'kn': 'kan',
'ko': 'kor',
'kr': 'kau',
'ks': 'kas',
'ku': 'kur',
'kv': 'kom',
'kw': 'cor',
'ky': 'kir',
'la': 'lat',
'lb': 'ltz',
'lg': 'lug',
'li': 'lim',
'ln': 'lin',
'lo': 'lao',
'lt': 'lit',
'lu': 'lub',
'lv': 'lav',
'mg': 'mlg',
'mh': 'mah',
'mi': 'mri',
'mk': 'mkd',
'ml': 'mal',
'mn': 'mon',
'mr': 'mar',
'ms': 'msa',
'mt': 'mlt',
'my': 'mya',
'na': 'nau',
'nb': 'nob',
'nd': 'nde',
'ne': 'nep',
'ng': 'ndo',
'nl': 'nld',
'nn': 'nno',
'no': 'nor',
'nr': 'nbl',
'nv': 'nav',
'ny': 'nya',
'oc': 'oci',
'oj': 'oji',
'om': 'orm',
'or': 'ori',
'os': 'oss',
'pa': 'pan',
'pi': 'pli',
'pl': 'pol',
'ps': 'pus',
'pt': 'por',
'qu': 'que',
'rm': 'roh',
'rn': 'run',
'ro': 'ron',
'ru': 'rus',
'rw': 'kin',
'sa': 'san',
'sc': 'srd',
'sd': 'snd',
'se': 'sme',
'sg': 'sag',
'si': 'sin',
'sk': 'slk',
'sl': 'slv',
'sm': 'smo',
'sn': 'sna',
'so': 'som',
'sq': 'sqi',
'sr': 'srp',
'ss': 'ssw',
'st': 'sot',
'su': 'sun',
'sv': 'swe',
'sw': 'swa',
'ta': 'tam',
'te': 'tel',
'tg': 'tgk',
'th': 'tha',
'ti': 'tir',
'tk': 'tuk',
'tl': 'tgl',
'tn': 'tsn',
'to': 'ton',
'tr': 'tur',
'ts': 'tso',
'tt': 'tat',
'tw': 'twi',
'ty': 'tah',
'ug': 'uig',
'uk': 'ukr',
'ur': 'urd',
'uz': 'uzb',
've': 'ven',
'vi': 'vie',
'vo': 'vol',
'wa': 'wln',
'wo': 'wol',
'xh': 'xho',
'yi': 'yid',
'yo': 'yor',
'za': 'zha',
'zh': 'zho',
'zu': 'zul',
}
@classmethod
def short2long(cls, code):
"""Convert language code from ISO 639-1 to ISO 639-2/T"""
return cls._lang_map.get(code[:2])
@classmethod
def long2short(cls, code):
"""Convert language code from ISO 639-2/T to ISO 639-1"""
for short_name, long_name in cls._lang_map.items():
if long_name == code:
return short_name
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
def __init__(self, proxies=None): def __init__(self, proxies=None):
# Set default handlers # Set default handlers