Compare commits

..

34 Commits

Author SHA1 Message Date
Philipp Hagemeister
a21420389e release 2015.02.19.3 2015-02-19 19:28:17 +01:00
Jaime Marquínez Ferrándiz
6140baf4e1 [nationalgeographic] Add extractor (closes #4960) 2015-02-19 18:17:31 +01:00
Sergey M․
8fc642eb5b [pornhub] Fix uploader regex 2015-02-19 22:15:49 +06:00
Sergey M․
e66e1a0046 [pornhub] Add support for playlists (Closes #4995) 2015-02-19 22:15:19 +06:00
Sergey M․
d5c69f1da4 [5min] Cover joystiq.com URLs (Closes #4962) 2015-02-19 21:47:11 +06:00
Jaime Marquínez Ferrándiz
5c8a3f862a [nbc] Use a test video that works outside the US 2015-02-19 15:00:39 +01:00
Jaime Marquínez Ferrándiz
a3b9157f49 [cbssports] Add extractor (closes #4996) 2015-02-19 13:06:53 +01:00
Philipp Hagemeister
b88ba05356 [imgur] Simplify 2015-02-19 05:53:09 +01:00
Philipp Hagemeister
b74d505577 Merge remote-tracking branch 'jbboehr/imgur-gifv-improvements' 2015-02-19 05:16:11 +01:00
John Boehr
9e2d7dca87 [imgur] improve error check for non-video URLs 2015-02-18 19:47:54 -08:00
John Boehr
d236b37ac9 [imgur] improve regex #4998 2015-02-18 19:28:19 -08:00
Philipp Hagemeister
e880c66bd8 [theonion] Modernize 2015-02-19 04:12:40 +01:00
Philipp Hagemeister
383456aa29 [Makefile] Also delete *.avi files in clean 2015-02-19 04:09:52 +01:00
John Boehr
1a13940c8d [imgur] support regular URL 2015-02-18 18:12:48 -08:00
Philipp Hagemeister
3d54788495 [webofstories] Fix extraction 2015-02-19 02:12:08 +01:00
Philipp Hagemeister
71d53ace2f [sockshare] Do not require thumbnail anymore
Thumbnail is not present on the website anymore.
2015-02-19 02:04:30 +01:00
Philipp Hagemeister
f37e3f99f0 [generic] Correct test case
Video has been reuploaded / edited
2015-02-19 02:00:52 +01:00
Philipp Hagemeister
bd03ffc16e [netzkino] Skip download in test case
Works fine from Germany, but fails from everywhere else
2015-02-19 01:58:54 +01:00
Philipp Hagemeister
1ac1af9b47 release 2015.02.19.2 2015-02-19 01:43:28 +01:00
Philipp Hagemeister
3bf5705316 [imgur] Add new extractor 2015-02-19 01:43:20 +01:00
Philipp Hagemeister
1c2528c8a3 [cbs] Modernize 2015-02-19 01:22:50 +01:00
Philipp Hagemeister
7bd15b1a03 release 2015.02.19.1 2015-02-19 01:04:24 +01:00
Philipp Hagemeister
6b961a85fd [patreon] Add support for embedlies (fixes #4969) 2015-02-19 01:04:19 +01:00
Philipp Hagemeister
7707004043 [patreon] Modernize 2015-02-19 00:38:05 +01:00
Philipp Hagemeister
a025d3c5a5 release 2015.02.19 2015-02-19 00:31:23 +01:00
Philipp Hagemeister
c460bdd56b [sandia] Add new extractor (#4974) 2015-02-19 00:31:01 +01:00
Philipp Hagemeister
b81a359eb6 [YoutubeDL] Use render_table for format listing 2015-02-19 00:28:58 +01:00
Philipp Hagemeister
d61aefb24c Merge remote-tracking branch 'origin/master' 2015-02-19 00:01:14 +01:00
Philipp Hagemeister
d305dd73a3 [utils] Fix js_to_json
Previously, the runtime could be atrocious for longer inputs.
2015-02-18 23:59:51 +01:00
Jaime Marquínez Ferrándiz
93a16ba238 [vimeo] Raise the ExtractorError with expected=True when no video password is given 2015-02-18 22:00:12 +01:00
Philipp Hagemeister
85d5866177 [yahoo] Remove md5sum from test case
The md5 sum has changed repeatedly, and we check whether it looks like a video anyways nowadays.
2015-02-18 20:03:04 +01:00
Philipp Hagemeister
9789d7535d [xtube] Fix test case 2015-02-18 19:58:41 +01:00
Philipp Hagemeister
d8443cd3f7 [wsj] Correct test case 2015-02-18 19:56:24 +01:00
Philipp Hagemeister
d47c26e168 [brightcove] Correct keys in playlists 2015-02-18 19:56:10 +01:00
29 changed files with 443 additions and 79 deletions

View File

@@ -1,7 +1,7 @@
all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean:
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe
PREFIX ?= /usr/local
BINDIR ?= $(PREFIX)/bin

View File

@@ -68,6 +68,7 @@
- **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv
- **CBS**
- **CBSNews**: CBS News
- **CBSSports**
- **CeskaTelevize**
- **channel9**: Channel 9
- **Chilloutzone**
@@ -121,6 +122,7 @@
- **EllenTV**
- **EllenTV:clips**
- **ElPais**: El País
- **Embedly**
- **EMPFlix**
- **Engadget**
- **Eporner**
@@ -190,6 +192,7 @@
- **ign.com**
- **imdb**: Internet Movie Database trailers
- **imdb:list**: Internet Movie Database lists
- **Imgur**
- **Ina**
- **InfoQ**
- **Instagram**
@@ -262,6 +265,7 @@
- **myvideo**
- **MyVidster**
- **n-tv.de**
- **NationalGeographic**
- **Naver**
- **NBA**
- **NBC**
@@ -319,6 +323,7 @@
- **podomatic**
- **PornHd**
- **PornHub**
- **PornHubPlaylist**
- **Pornotube**
- **PornoXO**
- **PromptFile**
@@ -352,6 +357,7 @@
- **rutube:movie**: Rutube movies
- **rutube:person**: Rutube person videos
- **RUTV**: RUTV.RU
- **Sandia**: Sandia National Laboratories
- **Sapo**: SAPO Vídeos
- **savefrom.net**
- **SBS**: sbs.com.au

View File

@@ -113,6 +113,16 @@ def expect_info_dict(self, got_dict, expected_dict):
self.assertTrue(
got.startswith(start_str),
'field %s (value: %r) should start with %r' % (info_field, got, start_str))
elif isinstance(expected, compat_str) and expected.startswith('contains:'):
got = got_dict.get(info_field)
contains_str = expected[len('contains:'):]
self.assertTrue(
isinstance(got, compat_str),
'Expected a %s object, but got %s for field %s' % (
compat_str.__name__, type(got).__name__, info_field))
self.assertTrue(
contains_str in got,
'field %s (value: %r) should contain %r' % (info_field, got, contains_str))
elif isinstance(expected, type):
got = got_dict.get(info_field)
self.assertTrue(isinstance(got, expected),

View File

@@ -370,6 +370,10 @@ class TestUtil(unittest.TestCase):
"playlist":[{"controls":{"all":null}}]
}''')
inp = '"SAND Number: SAND 2013-7800P\\nPresenter: Tom Russo\\nHabanero Software Training - Xyce Software\\nXyce, Sandia\\u0027s"'
json_code = js_to_json(inp)
self.assertEqual(json.loads(json_code), json.loads(inp))
def test_js_to_json_edgecases(self):
on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})

View File

@@ -1534,29 +1534,18 @@ class YoutubeDL(object):
return res
def list_formats(self, info_dict):
def line(format, idlen=20):
return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
format['format_id'],
format['ext'],
self.format_resolution(format),
self._format_note(format),
))
formats = info_dict.get('formats', [info_dict])
idlen = max(len('format code'),
max(len(f['format_id']) for f in formats))
formats_s = [
line(f, idlen) for f in formats
table = [
[f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
for f in formats
if f.get('preference') is None or f['preference'] >= -1000]
if len(formats) > 1:
formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
header_line = line({
'format_id': 'format code', 'ext': 'extension',
'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
header_line = ['format code', 'extension', 'resolution', 'note']
self.to_screen(
'[info] Available formats for %s:\n%s\n%s' %
(info_dict['id'], header_line, '\n'.join(formats_s)))
'[info] Available formats for %s:\n%s' %
(info_dict['id'], render_table(header_line, table)))
def list_thumbnails(self, info_dict):
thumbnails = info_dict.get('thumbnails')

View File

@@ -58,6 +58,7 @@ from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .cbsnews import CBSNewsIE
from .cbssports import CBSSportsIE
from .ccc import CCCIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
@@ -121,6 +122,7 @@ from .ellentv import (
EllenTVClipsIE,
)
from .elpais import ElPaisIE
from .embedly import EmbedlyIE
from .empflix import EMPFlixIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
@@ -204,6 +206,7 @@ from .imdb import (
ImdbIE,
ImdbListIE
)
from .imgur import ImgurIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
@@ -282,6 +285,7 @@ from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .myvidster import MyVidsterIE
from .nationalgeographic import NationalGeographicIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
@@ -350,7 +354,10 @@ from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornhub import (
PornHubIE,
PornHubPlaylistIE,
)
from .pornotube import PornotubeIE
from .pornoxo import PornoXOIE
from .promptfile import PromptFileIE
@@ -386,6 +393,7 @@ from .rutube import (
RutubePersonIE,
)
from .rutv import RUTVIE
from .sandia import SandiaIE
from .sapo import SapoIE
from .savefrom import SaveFromIE
from .sbs import SBSIE

View File

@@ -95,6 +95,7 @@ class BrightcoveIE(InfoExtractor):
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
'info_dict': {
'title': 'Sealife',
'id': '3550319591001',
},
'playlist_mincount': 7,
},
@@ -247,7 +248,7 @@ class BrightcoveIE(InfoExtractor):
playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
return self.playlist_result(videos, playlist_id=playlist_info['id'],
return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):

View File

@@ -1,7 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -39,8 +37,7 @@ class CBSIE(InfoExtractor):
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
real_id = self._search_regex(
r"video\.settings\.pid\s*=\s*'([^']+)';",

View File

@@ -0,0 +1,30 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class CBSSportsIE(InfoExtractor):
_VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
_TEST = {
'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
'info_dict': {
'id': '_d5_GbO8p1sT',
'ext': 'flv',
'title': 'US Open flashbacks: 1990s',
'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
section = mobj.group('section')
video_id = mobj.group('id')
all_videos = self._download_json(
'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
video_id)
# The json file contains the info of all the videos in the section
video_info = next(v for v in all_videos if v['pcid'] == video_id)
return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')

View File

@@ -0,0 +1,16 @@
# encoding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
class EmbedlyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)'
_TESTS = [{
'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
'only_matching': True,
}]
def _real_extract(self, url):
return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))

View File

@@ -14,6 +14,7 @@ class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
(?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
5min:)
(?P<id>\d+)
'''

View File

@@ -532,7 +532,7 @@ class GenericIE(InfoExtractor):
'info_dict': {
'id': 'Mrj4DVp2zeA',
'ext': 'mp4',
'upload_date': '20150204',
'upload_date': '20150212',
'uploader': 'The National Archives UK',
'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
'uploader_id': 'NationalArchives08',

View File

@@ -0,0 +1,97 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
mimetype2ext,
ExtractorError,
)
class ImgurIE(InfoExtractor):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 'MRW gifv is up and running without any bugs',
'description': 'The Internet\'s visual storytelling community. Explore, share, and discuss the best visual stories the Internet has to offer.',
},
}, {
'url': 'https://imgur.com/A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 'MRW gifv is up and running without any bugs',
'description': 'The Internet\'s visual storytelling community. Explore, share, and discuss the best visual stories the Internet has to offer.',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
width = int_or_none(self._search_regex(
r'<param name="width" value="([0-9]+)"',
webpage, 'width', fatal=False))
height = int_or_none(self._search_regex(
r'<param name="height" value="([0-9]+)"',
webpage, 'height', fatal=False))
video_elements = self._search_regex(
r'(?s)<div class="video-elements">(.*?)</div>',
webpage, 'video elements', default=None)
if not video_elements:
raise ExtractorError(
'No sources found for video %s. Maybe an image?' % video_id,
expected=True)
formats = []
for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
formats.append({
'format_id': m.group('type').partition('/')[2],
'url': self._proto_relative_url(m.group('src')),
'ext': mimetype2ext(m.group('type')),
'acodec': 'none',
'width': width,
'height': height,
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
gif_json = self._search_regex(
r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
webpage, 'GIF code', fatal=False)
if gif_json:
gifd = self._parse_json(
gif_json, video_id, transform_source=js_to_json)
formats.append({
'format_id': 'gif',
'preference': -10,
'width': width,
'height': height,
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
'url': self._proto_relative_url(gifd['gifUrl']),
'filesize': gifd.get('size'),
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'description': self._og_search_description(webpage),
'title': self._og_search_title(webpage),
}

View File

@@ -0,0 +1,38 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
smuggle_url,
url_basename,
)
class NationalGeographicIE(InfoExtractor):
_VALID_URL = r'http://video\.nationalgeographic\.com/video/.*?'
_TEST = {
'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
'info_dict': {
'id': '4DmDACA6Qtk_',
'ext': 'flv',
'title': 'Mating Crabs Busted by Sharks',
'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
},
'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
name = url_basename(url)
webpage = self._download_webpage(url, name)
feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url')
guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid')
feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
content = feed.find('.//{http://search.yahoo.com/mrss/}content')
theplatform_id = url_basename(content.attrib.get('url'))
return self.url_result(smuggle_url(
'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
# For some reason, the normal links don't work and we must force the use of f4m
{'force_smil_url': True}))

View File

@@ -18,13 +18,13 @@ class NBCIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
'url': 'http://www.nbc.com/the-tonight-show/segments/112966',
# md5 checksum is not stable
'info_dict': {
'id': 'bTmnLCvIbaaH',
'id': 'c9xnCo0YPOPH',
'ext': 'flv',
'title': 'I Am a Firefighter',
'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
},
},
{

View File

@@ -29,6 +29,9 @@ class NetzkinoIE(InfoExtractor):
'timestamp': 1344858571,
'age_limit': 12,
},
'params': {
'skip_download': 'Download only works from Germany',
}
}
def _real_extract(self, url):

View File

@@ -1,9 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
js_to_json,
@@ -11,7 +8,7 @@ from ..utils import (
class PatreonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)'
_VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P<id>[^&#]+)'
_TESTS = [
{
'url': 'http://www.patreon.com/creation?hid=743933',
@@ -35,6 +32,23 @@ class PatreonIE(InfoExtractor):
'thumbnail': 're:^https?://.*$',
},
},
{
'url': 'https://www.patreon.com/creation?hid=1682498',
'info_dict': {
'id': 'SU4fj_aEMVw',
'ext': 'mp4',
'title': 'I\'m on Patreon!',
'uploader': 'TraciJHines',
'thumbnail': 're:^https?://.*$',
'upload_date': '20150211',
'description': 'md5:c5a706b1f687817a3de09db1eb93acd4',
'uploader_id': 'TraciJHines',
},
'params': {
'noplaylist': True,
'skip_download': True,
}
}
]
# Currently Patreon exposes download URL via hidden CSS, so login is not
@@ -65,26 +79,29 @@ class PatreonIE(InfoExtractor):
'''
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage).strip()
attach_fn = self._html_search_regex(
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
embed = self._html_search_regex(
r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
webpage, 'embedded URL', default=None)
if attach_fn is not None:
video_url = 'http://www.patreon.com' + attach_fn
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
r'<strong>(.*?)</strong> is creating', webpage, 'uploader')
elif embed is not None:
return self.url_result(embed)
else:
playlist_js = self._search_regex(
playlist = self._parse_json(self._search_regex(
r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])',
webpage, 'playlist JSON')
playlist_json = js_to_json(playlist_js)
playlist = json.loads(playlist_json)
webpage, 'playlist JSON'),
video_id, transform_source=js_to_json)
data = playlist[0]
video_url = self._proto_relative_url(data['mp3'])
thumbnail = self._proto_relative_url(data.get('cover'))

View File

@@ -56,7 +56,7 @@ class PornHubIE(InfoExtractor):
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<',
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
@@ -110,3 +110,33 @@ class PornHubIE(InfoExtractor):
'formats': formats,
'age_limit': 18,
}
class PornHubPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.pornhub.com/playlist/6201671',
'info_dict': {
'id': '6201671',
'title': 'P0p4',
},
'playlist_mincount': 35,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
entries = [
self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
]
playlist = self._parse_json(
self._search_regex(
r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
playlist_id)
return self.playlist_result(
entries, playlist_id, playlist.get('title'), playlist.get('description'))

View File

@@ -0,0 +1,117 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import json
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_request,
compat_urlparse,
)
from ..utils import (
int_or_none,
js_to_json,
mimetype2ext,
unified_strdate,
)
class SandiaIE(InfoExtractor):
IE_DESC = 'Sandia National Laboratories'
_VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P<id>[0-9a-f]+)'
_TEST = {
'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d',
'md5': '9422edc9b9a60151727e4b6d8bef393d',
'info_dict': {
'id': '24aace4429fc450fb5b38cdbf424a66e1d',
'ext': 'mp4',
'title': 'Xyce Software Training - Section 1',
'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
'upload_date': '20120904',
'duration': 7794,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
webpage = self._download_webpage(req, video_id)
js_path = self._search_regex(
r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
webpage, 'JS code URL')
js_url = compat_urlparse.urljoin(url, js_path)
js_code = self._download_webpage(
js_url, video_id, note='Downloading player')
def extract_str(key, **args):
return self._search_regex(
r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
js_code, key, **args)
def extract_data(key, **args):
data_json = extract_str(key, **args)
if data_json is None:
return data_json
return self._parse_json(
data_json, video_id, transform_source=js_to_json)
formats = []
for i in itertools.count():
fd = extract_data('VideoUrls[%d]' % i, default=None)
if fd is None:
break
formats.append({
'format_id': '%s' % i,
'format_note': fd['MimeType'].partition('/')[2],
'ext': mimetype2ext(fd['MimeType']),
'url': fd['Location'],
'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
})
self._sort_formats(formats)
slide_baseurl = compat_urlparse.urljoin(
url, extract_data('SlideBaseUrl'))
slide_template = slide_baseurl + re.sub(
r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
slides = []
last_slide_time = 0
for i in itertools.count(1):
sd = extract_str('Slides[%d]' % i, default=None)
if sd is None:
break
timestamp = int_or_none(self._search_regex(
r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
sd, 'slide %s timestamp' % i, fatal=False))
slides.append({
'url': slide_template % i,
'duration': timestamp - last_slide_time,
})
last_slide_time = timestamp
formats.append({
'format_id': 'slides',
'protocol': 'slideshow',
'url': json.dumps(slides),
'preference': -10000, # Downloader not yet written
})
self._sort_formats(formats)
title = extract_data('Title')
description = extract_data('Description', fatal=False)
duration = int_or_none(extract_data(
'Duration', fatal=False), scale=1000)
upload_date = unified_strdate(extract_data('AirDate', fatal=False))
return {
'id': video_id,
'title': title,
'description': description,
'formats': formats,
'upload_date': upload_date,
'duration': duration,
}

View File

@@ -25,7 +25,6 @@ class SockshareIE(InfoExtractor):
'id': '437BE28B89D799D7',
'title': 'big_buck_bunny_720p_surround.avi',
'ext': 'avi',
'thumbnail': 're:^http://.*\.jpg$',
}
}
@@ -45,7 +44,7 @@ class SockshareIE(InfoExtractor):
''', webpage, 'hash')
fields = {
"hash": confirm_hash,
"hash": confirm_hash.encode('utf-8'),
"confirm": "Continue as Free User"
}
@@ -68,7 +67,7 @@ class SockshareIE(InfoExtractor):
webpage, 'title', default=None)
thumbnail = self._html_search_regex(
r'<img\s+src="([^"]*)".+?name="bg"',
webpage, 'thumbnail')
webpage, 'thumbnail', default=None)
formats = [{
'format_id': 'sd',

View File

@@ -4,11 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class TheOnionIE(InfoExtractor):
_VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?'
_VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?'
_TEST = {
'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
'md5': '19eaa9a39cf9b9804d982e654dc791ee',
@@ -22,10 +21,8 @@ class TheOnionIE(InfoExtractor):
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
article_id = mobj.group('article_id')
webpage = self._download_webpage(url, article_id)
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'"videoId":\s(\d+),', webpage, 'video ID')
@@ -34,10 +31,6 @@ class TheOnionIE(InfoExtractor):
thumbnail = self._og_search_thumbnail(webpage)
sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
if not sources:
raise ExtractorError(
'No sources found for video %s' % video_id, expected=True)
formats = []
for src, type_ in sources:
if type_ == 'video/mp4':
@@ -54,15 +47,15 @@ class TheOnionIE(InfoExtractor):
})
elif type_ == 'application/x-mpegURL':
formats.extend(
self._extract_m3u8_formats(src, video_id, preference=-1))
self._extract_m3u8_formats(src, display_id, preference=-1))
else:
self.report_warning(
'Encountered unexpected format: %s' % type_)
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,

View File

@@ -71,7 +71,9 @@ class ThePlatformIE(SubtitlesInfoExtractor):
if not provider_id:
provider_id = 'dJ5BDC'
if mobj.group('config'):
if smuggled_data.get('force_smil_url', False):
smil_url = url
elif mobj.group('config'):
config_url = url + '&form=json'
config_url = config_url.replace('swf/', 'config/')
config_url = config_url.replace('onsite/', 'onsite/config/')

View File

@@ -175,7 +175,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
data = compat_urllib_parse.urlencode({
'password': password,

View File

@@ -45,19 +45,17 @@ class WebOfStoriesIE(InfoExtractor):
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
story_filename = self._search_regex(
r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
speaker_id = self._search_regex(
r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
story_id = self._search_regex(
r'\.storyId\((\d+)\)', webpage, 'story ID')
speaker_type = self._search_regex(
r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
great_life = self._search_regex(
r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
webpage, 'embed params').split(',')]
(
_, speaker_id, story_id, story_duration,
speaker_type, great_life, _thumbnail, _has_subtitles,
story_filename, _story_order) = embed_params
is_great_life_series = great_life == 'true'
duration = int_or_none(self._search_regex(
r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
duration = int_or_none(story_duration)
# URL building, see: http://www.webofstories.com/scripts/player.js
ms_prefix = ''

View File

@@ -18,8 +18,8 @@ class WSJIE(InfoExtractor):
'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'ext': 'mp4',
'upload_date': '20150202',
'uploader_id': 'bbright',
'creator': 'bbright',
'uploader_id': 'jdesai',
'creator': 'jdesai',
'categories': list, # a long list
'duration': 90,
'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',

View File

@@ -22,7 +22,7 @@ class XTubeIE(InfoExtractor):
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
'description': 'http://www.xtube.com an ET kind of thing',
'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,

View File

@@ -24,7 +24,6 @@ class YahooIE(InfoExtractor):
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
'ext': 'mp4',

View File

@@ -1560,8 +1560,8 @@ def js_to_json(code):
return '"%s"' % v
res = re.sub(r'''(?x)
"(?:[^"\\]*(?:\\\\|\\")?)*"|
'(?:[^'\\]*(?:\\\\|\\')?)*'|
"(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
'(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
[a-zA-Z_][.a-zA-Z_0-9]*
''', fix_kv, code)
res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
@@ -1616,6 +1616,15 @@ def args_to_str(args):
return ' '.join(shlex_quote(a) for a in args)
def mimetype2ext(mt):
_, _, res = mt.rpartition('/')
return {
'x-ms-wmv': 'wmv',
'x-mp4-fragmented': 'mp4',
}.get(res, res)
def urlhandle_detect_ext(url_handle):
try:
url_handle.headers
@@ -1631,7 +1640,7 @@ def urlhandle_detect_ext(url_handle):
if e:
return e
return getheader('Content-Type').split("/")[1]
return mimetype2ext(getheader('Content-Type'))
def age_restricted(content_limit, age_limit):

View File

@@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2015.02.18.1'
__version__ = '2015.02.19.3'