Compare commits

...

16 Commits

Author SHA1 Message Date
Philipp Hagemeister
614582bcc4 release 2014.08.24 2014-08-24 02:44:36 +02:00
Philipp Hagemeister
e1ab5000b2 [brightcove] Add support for videoId= in og:video meta (Fixes #3571) 2014-08-24 02:41:21 +02:00
Philipp Hagemeister
a5ed3e571e [brightcove] Detect geoblocking 2014-08-24 02:40:26 +02:00
Philipp Hagemeister
10eaeb20c5 [generic] Require og:video URLs to contain a dot 2014-08-24 02:29:56 +02:00
Philipp Hagemeister
fa8deaf38b [generic] Prevent from downloading a .swf as a video
We're seeing quite a number of people who do not put a video file in the og:video field, but the player URL. Try to detect some of these and filter them out.
2014-08-24 02:24:49 +02:00
Philipp Hagemeister
6857590059 [brightcove] Add a truncated URL warning message (#3571) 2014-08-24 02:11:26 +02:00
Philipp Hagemeister
a3db22ebdf [grooveshark] Use proper imports 2014-08-24 02:06:59 +02:00
Philipp Hagemeister
c8e9a235d9 [generic] Add support for camtasia videos (Fixes #3574) 2014-08-24 02:02:17 +02:00
Philipp Hagemeister
30b871b0ca Merge remote-tracking branch 'origin/master' 2014-08-24 01:34:28 +02:00
Philipp Hagemeister
eb9da9b732 [grooveshark] Fix test md5sum 2014-08-24 01:33:55 +02:00
Philipp Hagemeister
d769be6c96 [grooveshark,http] Make HTTP POST downloads work 2014-08-24 01:31:35 +02:00
Sergey M․
a54bda3ae2 [wat] Add support for SD and HD videos (Closes #3558) 2014-08-24 02:22:10 +07:00
Philipp Hagemeister
00558d9414 Merge remote-tracking branch 'sehrgut/Grooveshark'
Conflicts:
	youtube_dl/__init__.py
	youtube_dl/extractor/__init__.py
2014-08-23 16:41:14 +02:00
Keith Beckman
ee1a7032d5 Fixed errors found by travisci:
py26: re.split can't take flags. use inline flags or re.compile
py27: info_dict must be serializable. remove request object
py335, py34: no urlparse module. use utils.compat_urlparse
2014-05-20 22:28:32 -04:00
Keith Beckman
7ed806d241 Fixed pyflakes and pep8 warnings 2014-05-20 02:55:21 -04:00
Keith Beckman
dd06c95e43 Added new IE for Grooveshark 2014-05-20 02:47:34 -04:00
9 changed files with 353 additions and 19 deletions

View File

@@ -71,6 +71,7 @@ __authors__ = (
'Sebastian Haas',
'Alexander Kirk',
'Erik Johnson',
'Keith Beckman',
)
__license__ = 'Public Domain'

View File

@@ -27,8 +27,16 @@ class HttpFD(FileDownloader):
headers['Youtubedl-user-agent'] = info_dict['user_agent']
if 'http_referer' in info_dict:
headers['Referer'] = info_dict['http_referer']
basic_request = compat_urllib_request.Request(url, None, headers)
request = compat_urllib_request.Request(url, None, headers)
add_headers = info_dict.get('http_headers')
if add_headers:
headers.update(add_headers)
data = info_dict.get('http_post_data')
http_method = info_dict.get('http_method')
basic_request = compat_urllib_request.Request(url, data, headers)
request = compat_urllib_request.Request(url, data, headers)
if http_method is not None:
basic_request.get_method = lambda: http_method
request.get_method = lambda: http_method
is_test = self.params.get('test', False)

View File

@@ -126,6 +126,7 @@ from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
from .grooveshark import GroovesharkIE
from .hark import HarkIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE

View File

@@ -154,12 +154,14 @@ class BrightcoveIE(InfoExtractor):
def _extract_brightcove_urls(cls, webpage):
"""Return a list of all Brightcove URLs from the webpage """
url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
url_m = re.search(
r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
webpage)
if url_m:
url = unescapeHTML(url_m.group(1))
# Some sites don't add it, we can't download with this url, for example:
# http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
if 'playerKey' in url:
if 'playerKey' in url or 'videoId' in url:
return [url]
matches = re.findall(
@@ -188,9 +190,13 @@ class BrightcoveIE(InfoExtractor):
referer = smuggled_data.get('Referer', url)
return self._get_video_info(
videoPlayer[0], query_str, query, referer=referer)
else:
elif 'playerKey' in query:
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
else:
raise ExtractorError(
'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
expected=True)
def _get_video_info(self, video_id, query_str, query, referer=None):
request_url = self._FEDERATED_URL_TEMPLATE % query_str
@@ -202,6 +208,13 @@ class BrightcoveIE(InfoExtractor):
req.add_header('Referer', referer)
webpage = self._download_webpage(req, video_id)
error_msg = self._html_search_regex(
r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage,
'error message', default=None)
if error_msg is not None:
raise ExtractorError(
'brightcove said: %s' % error_msg, expected=True)
self.report_extraction(video_id)
info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json')
info = json.loads(info)['data']

View File

@@ -84,6 +84,12 @@ class InfoExtractor(object):
format, irrespective of the file format.
-1 for default (order by other properties),
-2 or smaller for less than default.
* http_referer HTTP Referer header value to set.
* http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
to add to the request.
* http_post_data Additional data to send with a POST
request.
url: Final video URL.
ext: Video filename extension.
format: The video format, defaults to ext (used for --get-format)

View File

@@ -15,6 +15,7 @@ from ..utils import (
compat_xml_parse_error,
ExtractorError,
float_or_none,
HEADRequest,
orderedSet,
parse_xml,
@@ -305,6 +306,30 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True,
}
},
# Camtasia studio
{
'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
'playlist': [{
'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
'info_dict': {
'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
'ext': 'flv',
'duration': 2235.90,
}
}, {
'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
'info_dict': {
'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
'ext': 'flv',
'duration': 2235.93,
}
}],
'info_dict': {
'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
}
}
]
@@ -389,6 +414,43 @@ class GenericIE(InfoExtractor):
'entries': entries,
}
def _extract_camtasia(self, url, video_id, webpage):
""" Returns None if no camtasia video can be found. """
camtasia_cfg = self._search_regex(
r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
webpage, 'camtasia configuration file', default=None)
if camtasia_cfg is None:
return None
title = self._html_search_meta('DC.title', webpage, fatal=True)
camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
camtasia_cfg = self._download_xml(
camtasia_url, video_id,
note='Downloading camtasia configuration',
errnote='Failed to download camtasia configuration')
fileset_node = camtasia_cfg.find('./playlist/array/fileset')
entries = []
for n in fileset_node.getchildren():
url_n = n.find('./uri')
if url_n is None:
continue
entries.append({
'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
'title': '%s - %s' % (title, n.tag),
'url': compat_urlparse.urljoin(url, url_n.text),
'duration': float_or_none(n.find('./duration').text),
})
return {
'_type': 'playlist',
'entries': entries,
'title': title,
}
def _real_extract(self, url):
if url.startswith('//'):
return {
@@ -477,6 +539,11 @@ class GenericIE(InfoExtractor):
except compat_xml_parse_error:
pass
# Is it a Camtasia project?
camtasia_res = self._extract_camtasia(url, video_id, webpage)
if camtasia_res is not None:
return camtasia_res
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/rg3/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
@@ -764,7 +831,12 @@ class GenericIE(InfoExtractor):
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
return '.' in vpath and not vpath.endswith('.swf')
found = list(filter(
check_video,
re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
if not found:
# HTML5 video
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)

View File

@@ -0,0 +1,190 @@
# coding: utf-8
from __future__ import unicode_literals
import time
import math
import os.path
import re
from .common import InfoExtractor
from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
from ..utils import (
compat_urllib_parse,
compat_urlparse,
)
class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
def __init__(self):
self._current_object = None
self.objects = []
compat_html_parser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
attrs = dict((k, v) for k, v in attrs)
if tag == 'object':
self._current_object = {'attrs': attrs, 'params': []}
elif tag == 'param':
self._current_object['params'].append(attrs)
def handle_endtag(self, tag):
if tag == 'object':
self.objects.append(self._current_object)
self._current_object = None
@classmethod
def extract_object_tags(cls, html):
p = cls()
p.feed(html)
p.close()
return p.objects
class GroovesharkIE(InfoExtractor):
_VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
_TEST = {
'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
'md5': '7ecf8aefa59d6b2098517e1baa530023',
'info_dict': {
'id': '6SS1DW',
'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
'ext': 'mp3',
'duration': 227,
}
}
do_playerpage_request = True
do_bootstrap_request = True
def _parse_target(self, target):
uri = compat_urlparse.urlparse(target)
hash = uri.fragment[1:].split('?')[0]
token = os.path.basename(hash.rstrip('/'))
return (uri, hash, token)
def _build_bootstrap_url(self, target):
(uri, hash, token) = self._parse_target(target)
query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
def _build_meta_url(self, target):
(uri, hash, token) = self._parse_target(target)
query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
def _build_stream_url(self, meta):
return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None))
def _build_swf_referer(self, target, obj):
(uri, _, _) = self._parse_target(target)
return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None))
def _transform_bootstrap(self, js):
return re.split('(?m)^\s*try\s*{', js)[0] \
.split(' = ', 1)[1].strip().rstrip(';')
def _transform_meta(self, js):
return js.split('\n')[0].split('=')[1].rstrip(';')
def _get_meta(self, target):
(meta_url, token) = self._build_meta_url(target)
self.to_screen('Metadata URL: %s' % meta_url)
headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
req = compat_urllib_request.Request(meta_url, headers=headers)
res = self._download_json(req, token,
transform_source=self._transform_meta)
if 'getStreamKeyWithSong' not in res:
raise ExtractorError(
'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
if res['getStreamKeyWithSong'] is None:
raise ExtractorError(
'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
expected=True)
return res['getStreamKeyWithSong']
def _get_bootstrap(self, target):
(bootstrap_url, token) = self._build_bootstrap_url(target)
headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
req = compat_urllib_request.Request(bootstrap_url, headers=headers)
res = self._download_json(req, token, fatal=False,
note='Downloading player bootstrap data',
errnote='Unable to download player bootstrap data',
transform_source=self._transform_bootstrap)
return res
def _get_playerpage(self, target):
(_, _, token) = self._parse_target(target)
webpage = self._download_webpage(
target, token,
note='Downloading player page',
errnote='Unable to download player page',
fatal=False)
if webpage is not None:
# Search (for example German) error message
error_msg = self._html_search_regex(
r'<div id="content">\s*<h2>(.*?)</h2>', webpage,
'error message', default=None)
if error_msg is not None:
error_msg = error_msg.replace('\n', ' ')
raise ExtractorError('Grooveshark said: %s' % error_msg)
if webpage is not None:
o = GroovesharkHtmlParser.extract_object_tags(webpage)
return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'])
return (webpage, None)
def _real_initialize(self):
self.ts = int(time.time() * 1000) # timestamp in millis
def _real_extract(self, url):
(target_uri, _, token) = self._parse_target(url)
# 1. Fill cookiejar by making a request to the player page
swf_referer = None
if self.do_playerpage_request:
(_, player_objs) = self._get_playerpage(url)
if player_objs is not None:
swf_referer = self._build_swf_referer(url, player_objs[0])
self.to_screen('SWF Referer: %s' % swf_referer)
# 2. Ask preload.php for swf bootstrap data to better mimic webapp
if self.do_bootstrap_request:
bootstrap = self._get_bootstrap(url)
self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken'])
# 3. Ask preload.php for track metadata.
meta = self._get_meta(url)
# 4. Construct stream request for track.
stream_url = self._build_stream_url(meta)
duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000))
post_dict = {'streamKey': meta['streamKey']['streamKey']}
post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8')
headers = {
'Content-Length': len(post_data),
'Content-Type': 'application/x-www-form-urlencoded'
}
if swf_referer is not None:
headers['Referer'] = swf_referer
return {
'id': token,
'title': meta['song']['Name'],
'http_method': 'POST',
'url': stream_url,
'ext': 'mp3',
'format': 'mp3 audio',
'duration': duration,
'http_post_data': post_data,
'http_headers': headers,
}

View File

@@ -2,27 +2,30 @@
from __future__ import unicode_literals
import re
import time
import hashlib
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unified_strdate,
)
class WatIE(InfoExtractor):
_VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
_VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html'
IE_NAME = 'wat.tv'
_TEST = {
'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
'md5': 'ce70e9223945ed26a8056d413ca55dc9',
'info_dict': {
'id': '10631273',
'id': '11713067',
'display_id': 'soupe-figues-l-orange-aux-epices',
'ext': 'mp4',
'title': 'World War Z - Philadelphia VOST',
'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
},
'params': {
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
'title': 'Soupe de figues à l\'orange et aux épices',
'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
'upload_date': '20140819',
'duration': 120,
},
}
@@ -36,13 +39,20 @@ class WatIE(InfoExtractor):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
mobj = re.match(self._VALID_URL, url)
short_id = mobj.group('shortID')
webpage = self._download_webpage(url, short_id)
short_id = mobj.group('short_id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id or short_id)
real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
video_info = self.download_video_info(real_id)
if video_info.get('geolock'):
raise ExtractorError('This content is not available in your area', expected=True)
chapters = video_info['chapters']
first_chapter = chapters[0]
files = video_info['files']
first_file = files[0]
if real_id_for_chapter(first_chapter) != real_id:
self.to_screen('Multipart video detected')
@@ -61,12 +71,45 @@ class WatIE(InfoExtractor):
upload_date = unified_strdate(first_chapter['date_diffusion'])
# Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url
formats = [{
'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
'format_id': 'Mobile',
}]
fmts = [('SD', 'web')]
if first_file.get('hasHD'):
fmts.append(('HD', 'webhd'))
def compute_token(param):
timestamp = '%08x' % int(time.time())
magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564'
return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp)
for fmt in fmts:
webid = '/%s/%s' % (fmt[1], real_id)
video_url = self._download_webpage(
'http://www.wat.tv/get%s?token=%s&getURL=1' % (webid, compute_token(webid)),
real_id,
'Downloding %s video URL' % fmt[0],
'Failed to download %s video URL' % fmt[0],
False)
if not video_url:
continue
formats.append({
'url': video_url,
'ext': 'mp4',
'format_id': fmt[0],
})
return {
'id': real_id,
'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
'display_id': display_id,
'title': first_chapter['title'],
'thumbnail': first_chapter['preview'],
'description': first_chapter['description'],
'view_count': video_info['views'],
'upload_date': upload_date,
'duration': first_file['duration'],
'formats': formats,
}

View File

@@ -1,2 +1,2 @@
__version__ = '2014.08.23'
__version__ = '2014.08.24'