Compare commits

...

18 Commits

Author SHA1 Message Date
Philipp Hagemeister
34ca5d9ba0 release 2014.03.11 2014-03-11 16:51:50 +01:00
Philipp Hagemeister
60cc4dc4b4 [generic/funnyordie] Add support for funnyordie embeds (Fixes #2546) 2014-03-11 16:51:36 +01:00
Philipp Hagemeister
db95dc13a1 [playvid] Simplify (#2539) 2014-03-10 20:55:47 +01:00
Philipp Hagemeister
777ac90791 Merge remote-tracking branch 'MikeCol/playvid_extract' 2014-03-10 20:45:45 +01:00
Philipp Hagemeister
04f9bebbcb Merge remote-tracking branch 'jaimeMF/remove_global_opener' 2014-03-10 20:42:54 +01:00
MikeCol
4ea3137e41 Playvid extractor 2014-03-10 20:16:49 +01:00
Jaime Marquínez Ferrándiz
a0792b738e Don't install the global url opener
All the code uses now the urlopen method of YoutubeDL
2014-03-10 19:04:51 +01:00
Jaime Marquínez Ferrándiz
19a41fc613 Don't set the global socket timeout
Use the timeout argument of the `OpenerDirector.open` method instead
2014-03-10 19:03:37 +01:00
Sergey M․
3ee52157fb [vgtrk] Rename vesti extractor 2014-03-11 00:58:05 +07:00
Sergey M․
c4d197ee2d [vesti] Fix _VALID_URL regex 2014-03-11 00:49:41 +07:00
Philipp Hagemeister
a33932cfe3 [vevo] Correct test value
The date is now interpreted as UTC for consistency.
2014-03-10 17:56:54 +01:00
Philipp Hagemeister
bcf89ce62c [generic] Suppress warning about doctypes in RSS parser 2014-03-10 17:31:32 +01:00
Philipp Hagemeister
e3899d0e00 Merge branch 'master' of github.com:rg3/youtube-dl 2014-03-10 16:42:22 +01:00
Philipp Hagemeister
dcb00da49c [depositfiles] Remove extractor
This site requires a CAPTCHA to download, supports arbitrary files and not only audio/video, and I can't find a single uncopyrighted video with a quick google search.
Closes #1255
2014-03-10 16:41:08 +01:00
Sergey M․
aa51d20d19 [vesti] Skip geo restricted test 2014-03-10 22:31:22 +07:00
Philipp Hagemeister
ae7ed92057 [youtube] Fix up invalid JSON 2014-03-10 13:35:45 +01:00
Philipp Hagemeister
e45b31d9bd [vevo] Interpret date as UTC instead of local time 2014-03-10 13:12:57 +01:00
Philipp Hagemeister
5a25f39653 Correct extractor documentation 2014-03-10 13:09:55 +01:00
12 changed files with 145 additions and 87 deletions

View File

@@ -701,8 +701,11 @@ class YoutubeDL(object):
else:
formats = info_dict['formats']
if not formats:
raise ExtractorError('No video formats found!')
# We check that all the formats have the format and format_id fields
for (i, format) in enumerate(formats):
for i, format in enumerate(formats):
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
if format.get('format') is None:
@@ -1167,7 +1170,7 @@ class YoutubeDL(object):
def urlopen(self, req):
""" Start an HTTP download """
return self._opener.open(req)
return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):
if not self.params.get('verbose'):
@@ -1198,7 +1201,7 @@ class YoutubeDL(object):
def _setup_opener(self):
timeout_val = self.params.get('socket_timeout')
timeout = 600 if timeout_val is None else float(timeout_val)
self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
opts_cookiefile = self.params.get('cookiefile')
opts_proxy = self.params.get('proxy')
@@ -1236,7 +1239,3 @@ class YoutubeDL(object):
# (See https://github.com/rg3/youtube-dl/issues/1309 for details)
opener.addheaders = []
self._opener = opener
# TODO remove this global modification
compat_urllib_request.install_opener(opener)
socket.setdefaulttimeout(timeout)

View File

@@ -53,7 +53,6 @@ from .dailymotion import (
DailymotionUserIE,
)
from .daum import DaumIE
from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .defense import DefenseGouvFrIE
@@ -176,6 +175,7 @@ from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
@@ -247,8 +247,8 @@ from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
from .vgtrk import VGTRKIE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE

View File

@@ -118,9 +118,6 @@ class InfoExtractor(object):
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
_real_extract() must return a *list* of information dictionaries as
described above.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
"""

View File

@@ -1,60 +0,0 @@
import re
import os
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class DepositFilesIE(InfoExtractor):
"""Information extractor for depositfiles.com"""
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
def _real_extract(self, url):
file_id = url.split('/')[-1]
# Rebuild url in english locale
url = 'http://depositfiles.com/en/files/' + file_id
# Retrieve file webpage with 'Free download' button pressed
free_download_indication = {'gateway_result' : '1'}
request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
try:
self.report_download_webpage(file_id)
webpage = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
# Search for the real file URL
mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
if (mobj is None) or (mobj.group(1) is None):
# Try to figure out reason of the error.
mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
if (mobj is not None) and (mobj.group(1) is not None):
restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
raise ExtractorError(u'%s' % restriction_message)
else:
raise ExtractorError(u'Unable to extract download URL from: %s' % url)
file_url = mobj.group(1)
file_extension = os.path.splitext(file_url)[1][1:]
# Search for file title
file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
return [{
'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'),
'uploader': None,
'upload_date': None,
'title': file_title,
'ext': file_extension.decode('utf-8'),
}]

View File

@@ -1,12 +1,13 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
class FunnyOrDieIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
_VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
_TEST = {
'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
'file': '0732f586d7.mp4',
@@ -30,10 +31,20 @@ class FunnyOrDieIE(InfoExtractor):
[r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''],
webpage, 'video URL', flags=re.DOTALL)
if mobj.group('type') == 'embed':
post_json = self._search_regex(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
post = json.loads(post_json)['attachment']
title = post['name']
description = post.get('description')
else:
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'title': title,
'description': description,
}

View File

@@ -4,7 +4,6 @@ from __future__ import unicode_literals
import os
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from .youtube import YoutubeIE
@@ -17,6 +16,7 @@ from ..utils import (
ExtractorError,
HEADRequest,
parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
@@ -134,6 +134,17 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
# funnyordie embed
{
'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
'md5': '7cf780be104d40fea7bae52eed4a470e',
'info_dict': {
'id': '18e820ec3f',
'ext': 'mp4',
'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
}
},
]
def report_download_webpage(self, video_id):
@@ -274,7 +285,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed?
try:
doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
doc = parse_xml(webpage)
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
except compat_xml_parse_error:
@@ -432,6 +443,14 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
for eurl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@@ -0,0 +1,80 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
)
class PlayvidIE(InfoExtractor):
_VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
_TEST = {
'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
'md5': '44930f8afa616efdf9482daf4fe53e1e',
'info_dict': {
'id': 'agbDDi7WZTV',
'ext': 'mp4',
'title': 'Michelle Lewin in Miami Beach',
'duration': 240,
'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_title = None
duration = None
video_thumbnail = None
formats = []
# most of the information is stored in the flashvars
flashvars = self._html_search_regex(
r'flashvars="(.+?)"', webpage, 'flashvars')
infos = compat_urllib_parse.unquote(flashvars).split(r'&')
for info in infos:
videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
if videovars_match:
key = videovars_match.group(1)
val = videovars_match.group(2)
if key == 'title':
video_title = compat_urllib_parse.unquote_plus(val)
if key == 'duration':
try:
duration = int(val)
except ValueError:
pass
if key == 'big_thumb':
video_thumbnail = val
videourl_match = re.match(
r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
if videourl_match:
height = int(videourl_match.group('resolution'))
formats.append({
'height': height,
'url': val,
})
self._sort_formats(formats)
# Extract title - should be in the flashvars; if not, look elsewhere
if video_title is None:
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
return {
'id': video_id,
'formats': formats,
'title': video_title,
'thumbnail': video_thumbnail,
'duration': duration,
'description': None,
'age_limit': 18
}

View File

@@ -57,7 +57,7 @@ class VevoIE(InfoExtractor):
'age_limit': 18,
'title': 'Tunnel Vision (Explicit)',
'uploader': 'Justin Timberlake',
'upload_date': '20130704',
'upload_date': '20130703',
},
'params': {
'skip_download': 'true',
@@ -169,7 +169,7 @@ class VevoIE(InfoExtractor):
timestamp_ms = int(self._search_regex(
r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
upload_date = datetime.datetime.utcfromtimestamp(timestamp_ms // 1000)
return {
'id': video_id,
'title': video_info['title'],

View File

@@ -10,10 +10,9 @@ from ..utils import (
)
class VestiIE(InfoExtractor):
IE_NAME = 'vesti'
IE_DESC = 'Вести.Ru'
_VALID_URL = r'http://(?:(?:.+?\.)?vesti\.ru|(?:2\.)?russia\.tv|tvkultura\.ru|rutv\.ru)/(?P<id>.+)'
class VGTRKIE(InfoExtractor):
IE_DESC = 'ВГТРК'
_VALID_URL = r'http://(?:.+?\.)?(?:vesti\.ru|russia2?\.tv|tvkultura\.ru|rutv\.ru)/(?P<id>.+)'
_TESTS = [
{
@@ -85,7 +84,7 @@ class VestiIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia'
'skip': 'Blocked outside Russia',
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
@@ -142,6 +141,7 @@ class VestiIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia',
},
{
'url': 'http://tvkultura.ru/video/show/brand_id/31724/episode_id/972347/video_id/978186',

View File

@@ -1285,10 +1285,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Decide which formats to download
try:
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
if not mobj:
raise ValueError('Could not find vevo ID')
ytplayer_config = json.loads(mobj.group(1))
json_code = uppercase_escape(mobj.group(1))
ytplayer_config = json.loads(json_code)
args = ytplayer_config['args']
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted

View File

@@ -22,6 +22,7 @@ import struct
import subprocess
import sys
import traceback
import xml.etree.ElementTree
import zlib
try:
@@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd):
def urlencode_postdata(*args, **kargs):
return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
def parse_xml(s):
class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
def doctype(self, name, pubid, system):
pass # Ignore doctypes
parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)

View File

@@ -1,2 +1,2 @@
__version__ = '2014.03.10'
__version__ = '2014.03.11'