Compare commits

...

34 Commits

Author SHA1 Message Date
Ricardo Garcia
44e16fa17f Bump version number 2010-10-31 11:26:34 +01:00
Ricardo Garcia
d983524781 Add --no-progress option (fixes issue #98) 2010-10-31 11:26:34 +01:00
Ricardo Garcia
1392f3f52c Give preference to format 34 before format 5 in quality list 2010-10-31 11:26:34 +01:00
Ricardo Garcia
43ab0ca432 Do not error out on problems printing the file name 2010-10-31 11:26:34 +01:00
Ricardo Garcia
31cbdaafd4 Properly support simple titles in the newest InfoExtractors 2010-10-31 11:26:34 +01:00
Ricardo Garcia
bd3cdf6dc4 Do not pass URLs around in Unicode form (fixes issue #92) 2010-10-31 11:26:34 +01:00
Ricardo Garcia
8cc468de75 Bump version number 2010-10-31 11:26:31 +01:00
Ricardo Garcia
31bcb48001 Tweak final filename in the open attempt, to be platform and filename-agnostic 2010-10-31 11:26:30 +01:00
Ricardo Garcia
c201ebc915 Fix SyntaxError triggered by mistake in user-agent commit 2010-10-31 11:26:30 +01:00
Ricardo Garcia
ce9c6a3097 Fix problem with sanitize_title not replacing Windows directory separator 2010-10-31 11:26:30 +01:00
Ricardo Garcia
4cfeb46544 Update user-agent string 2010-10-31 11:26:30 +01:00
Ricardo Garcia
490fd7aea7 Cherry-pick obeythepenguin's changes and merge them into main branch 2010-10-31 11:26:30 +01:00
Ricardo Garcia
c05fc6a345 Support simplest new URLs in YouTube 2010-10-31 11:26:30 +01:00
Ricardo Garcia
91bce611c7 Bump version number 2010-10-31 11:26:26 +01:00
Ricardo Garcia
1c1821f8eb Improve rtmpdump support 2010-10-31 11:25:09 +01:00
Ricardo Garcia
60f8049d05 Only verify the URL when it's an HTTP download 2010-10-31 11:25:08 +01:00
obeythepenguin@gmail.com
49c0028a7a patched to add Google Video and Photobucket support 2010-10-31 11:25:08 +01:00
Ricardo Garcia
f1b4bee09d Bump version number 2010-10-31 11:25:05 +01:00
Ricardo Garcia
a04e80a481 Add flexibility importing the "parse_qs" function (fixes issue #81) 2010-10-31 11:25:05 +01:00
Ricardo Garcia
fe788f2c6f Bump version number 2010-10-31 11:25:01 +01:00
Ricardo Garcia
75a4cf3c97 Fix minor problems with Youtube user InfoExtractor 2010-10-31 11:25:01 +01:00
Ricardo Garcia
0487b407a1 Add support for using rtmpdump 2010-10-31 11:25:01 +01:00
Ricardo Garcia
a692ca7c49 Bump version number 2010-10-31 11:24:57 +01:00
Ricardo Garcia
9c457d2a20 Handle file open mode correctly (fixes issue #76) 2010-10-31 11:24:56 +01:00
Archanamiya
c39c05cdd7 Added support to download all of a user's videos! 2010-10-31 11:24:56 +01:00
Ricardo Garcia
29f0756805 Fix detection of uploader nickname in metacafe (fixes issue #67) 2010-10-31 11:24:56 +01:00
Ricardo Garcia
d9bc015b3c Take format 37 into account (fixes issue #65) 2010-10-31 11:24:56 +01:00
Ricardo Garcia
4bec29ef4b Add self-updating code 2010-10-31 11:24:56 +01:00
Ricardo Garcia
ab1f697827 Use unquote_plus to decode video title 2010-10-31 11:24:56 +01:00
Ricardo Garcia
583c714fde Allow empty titles because they do appear in some videos (fixes issue #53) 2010-10-31 11:24:56 +01:00
Ricardo Garcia
850ab76560 Use default values for "continuedl" and "nooverwrites" downloader parameters 2010-10-31 11:24:56 +01:00
Ricardo Garcia
f5a5bec351 Avoid using Unicode strings when forming URL requests (fixes issue #50) 2010-10-31 11:24:56 +01:00
Ricardo Garcia
f94b636c3e Improve preferred encoding detection method 2010-10-31 11:24:56 +01:00
Ricardo Garcia
0833f1eb83 Restore INTERNAL version number 2010-10-31 11:24:56 +01:00
2 changed files with 522 additions and 74 deletions

View File

@@ -1 +1 @@
2009.09.13
2010.03.07

View File

@@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# Author: Danny Colligan
# Author: Benjamin Johnson
# License: Public domain code
import htmlentitydefs
import httplib
@@ -13,13 +14,20 @@ import os.path
import re
import socket
import string
import subprocess
import sys
import time
import urllib
import urllib2
# parse_qs was moved from the cgi module to the urlparse module recently.
try:
from urlparse import parse_qs
except ImportError:
from cgi import parse_qs
std_headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
@@ -33,15 +41,68 @@ def preferredencoding():
Returns the best encoding scheme for the system, based on
locale.getpreferredencoding() and some further tweaks.
"""
def yield_preferredencoding():
try:
pref = locale.getpreferredencoding()
u'TEST'.encode(pref)
except:
pref = 'UTF-8'
while True:
yield pref
return yield_preferredencoding().next()
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character.
This function receives a match object and is intended to be used with
the re.sub() function.
"""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
return utitle.replace(unicode(os.sep), u'%')
def sanitize_open(filename, open_mode):
"""Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename. If this fails, it tries to change
the filename slightly, step by step, until it's either able to open it
or it fails and raises a final exception, like the standard open()
function.
It returns the tuple (stream, definitive_file_name).
"""
try:
pref = locale.getpreferredencoding()
# Mac OSX systems have this problem sometimes
if pref == '':
return 'UTF-8'
return pref
except:
sys.stderr.write('WARNING: problem obtaining preferred encoding. Falling back to UTF-8.\n')
return 'UTF-8'
stream = open(filename, open_mode)
return (stream, filename)
except (IOError, OSError), err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
# An exception here should be caught in the caller
stream = open(filename, open_mode)
return (stream, filename)
class DownloadError(Exception):
"""Download Error exception.
@@ -131,6 +192,7 @@ class FileDownloader(object):
ratelimit: Download speed limit, in bytes/sec.
nooverwrites: Prevent overwriting files.
continuedl: Try to continue downloads if possible.
noprogress: Do not print the progress bar.
"""
params = None
@@ -239,11 +301,15 @@ class FileDownloader(object):
self._pps.append(pp)
pp.set_downloader(self)
def to_stdout(self, message, skip_eol=False):
def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
"""Print message to stdout if not in quiet mode."""
if not self.params.get('quiet', False):
print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
try:
if not self.params.get('quiet', False):
print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
sys.stdout.flush()
except (UnicodeEncodeError), err:
if not ignore_encoding_errors:
raise
def to_stderr(self, message):
"""Print message to stderr."""
@@ -281,10 +347,12 @@ class FileDownloader(object):
def report_destination(self, filename):
"""Report destination filename."""
self.to_stdout(u'[download] Destination: %s' % filename)
self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
"""Report download progress."""
if self.params.get('noprogress', False):
return
self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
@@ -294,7 +362,10 @@ class FileDownloader(object):
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
try:
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
except (UnicodeEncodeError), err:
self.to_stdout(u'[download] The file has already been downloaded')
def report_unable_to_resume(self):
"""Report it was impossible to resume download."""
@@ -302,22 +373,27 @@ class FileDownloader(object):
def report_finish(self):
"""Report download finished."""
self.to_stdout(u'')
if self.params.get('noprogress', False):
self.to_stdout(u'[download] Download completed')
else:
self.to_stdout(u'')
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
try:
info_dict['url'] = self.verify_url(info_dict['url'])
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Verify URL if it's an HTTP one
if info_dict['url'].startswith('http'):
try:
info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Forced printings
if self.params.get('forcetitle', False):
print info_dict['title'].encode(preferredencoding())
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
if self.params.get('forceurl', False):
print info_dict['url'].encode(preferredencoding())
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
return
@@ -327,7 +403,7 @@ class FileDownloader(object):
filename = self.params['outtmpl'] % template_dict
except (ValueError, KeyError), err:
self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
if self.params['nooverwrites'] and os.path.exists(filename):
if self.params.get('nooverwrites', False) and os.path.exists(filename):
self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
return
@@ -338,7 +414,7 @@ class FileDownloader(object):
return
try:
success = self._do_download(filename, info_dict['url'])
success = self._do_download(filename, info_dict['url'].encode('utf-8'))
except (OSError, IOError), err:
raise UnavailableFormatError
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@@ -390,21 +466,53 @@ class FileDownloader(object):
if info is None:
break
def _do_download(self, filename, url):
stream = None
open_mode = 'ab'
def _download_with_rtmpdump(self, filename, url):
self.report_destination(filename)
# Check for rtmpdump first
try:
subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
return False
# Download using rtmpdump. rtmpdump returns exit code 2 when
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
while retval == 2 or retval == 1:
self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
time.sleep(2.0) # This seems to be needed
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
if retval == 0:
self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
return True
else:
self.trouble('ERROR: rtmpdump exited with code %d' % retval)
return False
def _do_download(self, filename, url):
# Attempt to download using rtmpdump
if url.startswith('rtmp'):
return self._download_with_rtmpdump(filename, url)
stream = None
open_mode = 'wb'
basic_request = urllib2.Request(url, None, std_headers)
request = urllib2.Request(url, None, std_headers)
# Attempt to resume download with "continuedl" option
# Establish possible resume length
if os.path.isfile(filename):
resume_len = os.path.getsize(filename)
else:
resume_len = 0
if self.params['continuedl'] and resume_len != 0:
# Request parameters in case of being able to resume
if self.params.get('continuedl', False) and resume_len != 0:
self.report_resuming_byte(resume_len)
request.add_header('Range','bytes=%d-' % resume_len)
open_mode = 'ab'
# Establish connection
try:
@@ -412,12 +520,16 @@ class FileDownloader(object):
except (urllib2.HTTPError, ), err:
if err.code != 416: # 416 is 'Requested range not satisfiable'
raise
# Unable to resume
data = urllib2.urlopen(basic_request)
content_length = data.info()['Content-Length']
if content_length is not None and long(content_length) == resume_len:
# Because the file had already been fully downloaded
self.report_file_already_downloaded(filename)
return True
else:
# Because the server didn't let us
self.report_unable_to_resume()
open_mode = 'wb'
@@ -439,7 +551,7 @@ class FileDownloader(object):
# Open file just in time
if stream is None:
try:
stream = open(filename, open_mode)
(stream, filename) = sanitize_open(filename, open_mode)
self.report_destination(filename)
except (OSError, IOError), err:
self.trouble('ERROR: unable to open for writing: %s' % str(err))
@@ -525,46 +637,24 @@ class InfoExtractor(object):
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
_available_formats = ['22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
_available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
_video_extensions = {
'13': '3gp',
'17': 'mp4',
'18': 'mp4',
'22': 'mp4',
'37': 'mp4',
}
@staticmethod
def suitable(url):
return (re.match(YoutubeIE._VALID_URL, url) is not None)
@staticmethod
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character."""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def report_lang(self):
"""Report attempt to set language."""
self._downloader.to_stdout(u'[youtube] Setting language')
@@ -589,6 +679,10 @@ class YoutubeIE(InfoExtractor):
"""Report extracted video URL."""
self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
def report_rtmp_download(self):
"""Indicate the download will use the RTMP protocol."""
self._downloader.to_stdout(u'[youtube] RTMP download detected')
def _real_initialize(self):
if self._downloader is None:
return
@@ -687,46 +781,47 @@ class YoutubeIE(InfoExtractor):
try:
self.report_video_info_webpage_download(video_id)
video_info_webpage = urllib2.urlopen(request).read()
video_info = parse_qs(video_info_webpage)
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
return
self.report_information_extraction(video_id)
# "t" param
mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'token' not in video_info:
# Attempt to see if YouTube has issued an error message
mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'reason' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
stream.write(video_info_webpage)
stream.close()
else:
reason = urllib.unquote_plus(mobj.group(1))
reason = urllib.unquote_plus(video_info['reason'][0])
self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
return
token = urllib.unquote(mobj.group(1))
token = urllib.unquote_plus(video_info['token'][0])
video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
if format_param is not None:
video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
# Check possible RTMP download
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
video_real_url = video_info['conn'][0]
# uploader
mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'author' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
video_uploader = urllib.unquote(mobj.group(1))
video_uploader = urllib.unquote_plus(video_info['author'][0])
# title
mobj = re.search(r'(?m)&title=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'title' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = urllib.unquote(mobj.group(1))
video_title = urllib.unquote_plus(video_info['title'][0])
video_title = video_title.decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
video_title = video_title.replace(os.sep, u'%')
video_title = sanitize_title(video_title)
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
@@ -866,8 +961,9 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
@@ -887,6 +983,260 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: format not available for video')
class GoogleIE(InfoExtractor):
"""Information extractor for video.google.com."""
_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(GoogleIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(1)
video_extension = 'mp4'
# Retrieve video webpage to extract further information
request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r"download_url:'([^']+)'", webpage)
if mobj is None:
video_extension = 'flv'
mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
mediaURL = mediaURL.replace('\\x3d', '\x3d')
mediaURL = mediaURL.replace('\\x26', '\x26')
video_url = mediaURL
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
# Google Video doesn't show uploader nicknames?
video_uploader = 'NA'
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com."""
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(PhotobucketIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(1)
video_extension = 'flv'
# Retrieve video webpage to extract further information
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
video_url = mediaURL
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
video_uploader = mobj.group(2).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class GenericIE(InfoExtractor):
"""Generic last-resort information extractor."""
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return True
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
video_id = url.split('/')[-1]
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
except ValueError, err:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_url = urllib.unquote(mobj.group(1))
video_id = os.path.basename(video_url)
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
# video uploader is domain name
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_uploader = mobj.group(1).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class YoutubeSearchIE(InfoExtractor):
"""Information Extractor for YouTube search queries."""
_VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
@@ -906,6 +1256,7 @@ class YoutubeSearchIE(InfoExtractor):
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _real_initialize(self):
@@ -919,6 +1270,7 @@ class YoutubeSearchIE(InfoExtractor):
prefix, query = query.split(':')
prefix = prefix[8:]
query = query.encode('utf-8')
if prefix == '':
self._download_n_results(query, 1)
return
@@ -1036,6 +1388,61 @@ class YoutubePlaylistIE(InfoExtractor):
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
class YoutubeUserIE(InfoExtractor):
"""Information Extractor for YouTube users."""
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
InfoExtractor.__init__(self, downloader)
self._youtube_ie = youtube_ie
@staticmethod
def suitable(url):
return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
def report_download_page(self, username):
"""Report attempt to download user page."""
self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
def _real_initialize(self):
self._youtube_ie.initialize()
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
return
# Download user page
username = mobj.group(1)
video_ids = []
pagenum = 1
self.report_download_page(username)
request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
return
# Extract video identifiers
ids_in_page = []
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
for id in video_ids:
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
class PostProcessor(object):
"""Post Processor class.
@@ -1089,6 +1496,22 @@ if __name__ == '__main__':
import getpass
import optparse
# Function to update the program file with the latest version from bitbucket.org
def update_self(downloader, filename):
# Note: downloader only used for options
if not os.access (filename, os.W_OK):
sys.exit('ERROR: no write permissions on %s' % filename)
downloader.to_stdout('Updating to latest stable version...')
latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
latest_version = urllib.urlopen(latest_url).read().strip()
prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
newcontent = urllib.urlopen(prog_url).read()
stream = open(filename, 'w')
stream.write(newcontent)
stream.close()
downloader.to_stdout('Updated to version %s' % latest_version)
# General configuration
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
@@ -1097,7 +1520,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2009.09.13',
version='2010.03.07',
conflict_handler='resolve',
)
@@ -1105,6 +1528,8 @@ if __name__ == '__main__':
action='help', help='print this help text and exit')
parser.add_option('-v', '--version',
action='version', help='print program version and exit')
parser.add_option('-U', '--update',
action='store_true', dest='update_self', help='update this program to latest stable version')
parser.add_option('-i', '--ignore-errors',
action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
parser.add_option('-r', '--rate-limit',
@@ -1139,6 +1564,8 @@ if __name__ == '__main__':
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
verbosity.add_option('-e', '--get-title',
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
verbosity.add_option('--no-progress',
action='store_true', dest='noprogress', help='do not print progress bar', default=False)
parser.add_option_group(verbosity)
filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
@@ -1157,7 +1584,7 @@ if __name__ == '__main__':
parser.add_option_group(filesystem)
(opts, args) = parser.parse_args()
# Batch file verification
batchurls = []
if opts.batchfile is not None:
@@ -1170,8 +1597,6 @@ if __name__ == '__main__':
all_urls = batchurls + args
# Conflicting, missing and erroneous options
if len(all_urls) < 1:
parser.error(u'you must provide at least one URL')
if opts.usenetrc and (opts.username is not None or opts.password is not None):
parser.error(u'using .netrc conflicts with giving username/password')
if opts.password is not None and opts.username is None:
@@ -1192,7 +1617,11 @@ if __name__ == '__main__':
youtube_ie = YoutubeIE()
metacafe_ie = MetacafeIE(youtube_ie)
youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
youtube_user_ie = YoutubeUserIE(youtube_ie)
youtube_search_ie = YoutubeSearchIE(youtube_ie)
google_ie = GoogleIE()
photobucket_ie = PhotobucketIE()
generic_ie = GenericIE()
# File downloader
fd = FileDownloader({
@@ -1212,11 +1641,30 @@ if __name__ == '__main__':
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
'continuedl': opts.continue_dl,
'noprogress': opts.noprogress,
})
fd.add_info_extractor(youtube_search_ie)
fd.add_info_extractor(youtube_pl_ie)
fd.add_info_extractor(youtube_user_ie)
fd.add_info_extractor(metacafe_ie)
fd.add_info_extractor(youtube_ie)
fd.add_info_extractor(google_ie)
fd.add_info_extractor(photobucket_ie)
# This must come last since it's the
# fallback if none of the others work
fd.add_info_extractor(generic_ie)
# Update version
if opts.update_self:
update_self(fd, sys.argv[0])
# Maybe do nothing
if len(all_urls) < 1:
if not opts.update_self:
parser.error(u'you must provide at least one URL')
else:
sys.exit()
retcode = fd.download(all_urls)
sys.exit(retcode)