Compare commits

...

36 Commits

Author SHA1 Message Date
Ricardo Garcia
fd20984889 Bump version number 2010-10-31 11:23:48 +01:00
Ricardo Garcia
111ae3695c Document new -w option 2010-10-31 11:23:48 +01:00
Ricardo Garcia
0beeff4b3e Add que -w or --no-overwrites option 2010-10-31 11:23:48 +01:00
Ricardo Garcia
64a6f26c5d Put Danny Colligan as an author in the script itself 2010-10-31 11:23:48 +01:00
Ricardo Garcia
a9633f1457 Use quote_plus instead of manually replacing spaces by plus signs 2010-10-31 11:23:48 +01:00
Ricardo Garcia
a20e4c2f96 Improve documentation of new features in webpage 2010-10-31 11:23:47 +01:00
Ricardo Garcia
d1536018a8 Include Danny Colligan in credits 2010-10-31 11:23:47 +01:00
Ricardo Garcia
25af2bce3a Include Danny Colligan's YouTube search InfoExtractor 2010-10-31 11:23:47 +01:00
Ricardo Garcia
d1580ed990 Fix NameError 2010-10-31 11:23:45 +01:00
Ricardo Garcia
eb0d2909a8 Document new -a option 2010-10-31 11:23:44 +01:00
Ricardo Garcia
ba72f8a5d1 Bump version and increase Firefox version number 2010-10-31 11:23:44 +01:00
Ricardo Garcia
c6fd0bb806 Add -a (--batch-file) option 2010-10-31 11:23:44 +01:00
Ricardo Garcia
72ac78b8b0 Fix for YouTube internationalization changes 2010-10-31 11:23:44 +01:00
Ricardo Garcia
240b737ebd Bump version number 2010-10-31 11:23:41 +01:00
Ricardo Garcia
27d98b6e25 Fix TypeError in decode() method and unordered playlist URLs 2010-10-31 11:23:41 +01:00
Ricardo Garcia
5487aea5d8 Improve documentation 2010-10-31 11:23:41 +01:00
Ricardo Garcia
9ca4851a00 Bump version number 2010-10-31 11:23:38 +01:00
Ricardo Garcia
1e9daf2a48 Make the YouTube login mechanism work across countries 2010-10-31 11:23:38 +01:00
Ricardo Garcia
d853063955 Bump version number 2010-10-31 11:23:38 +01:00
Ricardo Garcia
2546e7679f Fix metacafe.com and UTF8 output filenames 2010-10-31 11:23:35 +01:00
Ricardo Garcia
0ddf38df18 Bump version string 2010-10-31 11:23:31 +01:00
Ricardo Garcia
65cd34c5d7 Add initial version of postprocessing framework 2010-10-31 11:23:31 +01:00
Ricardo Garcia
5352678576 Improve a couple of detection strings to avoid i18n-related bugs 2010-10-31 11:23:31 +01:00
Ricardo Garcia
a7d06f400c Increase version number 2010-10-31 11:23:31 +01:00
Ricardo Garcia
b1a1f8ea8f Improve error message regarding output templates and charsets 2010-10-31 11:23:31 +01:00
Ricardo Garcia
f807dc157e Update webpage to reflect changes 2010-10-31 11:23:31 +01:00
Ricardo Garcia
97accc0ece Simplify a statement 2010-10-31 11:23:31 +01:00
Ricardo Garcia
76a7f36400 Make the most prominent output strings Unicode and fix Unicode title bug 2010-10-31 11:23:31 +01:00
Ricardo Garcia
0c2dc87d9e Add YoutubePlaylistIE class 2010-10-31 11:23:31 +01:00
Ricardo Garcia
020f7150aa Add metacafe.com support and minor changse 2010-10-31 11:23:31 +01:00
Ricardo Garcia
3af1e17284 Fix directory creation not working with absolute paths 2010-10-31 11:23:31 +01:00
Ricardo Garcia
acd3d84298 Add --rate-limit program option 2010-10-31 11:23:31 +01:00
Ricardo Garcia
7337efbfe4 Modify ignore filters 2010-10-31 11:23:31 +01:00
Ricardo Garcia
3c53b78720 Strip newline from version 2010-10-31 11:23:31 +01:00
Ricardo Garcia
bb02834692 Add script to regenerate index.html 2010-10-31 11:23:30 +01:00
Ricardo Garcia
3e1cabc338 Add old version of webpage 2010-10-31 11:23:30 +01:00
4 changed files with 726 additions and 49 deletions

View File

@@ -1,3 +1,4 @@
syntax: glob
youtube-dl-old
index.html
youtube-dl-*
.*.swp

15
generate-index Executable file
View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python
import hashlib
import subprocess
template = file('index.html.in', 'r').read()
version = subprocess.Popen(['./youtube-dl', '--version'], stdout=subprocess.PIPE).communicate()[0].strip()
data = file('youtube-dl', 'rb').read()
md5sum = hashlib.md5(data).hexdigest()
sha1sum = hashlib.sha1(data).hexdigest()
sha256sum = hashlib.sha256(data).hexdigest()
template = template.replace('@PROGRAM_VERSION@', version)
template = template.replace('@PROGRAM_MD5SUM@', md5sum)
template = template.replace('@PROGRAM_SHA1SUM@', sha1sum)
template = template.replace('@PROGRAM_SHA256SUM@', sha256sum)
file('index.html', 'w').write(template)

229
index.html.in Normal file
View File

@@ -0,0 +1,229 @@
<!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
<title>youtube-dl: Download videos from YouTube.com</title>
<style type="text/css"><!--
body {
font-family: sans-serif;
font-size: small;
}
h1 {
text-align: center;
text-decoration: underline;
color: #006699;
}
h2 {
color: #006699;
}
p {
text-align: justify;
margin-left: 5%;
margin-right: 5%;
}
ul {
margin-left: 5%;
margin-right: 5%;
list-style-type: square;
}
li {
margin-bottom: 0.5ex;
}
.smallnote {
font-size: x-small;
text-align: center;
}
--></style>
</head>
<body>
<h1>youtube-dl: Download videos from YouTube.com</h1>
<p class="smallnote">(and more...)</p>
<h2>What is it?</h2>
<p><em>youtube-dl</em> is a small command-line program to download videos
from YouTube.com. It requires the <a href="http://www.python.org/">Python
interpreter</a>, version 2.4 or later, and it's not platform specific.
It should work in your Unix box, in Windows or in Mac OS X. The latest version
is <strong>@PROGRAM_VERSION@</strong>. It's released to the public domain,
which means you can modify it, redistribute it or use it however you like.</p>
<p>I'll try to keep it updated if YouTube.com changes the way you access
their videos. After all, it's a simple and short program. However, I can't
guarantee anything. If you detect it stops working, check for new versions
and/or inform me about the problem, indicating the program version you
are using. If the program stops working and I can't solve the problem but
you have a solution, I'd like to know it. If that happens and you feel you
can maintain the program yourself, tell me. My contact information is
at <a href="http://freshmeat.net/~rg3/">freshmeat.net</a>.</p>
<p>Thanks for all the feedback received so far. I'm glad people find my
program useful.</p>
<h2>Usage instructions</h2>
<p>In Windows, once you have installed the Python interpreter, save the
program with the <em>.py</em> extension and put it somewhere in the PATH.
Try to follow the
<a href="http://rg03.wordpress.com/youtube-dl-under-windows-xp/">guide to
install youtube-dl under Windows XP</a>.</p>
<p>In Unix, download it, give it execution permission and copy it to one
of the PATH directories (typically, <em>/usr/local/bin</em>).</p>
<p>After that, you should be able to call it from the command line as
<em>youtube-dl</em> or <em>youtube-dl.py</em>. I will use <em>youtube-dl</em>
in the following examples. Usage instructions are easy. Use <em>youtube-dl</em>
followed by a video URL or identifier. Example: <em>youtube-dl
"http://www.youtube.com/watch?v=foobar"</em>. The video will be saved
to the file <em>foobar.flv</em> in that example. As YouTube.com
videos are in Flash Video format, their extension should be <em>flv</em>.
In Linux and other unices, video players using a recent version of
<em>ffmpeg</em> can play them. That includes MPlayer, VLC, etc. Those two
work under Windows and other platforms, but you could also get a
specific FLV player of your taste.</p>
<p>If you try to run the program and you receive an error message containing the
keyword <em>SyntaxError</em> near the end, it means your Python interpreter
is too old.</p>
<h2>More usage tips</h2>
<ul>
<li>You can change the file name of the video using the -o option, like in
<em>youtube-dl -o vid.flv "http://www.youtube.com/watch?v=foobar"</em>.
Read the <a href="#otpl">Output template</a> section for more details on
this.</li>
<li>Some videos require an account to be downloaded, mostly because they're
flagged as mature content. You can pass the program a username and password
for a YouTube.com account with the -u and -p options, like <em>youtube-dl
-u myusername -p mypassword "http://www.youtube.com/watch?v=foobar"</em>.</li>
<li>The account data can also be read from the user .netrc file by indicating
the -n or --netrc option. The machine name is <em>youtube</em> in that
case.</li>
<li>The <em>simulate mode</em> (activated with -s or --simulate) can be used
to just get the real video URL and use it with a download manager if you
prefer that option.</li>
<li>The <em>quiet mode</em> (activated with -q or --quiet) can be used to
supress all output messages. This allows, in systems featuring /dev/stdout
and other similar special files, outputting the video data to standard output
in order to pipe it to another program without interferences.</li>
<li>The program can be told to simply print the final video URL to standard
output using the -g or --get-url option.</li>
<li>In a similar line, the -e or --get-title option tells the program to print
the video title.</li>
<li>The default filename is <em>video_id.flv</em>. But you can also use the
video title in the filename with the -t or --title option, or preserve the
literal title in the filename with the -l or --literal option.</li>
<li>You can make the program append <em>&amp;fmt=something</em> to the URL
by using the -f or --format option. This makes it possible to download high
quality versions of the videos when available.</li>
<li>The -b or --best-quality option is an alias for -f 18.</li>
<li>The -m or --mobile-version option is an alias for -f 17.</li>
<li>Normally, the program will stop on the first error, but you can tell it
to attempt to download every video with the -i or --ignore-errors option.</li>
<li>The -a or --batch-file option lets you specify a file to read URLs from.
The file must contain one URL per line.</li>
<li>The program can be told not to overwrite existing files using the -w or
--no-overwrites option.</li>
<li>For YouTube, you can also use the URL of a playlist, and it will download
all the videos in that playlist.</li>
<li>For YouTube, you can also use the special word <em>ytsearch</em> to
download search results. With <em>ytsearch</em> it will download the
first search result. With <em>ytsearchN</em>, where N is a number, it
will download the first N results. With <em>ytsearchall</em> it will
download every result for that search. In most systems you'll need to
use quotes for multiple words. Example: <em>youtube-dl "ytsearch3:cute
kittens"</em>.
<li><em>youtube-dl</em> honors the <em>http_proxy</em> environment variable
if you want to use a proxy. Set it to something like
<em>http://proxy.example.com:8080</em>, and do not leave the <em>http://</em>
prefix out.</li>
<li>You can get the program version by calling it as <em>youtube-dl
-v</em> or <em>youtube-dl --version</em>.</li>
<li>For usage instructions, use <em>youtube-dl -h</em> or <em>youtube-dl
--help.</em></li>
<li>You can cancel the program at any time pressing Ctrl+C. It may print
some error lines saying something about <em>KeyboardInterrupt</em>.
That's ok.</li>
</ul>
<h2>Download it</h2>
<p>Note that if you directly click on these hyperlinks, your web browser will
most likely display the program contents. It's usually better to
right-click on it and choose the appropriate option, normally called <em>Save
Target As</em> or <em>Save Link As</em>, depending on the web browser you
are using.</p>
<p><a href="youtube-dl">@PROGRAM_VERSION@</a></p>
<ul>
<li><strong>MD5</strong>: @PROGRAM_MD5SUM@</li>
<li><strong>SHA1</strong>: @PROGRAM_SHA1SUM@</li>
<li><strong>SHA256</strong>: @PROGRAM_SHA256SUM@</li>
</ul>
<h2 id="otpl">Output template</h2>
<p>The -o option allows users to indicate a template for the output file names.
The basic usage is not to set any template arguments when downloading a single
file, like in <em>youtube-dl -o funny_video.flv 'http://some/video'</em>.
However, it may contain special sequences that will be replaced when
downloading each video. The special sequences have the format
<strong>%(NAME)s</strong>. To clarify, that's a percent symbol followed by a
name in parenthesis, followed by a lowercase S. Allowed names are:</p>
<ul>
<li><em>id</em>: The sequence will be replaced by the video identifier.</li>
<li><em>url</em>: The sequence will be replaced by the video URL.</li>
<li><em>uploader</em>: The sequence will be replaced by the nickname of the
person who uploaded the video.</li>
<li><em>title</em>: The sequence will be replaced by the literal video
title.</li>
<li><em>stitle</em>: The sequence will be replaced by a simplified video
title, restricted to alphanumeric characters and dashes.</li>
<li><em>ext</em>: The sequence will be replaced by the appropriate
extension (like <em>flv</em> or <em>mp4</em>).</li>
</ul>
<p>As you may have guessed, the default template is <em>%(id)s.%(ext)s</em>.
When some command line options are used, it's replaced by other templates like
<em>%(title)s-%(id)s.%(ext)s</em>. You can specify your own.</p>
<h2>Authors</h2>
<ul>
<li>Ricardo Garcia Gonzalez: program core, YouTube.com InfoExtractor,
metacafe.com InfoExtractor and YouTube playlist InfoExtractor.</li>
<li>Danny Colligan: YouTube search InfoExtractor, ideas and patches.</li>
<li>Many other people contributing patches, code, ideas and kind messages. Too
many to be listed here. You know who you are. Thank you very much.</li>
</ul>
<p class="smallnote">Copyright &copy; 2006-2007 Ricardo Garcia Gonzalez</p>
</body>
</html>

View File

@@ -1,9 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# Author: Danny Colligan
# License: Public domain code
import htmlentitydefs
import httplib
import locale
import math
import netrc
import os
@@ -17,7 +19,7 @@ import urllib
import urllib2
std_headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
@@ -42,6 +44,14 @@ class SameFileError(Exception):
"""
pass
class PostProcessingError(Exception):
"""Post Processing exception.
This exception may be raised by PostProcessor's .run() method to
indicate an error in the postprocessing task.
"""
pass
class FileDownloader(object):
"""File Downloader class.
@@ -78,14 +88,18 @@ class FileDownloader(object):
format: Video format code.
outtmpl: Template for output names.
ignoreerrors: Do not stop on download errors.
ratelimit: Download speed limit, in bytes/sec.
nooverwrites: Prevent overwriting files.
"""
_params = None
_ies = []
_pps = []
def __init__(self, params):
"""Create a FileDownloader object with the given options."""
self._ies = []
self._pps = []
self.set_params(params)
@staticmethod
@@ -93,6 +107,7 @@ class FileDownloader(object):
"""Create directory components in filename. Similar to Unix "mkdir -p"."""
components = filename.split(os.sep)
aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
for dir in aggregate:
if not os.path.exists(dir):
os.mkdir(dir)
@@ -149,6 +164,16 @@ class FileDownloader(object):
return int(new_min)
return int(rate)
@staticmethod
def parse_bytes(bytestr):
"""Parse a string indicating a byte quantity into a long integer."""
matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
if matchobj is None:
return None
number = float(matchobj.group(1))
multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
return long(round(number * multiplier))
def set_params(self, params):
"""Sets parameters."""
if type(params) != dict:
@@ -164,15 +189,20 @@ class FileDownloader(object):
self._ies.append(ie)
ie.set_downloader(self)
def add_post_processor(self, pp):
"""Add a PostProcessor object to the end of the chain."""
self._pps.append(pp)
pp.set_downloader(self)
def to_stdout(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
if not self._params.get('quiet', False):
sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
print u'%s%s' % (message, [u'\n', u''][skip_eol]),
sys.stdout.flush()
def to_stderr(self, message):
"""Print message to stderr."""
sys.stderr.write('%s\n' % message)
print >>sys.stderr, message
def fixed_template(self):
"""Checks if the output template is fixed."""
@@ -193,18 +223,31 @@ class FileDownloader(object):
raise DownloadError(message)
return 1
def slow_down(self, start_time, byte_counter):
"""Sleep if the download speed is over the rate limit."""
rate_limit = self._params.get('ratelimit', None)
if rate_limit is None or byte_counter == 0:
return
now = time.time()
elapsed = now - start_time
if elapsed <= 0.0:
return
speed = float(byte_counter) / elapsed
if speed > rate_limit:
time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
def report_destination(self, filename):
"""Report destination filename."""
self.to_stdout('[download] Destination: %s' % filename)
self.to_stdout(u'[download] Destination: %s' % filename)
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
"""Report download progress."""
self.to_stdout('\r[download] %s of %s at %s ETA %s' %
self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
def report_finish(self):
"""Report download finished."""
self.to_stdout('')
self.to_stdout(u'')
def download(self, url_list):
"""Download a given list of URLs."""
@@ -228,7 +271,6 @@ class FileDownloader(object):
raise SameFileError(self._params['outtmpl'])
for result in results:
# Forced printings
if self._params.get('forcetitle', False):
print result['title']
@@ -243,7 +285,10 @@ class FileDownloader(object):
filename = self._params['outtmpl'] % result
self.report_destination(filename)
except (ValueError, KeyError), err:
retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
continue
if self._params['nooverwrites'] and os.path.exists(filename):
self.to_stderr('WARNING: file exists: %s; skipping' % filename)
continue
try:
self.pmkdir(filename)
@@ -264,11 +309,26 @@ class FileDownloader(object):
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
continue
try:
self.post_process(filename, result)
except (PostProcessingError), err:
retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
continue
break
if not suitable_found:
retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
return retcode
def post_process(self, filename, ie_info):
"""Run the postprocessing chain on the given file."""
info = dict(ie_info)
info['filepath'] = filename
for pp in self._pps:
info = pp.run(info)
if info is None:
break
def _do_download(self, stream, url):
request = urllib2.Request(url, None, std_headers)
@@ -296,6 +356,9 @@ class FileDownloader(object):
stream.write(data_block)
block_size = self.best_block_size(after - before, data_block_len)
# Apply rate limit
self.slow_down(start, byte_counter)
self.report_finish()
if data_len is not None and str(byte_counter) != data_len:
raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
@@ -335,7 +398,7 @@ class InfoExtractor(object):
@staticmethod
def suitable(url):
"""Receives a URL and returns True if suitable for this IE."""
return True
return False
def initialize(self):
"""Initializes an instance (authentication, etc)."""
@@ -359,7 +422,7 @@ class InfoExtractor(object):
def to_stderr(self, message):
"""Print message to stderr."""
sys.stderr.write('%s\n' % message)
print >>sys.stderr, message
def _real_initialize(self):
"""Real initialization process. Redefine in subclasses."""
@@ -372,29 +435,39 @@ class InfoExtractor(object):
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
_LOGIN_URL = 'http://www.youtube.com/login?next=/'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
@staticmethod
def suitable(url):
return (re.match(YoutubeIE._VALID_URL, url) is not None)
def report_lang(self):
"""Report attempt to set language."""
self.to_stdout(u'[youtube] Setting language')
def report_login(self):
"""Report attempt to log in."""
self.to_stdout('[youtube] Logging in')
self.to_stdout(u'[youtube] Logging in')
def report_age_confirmation(self):
"""Report attempt to confirm age."""
self.to_stdout('[youtube] Confirming age')
self.to_stdout(u'[youtube] Confirming age')
def report_webpage_download(self, video_id):
"""Report attempt to download webpage."""
self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self.to_stdout('[youtube] %s: Extracting video information' % video_id)
self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
def report_video_url(self, video_id, video_real_url):
"""Report extracted video URL."""
self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
def _real_initialize(self):
if self._downloader is None:
@@ -417,13 +490,22 @@ class YoutubeIE(InfoExtractor):
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError), err:
self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
return
# No authentication to be performed
if username is None:
return
# Set language
request = urllib2.Request(self._LOGIN_URL, None, std_headers)
try:
self.report_lang()
urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
return
# Log in
login_form = {
'current_form': 'loginForm',
@@ -437,10 +519,10 @@ class YoutubeIE(InfoExtractor):
self.report_login()
login_results = urllib2.urlopen(request).read()
if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
self.to_stderr('WARNING: unable to log in: bad username or password')
self.to_stderr(u'WARNING: unable to log in: bad username or password')
return
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr('WARNING: unable to log in: %s' % str(err))
self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
return
# Confirm age
@@ -453,14 +535,14 @@ class YoutubeIE(InfoExtractor):
self.report_age_confirmation()
age_results = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
return
def _real_extract(self, url):
# Extract video id from URL
mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self.to_stderr('ERROR: invalid URL: %s' % url)
self.to_stderr(u'ERROR: invalid URL: %s' % url)
return [None]
video_id = mobj.group(2)
@@ -474,7 +556,7 @@ class YoutubeIE(InfoExtractor):
video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
# Normalize URL, including format
normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
if format_param is not None:
normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
request = urllib2.Request(normalized_url, None, std_headers)
@@ -482,14 +564,14 @@ class YoutubeIE(InfoExtractor):
self.report_webpage_download(video_id)
video_webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
return [None]
self.report_information_extraction(video_id)
# "t" param
mobj = re.search(r', "t": "([^"]+)"', video_webpage)
if mobj is None:
self.to_stderr('ERROR: unable to extract "t" parameter')
self.to_stderr(u'ERROR: unable to extract "t" parameter')
return [None]
video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
if format_param is not None:
@@ -497,16 +579,16 @@ class YoutubeIE(InfoExtractor):
self.report_video_url(video_id, video_real_url)
# uploader
mobj = re.search(r'More From: ([^<]*)<', video_webpage)
mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
if mobj is None:
self.to_stderr('ERROR: unable to extract uploader nickname')
self.to_stderr(u'ERROR: unable to extract uploader nickname')
return [None]
video_uploader = mobj.group(1)
# title
mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
if mobj is None:
self.to_stderr('ERROR: unable to extract video title')
self.to_stderr(u'ERROR: unable to extract video title')
return [None]
video_title = mobj.group(1).decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
@@ -518,14 +600,333 @@ class YoutubeIE(InfoExtractor):
# Return information
return [{
'id': video_id,
'url': video_real_url,
'uploader': video_uploader,
'id': video_id.decode('utf-8'),
'url': video_real_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'title': video_title,
'stitle': simple_title,
'ext': video_extension,
'ext': video_extension.decode('utf-8'),
}]
class MetacafeIE(InfoExtractor):
"""Information Extractor for metacafe.com."""
_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
InfoExtractor.__init__(self, downloader)
self._youtube_ie = youtube_ie
@staticmethod
def suitable(url):
return (re.match(MetacafeIE._VALID_URL, url) is not None)
def report_disclaimer(self):
"""Report disclaimer retrieval."""
self.to_stdout(u'[metacafe] Retrieving disclaimer')
def report_age_confirmation(self):
"""Report attempt to confirm age."""
self.to_stdout(u'[metacafe] Confirming age')
def report_download_webpage(self, video_id):
"""Report webpage download."""
self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
def _real_initialize(self):
# Retrieve disclaimer
request = urllib2.Request(self._DISCLAIMER, None, std_headers)
try:
self.report_disclaimer()
disclaimer = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
return
# Confirm age
disclaimer_form = {
'filters': '0',
'submit': "Continue - I'm over 18",
}
request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
try:
self.report_age_confirmation()
disclaimer = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
return
def _real_extract(self, url):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self.to_stderr(u'ERROR: invalid URL: %s' % url)
return [None]
video_id = mobj.group(1)
# Check if video comes from YouTube
mobj2 = re.match(r'^yt-(.*)$', video_id)
if mobj2 is not None:
return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
simple_title = mobj.group(2).decode('utf-8')
video_extension = 'flv'
# Retrieve video webpage to extract further information
request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
return [None]
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
if mobj is None:
self.to_stderr(u'ERROR: unable to extract media URL')
return [None]
mediaURL = mobj.group(1).replace('\\', '')
mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
if mobj is None:
self.to_stderr(u'ERROR: unable to extract gdaKey')
return [None]
gdaKey = mobj.group(1)
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
if mobj is None:
self.to_stderr(u'ERROR: unable to extract title')
return [None]
video_title = mobj.group(1).decode('utf-8')
mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
if mobj is None:
self.to_stderr(u'ERROR: unable to extract uploader nickname')
return [None]
video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
# Return information
return [{
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
}]
class YoutubeSearchIE(InfoExtractor):
"""Information Extractor for YouTube search queries."""
_VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
_VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
_MORE_PAGES_INDICATOR = r'>Next</a>'
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
InfoExtractor.__init__(self, downloader)
self._youtube_ie = youtube_ie
@staticmethod
def suitable(url):
return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _real_initialize(self):
self._youtube_ie.initialize()
def _real_extract(self, query):
mobj = re.match(self._VALID_QUERY, query)
if mobj is None:
self.to_stderr(u'ERROR: invalid search query "%s"' % query)
return [None]
prefix, query = query.split(':')
prefix = prefix[8:]
if prefix == '':
return self._download_n_results(query, 1)
elif prefix == 'all':
return self._download_n_results(query, -1)
else:
try:
n = int(prefix)
if n <= 0:
self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
return [None]
return self._download_n_results(query, n)
except ValueError: # parsing prefix as int fails
return self._download_n_results(query, 1)
def _download_n_results(self, query, n):
"""Downloads a specified number of results for a query"""
video_ids = []
already_seen = set()
pagenum = 1
while True:
self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
return [None]
# Extract video identifiers
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
if video_id not in already_seen:
video_ids.append(video_id)
already_seen.add(video_id)
if len(video_ids) == n:
# Specified n videos reached
information = []
for id in video_ids:
information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
return information
if self._MORE_PAGES_INDICATOR not in page:
information = []
for id in video_ids:
information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
return information
pagenum = pagenum + 1
class YoutubePlaylistIE(InfoExtractor):
"""Information Extractor for YouTube playlists."""
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
_TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
_MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
InfoExtractor.__init__(self, downloader)
self._youtube_ie = youtube_ie
@staticmethod
def suitable(url):
return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
def report_download_page(self, playlist_id, pagenum):
"""Report attempt to download playlist page with given number."""
self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
def _real_initialize(self):
self._youtube_ie.initialize()
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self.to_stderr(u'ERROR: invalid url: %s' % url)
return [None]
# Download playlist pages
playlist_id = mobj.group(1)
video_ids = []
pagenum = 1
while True:
self.report_download_page(playlist_id, pagenum)
request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
return [None]
# Extract video identifiers
ids_in_page = []
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
break
pagenum = pagenum + 1
information = []
for id in video_ids:
information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
return information
class PostProcessor(object):
"""Post Processor class.
PostProcessor objects can be added to downloaders with their
add_post_processor() method. When the downloader has finished a
successful download, it will take its internal chain of PostProcessors
and start calling the run() method on each one of them, first with
an initial argument and then with the returned value of the previous
PostProcessor.
The chain will be stopped if one of them ever returns None or the end
of the chain is reached.
PostProcessor objects follow a "mutual registration" process similar
to InfoExtractor objects.
"""
_downloader = None
def __init__(self, downloader=None):
self._downloader = downloader
def to_stdout(self, message):
"""Print message to stdout if downloader is not in quiet mode."""
if self._downloader is None or not self._downloader.get_params().get('quiet', False):
print message
def to_stderr(self, message):
"""Print message to stderr."""
print >>sys.stderr, message
def set_downloader(self, downloader):
"""Sets the downloader for this PP."""
self._downloader = downloader
def run(self, information):
"""Run the PostProcessor.
The "information" argument is a dictionary like the ones
returned by InfoExtractors. The only difference is that this
one has an extra field called "filepath" that points to the
downloaded file.
When this method returns None, the postprocessing chain is
stopped. However, this method may return an information
dictionary that will be passed to the next postprocessing
object in the chain. It can be the one it received after
changing some fields.
In addition, this method may raise a PostProcessingError
exception that will be taken into account by the downloader
it was called from.
"""
return information # by default, do nothing
### MAIN PROGRAM ###
if __name__ == '__main__':
try:
# Modules needed only when running the main program
@@ -540,7 +941,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2008.07.22',
version='2009.02.07',
conflict_handler='resolve',
)
parser.add_option('-h', '--help',
@@ -575,26 +976,52 @@ if __name__ == '__main__':
action='store_const', dest='format', help='alias for -f 17', const='17')
parser.add_option('-i', '--ignore-errors',
action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
parser.add_option('-r', '--rate-limit',
dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
parser.add_option('-a', '--batch-file',
dest='batchfile', metavar='F', help='file containing URLs to download')
parser.add_option('-w', '--no-overwrites',
action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
(opts, args) = parser.parse_args()
# Batch file verification
batchurls = []
if opts.batchfile is not None:
try:
batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
except IOError:
sys.exit(u'ERROR: batch file could not be read')
all_urls = batchurls + args
# Conflicting, missing and erroneous options
if len(args) < 1:
sys.exit('ERROR: you must provide at least one URL')
if len(all_urls) < 1:
sys.exit(u'ERROR: you must provide at least one URL')
if opts.usenetrc and (opts.username is not None or opts.password is not None):
sys.exit('ERROR: using .netrc conflicts with giving username/password')
sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
if opts.password is not None and opts.username is None:
sys.exit('ERROR: account username missing')
sys.exit(u'ERROR: account username missing')
if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
sys.exit('ERROR: using output template conflicts with using title or literal title')
sys.exit(u'ERROR: using output template conflicts with using title or literal title')
if opts.usetitle and opts.useliteral:
sys.exit('ERROR: using title conflicts with using literal title')
sys.exit(u'ERROR: using title conflicts with using literal title')
if opts.username is not None and opts.password is None:
opts.password = getpass.getpass('Type account password and press return:')
opts.password = getpass.getpass(u'Type account password and press return:')
if opts.ratelimit is not None:
numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
if numeric_limit is None:
sys.exit(u'ERROR: invalid rate limit specified')
opts.ratelimit = numeric_limit
# Information extractors
youtube_ie = YoutubeIE()
metacafe_ie = MetacafeIE(youtube_ie)
youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
youtube_search_ie = YoutubeSearchIE(youtube_ie)
# File downloader
charset = locale.getdefaultlocale()[1]
if charset is None:
charset = 'ascii'
fd = FileDownloader({
'usenetrc': opts.usenetrc,
'username': opts.username,
@@ -604,19 +1031,24 @@ if __name__ == '__main__':
'forcetitle': opts.gettitle,
'simulate': (opts.simulate or opts.geturl or opts.gettitle),
'format': opts.format,
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
or '%(id)s.%(ext)s'),
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
or u'%(id)s.%(ext)s'),
'ignoreerrors': opts.ignoreerrors,
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
})
fd.add_info_extractor(youtube_search_ie)
fd.add_info_extractor(youtube_pl_ie)
fd.add_info_extractor(metacafe_ie)
fd.add_info_extractor(youtube_ie)
retcode = fd.download(args)
retcode = fd.download(all_urls)
sys.exit(retcode)
except DownloadError:
sys.exit(1)
except SameFileError:
sys.exit('ERROR: fixed output name but more than one file to download')
sys.exit(u'ERROR: fixed output name but more than one file to download')
except KeyboardInterrupt:
sys.exit('\nERROR: Interrupted by user')
sys.exit(u'\nERROR: Interrupted by user')