Compare commits

..

14 Commits

Author SHA1 Message Date
Ricardo Garcia
b58faab5e7 Bump version number 2011-02-26 00:47:29 +01:00
Idan Kamara
377086af3d Use '--' to separate the file argument from the options when calling ffmpeg
This is to avoid a potential issue if the file name begins with a hyphen since ffmpeg will interpret it as an option
2011-02-25 23:24:58 +02:00
Ricardo Garcia
820eedcb50 Bump version number 2011-02-25 21:54:16 +01:00
Ricardo Garcia
da273188f3 Catch possible exceptions when running ffprobe 2011-02-25 21:53:26 +01:00
Idan Kamara
1bd9258272 Fix stderr print when ffmpeg fails 2011-02-25 22:30:22 +02:00
Ricardo Garcia
c076845454 Bump version number 2011-02-25 20:12:32 +01:00
Ricardo Garcia
afd233c05c Update User-Agent string 2011-02-25 20:11:53 +01:00
Ricardo Garcia
3072fab115 Add an audio extracting PostProcessor using ffmpeg (closes #2) 2011-02-25 19:06:58 +01:00
Ricardo Garcia
87cbd21323 Fix date parsing for YouTube (patch by Drake Wyrm) 2011-02-25 19:05:35 +01:00
Ricardo Garcia
ef9f8451c8 Add Gergely Imreh to the author list 2011-02-20 18:01:57 +01:00
Gergely Imreh
9f5f960213 Facebook info extractor
This IE should be full-featured.

Public videos can be downloaded without login, e.g:
https://www.facebook.com/video/video.php?v=696729990595

Private videos need login, and subject to login rate limit of
a couple of tries / minute.
2011-02-20 23:57:50 +08:00
Gergely Imreh
7cc3c6fd62 Fix possible missing parameter in playlist url extraction
The "playlist_prefix" parameter was missing when parsing playlist urls
that match the recently added format, e.g.:
http://www.youtube.com/user/stanforduniversity#g/c/9D558D49CA734A02
For these URLs (basically, for every playlist type so far, except the
artist list) playlist_prefix has to be equal to "p" for correct
exctraction.
2011-02-13 19:02:56 +08:00
Ricardo Garcia
d119b54df6 Support more common YouTube playlist URLs 2011-02-12 20:19:20 +01:00
Gergely Imreh
f74e22ae28 Enable artist playlists in YoutubePlaylistIE
Artist playlist pages have different format compared to user playlists,
thus more format checking is needed to construct the correct URL.

From the artist playlist this method downloads all listed below the
"Videos by [Artist Name]" header, plus usually there's one more
video on the side, titled "Youtube Mix for [Artist Name]", which
has a link format that currently cannot be distinguished from the other
videos in the list.
2011-01-31 19:00:51 +08:00
2 changed files with 342 additions and 8 deletions

View File

@@ -1 +1 @@
2011.01.30
2011.02.25c

View File

@@ -6,6 +6,7 @@
# Author: Vasyl' Vavrychuk
# Author: Witold Baryluk
# Author: Paweł Paprota
# Author: Gergely Imreh
# License: Public domain code
import cookielib
import ctypes
@@ -37,7 +38,7 @@ except ImportError:
from cgi import parse_qs
std_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
@@ -1058,7 +1059,7 @@ class YoutubeIE(InfoExtractor):
mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
format_expressions = ['%d %B %Y', '%B %d %Y']
format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
for expression in format_expressions:
try:
upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
@@ -2096,8 +2097,8 @@ class YahooSearchIE(InfoExtractor):
class YoutubePlaylistIE(InfoExtractor):
"""Information Extractor for YouTube playlists."""
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
_TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
_youtube_ie = None
@@ -2124,14 +2125,26 @@ class YoutubePlaylistIE(InfoExtractor):
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
return
# Single video case
if mobj.group(3) is not None:
self._youtube_ie.extract(mobj.group(3))
return
# Download playlist pages
playlist_id = mobj.group(1)
# prefix is 'p' as default for playlists but there are other types that need extra care
playlist_prefix = mobj.group(1)
if playlist_prefix == 'a':
playlist_access = 'artist'
else:
playlist_prefix = 'p'
playlist_access = 'view_play_list'
playlist_id = mobj.group(2)
video_ids = []
pagenum = 1
while True:
self.report_download_page(playlist_id, pagenum)
request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@@ -2327,6 +2340,229 @@ class DepositFilesIE(InfoExtractor):
except UnavailableVideoError, err:
self._downloader.trouble(u'ERROR: unable to download file')
class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
_NETRC_MACHINE = 'facebook'
_available_formats = ['highqual', 'lowqual']
_video_extensions = {
'highqual': 'mp4',
'lowqual': 'mp4',
}
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(FacebookIE._VALID_URL, url) is not None)
def _reporter(self, message):
"""Add header and report message."""
self._downloader.to_screen(u'[facebook] %s' % message)
def report_login(self):
"""Report attempt to log in."""
self._reporter(u'Logging in')
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self._reporter(u'%s: Downloading video webpage' % video_id)
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self._reporter(u'%s: Extracting video information' % video_id)
def _parse_page(self, video_webpage):
"""Extract video information from page"""
# General data
data = {'title': r'class="video_title datawrap">(.*?)</',
'description': r'<div class="datawrap">(.*?)</div>',
'owner': r'\("video_owner_name", "(.*?)"\)',
'upload_date': r'data-date="(.*?)"',
'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
}
video_info = {}
for piece in data.keys():
mobj = re.search(data[piece], video_webpage)
if mobj is not None:
video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
# Video urls
video_urls = {}
for fmt in self._available_formats:
mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
if mobj is not None:
# URL is in a Javascript segment inside an escaped Unicode format within
# the generally utf-8 page
video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
video_info['video_urls'] = video_urls
return video_info
def _real_initialize(self):
if self._downloader is None:
return
useremail = None
password = None
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
useremail = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
useremail = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError), err:
self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
return
if useremail is None:
return
# Log in
login_form = {
'email': useremail,
'pass': password,
'login': 'Log+In'
}
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
try:
self.report_login()
login_results = urllib2.urlopen(request).read()
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
return
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
video_id = mobj.group('ID')
# Get video webpage
self.report_video_webpage_download(video_id)
request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
try:
page = urllib2.urlopen(request)
video_webpage = page.read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
return
# Start extracting information
self.report_information_extraction(video_id)
# Extract information
video_info = self._parse_page(video_webpage)
# uploader
if 'owner' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
video_uploader = video_info['owner']
# title
if 'title' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = video_info['title']
video_title = video_title.decode('utf-8')
video_title = sanitize_title(video_title)
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
simple_title = simple_title.strip(ur'_')
# thumbnail image
if 'thumbnail' not in video_info:
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
video_thumbnail = ''
else:
video_thumbnail = video_info['thumbnail']
# upload date
upload_date = u'NA'
if 'upload_date' in video_info:
upload_time = video_info['upload_date']
timetuple = email.utils.parsedate_tz(upload_time)
if timetuple is not None:
try:
upload_date = time.strftime('%Y%m%d', timetuple[0:9])
except:
pass
# description
video_description = 'No description available.'
if (self._downloader.params.get('forcedescription', False) and
'description' in video_info):
video_description = video_info['description']
url_map = video_info['video_urls']
if len(url_map.keys()) > 0:
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
format_limit = self._downloader.params.get('format_limit', None)
if format_limit is not None and format_limit in self._available_formats:
format_list = self._available_formats[self._available_formats.index(format_limit):]
else:
format_list = self._available_formats
existing_formats = [x for x in format_list if x in url_map]
if len(existing_formats) == 0:
self._downloader.trouble(u'ERROR: no known formats available for video')
return
if req_format is None:
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
elif req_format == '-1':
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
else:
# Specific format
if req_format not in url_map:
self._downloader.trouble(u'ERROR: requested format not available')
return
video_url_list = [(req_format, url_map[req_format])] # Specific format
for format_param, video_real_url in video_url_list:
# At this point we have a new video
self._downloader.increment_downloads()
# Extension
video_extension = self._video_extensions.get(format_param, 'mp4')
# Find the video URL in fmt_url_map or conn paramters
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_real_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'upload_date': upload_date,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description.decode('utf-8'),
'player_url': None,
})
except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video')
class PostProcessor(object):
"""Post Processor class.
@@ -2373,6 +2609,88 @@ class PostProcessor(object):
"""
return information # by default, do nothing
class FFmpegExtractAudioPP(PostProcessor):
def __init__(self, downloader=None, preferredcodec=None):
PostProcessor.__init__(self, downloader)
if preferredcodec is None:
preferredcodec = 'best'
self._preferredcodec = preferredcodec
@staticmethod
def get_audio_codec(path):
try:
handle = subprocess.Popen(['ffprobe', '-show_streams', '--', path],
stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
output = handle.communicate()[0]
if handle.wait() != 0:
return None
except (IOError, OSError):
return None
audio_codec = None
for line in output.split('\n'):
if line.startswith('codec_name='):
audio_codec = line.split('=')[1].strip()
elif line.strip() == 'codec_type=audio' and audio_codec is not None:
return audio_codec
return None
@staticmethod
def run_ffmpeg(path, out_path, codec, more_opts):
try:
ret = subprocess.call(['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path],
stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
return (ret == 0)
except (IOError, OSError):
return False
def run(self, information):
path = information['filepath']
filecodec = self.get_audio_codec(path)
if filecodec is None:
self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
return None
more_opts = []
if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
if filecodec == 'aac' or filecodec == 'mp3':
# Lossless if possible
acodec = 'copy'
extension = filecodec
if filecodec == 'aac':
more_opts = ['-f', 'adts']
else:
# MP3 otherwise.
acodec = 'libmp3lame'
extension = 'mp3'
more_opts = ['-ab', '128k']
else:
# We convert the audio (lossy)
acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
extension = self._preferredcodec
more_opts = ['-ab', '128k']
if self._preferredcodec == 'aac':
more_opts += ['-f', 'adts']
(prefix, ext) = os.path.splitext(path)
new_path = prefix + '.' + extension
self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
status = self.run_ffmpeg(path, new_path, acodec, more_opts)
if not status:
self._downloader.to_stderr(u'WARNING: error running ffmpeg')
return None
try:
os.remove(path)
except (IOError, OSError):
self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
return None
information['filepath'] = new_path
return information
### MAIN PROGRAM ###
if __name__ == '__main__':
try:
@@ -2405,7 +2723,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2011.01.30',
version='2011.02.25c',
conflict_handler='resolve',
)
@@ -2497,6 +2815,13 @@ if __name__ == '__main__':
help='do not use the Last-modified header to set the file modification time', default=True)
parser.add_option_group(filesystem)
postproc = optparse.OptionGroup(parser, 'Post-processing Options')
postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
help='"best", "aac" or "mp3"; best by default')
parser.add_option_group(postproc)
(opts, args) = parser.parse_args()
# Open appropriate CookieJar
@@ -2568,6 +2893,9 @@ if __name__ == '__main__':
raise ValueError
except (TypeError, ValueError), err:
parser.error(u'invalid playlist end number specified')
if opts.extractaudio:
if opts.audioformat not in ['best', 'aac', 'mp3']:
parser.error(u'invalid audio format specified')
# Information extractors
youtube_ie = YoutubeIE()
@@ -2582,6 +2910,7 @@ if __name__ == '__main__':
yahoo_ie = YahooIE()
yahoo_search_ie = YahooSearchIE(yahoo_ie)
deposit_files_ie = DepositFilesIE()
facebook_ie = FacebookIE()
generic_ie = GenericIE()
# File downloader
@@ -2633,11 +2962,16 @@ if __name__ == '__main__':
fd.add_info_extractor(yahoo_ie)
fd.add_info_extractor(yahoo_search_ie)
fd.add_info_extractor(deposit_files_ie)
fd.add_info_extractor(facebook_ie)
# This must come last since it's the
# fallback if none of the others work
fd.add_info_extractor(generic_ie)
# PostProcessors
if opts.extractaudio:
fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
# Update version
if opts.update_self:
update_self(fd, sys.argv[0])