Compare commits

..

13 Commits

Author SHA1 Message Date
Philipp Hagemeister
56eca2e956 release 2014.04.04.1 2014-04-04 00:25:43 +02:00
Philipp Hagemeister
2ad4d1ba07 [morningstar] Add new extractor (Fixes #2687) 2014-04-04 00:25:35 +02:00
Philipp Hagemeister
4853de808b release 2014.04.04 2014-04-04 00:06:06 +02:00
Philipp Hagemeister
6ff5f12218 [motorsport] Add extractor (Fixes #2688) 2014-04-04 00:05:43 +02:00
Philipp Hagemeister
52a180684f [README] Fix VALID_URL in extractor example 2014-04-03 23:25:23 +02:00
Philipp Hagemeister
b21e25702f Merge pull request #2681 from phihag/readme-dev-instructions
[README] Improve developer instructions
2014-04-03 23:06:15 +02:00
Jaime Marquínez Ferrándiz
983af2600f [wimp] Detect youtube videos (fixes #2686) 2014-04-03 20:44:51 +02:00
Philipp Hagemeister
f34e6a2cd6 [comedycentral:shows] Do no include 6-digit identifier in display ID 2014-04-03 18:39:00 +02:00
Philipp Hagemeister
a9f304031b release 2014.04.03.3 2014-04-03 16:21:54 +02:00
Philipp Hagemeister
9271bc8355 [cnet] Add new extractor (Fixes #2679) 2014-04-03 16:21:21 +02:00
Philipp Hagemeister
d1b3e3dd75 [README] Add md5 to code example 2014-04-03 15:59:04 +02:00
Philipp Hagemeister
968ed2a777 [comedycentral] Add test for #2677 2014-04-03 15:31:04 +02:00
Philipp Hagemeister
5fbd672c38 [README] Improve developer instructions
Add a longer tutorial that should cover everything needed to start developing IEs.

Fixes #2676
2014-04-03 14:46:24 +02:00
10 changed files with 285 additions and 10 deletions

View File

@@ -371,7 +371,67 @@ If you want to create a build of youtube-dl yourself, you'll need
### Adding support for a new site
If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class YourExtractorIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://yourextractor.com/watch/42',
'md5': 'TODO: md5 sum of the first 10KiB of the video file',
'info_dict': {
'id': '42',
'ext': 'mp4',
'title': 'Video title goes here',
# TODO more properties, either as:
# * A value
# * MD5 checksum; start the string with md5:
# * A regular expression; start the string with re:
# * Any Python type (for example int or float)
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
# TODO more code goes here, for example ...
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
return {
'id': video_id,
'title': title,
# TODO more properties (see youtube_dl/extractor/common.py)
}
5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done.
7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
$ git add youtube_dl/extractor/__init__.py
$ git add youtube_dl/extractor/yourextractor.py
$ git commit -m '[yourextractor] Add new extractor'
$ git push origin yourextractor
10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
In any case, thank you very much for your contributions!
# BUGS

View File

@@ -153,6 +153,9 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch(
'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
['ComedyCentralShows'])
self.assertMatch(
'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
['ComedyCentralShows'])
if __name__ == '__main__':
unittest.main()

View File

@@ -40,6 +40,7 @@ from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .cmt import CMTIE
from .cnet import CNETIE
from .cnn import (
CNNIE,
CNNBlogsIE,
@@ -153,6 +154,8 @@ from .mixcloud import MixcloudIE
from .mpora import MporaIE
from .mofosex import MofosexIE
from .mooshare import MooshareIE
from .morningstar import MorningstarIE
from .motorsport import MotorsportIE
from .mtv import (
MTVIE,
MTVIggyIE,

View File

@@ -0,0 +1,70 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
)
class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'md5': '041233212a0d06b179c87cbcca1577b8',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
'ext': 'mp4',
'title': 'Hands-on with Microsoft Windows 8.1 Update',
'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
'thumbnail': 're:^http://.*/flmswindows8.jpg$',
'uploader_id': 'sarah.mitroff@cbsinteractive.com',
'uploader': 'Sarah Mitroff',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
data_json = self._html_search_regex(
r"<div class=\"cnetVideoPlayer\" data-cnet-video-options='([^']+)'",
webpage, 'data json')
data = json.loads(data_json)
vdata = data['video']
video_id = vdata['id']
title = vdata['headline']
description = vdata.get('dek')
thumbnail = vdata.get('image', {}).get('path')
author = vdata.get('author')
if author:
uploader = '%s %s' % (author['firstName'], author['lastName'])
uploader_id = author.get('email')
else:
uploader = None
uploader_id = None
formats = [{
'format_id': '%s-%s-%s' % (
f['type'], f['format'],
int_or_none(f.get('bitrate'), 1000, default='')),
'url': f['uri'],
'tbr': int_or_none(f.get('bitrate'), 1000),
} for f in vdata['files']['data']]
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
'description': description,
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
}

View File

@@ -41,7 +41,7 @@ class ComedyCentralShowsIE(InfoExtractor):
_VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|https?://(:www\.)?
(?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
(full-episodes/(?P<episode>.*)|
(full-episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
(?P<clip>
(?:(?:guests/[^/]+|videos)/[^/]+/(?P<videotitle>[^/?#]+))
|(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))

View File

@@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import json
import re
import time
from .common import InfoExtractor
from ..utils import (
compat_parse_qs,
compat_str,
int_or_none,
)
class MorningstarIE(InfoExtractor):
IE_DESC = 'morningstar.com'
_VALID_URL = r'https?://(?:www\.)?morningstar\.com/cover/videocenter\.aspx\?id=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869',
'md5': '6c0acface7a787aadc8391e4bbf7b0f5',
'info_dict': {
'id': '615869',
'ext': 'mp4',
'title': 'Get Ahead of the Curve on 2013 Taxes',
'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.",
'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title')
video_url = self._html_search_regex(
r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"',
webpage, 'video URL')
thumbnail = self._html_search_regex(
r'<input type="hidden" id="hidSnapshot" value="([^"]+)"',
webpage, 'thumbnail', fatal=False)
description = self._html_search_regex(
r'<div id="mstarDeck".*?>(.*?)</div>',
webpage, 'description', fatal=False)
return {
'id': video_id,
'title': title,
'url': video_url,
'thumbnail': thumbnail,
'description': description,
}

View File

@@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import json
import re
import time
from .common import InfoExtractor
from ..utils import (
compat_parse_qs,
compat_str,
int_or_none,
)
class MotorsportIE(InfoExtractor):
IE_DESC = 'motorsport.com'
_VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
_TEST = {
'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
'info_dict': {
'id': '7063',
'ext': 'mp4',
'title': 'Red Bull Racing: 2014 Rules Explained',
'duration': 207,
'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations which are arguably the most complex the sport has ever seen.',
'uploader': 'rainiere',
'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
flashvars_code = self._html_search_regex(
r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
flashvars = compat_parse_qs(flashvars_code)
params = json.loads(flashvars['parameters'][0])
e = compat_str(int(time.time()) + 24 * 60 * 60)
base_video_url = params['location'] + '?e=' + e
s = 'h3hg713fh32'
h = hashlib.md5(s + base_video_url).hexdigest()
video_url = base_video_url + '&h=' + h
uploader = self._html_search_regex(
r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
'uploader', fatal=False)
return {
'id': params['video_id'],
'display_id': display_id,
'title': params['title'],
'url': video_url,
'description': params.get('description'),
'thumbnail': params.get('main_thumb'),
'duration': int_or_none(params.get('duration')),
'uploader': uploader,
}

View File

@@ -3,11 +3,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .youtube import YoutubeIE
class WimpIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
_TEST = {
_TESTS = [{
'url': 'http://www.wimp.com/maruexhausted/',
'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
'info_dict': {
@@ -16,7 +17,20 @@ class WimpIE(InfoExtractor):
'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8',
}
}
}, {
# youtube video
'url': 'http://www.wimp.com/clowncar/',
'info_dict': {
'id': 'cG4CEr2aiSg',
'ext': 'mp4',
'title': 'Basset hound clown car...incredible!',
'description': 'md5:8d228485e0719898c017203f900b3a35',
'uploader': 'Gretchen Hoey',
'uploader_id': 'gretchenandjeff1',
'upload_date': '20140303',
},
'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -24,6 +38,13 @@ class WimpIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
'_type': 'url',
'url': video_url,
'ie_key': YoutubeIE.ie_key(),
}
return {
'id': video_id,
@@ -31,4 +52,4 @@ class WimpIE(InfoExtractor):
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
}
}

View File

@@ -1176,12 +1176,12 @@ class HEADRequest(compat_urllib_request.Request):
return "HEAD"
def int_or_none(v, scale=1):
return v if v is None else (int(v) // scale)
def int_or_none(v, scale=1, default=None):
return default if v is None else (int(v) // scale)
def float_or_none(v, scale=1):
return v if v is None else (float(v) / scale)
def float_or_none(v, scale=1, default=None):
return default if v is None else (float(v) / scale)
def parse_duration(s):

View File

@@ -1,2 +1,2 @@
__version__ = '2014.04.03.2'
__version__ = '2014.04.04.1'