Compare commits

...

75 Commits

Author SHA1 Message Date
Philipp Hagemeister
c11726364e release 2014.03.24 2014-03-24 10:17:35 +01:00
Philipp Hagemeister
c577d735c6 release 2013.03.24.2 2014-03-24 02:24:31 +01:00
Philipp Hagemeister
9f0375f61a release 2013.03.24.1 2014-03-24 02:22:12 +01:00
Philipp Hagemeister
5e114e4bfe [soundcloud] Always add streaming formats 2014-03-24 02:21:17 +01:00
Philipp Hagemeister
83622b6d2f [soundcloud] Simplify string literals 2014-03-24 02:15:31 +01:00
Philipp Hagemeister
3d87426c2d release 2013.03.24 2014-03-24 01:42:14 +01:00
Philipp Hagemeister
ce328530a9 Merge remote-tracking branch 'origin/master' 2014-03-24 01:42:11 +01:00
Philipp Hagemeister
f70daac108 [RTS] Add extractor (Fixes #2608) 2014-03-24 01:41:14 +01:00
Philipp Hagemeister
912b38b428 [instagram] Fix info_dict key name 2014-03-24 01:40:09 +01:00
Philipp Hagemeister
6e25c58ed7 Merge pull request #2567 from jaimeMF/sphinx-docs
Add initial sphinx docs
2014-03-24 00:50:32 +01:00
Philipp Hagemeister
51fb2e98d2 [radiofrance] Modernize 2014-03-23 17:43:33 +01:00
Philipp Hagemeister
38d63d846e [extractor/common] Clarify preference key in formats 2014-03-23 17:41:43 +01:00
Philipp Hagemeister
07cec9776e release 2014.03.23 2014-03-23 16:06:41 +01:00
Philipp Hagemeister
ea38e55fff [instagram] Add support for user profiles (Fixes #2606) 2014-03-23 16:06:07 +01:00
Philipp Hagemeister
257cfebfe6 [test] Move expect_info_dict out of test_download 2014-03-23 15:52:21 +01:00
Philipp Hagemeister
6eefe53329 [utils] Simplify setproctitle 2014-03-23 14:28:22 +01:00
Philipp Hagemeister
1986025d2b [xbef] (Add extractor) 2014-03-23 14:04:36 +01:00
Philipp Hagemeister
c9aa111b4f [worldstarhiphop] Modernize 2014-03-23 13:49:15 +01:00
Philipp Hagemeister
bfcb6e3917 Merge remote-tracking branch 'fiocfun/xtube-user-extractor' 2014-03-23 13:36:14 +01:00
Sergey M․
2c1396073e [metacafe] Remove accidently inserted comment string 2014-03-23 05:16:02 +07:00
Sergey M․
401983c6a0 [metacafe] More modernize 2014-03-23 05:13:15 +07:00
Sergey M․
391dc3ee07 [metacafe] Replace cbs test 2014-03-23 05:08:11 +07:00
Sergey M․
be3b8fa30f [metacafe] Modernize 2014-03-23 05:05:31 +07:00
fiocfun
9f5809b3e8 [xtube] user playlist extractor 2014-03-23 00:16:35 +06:00
Sergey M․
0320ddc192 [pornhub] Fix uploader extraction and extract counts 2014-03-22 21:30:22 +07:00
Philipp Hagemeister
56dd55721c Remove unused imports and clarify variable names 2014-03-22 15:17:32 +01:00
Philipp Hagemeister
231f76b530 [toypics] Separate user and video extraction (#2601) 2014-03-22 15:15:01 +01:00
Philipp Hagemeister
55442a7812 Merge remote-tracking branch 'fiocfun/toypics-support' 2014-03-22 14:24:44 +01:00
Philipp Hagemeister
43b81eb98a [youtube] Remove useless resolution fields from format definitions
These can be - and are - calculated automatically by the YoutubeDL core.
2014-03-22 14:22:41 +01:00
Philipp Hagemeister
bfd718793c Merge remote-tracking branch 'hurda/patch-1' 2014-03-22 14:21:04 +01:00
Philipp Hagemeister
a9c2896e22 Make missing test definition fields an error
If the result is not testable (for example, because a description changes often), either pass in a type or a regular expression (a string starting with 're:')
2014-03-22 14:20:07 +01:00
hurda
278229d195 itag 160 is 144p, not 192p 2014-03-22 12:15:45 +01:00
Philipp Hagemeister
fa154d1dbe [videolectures.net] Make description optional 2014-03-22 12:10:56 +01:00
Jaime Marquínez Ferrándiz
7e2ede9891 [generic] Run TED detection before JW Player detection
Otherwise it overwrittes the `mobj` variable.
2014-03-22 10:20:44 +01:00
fiocfun
74af99fc2f toypics.net support 2014-03-22 04:07:44 +06:00
Jaime Marquínez Ferrándiz
0f2a2ba14b Merge remote-tracking branch 'dstftw/generic-webpage-unescape'
Conflicts:
	youtube_dl/extractor/generic.py
2014-03-21 22:14:24 +01:00
Jaime Marquínez Ferrándiz
e24b5a8610 [ooyala] Modernize 2014-03-21 21:55:51 +01:00
Jaime Marquínez Ferrándiz
750f9020ae [generic] Recognize more Ooyala embedded videos (#2569) 2014-03-21 21:51:33 +01:00
Jaime Marquínez Ferrándiz
f82863851e Add an extractor for on.aol.com 2014-03-21 19:54:44 +01:00
Jaime Marquínez Ferrándiz
933a5b3792 Add extractor for Engadget and 5min (closes #2465)
engadget.com uses the generic 5min.com service.
2014-03-21 19:13:46 +01:00
Sergey M․
aa488e1385 [xtube] Fix formats extraction 2014-03-21 23:58:40 +07:00
Philipp Hagemeister
d77650525d release 2014.03.21.5 2014-03-21 14:52:57 +01:00
Philipp Hagemeister
3e50c29984 release 2014.03.21.4 2014-03-21 14:38:55 +01:00
Philipp Hagemeister
64e7ad6045 [videolectures] (New extractor) 2014-03-21 14:38:41 +01:00
Philipp Hagemeister
23f4a93bb4 [daum] Modernize 2014-03-21 14:38:41 +01:00
Jaime Marquínez Ferrándiz
6f13b055f1 [cspan] Fix typo in a comment 2014-03-21 08:01:20 +01:00
Philipp Hagemeister
1f91bd15c3 release 2014.03.21.3 2014-03-21 02:10:35 +01:00
Philipp Hagemeister
11a15be4ce [cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00
Philipp Hagemeister
14e17e18cb release 2014.03.21.2 2014-03-21 01:42:45 +01:00
Philipp Hagemeister
1b124d1942 [parliamentliveuk] Add extractor 2014-03-21 01:42:28 +01:00
Philipp Hagemeister
747373d4ae release 2014.03.21.1 2014-03-21 01:00:27 +01:00
Philipp Hagemeister
18d367c0a5 Remove legacy InfoExtractors file 2014-03-21 01:00:06 +01:00
Philipp Hagemeister
a1a530b067 [pbs] Add support for video ratings 2014-03-21 00:59:51 +01:00
Philipp Hagemeister
cb9722cb3f [viki] Modernize 2014-03-21 00:53:18 +01:00
Philipp Hagemeister
773c0b4bb8 [pbs] Add support for widget URLs (Fixes #2594) 2014-03-21 00:46:32 +01:00
Philipp Hagemeister
23c322a531 release 2014.03.21 2014-03-21 00:37:23 +01:00
Philipp Hagemeister
7e8c0af004 Add --prefer-insecure option (Fixes #2364) 2014-03-21 00:37:10 +01:00
Philipp Hagemeister
d2983ccb25 [ninegag] Modernize and remove unused import 2014-03-21 00:37:10 +01:00
Philipp Hagemeister
f24e9833dc [youporn] Modernize 2014-03-21 00:37:10 +01:00
Sergey M․
bc2bdf5709 [kontrtube] Modernize 2014-03-20 23:05:57 +07:00
Philipp Hagemeister
627a209f74 release 2014.03.20 2014-03-20 16:35:54 +01:00
Philipp Hagemeister
1a4895453a [YoutubeDL] Improve error message 2014-03-20 16:33:46 +01:00
Philipp Hagemeister
aab74fa106 [ted] Simplify embed code (#2587) 2014-03-20 16:33:23 +01:00
Philipp Hagemeister
2bd9efd4c2 Merge remote-tracking branch 'anovicecodemonkey/TEDIEimprovements' 2014-03-20 16:24:34 +01:00
Jaime Marquínez Ferrándiz
39a743fb9b [arte] Modernize tests and fix _VALID_REGEX 2014-03-20 09:14:43 +01:00
Jaime Marquínez Ferrándiz
4966a0b22d [arte] Add extractor for concert.arte.tv (closes #2588) 2014-03-20 09:11:47 +01:00
anovicecodemonkey
fc26023120 [TEDIE] Add support for embeded TED video URLs 2014-03-20 01:04:21 +10:30
anovicecodemonkey
8d7c0cca13 [generic] Add support for embeded TED videos 2014-03-20 00:56:32 +10:30
Sergey M․
f66ede4328 [arte.tv:+7] Fix _VALID_URL 2014-03-19 21:23:55 +07:00
Philipp Hagemeister
cc88b90ec8 [desvscripts/release] Bump the number of password tries to accomodate stubby-fingered @phihag 2014-03-18 15:02:37 +01:00
Philipp Hagemeister
b6c5fa9a0b release 2014.03.18.1 2014-03-18 14:42:59 +01:00
Jaime Marquínez Ferrándiz
685052fc7b Add initial sphinx docs
With an initial guide for using youtube_dl from python programs.
2014-03-15 19:08:09 +01:00
Sergey M․
d95e35d659 [generic] Add nowvideo test hidden behind percent encoding 2014-03-15 04:39:53 +07:00
Sergey M․
1439073049 [generic] Add comment for unescaping webpage contents 2014-03-15 04:38:49 +07:00
Sergey M
1f7659dbe9 [generic] Unescape webpage contents 2014-03-15 04:21:17 +07:00
47 changed files with 1496 additions and 368 deletions

View File

@@ -3,3 +3,5 @@ include test/*.py
include test/*.json
include youtube-dl.bash-completion
include youtube-dl.1
recursive-include docs *
prune docs/_build

View File

@@ -36,6 +36,9 @@ which means you can modify it, redistribute it or use it however you like.
an empty string (--proxy "") for direct
connection
--no-check-certificate Suppress HTTPS certificate validation.
--prefer-insecure Use an unencrypted connection to retrieve
information about the video. (Currently
supported only for YouTube)
--cache-dir DIR Location in the filesystem where youtube-dl
can store some downloaded information
permanently. By default $XDG_CACHE_HOME

View File

@@ -70,7 +70,7 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz"
git checkout HEAD -- youtube-dl youtube-dl.exe
/bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..."
for f in $RELEASE_FILES; do gpg --detach-sig "build/$version/$f"; done
for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done
scp -r "build/$version" ytdl@yt-dl.org:html/tmp/
ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/"
ssh ytdl@yt-dl.org "sh html/update_latest.sh $version"

1
docs/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
_build/

177
docs/Makefile Normal file
View File

@@ -0,0 +1,177 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/youtube-dl.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/youtube-dl.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/youtube-dl"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/youtube-dl"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

71
docs/conf.py Normal file
View File

@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
#
# youtube-dl documentation build configuration file, created by
# sphinx-quickstart on Fri Mar 14 21:05:43 2014.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
# Allows to import youtube_dl
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# -- General configuration ------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'youtube-dl'
copyright = u'2014, Ricardo Garcia Gonzalez'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
import youtube_dl
version = youtube_dl.__version__
# The full version, including alpha/beta/rc tags.
release = version
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Output file base name for HTML help builder.
htmlhelp_basename = 'youtube-dldoc'

23
docs/index.rst Normal file
View File

@@ -0,0 +1,23 @@
Welcome to youtube-dl's documentation!
======================================
*youtube-dl* is a command-line program to download videos from YouTube.com and more sites.
It can also be used in Python code.
Developer guide
---------------
This section contains information for using *youtube-dl* from Python programs.
.. toctree::
:maxdepth: 2
module_guide
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

67
docs/module_guide.rst Normal file
View File

@@ -0,0 +1,67 @@
Using the ``youtube_dl`` module
===============================
When using the ``youtube_dl`` module, you start by creating an instance of :class:`YoutubeDL` and adding all the available extractors:
.. code-block:: python
>>> from youtube_dl import YoutubeDL
>>> ydl = YoutubeDL()
>>> ydl.add_default_info_extractors()
Extracting video information
----------------------------
You use the :meth:`YoutubeDL.extract_info` method for getting the video information, which returns a dictionary:
.. code-block:: python
>>> info = ydl.extract_info('http://www.youtube.com/watch?v=BaW_jenozKc', download=False)
[youtube] Setting language
[youtube] BaW_jenozKc: Downloading webpage
[youtube] BaW_jenozKc: Downloading video info webpage
[youtube] BaW_jenozKc: Extracting video information
>>> info['title']
'youtube-dl test video "\'/\\ä↭𝕐'
>>> info['height'], info['width']
(720, 1280)
If you want to download or play the video you can get its url:
.. code-block:: python
>>> info['url']
'https://...'
Extracting playlist information
-------------------------------
The playlist information is extracted in a similar way, but the dictionary is a bit different:
.. code-block:: python
>>> playlist = ydl.extract_info('http://www.ted.com/playlists/13/open_source_open_world', download=False)
[TED] open_source_open_world: Downloading playlist webpage
...
>>> playlist['title']
'Open-source, open world'
You can access the videos in the playlist with the ``entries`` field:
.. code-block:: python
>>> for video in playlist['entries']:
... print('Video #%d: %s' % (video['playlist_index'], video['title']))
Video #1: How Arduino is open-sourcing imagination
Video #2: The year open data went worldwide
Video #3: Massive-scale online collaboration
Video #4: The art of asking
Video #5: How cognitive surplus will change the world
Video #6: The birth of Wikipedia
Video #7: Coding a better government
Video #8: The era of open innovation
Video #9: The currency of the new economy is trust

View File

@@ -9,7 +9,10 @@ import sys
import youtube_dl.extractor
from youtube_dl import YoutubeDL
from youtube_dl.utils import preferredencoding
from youtube_dl.utils import (
compat_str,
preferredencoding,
)
def get_params(override=None):
@@ -83,3 +86,45 @@ def gettestcases():
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
def expect_info_dict(self, expected_dict, got_dict):
for info_field, expected in expected_dict.items():
if isinstance(expected, compat_str) and expected.startswith('re:'):
got = got_dict.get(info_field)
match_str = expected[len('re:'):]
match_rex = re.compile(match_str)
self.assertTrue(
isinstance(got, compat_str) and match_rex.match(got),
u'field %s (value: %r) should match %r' % (info_field, got, match_str))
elif isinstance(expected, type):
got = got_dict.get(info_field)
self.assertTrue(isinstance(got, expected),
u'Expected type %r, but got value %r of type %r' % (expected, got, type(got)))
else:
if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(got_dict.get(info_field))
else:
got = got_dict.get(info_field)
self.assertEqual(expected, got,
u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# Check for the presence of mandatory fields
for key in ('id', 'url', 'title', 'ext'):
self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
# Check for mandatory fields that are automatically set by YoutubeDL
for key in ['webpage_url', 'extractor', 'extractor_key']:
self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
# Are checkable fields missing from the test case definition?
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
for key, value in got_dict.items()
if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
if missing_keys:
sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
self.assertFalse(
missing_keys,
'Missing keys in test definition: %s' % (
', '.join(sorted(missing_keys))))

View File

@@ -141,6 +141,7 @@ class TestAllURLsMatching(unittest.TestCase):
def test_pbs(self):
# https://github.com/rg3/youtube-dl/issues/2350
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
if __name__ == '__main__':
unittest.main()

View File

@@ -9,16 +9,16 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
get_params,
gettestcases,
try_rm,
expect_info_dict,
md5,
report_warning
try_rm,
report_warning,
)
import hashlib
import io
import json
import re
import socket
import youtube_dl.YoutubeDL
@@ -135,40 +135,8 @@ def generator(test_case):
self.assertEqual(md5_for_file, tc['md5'])
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
for (info_field, expected) in tc.get('info_dict', {}).items():
if isinstance(expected, compat_str) and expected.startswith('re:'):
got = info_dict.get(info_field)
match_str = expected[len('re:'):]
match_rex = re.compile(match_str)
self.assertTrue(
isinstance(got, compat_str) and match_rex.match(got),
u'field %s (value: %r) should match %r' % (info_field, got, match_str))
elif isinstance(expected, type):
got = info_dict.get(info_field)
self.assertTrue(isinstance(got, expected),
u'Expected type %r, but got value %r of type %r' % (expected, got, type(got)))
else:
if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(info_dict.get(info_field))
else:
got = info_dict.get(info_field)
self.assertEqual(expected, got,
u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# Check for the presence of mandatory fields
for key in ('id', 'url', 'title', 'ext'):
self.assertTrue(key in info_dict.keys() and info_dict[key])
# Check for mandatory fields that are automatically set by YoutubeDL
for key in ['webpage_url', 'extractor', 'extractor_key']:
self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
# If checkable fields are missing from the test case, print the info_dict
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
for key, value in info_dict.items()
if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
expect_info_dict(self, tc.get('info_dict', {}), info_dict)
finally:
try_rm_tcs_files()

View File

@@ -9,8 +9,10 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL
from test.helper import (
expect_info_dict,
FakeYDL,
)
from youtube_dl.extractor import (
AcademicEarthCourseIE,
@@ -37,6 +39,9 @@ from youtube_dl.extractor import (
GoogleSearchIE,
GenericIE,
TEDIE,
ToypicsUserIE,
XTubeUserIE,
InstagramUserIE,
)
@@ -269,5 +274,46 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Who are the hackers?')
self.assertTrue(len(result['entries']) >= 6)
def test_toypics_user(self):
dl = FakeYDL()
ie = ToypicsUserIE(dl)
result = ie.extract('http://videos.toypics.net/Mikey')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'Mikey')
self.assertTrue(len(result['entries']) >= 17)
def test_xtube_user(self):
dl = FakeYDL()
ie = XTubeUserIE(dl)
result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'greenshowers')
self.assertTrue(len(result['entries']) >= 155)
def test_InstagramUser(self):
dl = FakeYDL()
ie = InstagramUserIE(dl)
result = ie.extract('http://instagram.com/porsche')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'porsche')
self.assertTrue(len(result['entries']) >= 2)
test_video = next(
e for e in result['entries']
if e['id'] == '614605558512799803_462752227')
dl.add_default_extra_info(test_video, ie, '(irrelevant URL)')
dl.process_video_result(test_video, download=False)
EXPECTED = {
'id': '614605558512799803_462752227',
'ext': 'mp4',
'title': '#Porsche Intelligent Performance.',
'thumbnail': 're:^https?://.*\.jpg',
'uploader': 'Porsche',
'uploader_id': 'porsche',
'timestamp': 1387486713,
'upload_date': '20131219',
}
expect_info_dict(self, EXPECTED, test_video)
if __name__ == '__main__':
unittest.main()

View File

@@ -35,6 +35,7 @@ from youtube_dl.utils import (
url_basename,
urlencode_postdata,
xpath_with_ns,
parse_iso8601,
)
if sys.version_info < (3, 0):
@@ -266,5 +267,10 @@ class TestUtil(unittest.TestCase):
data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
self.assertTrue(isinstance(data, bytes))
def test_parse_iso8601(self):
self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
if __name__ == '__main__':
unittest.main()

View File

@@ -1,4 +0,0 @@
# Legacy file for backwards compatibility, use youtube_dl.extractor instead!
from .extractor.common import InfoExtractor, SearchInfoExtractor
from .extractor import gen_extractors, get_info_extractor

View File

@@ -148,6 +148,8 @@ class YoutubeDL(object):
again.
cookiefile: File name where cookies should be read from and dumped to.
nocheckcertificate:Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
At the moment, this is only supported by YouTube.
proxy: URL of the proxy server to use
socket_timeout: Time to wait for unresponsive hosts, in seconds
bidi_workaround: Work around buggy terminals without bidirectional text
@@ -510,13 +512,7 @@ class YoutubeDL(object):
'_type': 'compat_list',
'entries': ie_result,
}
self.add_extra_info(ie_result,
{
'extractor': ie.IE_NAME,
'webpage_url': url,
'webpage_url_basename': url_basename(url),
'extractor_key': ie.ie_key(),
})
self.add_default_extra_info(ie_result, ie, url)
if process:
return self.process_ie_result(ie_result, download, extra_info)
else:
@@ -533,7 +529,15 @@ class YoutubeDL(object):
else:
raise
else:
self.report_error('no suitable InfoExtractor: %s' % url)
self.report_error('no suitable InfoExtractor for URL %s' % url)
def add_default_extra_info(self, ie_result, ie, url):
self.add_extra_info(ie_result, {
'extractor': ie.IE_NAME,
'webpage_url': url,
'webpage_url_basename': url_basename(url),
'extractor_key': ie.ie_key(),
})
def process_ie_result(self, ie_result, download=True, extra_info={}):
"""

View File

@@ -237,6 +237,9 @@ def parseOpts(overrideArguments=None):
'--proxy', dest='proxy', default=None, metavar='URL',
help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option(
'--prefer-insecure', action='store_true', dest='prefer_insecure',
help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
general.add_option(
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
@@ -257,7 +260,6 @@ def parseOpts(overrideArguments=None):
action='store_true',
help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
selection.add_option(
'--playlist-start',
dest='playliststart', metavar='NUMBER', default=1, type=int,
@@ -756,6 +758,7 @@ def _real_main(argv=None):
'download_archive': download_archive_fn,
'cookiefile': opts.cookiefile,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
'bidi_workaround': opts.bidi_workaround,

View File

@@ -2,6 +2,7 @@ from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
from .aftonbladet import AftonbladetIE
from .anitube import AnitubeIE
from .aol import AolIE
from .aparat import AparatIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
@@ -10,6 +11,7 @@ from .arte import (
ArteTvIE,
ArteTVPlus7IE,
ArteTVCreativeIE,
ArteTVConcertIE,
ArteTVFutureIE,
ArteTVDDCIE,
)
@@ -63,6 +65,7 @@ from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .eitb import EitbIE
from .elpais import ElPaisIE
from .engadget import EngadgetIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
@@ -71,6 +74,7 @@ from .facebook import FacebookIE
from .faz import FazIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
@@ -108,7 +112,7 @@ from .imdb import (
)
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
from .instagram import InstagramIE, InstagramUserIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
from .ivi import (
@@ -173,6 +177,7 @@ from .nowness import NownessIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .parliamentliveuk import ParliamentLiveUKIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .playvid import PlayvidIE
@@ -190,6 +195,7 @@ from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
from .rts import RTSIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
@@ -234,6 +240,7 @@ from .theplatform import ThePlatformIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
@@ -258,6 +265,7 @@ from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .vimeo import (
@@ -278,10 +286,11 @@ from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
from .xbef import XBefIE
from .xhamster import XHamsterIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
from .xtube import XTubeIE
from .xtube import XTubeUserIE, XTubeIE
from .yahoo import (
YahooIE,
YahooNewsIE,

View File

@@ -0,0 +1,28 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .fivemin import FiveMinIE
class AolIE(InfoExtractor):
IE_NAME = 'on.aol.com'
_VALID_URL = r'http://on\.aol\.com/video/.*-(?P<id>\d+)($|\?)'
_TEST = {
'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
'md5': '18ef68f48740e86ae94b98da815eec42',
'info_dict': {
'id': '518167793',
'ext': 'mp4',
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
},
'add_ie': ['FiveMin'],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
self.to_screen('Downloading 5min.com video %s' % video_id)
return FiveMinIE._build_result(video_id)

View File

@@ -131,7 +131,7 @@ class ArteTvIE(InfoExtractor):
class ArteTVPlus7IE(InfoExtractor):
IE_NAME = 'arte.tv:+7'
_VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
@classmethod
def _extract_url_info(cls, url):
@@ -202,6 +202,8 @@ class ArteTVPlus7IE(InfoExtractor):
re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None,
# The version with sourds/mal subtitles has also lower relevance
re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None,
# Prefer http downloads over m3u8
0 if f['url'].endswith('m3u8') else 1,
)
formats = sorted(formats, key=sort_key)
def _format(format_info):
@@ -242,8 +244,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
'file': '050489-002.mp4',
'info_dict': {
'id': '050489-002',
'ext': 'mp4',
'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design',
},
}
@@ -255,8 +258,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
'file': '050940-003.mp4',
'info_dict': {
'id': '050940-003',
'ext': 'mp4',
'title': 'Les champignons au secours de la planète',
},
}
@@ -270,7 +274,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
class ArteTVDDCIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:ddc'
_VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
_VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
def _real_extract(self, url):
video_id, lang = self._extract_url_info(url)
@@ -284,3 +288,20 @@ class ArteTVDDCIE(ArteTVPlus7IE):
javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
return self._extract_from_json_url(json_url, video_id, lang)
class ArteTVConcertIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:concert'
_VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)'
_TEST = {
'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
'md5': '9ea035b7bd69696b67aa2ccaaa218161',
'info_dict': {
'id': '186',
'ext': 'mp4',
'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"',
'upload_date': '20140128',
'description': 'md5:486eb08f991552ade77439fe6d82c305',
},
}

View File

@@ -74,7 +74,7 @@ class InfoExtractor(object):
"http", "https", "rtsp", "rtmp", "m3u8" or so.
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field.
by this field, regardless of all other values.
-1 for default (order by other properties),
-2 or smaller for less than default.
* quality Order number of the video quality of this

View File

@@ -10,9 +10,9 @@ from ..utils import (
class CSpanIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)'
_VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
IE_DESC = 'C-SPAN'
_TEST = {
_TESTS = [{
'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
'md5': '8e44ce11f0f725527daccc453f553eb0',
'info_dict': {
@@ -22,13 +22,24 @@ class CSpanIE(InfoExtractor):
'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
},
'skip': 'Regularly fails on travis, for unknown reasons',
}
}, {
'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
# For whatever reason, the served video alternates between
# two different ones
#'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c',
'info_dict': {
'id': '340723',
'ext': 'mp4',
'title': 'International Health Care Models',
'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_id = mobj.group('id')
webpage = self._download_webpage(url, page_id)
video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id')
video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id')
description = self._html_search_regex(
[

View File

@@ -1,25 +1,28 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
determine_ext,
)
class DaumIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
IE_NAME = u'daum.net'
IE_NAME = 'daum.net'
_TEST = {
u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
u'file': u'52554690.mp4',
u'info_dict': {
u'title': u'DOTA 2GETHER 시즌2 6회 - 2부',
u'description': u'DOTA 2GETHER 시즌2 6회 - 2부',
u'upload_date': u'20130831',
u'duration': 3868,
'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
'info_dict': {
'id': '52554690',
'ext': 'mp4',
'title': 'DOTA 2GETHER 시즌2 6회 - 2부',
'description': 'DOTA 2GETHER 시즌2 6회 - 2부',
'upload_date': '20130831',
'duration': 3868,
},
}
@@ -30,14 +33,14 @@ class DaumIE(InfoExtractor):
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(
r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
webpage, u'full id')
webpage, 'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
info = self._download_xml(
'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
u'Downloading video info')
'Downloading video info')
urls = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
video_id, u'Downloading video formats info')
video_id, 'Downloading video formats info')
self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
@@ -53,7 +56,6 @@ class DaumIE(InfoExtractor):
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
'ext': determine_ext(format_url),
'format_id': profile,
})

View File

@@ -0,0 +1,43 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .fivemin import FiveMinIE
from ..utils import (
url_basename,
)
class EngadgetIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://www.engadget.com/
(?:video/5min/(?P<id>\d+)|
[\d/]+/.*?)
'''
_TEST = {
'url': 'http://www.engadget.com/video/5min/518153925/',
'md5': 'c6820d4828a5064447a4d9fc73f312c9',
'info_dict': {
'id': '518153925',
'ext': 'mp4',
'title': 'Samsung Galaxy Tab Pro 8.4 Review',
},
'add_ie': ['FiveMin'],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
if video_id is not None:
return FiveMinIE._build_result(video_id)
else:
title = url_basename(url)
webpage = self._download_webpage(url, title)
ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
return {
'_type': 'playlist',
'title': title,
'entries': [FiveMinIE._build_result(id) for id in ids]
}

View File

@@ -0,0 +1,56 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_str,
)
class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
(?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=|
5min:)
(?P<id>\d+)
'''
_TEST = {
# From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/
'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791',
'md5': '4f7b0b79bf1a470e5004f7112385941d',
'info_dict': {
'id': '518013791',
'ext': 'mp4',
'title': 'iPad Mini with Retina Display Review',
},
}
@classmethod
def _build_result(cls, video_id):
return cls.url_result('5min:%s' % video_id, cls.ie_key())
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
info = self._download_json(
'https://syn.5min.com/handlers/SenseHandler.ashx?func=GetResults&'
'playlist=%s&url=https' % video_id,
video_id)['binding'][0]
second_id = compat_str(int(video_id[:-2]) + 1)
formats = []
for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]:
if any(r['ID'] == quality for r in info['Renditions']):
formats.append({
'format_id': compat_str(quality),
'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality),
'height': height,
})
return {
'id': video_id,
'title': info['Title'],
'formats': formats,
}

View File

@@ -102,6 +102,20 @@ class GenericIE(InfoExtractor):
'title': '2cc213299525360.mov', # that's what we get
},
},
# second style of embedded ooyala videos
{
'url': 'http://www.smh.com.au/tv/business/show/financial-review-sunday/behind-the-scenes-financial-review-sunday--4350201.html',
'info_dict': {
'id': '13djJjYjptA1XpPx8r9kuzPyj3UZH0Uk',
'ext': 'mp4',
'title': 'Behind-the-scenes: Financial Review Sunday ',
'description': 'Step inside Channel Nine studios for an exclusive tour of its upcoming financial business show.',
},
'params': {
# m3u8 download
'skip_download': True,
},
},
# google redirect
{
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@ -159,7 +173,30 @@ class GenericIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
}
},
# Embedded TED video
{
'url': 'http://en.support.wordpress.com/videos/ted-talks/',
'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
'info_dict': {
'id': '981',
'ext': 'mp4',
'title': 'My web playroom',
'uploader': 'Ze Frank',
'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
}
},
# nowvideo embed hidden behind percent encoding
{
'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
'md5': '2baf4ddd70f697d94b1c18cf796d5107',
'info_dict': {
'id': '06e53103ca9aa',
'ext': 'flv',
'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
'description': 'No description',
},
},
]
def report_download_webpage(self, video_id):
@@ -311,6 +348,11 @@ class GenericIE(InfoExtractor):
except compat_xml_parse_error:
pass
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/rg3/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
webpage = compat_urllib_parse.unquote(webpage)
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
@@ -412,9 +454,10 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for Ooyala videos
mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
return OoyalaIE._build_url_result(mobj.group(1))
return OoyalaIE._build_url_result(mobj.group('ec'))
# Look for Aparat videos
mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@@ -476,6 +519,12 @@ class GenericIE(InfoExtractor):
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
# Look for embedded TED player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
@@ -487,6 +536,7 @@ class GenericIE(InfoExtractor):
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)

View File

@@ -3,6 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
)
class InstagramIE(InfoExtractor):
@@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor):
'uploader_id': uploader_id,
'description': desc,
}
class InstagramUserIE(InfoExtractor):
_VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader_id = mobj.group('username')
entries = []
page_count = 0
media_url = 'http://instagram.com/%s/media' % uploader_id
while True:
page = self._download_json(
media_url, uploader_id,
note='Downloading page %d ' % (page_count + 1),
)
page_count += 1
for it in page['items']:
if it.get('type') != 'video':
continue
like_count = int_or_none(it.get('likes', {}).get('count'))
user = it.get('user', {})
formats = [{
'format_id': k,
'height': v.get('height'),
'width': v.get('width'),
'url': v['url'],
} for k, v in it['videos'].items()]
self._sort_formats(formats)
thumbnails_el = it.get('images', {})
thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
title = it.get('caption', {}).get('text', it['id'])
entries.append({
'id': it['id'],
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'webpage_url': it.get('link'),
'uploader': user.get('full_name'),
'uploader_id': user.get('username'),
'like_count': like_count,
'timestamp': int_or_none(it.get('created_time')),
})
if not page['items']:
break
max_id = page['items'][-1]['id']
media_url = (
'http://instagram.com/%s/media?max_id=%s' % (
uploader_id, max_id))
return {
'_type': 'playlist',
'entries': entries,
'id': uploader_id,
'title': uploader_id,
}

View File

@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import int_or_none
class KontrTubeIE(InfoExtractor):
@@ -32,27 +33,26 @@ class KontrTubeIE(InfoExtractor):
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
'video title')
title = self._html_search_regex(
r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title')
description = self._html_search_meta('description', webpage, 'video description')
mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
webpage)
mobj = re.search(
r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
'view count', fatal=False)
view_count = int(view_count) if view_count is not None else None
view_count = self._html_search_regex(
r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False)
comment_count = None
comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
fatal=False)
comment_str = self._html_search_regex(
r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False)
if comment_str.startswith('комментариев нет'):
comment_count = 0
else:
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
if mobj:
comment_count = int(mobj.group('total'))
comment_count = mobj.group('total')
return {
'id': video_id,
@@ -61,6 +61,6 @@ class KontrTubeIE(InfoExtractor):
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count),
}

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -9,104 +11,103 @@ from ..utils import (
ExtractorError,
)
class MetacafeIE(InfoExtractor):
"""Information Extractor for metacafe.com."""
_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
class MetacafeIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = u'metacafe'
IE_NAME = 'metacafe'
_TESTS = [
# Youtube video
{
u"add_ie": ["Youtube"],
u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
u"file": u"_aUehQsCQtM.mp4",
u"info_dict": {
u"upload_date": u"20090102",
u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!",
u"description": u"md5:2439a8ef6d5a70e380c22f5ad323e5a8",
u"uploader": u"PBS",
u"uploader_id": u"PBS"
}
},
# Normal metacafe video
{
u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad',
u'info_dict': {
u'id': u'11121940',
u'ext': u'mp4',
u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4',
u'uploader': u'ign',
u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
# Youtube video
{
'add_ie': ['Youtube'],
'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
'info_dict': {
'id': '_aUehQsCQtM',
'ext': 'mp4',
'upload_date': '20090102',
'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
'uploader': 'PBS',
'uploader_id': 'PBS'
}
},
},
# AnyClip video
{
u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
u"file": u"an-dVVXnuY7Jh77J.mp4",
u"info_dict": {
u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
u"uploader": u"anyclip",
u"description": u"md5:38c711dd98f5bb87acf973d573442e67",
# Normal metacafe video
{
'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
'info_dict': {
'id': '11121940',
'ext': 'mp4',
'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
'uploader': 'ign',
'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
},
},
},
# age-restricted video
{
u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09',
u'info_dict': {
u'id': u'5186653',
u'ext': u'mp4',
u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
u'uploader': u'Dwayne Pipe',
u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b',
u'age_limit': 18,
# AnyClip video
{
'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
'info_dict': {
'id': 'an-dVVXnuY7Jh77J',
'ext': 'mp4',
'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
'uploader': 'anyclip',
'description': 'md5:38c711dd98f5bb87acf973d573442e67',
},
},
},
# cbs video
{
u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/',
u'info_dict': {
u'id': u'0rOxMBabDXN6',
u'ext': u'flv',
u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet',
u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d',
u'duration': 129,
# age-restricted video
{
'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
'info_dict': {
'id': '5186653',
'ext': 'mp4',
'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
'uploader': 'Dwayne Pipe',
'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
'age_limit': 18,
},
},
u'params': {
# rtmp download
u'skip_download': True,
# cbs video
{
'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
'info_dict': {
'id': '8VD4r_Zws8VP',
'ext': 'flv',
'title': 'Open: This is Face the Nation, February 9',
'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
'duration': 96,
},
'params': {
# rtmp download
'skip_download': True,
},
},
},
]
def report_disclaimer(self):
"""Report disclaimer retrieval."""
self.to_screen(u'Retrieving disclaimer')
self.to_screen('Retrieving disclaimer')
def _real_initialize(self):
# Retrieve disclaimer
self.report_disclaimer()
self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer')
self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
# Confirm age
disclaimer_form = {
'filters': '0',
'submit': "Continue - I'm over 18",
}
}
request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.report_age_confirmation()
self._download_webpage(request, None, False, u'Unable to confirm age')
self._download_webpage(request, None, False, 'Unable to confirm age')
def _real_extract(self, url):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(1)
@@ -153,23 +154,24 @@ class MetacafeIE(InfoExtractor):
else:
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
raise ExtractorError('Unable to extract media URL')
vardict = compat_parse_qs(mobj.group(1))
if 'mediaData' not in vardict:
raise ExtractorError(u'Unable to extract media URL')
mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
raise ExtractorError('Unable to extract media URL')
mobj = re.search(
r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
raise ExtractorError('Unable to extract media URL')
mediaURL = mobj.group('mediaURL').replace('\\/', '/')
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
video_ext = determine_ext(video_url)
video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title')
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(
r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
webpage, u'uploader nickname', fatal=False)
webpage, 'uploader nickname', fatal=False)
if re.search(r'"contentRating":"restricted"', webpage) is not None:
age_limit = 18
@@ -177,14 +179,12 @@ class MetacafeIE(InfoExtractor):
age_limit = 0
return {
'_type': 'video',
'id': video_id,
'url': video_url,
'id': video_id,
'url': video_url,
'description': description,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'title': video_title,
'thumbnail':thumbnail,
'ext': video_ext,
'ext': video_ext,
'age_limit': age_limit,
}

View File

@@ -1,6 +1,5 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
@@ -12,8 +11,9 @@ class NineGagIE(InfoExtractor):
_TEST = {
"url": "http://9gag.tv/v/1912",
"file": "1912.mp4",
"info_dict": {
"id": "1912",
"ext": "mp4",
"description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
"title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
"view_count": int,

View File

@@ -1,20 +1,23 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import unescapeHTML
class OoyalaIE(InfoExtractor):
_VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'
_TEST = {
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4',
u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c',
u'info_dict': {
u'title': u'Explaining Data Recovery from Hard Drives and SSDs',
u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
'md5': '3f5cceb3a7bf461d6c29dc466cf8033c',
'info_dict': {
'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
'ext': 'mp4',
'title': 'Explaining Data Recovery from Hard Drives and SSDs',
'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
},
}
@@ -28,13 +31,14 @@ class OoyalaIE(InfoExtractor):
ie=cls.ie_key())
def _extract_result(self, info, more_info):
return {'id': info['embedCode'],
'ext': 'mp4',
'title': unescapeHTML(info['title']),
'url': info.get('ipad_url') or info['url'],
'description': unescapeHTML(more_info['description']),
'thumbnail': more_info['promo'],
}
return {
'id': info['embedCode'],
'ext': 'mp4',
'title': unescapeHTML(info['title']),
'url': info.get('ipad_url') or info['url'],
'description': unescapeHTML(more_info['description']),
'thumbnail': more_info['promo'],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -42,22 +46,23 @@ class OoyalaIE(InfoExtractor):
player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
player = self._download_webpage(player_url, embedCode)
mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
player, u'mobile player url')
player, 'mobile player url')
mobile_player = self._download_webpage(mobile_url, embedCode)
videos_info = self._search_regex(
r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
mobile_player, u'info').replace('\\"','"')
videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"')
mobile_player, 'info').replace('\\"','"')
videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"')
videos_info = json.loads(videos_info)
videos_more_info =json.loads(videos_more_info)
if videos_more_info.get('lineup'):
videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
return {'_type': 'playlist',
'id': embedCode,
'title': unescapeHTML(videos_more_info['title']),
'entries': videos,
}
return {
'_type': 'playlist',
'id': embedCode,
'title': unescapeHTML(videos_more_info['title']),
'entries': videos,
}
else:
return self._extract_result(videos_info[0], videos_more_info)

View File

@@ -0,0 +1,53 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class ParliamentLiveUKIE(InfoExtractor):
IE_NAME = 'parliamentlive.tv'
IE_DESC = 'UK parliament videos'
_VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia',
'info_dict': {
'id': '15121',
'ext': 'asf',
'title': 'hoc home affairs committee, 18 mar 2014.pm',
'description': 'md5:033b3acdf83304cd43946b2d5e5798d1',
},
'params': {
'skip_download': True, # Requires mplayer (mms)
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
asx_url = self._html_search_regex(
r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage,
'metadata URL')
asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata')
video_url = asx.find('.//REF').attrib['HREF']
title = self._search_regex(
r'''(?x)player\.setClipDetails\(
(?:(?:[0-9]+|"[^"]+"),\s*){2}
"([^"]+",\s*"[^"]+)"
''',
webpage, 'title').replace('", "', ', ')
description = self._html_search_regex(
r'(?s)<span id="MainContentPlaceHolder_CaptionsBlock_WitnessInfo">(.*?)</span>',
webpage, 'description')
return {
'id': video_id,
'ext': 'asf',
'url': video_url,
'title': title,
'description': description,
}

View File

@@ -3,6 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
US_RATINGS,
)
class PBSIE(InfoExtractor):
@@ -13,7 +16,7 @@ class PBSIE(InfoExtractor):
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Player
video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/
video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
'''
@@ -57,6 +60,11 @@ class PBSIE(InfoExtractor):
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
rating_str = info.get('rating')
if rating_str is not None:
rating_str = rating_str.rpartition('-')[2]
age_limit = US_RATINGS.get(rating_str)
return {
'id': video_id,
'title': info['title'],
@@ -65,4 +73,5 @@ class PBSIE(InfoExtractor):
'description': info['program'].get('description'),
'thumbnail': info.get('image_url'),
'duration': info.get('duration'),
'age_limit': age_limit,
}

View File

@@ -8,6 +8,7 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
str_to_int,
)
from ..aes import (
aes_decrypt_text
@@ -27,6 +28,12 @@ class PornHubIE(InfoExtractor):
}
}
def _extract_count(self, pattern, webpage, name):
count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
if count:
count = str_to_int(count)
return count
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
@@ -37,11 +44,19 @@ class PornHubIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, 'uploader', fatal=False)
video_uploader = self._html_search_regex(
r'(?s)<div class="video-info-row">\s*From:&nbsp;.+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = compat_urllib_parse.unquote(thumbnail)
view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
comment_count = self._extract_count(
r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
if webpage.find('"encrypted":true') != -1:
password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
@@ -77,6 +92,10 @@ class PornHubIE(InfoExtractor):
'uploader': video_uploader,
'title': video_title,
'thumbnail': thumbnail,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
'comment_count': comment_count,
'formats': formats,
'age_limit': 18,
}

View File

@@ -1,4 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -6,16 +8,17 @@ from .common import InfoExtractor
class RadioFranceIE(InfoExtractor):
_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
IE_NAME = u'radiofrance'
IE_NAME = 'radiofrance'
_TEST = {
u'url': u'http://maison.radiofrance.fr/radiovisions/one-one',
u'file': u'one-one.ogg',
u'md5': u'bdbb28ace95ed0e04faab32ba3160daf',
u'info_dict': {
u"title": u"One to one",
u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
u"uploader": u"Thomas Hercouët",
'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
'info_dict': {
'id': 'one-one',
'ext': 'ogg',
"title": "One to one",
"description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
"uploader": "Thomas Hercouët",
},
}
@@ -24,27 +27,28 @@ class RadioFranceIE(InfoExtractor):
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title')
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
webpage, u'description', fatal=False)
webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
webpage, u'uploader', fatal=False)
webpage, 'uploader', fatal=False)
formats_str = self._html_search_regex(
r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
webpage, u'audio URLs')
webpage, 'audio URLs')
formats = [
{
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
'preference': i,
}
for fm in
re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)
for i, fm in
enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
]
# No sorting, we don't know any more about these formats
self._sort_formats(formats)
return {
'id': video_id,

View File

@@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
unescapeHTML,
)
class RTSIE(InfoExtractor):
IE_DESC = 'RTS.ch'
_VALID_URL = r'^https?://(?:www\.)?rts\.ch/archives/tv/[^/]+/(?P<id>[0-9]+)-.*?\.html'
_TEST = {
'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
'md5': '753b877968ad8afaeddccc374d4256a5',
'info_dict': {
'id': '3449373',
'ext': 'mp4',
'duration': 1488,
'title': 'Les Enfants Terribles',
'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
'uploader': 'Divers',
'upload_date': '19680921',
'timestamp': -40280400,
},
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
all_info = self._download_json(
'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id)
info = all_info['video']['JSONinfo']
upload_timestamp = parse_iso8601(info.get('broadcast_date'))
duration = parse_duration(info.get('duration'))
thumbnail = unescapeHTML(info.get('preview_image_url'))
formats = [{
'format_id': fid,
'url': furl,
'tbr': int_or_none(self._search_regex(
r'-([0-9]+)k\.', furl, 'bitrate', default=None)),
} for fid, furl in info['streams'].items()]
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': info['title'],
'description': info.get('intro'),
'duration': duration,
'uploader': info.get('programName'),
'timestamp': upload_timestamp,
}

View File

@@ -100,7 +100,7 @@ class SoundcloudIE(InfoExtractor):
def report_resolve(self, video_id):
"""Report information extraction."""
self.to_screen(u'%s: Resolving id' % video_id)
self.to_screen('%s: Resolving id' % video_id)
@classmethod
def _resolv_url(cls, url):
@@ -124,45 +124,46 @@ class SoundcloudIE(InfoExtractor):
'description': info['description'],
'thumbnail': thumbnail,
}
formats = []
if info.get('downloadable', False):
# We can build a direct link to the song
format_url = (
'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
track_id, self._CLIENT_ID))
result['formats'] = [{
formats.append({
'format_id': 'download',
'ext': info.get('original_format', 'mp3'),
'url': format_url,
'vcodec': 'none',
}]
else:
# We have to retrieve the url
streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
stream_json = self._download_webpage(
streams_url,
track_id, 'Downloading track url')
'preference': 10,
})
formats = []
format_dict = json.loads(stream_json)
for key, stream_url in format_dict.items():
if key.startswith(u'http'):
formats.append({
'format_id': key,
'ext': ext,
'url': stream_url,
'vcodec': 'none',
})
elif key.startswith(u'rtmp'):
# The url doesn't have an rtmp app, we have to extract the playpath
url, path = stream_url.split('mp3:', 1)
formats.append({
'format_id': key,
'url': url,
'play_path': 'mp3:' + path,
'ext': ext,
'vcodec': 'none',
})
# We have to retrieve the url
streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
stream_json = self._download_webpage(
streams_url,
track_id, 'Downloading track url')
format_dict = json.loads(stream_json)
for key, stream_url in format_dict.items():
if key.startswith('http'):
formats.append({
'format_id': key,
'ext': ext,
'url': stream_url,
'vcodec': 'none',
})
elif key.startswith('rtmp'):
# The url doesn't have an rtmp app, we have to extract the playpath
url, path = stream_url.split('mp3:', 1)
formats.append({
'format_id': key,
'url': url,
'play_path': 'mp3:' + path,
'ext': ext,
'vcodec': 'none',
})
if not formats:
# We fallback to the stream_url in the original info, this
@@ -188,7 +189,7 @@ class SoundcloudIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
raise ExtractorError('Invalid URL: %s' % url)
track_id = mobj.group('track_id')
token = None
@@ -226,7 +227,7 @@ class SoundcloudSetIE(SoundcloudIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
raise ExtractorError('Invalid URL: %s' % url)
# extract uploader (which is in the url)
uploader = mobj.group(1)
@@ -243,7 +244,7 @@ class SoundcloudSetIE(SoundcloudIE):
info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
return
self.report_extraction(full_title)

View File

@@ -11,7 +11,9 @@ from ..utils import (
class TEDIE(SubtitlesInfoExtractor):
_VALID_URL = r'''(?x)http://www\.ted\.com/
_VALID_URL = r'''(?x)
(?P<proto>https?://)
(?P<type>www|embed)(?P<urlmain>\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
@@ -19,6 +21,7 @@ class TEDIE(SubtitlesInfoExtractor):
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
.*)$
'''
_TEST = {
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
@@ -48,6 +51,9 @@ class TEDIE(SubtitlesInfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
if m.group('type') == 'embed':
desktop_url = m.group('proto') + 'www' + m.group('urlmain')
return self.url_result(desktop_url, 'TED')
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url, name)

View File

@@ -0,0 +1,75 @@
from .common import InfoExtractor
import re
class ToypicsIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
_VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
_TEST = {
'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
'md5': '16e806ad6d6f58079d210fe30985e08b',
'info_dict': {
'id': '514',
'ext': 'mp4',
'title': 'Chance-Bulge\'d, 2',
'age_limit': 18,
'uploader': 'kidsune',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
title = self._html_search_regex(
r'<title>Toypics - ([^<]+)</title>', page, 'title')
username = self._html_search_regex(
r'toypics.net/([^/"]+)" class="user-name">', page, 'username')
return {
'id': video_id,
'url': video_url,
'title': title,
'uploader': username,
'age_limit': 18,
}
class ToypicsUserIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
_VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
username = mobj.group('username')
profile_page = self._download_webpage(
url, username, note='Retrieving profile page')
video_count = int(self._search_regex(
r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page,
'video count'))
PAGE_SIZE = 8
urls = []
page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
for n in range(1, page_count + 1):
lpage_url = url + '/public/%d' % n
lpage = self._download_webpage(
lpage_url, username,
note='Downloading page %d/%d' % (n, page_count))
urls.extend(
re.findall(
r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">',
lpage))
return {
'_type': 'playlist',
'id': username,
'entries': [{
'_type': 'url',
'url': eurl,
'ie_key': 'Toypics',
} for eurl in urls]
}

View File

@@ -0,0 +1,70 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
int_or_none,
parse_duration,
unified_strdate,
)
class VideoLecturesNetIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
IE_NAME = 'videolectures.net'
_TEST = {
'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
'info_dict': {
'id': 'promogram_igor_mekjavic_eng',
'ext': 'mp4',
'title': 'Automatics, robotics and biocybernetics',
'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
'upload_date': '20130627',
'duration': 565,
'thumbnail': 're:http://.*\.jpg',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
smil = self._download_xml(smil_url, video_id)
title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract')
description = (
None if description_el is None
else description_el.attrib['content'])
upload_date = unified_strdate(
find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
switch = smil.find('.//switch')
duration = parse_duration(switch.attrib.get('dur'))
thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
thumbnail = (
None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
formats = [{
'url': v.attrib['src'],
'width': int_or_none(v.attrib.get('width')),
'height': int_or_none(v.attrib.get('height')),
'filesize': int_or_none(v.attrib.get('size')),
'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
'ext': v.attrib.get('ext'),
} for v in switch.findall('./video')
if v.attrib.get('proto') == 'http']
return {
'id': video_id,
'title': title,
'description': description,
'upload_date': upload_date,
'duration': duration,
'thumbnail': thumbnail,
'formats': formats,
}

View File

@@ -1,29 +1,33 @@
from __future__ import unicode_literals
import re
from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
US_RATINGS,
)
from .subtitles import SubtitlesInfoExtractor
class VikiIE(SubtitlesInfoExtractor):
IE_NAME = u'viki'
IE_NAME = 'viki'
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
_TEST = {
u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
u'file': u'1023585v.mp4',
u'md5': u'a21454021c2646f5433514177e2caa5f',
u'info_dict': {
u'title': u'Heirs Episode 14',
u'uploader': u'SBS',
u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
u'upload_date': u'20131121',
u'age_limit': 13,
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'md5': 'a21454021c2646f5433514177e2caa5f',
'info_dict': {
'id': '1023585v',
'ext': 'mp4',
'title': 'Heirs Episode 14',
'uploader': 'SBS',
'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
'upload_date': '20131121',
'age_limit': 13,
},
u'skip': u'Blocked in the US',
'skip': 'Blocked in the US',
}
def _real_extract(self, url):
@@ -44,28 +48,21 @@ class VikiIE(SubtitlesInfoExtractor):
rating_str = self._html_search_regex(
r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
u'rating information', default='').strip()
RATINGS = {
'G': 0,
'PG': 10,
'PG-13': 13,
'R': 16,
'NC': 18,
}
age_limit = RATINGS.get(rating_str)
'rating information', default='').strip()
age_limit = US_RATINGS.get(rating_str)
info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
info_webpage = self._download_webpage(
info_url, video_id, note=u'Downloading info page')
info_url, video_id, note='Downloading info page')
if re.match(r'\s*<div\s+class="video-error', info_webpage):
raise ExtractorError(
u'Video %s is blocked from your location.' % video_id,
'Video %s is blocked from your location.' % video_id,
expected=True)
video_url = self._html_search_regex(
r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
upload_date_str = self._html_search_regex(
r'"created_at":"([^"]+)"', info_webpage, u'upload date')
r'"created_at":"([^"]+)"', info_webpage, 'upload date')
upload_date = (
unified_strdate(upload_date_str)
if upload_date_str is not None

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@@ -7,14 +9,14 @@ class WorldStarHipHopIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
_TEST = {
"url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
"file": "wshh6a7q1ny0G34ZwuIO.mp4",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
"id": "wshh6a7q1ny0G34ZwuIO",
"ext": "mp4",
"title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
@@ -23,41 +25,32 @@ class WorldStarHipHopIE(InfoExtractor):
m_vevo_id = re.search(r'videoId=(.*?)&amp?',
webpage_src)
if m_vevo_id is not None:
self.to_screen(u'Vevo video detected:')
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
webpage_src, u'video URL')
video_url = self._search_regex(
r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL')
if 'youtube' in video_url:
self.to_screen(u'Youtube video detected:')
return self.url_result(video_url, ie='Youtube')
if 'mp4' in video_url:
ext = 'mp4'
else:
ext = 'flv'
video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
video_title = self._html_search_regex(
r"<title>(.*)</title>", webpage_src, 'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
webpage_src, u'thumbnail', fatal=False)
thumbnail = self._html_search_regex(
r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail',
fatal=False)
if not thumbnail:
_title = r"""candytitles.*>(.*)</span>"""
mobj = re.search(_title, webpage_src)
if mobj is not None:
video_title = mobj.group(1)
results = [{
'id': video_id,
'url' : video_url,
'title' : video_title,
'thumbnail' : thumbnail,
'ext' : ext,
}]
return results
return {
'id': video_id,
'url': video_url,
'title': video_title,
'thumbnail': thumbnail,
}

View File

@@ -0,0 +1,50 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
)
class XBefIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking',
'md5': 'a478b565baff61634a98f5e5338be995',
'info_dict': {
'id': '5119',
'ext': 'mp4',
'title': 'md5:7358a9faef8b7b57acda7c04816f170e',
'age_limit': 18,
'thumbnail': 're:^http://.*\.jpg',
}
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
config_url_enc = self._download_webpage(
'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
note='Retrieving config URL')
config_url = compat_urllib_parse.unquote(config_url_enc)
config = self._download_xml(
config_url, video_id, note='Retrieving config')
video_url = config.find('./file').text
thumbnail = config.find('./image').text
return {
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'age_limit': 18,
}

View File

@@ -1,11 +1,10 @@
from __future__ import unicode_literals
import os
import re
import json
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
parse_duration,
str_to_int,
@@ -42,7 +41,6 @@ class XTubeIE(InfoExtractor):
r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
duration = parse_duration(self._html_search_regex(
r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
view_count = self._html_search_regex(
@@ -54,12 +52,18 @@ class XTubeIE(InfoExtractor):
if comment_count:
comment_count = str_to_int(comment_count)
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
format[0] += 'p'
format[1] += 'k'
format = "-".join(format)
player_quality_option = json.loads(self._html_search_regex(
r'playerQualityOption = ({.+?});', webpage, 'player quality option'))
QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080']
formats = [
{
'url': furl,
'format_id': format_id,
'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1,
} for format_id, furl in player_quality_option.items()
]
self._sort_formats(formats)
return {
'id': video_id,
@@ -69,9 +73,42 @@ class XTubeIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'url': video_url,
'ext': extension,
'format': format,
'format_id': format,
'formats': formats,
'age_limit': 18,
}
class XTubeUserIE(InfoExtractor):
IE_DESC = 'XTube user profile'
_VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
username = mobj.group('username')
profile_page = self._download_webpage(
url, username, note='Retrieving profile page')
video_count = int(self._search_regex(
r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page,
'video count'))
PAGE_SIZE = 25
urls = []
page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
for n in range(1, page_count + 1):
lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
lpage = self._download_webpage(
lpage_url, username,
note='Downloading page %d/%d' % (n, page_count))
urls.extend(
re.findall(r'addthis:url="([^"]+)"', lpage))
return {
'_type': 'playlist',
'id': username,
'entries': [{
'_type': 'url',
'url': eurl,
'ie_key': 'XTube',
} for eurl in urls]
}

View File

@@ -1,3 +1,6 @@
from __future__ import unicode_literals
import json
import re
import sys
@@ -17,24 +20,25 @@ from ..aes import (
class YouPornIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_TEST = {
u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
u'file': u'505835.mp4',
u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
u'info_dict': {
u"upload_date": u"20101221",
u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
u"uploader": u"Ask Dan And Jennifer",
u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
u"age_limit": 18,
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
'info_dict': {
'id': '505835',
'ext': 'mp4',
'upload_date': '20101221',
'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
'uploader': 'Ask Dan And Jennifer',
'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
url = 'http://www.' + mobj.group('url')
url = mobj.group('proto') + 'www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -42,7 +46,7 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage)
# Get JSON parameters
json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters')
try:
params = json.loads(json_params)
except:
@@ -61,7 +65,7 @@ class YouPornIE(InfoExtractor):
# Get all of the links from the page
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
webpage, u'download list').strip()
webpage, 'download list').strip()
LINK_RE = r'<a href="([^"]+)">'
links = re.findall(LINK_RE, download_list_html)
@@ -86,7 +90,7 @@ class YouPornIE(InfoExtractor):
resolution = format_parts[0]
height = int(resolution[:-len('p')])
bitrate = int(format_parts[1][:-len('k')])
format = u'-'.join(format_parts) + u'-' + dn
format = '-'.join(format_parts) + '-' + dn
formats.append({
'url': video_url,

View File

@@ -176,32 +176,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# 3d videos
'82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
'83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
'84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
'85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
'100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
'101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
'102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
# Apple HTTP Live Streaming
'92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
'93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
'94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
'95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
'96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
'132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
'151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
# DASH mp4 video
'133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
@@ -215,13 +215,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
'242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
'243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
'244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
'245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
'246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
'247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
'248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
# Dash webm audio
'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
@@ -1130,14 +1130,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
def _real_extract(self, url):
proto = (
u'http' if self._downloader.params.get('prefer_insecure', False)
else u'https')
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url)
# Get video webpage
url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
@@ -1162,7 +1166,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'asv': 3,
'sts':'1588',
})
video_info_url = 'https://www.youtube.com/get_video_info?' + data
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
@@ -1170,7 +1174,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
age_gate = False
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
@@ -1445,7 +1449,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import calendar
import contextlib
import ctypes
import datetime
@@ -501,13 +502,13 @@ def orderedSet(iterable):
res.append(el)
return res
def unescapeHTML(s):
"""
@param s a string
"""
assert type(s) == type(u'')
result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
def unescapeHTML(s):
if s is None:
return None
assert type(s) == compat_str
result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
return result
@@ -761,8 +762,37 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_response = http_response
def parse_iso8601(date_str):
""" Return a UNIX timestamp from the given date """
if date_str is None:
return None
m = re.search(
r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
date_str)
if not m:
timezone = datetime.timedelta()
else:
date_str = date_str[:-len(m.group(0))]
if not m.group('sign'):
timezone = datetime.timedelta()
else:
sign = 1 if m.group('sign') == '+' else -1
timezone = datetime.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
return calendar.timegm(dt.timetuple())
def unified_strdate(date_str):
"""Return a string with the date in the format YYYYMMDD"""
if date_str is None:
return None
upload_date = None
#Replace commas
date_str = date_str.replace(',', ' ')
@@ -1122,11 +1152,11 @@ def setproctitle(title):
libc = ctypes.cdll.LoadLibrary("libc.so.6")
except OSError:
return
title = title
buf = ctypes.create_string_buffer(len(title) + 1)
buf.value = title.encode('utf-8')
title_bytes = title.encode('utf-8')
buf = ctypes.create_string_buffer(len(title_bytes))
buf.value = title_bytes
try:
libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
libc.prctl(15, buf, 0, 0, 0)
except AttributeError:
return # Strange libc, just skip this
@@ -1289,3 +1319,12 @@ if sys.version_info < (3, 0) and sys.platform == 'win32':
return getpass.getpass(prompt, *args, **kwargs)
else:
compat_getpass = getpass.getpass
US_RATINGS = {
'G': 0,
'PG': 10,
'PG-13': 13,
'R': 16,
'NC': 18,
}

View File

@@ -1,2 +1,2 @@
__version__ = '2014.03.18'
__version__ = '2014.03.24'