aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST.in1
-rw-r--r--Makefile3
-rw-r--r--README.md71
-rwxr-xr-xdevscripts/release.sh6
-rw-r--r--docs/.gitignore1
-rw-r--r--docs/Makefile177
-rw-r--r--docs/conf.py71
-rw-r--r--docs/index.rst23
-rw-r--r--docs/module_guide.rst67
-rw-r--r--test/helper.py47
-rw-r--r--test/test_YoutubeDL.py74
-rw-r--r--test/test_all_urls.py20
-rw-r--r--test/test_download.py47
-rw-r--r--test/test_playlists.py55
-rw-r--r--test/test_utils.py14
-rw-r--r--youtube_dl/YoutubeDL.py52
-rw-r--r--youtube_dl/__init__.py38
-rw-r--r--youtube_dl/downloader/common.py5
-rw-r--r--youtube_dl/downloader/f4m.py1
-rw-r--r--youtube_dl/downloader/hls.py6
-rw-r--r--youtube_dl/downloader/http.py2
-rw-r--r--youtube_dl/extractor/__init__.py24
-rw-r--r--youtube_dl/extractor/addanime.py31
-rw-r--r--youtube_dl/extractor/appletrailers.py29
-rw-r--r--youtube_dl/extractor/arte.py145
-rw-r--r--youtube_dl/extractor/auengine.py21
-rw-r--r--youtube_dl/extractor/bloomberg.py36
-rw-r--r--youtube_dl/extractor/breakcom.py7
-rw-r--r--youtube_dl/extractor/brightcove.py2
-rw-r--r--youtube_dl/extractor/byutv.py50
-rw-r--r--youtube_dl/extractor/c56.py29
-rw-r--r--youtube_dl/extractor/canalplus.py2
-rw-r--r--youtube_dl/extractor/cinemassacre.py46
-rw-r--r--youtube_dl/extractor/clipfish.py37
-rw-r--r--youtube_dl/extractor/clipsyndicate.py23
-rw-r--r--youtube_dl/extractor/cnet.py75
-rw-r--r--youtube_dl/extractor/comedycentral.py139
-rw-r--r--youtube_dl/extractor/common.py13
-rw-r--r--youtube_dl/extractor/cspan.py30
-rw-r--r--youtube_dl/extractor/dailymotion.py11
-rw-r--r--youtube_dl/extractor/discovery.py5
-rw-r--r--youtube_dl/extractor/ehow.py43
-rw-r--r--youtube_dl/extractor/franceculture.py77
-rw-r--r--youtube_dl/extractor/generic.py75
-rw-r--r--youtube_dl/extractor/huffpost.py3
-rw-r--r--youtube_dl/extractor/ign.py102
-rw-r--r--youtube_dl/extractor/instagram.py68
-rw-r--r--youtube_dl/extractor/keezmovies.py26
-rw-r--r--youtube_dl/extractor/kickstarter.py46
-rw-r--r--youtube_dl/extractor/metacritic.py3
-rw-r--r--youtube_dl/extractor/mooshare.py2
-rw-r--r--youtube_dl/extractor/morningstar.py47
-rw-r--r--youtube_dl/extractor/motorsport.py63
-rw-r--r--youtube_dl/extractor/musicplayon.py75
-rw-r--r--youtube_dl/extractor/nba.py8
-rw-r--r--youtube_dl/extractor/niconico.py81
-rw-r--r--youtube_dl/extractor/ntv.py157
-rw-r--r--youtube_dl/extractor/oe1.py40
-rw-r--r--youtube_dl/extractor/ooyala.py2
-rw-r--r--youtube_dl/extractor/pornhd.py77
-rw-r--r--youtube_dl/extractor/pyvideo.py61
-rw-r--r--youtube_dl/extractor/radiofrance.py34
-rw-r--r--youtube_dl/extractor/roxwel.py52
-rw-r--r--youtube_dl/extractor/rts.py151
-rw-r--r--youtube_dl/extractor/rutube.py28
-rw-r--r--youtube_dl/extractor/slashdot.py24
-rw-r--r--youtube_dl/extractor/smotri.py51
-rw-r--r--youtube_dl/extractor/soundcloud.py69
-rw-r--r--youtube_dl/extractor/ted.py43
-rw-r--r--youtube_dl/extractor/tf1.py40
-rw-r--r--youtube_dl/extractor/urort.py61
-rw-r--r--youtube_dl/extractor/veoh.py119
-rw-r--r--youtube_dl/extractor/vice.py38
-rw-r--r--youtube_dl/extractor/vk.py2
-rw-r--r--youtube_dl/extractor/washingtonpost.py103
-rw-r--r--youtube_dl/extractor/wat.py52
-rw-r--r--youtube_dl/extractor/wdr.py86
-rw-r--r--youtube_dl/extractor/wimp.py27
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py43
-rw-r--r--youtube_dl/extractor/xbef.py50
-rw-r--r--youtube_dl/extractor/xtube.py38
-rw-r--r--youtube_dl/extractor/yahoo.py38
-rw-r--r--youtube_dl/extractor/youtube.py136
-rw-r--r--youtube_dl/jsinterp.py116
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py5
-rw-r--r--youtube_dl/utils.py63
-rw-r--r--youtube_dl/version.py2
87 files changed, 3015 insertions, 1048 deletions
diff --git a/MANIFEST.in b/MANIFEST.in
index 8f8af7a7f..d43cc1f3b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,4 @@ include test/*.py
include test/*.json
include youtube-dl.bash-completion
include youtube-dl.1
+recursive-include docs Makefile conf.py *.rst
diff --git a/Makefile b/Makefile
index c6d09932b..f7d917d09 100644
--- a/Makefile
+++ b/Makefile
@@ -72,8 +72,9 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
--exclude '__pycache' \
--exclude '.git' \
--exclude 'testdata' \
+ --exclude 'docs/_build' \
-- \
- bin devscripts test youtube_dl \
+ bin devscripts test youtube_dl docs \
CHANGELOG LICENSE README.md README.txt \
Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion setup.py \
youtube-dl
diff --git a/README.md b/README.md
index a10b13055..1ba1486d2 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,9 @@ which means you can modify it, redistribute it or use it however you like.
--user-agent UA specify a custom user agent
--referer REF specify a custom referer, use if the video
access is restricted to one domain
+ --add-header FIELD:VALUE specify a custom HTTP header and its value,
+ separated by a colon ':'. You can use this
+ option multiple times
--list-extractors List all supported extractors and the URLs
they would handle
--extractor-descriptions Output descriptions of all supported
@@ -62,6 +65,7 @@ which means you can modify it, redistribute it or use it however you like.
configuration in ~/.config/youtube-dl.conf
(%APPDATA%/youtube-dl/config.txt on
Windows)
+ --encoding ENCODING Force the specified encoding (experimental)
## Video Selection:
--playlist-start NUMBER playlist video to start at (default is 1)
@@ -166,6 +170,7 @@ which means you can modify it, redistribute it or use it however you like.
## Verbosity / Simulation Options:
-q, --quiet activates quiet mode
+ --no-warnings Ignore warnings
-s, --simulate do not download the video and do not write
anything to disk
--skip-download do not download the video
@@ -177,7 +182,9 @@ which means you can modify it, redistribute it or use it however you like.
--get-duration simulate, quiet but print video length
--get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format
- -j, --dump-json simulate, quiet but print JSON information
+ -j, --dump-json simulate, quiet but print JSON information.
+ See --output for a description of available
+ keys.
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar
@@ -364,7 +371,67 @@ If you want to create a build of youtube-dl yourself, you'll need
### Adding support for a new site
-If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
+If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
+
+1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
+2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
+3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
+ # coding: utf-8
+ from __future__ import unicode_literals
+
+ import re
+
+ from .common import InfoExtractor
+
+
+ class YourExtractorIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://yourextractor.com/watch/42',
+ 'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+ 'info_dict': {
+ 'id': '42',
+ 'ext': 'mp4',
+ 'title': 'Video title goes here',
+ # TODO more properties, either as:
+ # * A value
+ # * MD5 checksum; start the string with md5:
+ # * A regular expression; start the string with re:
+ # * Any Python type (for example int or float)
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ # TODO more code goes here, for example ...
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ # TODO more properties (see youtube_dl/extractor/common.py)
+ }
+
+
+5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done.
+7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
+8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
+9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
+
+ $ git add youtube_dl/extractor/__init__.py
+ $ git add youtube_dl/extractor/yourextractor.py
+ $ git commit -m '[yourextractor] Add new extractor'
+ $ git push origin yourextractor
+
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+
+In any case, thank you very much for your contributions!
# BUGS
diff --git a/devscripts/release.sh b/devscripts/release.sh
index aa3119c42..2974a7c3e 100755
--- a/devscripts/release.sh
+++ b/devscripts/release.sh
@@ -22,6 +22,12 @@ fi
if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
version="$1"
+major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p')
+if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then
+ echo "$version does not start with today's date!"
+ exit 1
+fi
+
if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi
if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi
useless_files=$(find youtube_dl -type f -not -name '*.py')
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 000000000..69fa449dd
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+_build/
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 000000000..712218045
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/youtube-dl.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/youtube-dl.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/youtube-dl"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/youtube-dl"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 000000000..4a04ad779
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+#
+# youtube-dl documentation build configuration file, created by
+# sphinx-quickstart on Fri Mar 14 21:05:43 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+# Allows to import youtube_dl
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# -- General configuration ------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'youtube-dl'
+copyright = u'2014, Ricardo Garcia Gonzalez'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+import youtube_dl
+version = youtube_dl.__version__
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'youtube-dldoc'
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 000000000..b746ff95b
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,23 @@
+Welcome to youtube-dl's documentation!
+======================================
+
+*youtube-dl* is a command-line program to download videos from YouTube.com and more sites.
+It can also be used in Python code.
+
+Developer guide
+---------------
+
+This section contains information for using *youtube-dl* from Python programs.
+
+.. toctree::
+ :maxdepth: 2
+
+ module_guide
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/docs/module_guide.rst b/docs/module_guide.rst
new file mode 100644
index 000000000..03d72882e
--- /dev/null
+++ b/docs/module_guide.rst
@@ -0,0 +1,67 @@
+Using the ``youtube_dl`` module
+===============================
+
+When using the ``youtube_dl`` module, you start by creating an instance of :class:`YoutubeDL` and adding all the available extractors:
+
+.. code-block:: python
+
+ >>> from youtube_dl import YoutubeDL
+ >>> ydl = YoutubeDL()
+ >>> ydl.add_default_info_extractors()
+
+Extracting video information
+----------------------------
+
+You use the :meth:`YoutubeDL.extract_info` method for getting the video information, which returns a dictionary:
+
+.. code-block:: python
+
+ >>> info = ydl.extract_info('http://www.youtube.com/watch?v=BaW_jenozKc', download=False)
+ [youtube] Setting language
+ [youtube] BaW_jenozKc: Downloading webpage
+ [youtube] BaW_jenozKc: Downloading video info webpage
+ [youtube] BaW_jenozKc: Extracting video information
+ >>> info['title']
+ 'youtube-dl test video "\'/\\ä↭𝕐'
+ >>> info['height'], info['width']
+ (720, 1280)
+
+If you want to download or play the video you can get its url:
+
+.. code-block:: python
+
+ >>> info['url']
+ 'https://...'
+
+Extracting playlist information
+-------------------------------
+
+The playlist information is extracted in a similar way, but the dictionary is a bit different:
+
+.. code-block:: python
+
+ >>> playlist = ydl.extract_info('http://www.ted.com/playlists/13/open_source_open_world', download=False)
+ [TED] open_source_open_world: Downloading playlist webpage
+ ...
+ >>> playlist['title']
+ 'Open-source, open world'
+
+
+
+You can access the videos in the playlist with the ``entries`` field:
+
+.. code-block:: python
+
+ >>> for video in playlist['entries']:
+ ... print('Video #%d: %s' % (video['playlist_index'], video['title']))
+
+ Video #1: How Arduino is open-sourcing imagination
+ Video #2: The year open data went worldwide
+ Video #3: Massive-scale online collaboration
+ Video #4: The art of asking
+ Video #5: How cognitive surplus will change the world
+ Video #6: The birth of Wikipedia
+ Video #7: Coding a better government
+ Video #8: The era of open innovation
+ Video #9: The currency of the new economy is trust
+
diff --git a/test/helper.py b/test/helper.py
index 17de951c5..8739f816c 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -9,7 +9,10 @@ import sys
import youtube_dl.extractor
from youtube_dl import YoutubeDL
-from youtube_dl.utils import preferredencoding
+from youtube_dl.utils import (
+ compat_str,
+ preferredencoding,
+)
def get_params(override=None):
@@ -83,3 +86,45 @@ def gettestcases():
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+def expect_info_dict(self, expected_dict, got_dict):
+ for info_field, expected in expected_dict.items():
+ if isinstance(expected, compat_str) and expected.startswith('re:'):
+ got = got_dict.get(info_field)
+ match_str = expected[len('re:'):]
+ match_rex = re.compile(match_str)
+
+ self.assertTrue(
+ isinstance(got, compat_str) and match_rex.match(got),
+ u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+ elif isinstance(expected, type):
+ got = got_dict.get(info_field)
+ self.assertTrue(isinstance(got, expected),
+ u'Expected type %r, but got value %r of type %r' % (expected, got, type(got)))
+ else:
+ if isinstance(expected, compat_str) and expected.startswith('md5:'):
+ got = 'md5:' + md5(got_dict.get(info_field))
+ else:
+ got = got_dict.get(info_field)
+ self.assertEqual(expected, got,
+ u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+
+ # Check for the presence of mandatory fields
+ for key in ('id', 'url', 'title', 'ext'):
+ self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
+ # Check for mandatory fields that are automatically set by YoutubeDL
+ for key in ['webpage_url', 'extractor', 'extractor_key']:
+ self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
+
+ # Are checkable fields missing from the test case definition?
+ test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
+ for key, value in got_dict.items()
+ if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
+ missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
+ if missing_keys:
+ sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
+ self.assertFalse(
+ missing_keys,
+ 'Missing keys in test definition: %s' % (
+ ', '.join(sorted(missing_keys))))
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 1f3ccaea0..2902dbec7 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -26,16 +26,27 @@ class YDL(FakeYDL):
self.msgs.append(msg)
+def _make_result(formats, **kwargs):
+ res = {
+ 'formats': formats,
+ 'id': 'testid',
+ 'title': 'testttitle',
+ 'extractor': 'testex',
+ }
+ res.update(**kwargs)
+ return res
+
+
class TestFormatSelection(unittest.TestCase):
def test_prefer_free_formats(self):
# Same resolution => download webm
ydl = YDL()
ydl.params['prefer_free_formats'] = True
formats = [
- {'ext': 'webm', 'height': 460},
- {'ext': 'mp4', 'height': 460},
+ {'ext': 'webm', 'height': 460, 'url': 'x'},
+ {'ext': 'mp4', 'height': 460, 'url': 'y'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
ydl.process_ie_result(info_dict)
@@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL()
ydl.params['prefer_free_formats'] = True
formats = [
- {'ext': 'webm', 'height': 720},
- {'ext': 'mp4', 'height': 1080},
+ {'ext': 'webm', 'height': 720, 'url': 'a'},
+ {'ext': 'mp4', 'height': 1080, 'url': 'b'},
]
info_dict['formats'] = formats
yie = YoutubeIE(ydl)
@@ -60,9 +71,9 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL()
ydl.params['prefer_free_formats'] = False
formats = [
- {'ext': 'webm', 'height': 720},
- {'ext': 'mp4', 'height': 720},
- {'ext': 'flv', 'height': 720},
+ {'ext': 'webm', 'height': 720, 'url': '_'},
+ {'ext': 'mp4', 'height': 720, 'url': '_'},
+ {'ext': 'flv', 'height': 720, 'url': '_'},
]
info_dict['formats'] = formats
yie = YoutubeIE(ydl)
@@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL()
ydl.params['prefer_free_formats'] = False
formats = [
- {'ext': 'flv', 'height': 720},
- {'ext': 'webm', 'height': 720},
+ {'ext': 'flv', 'height': 720, 'url': '_'},
+ {'ext': 'webm', 'height': 720, 'url': '_'},
]
info_dict['formats'] = formats
yie = YoutubeIE(ydl)
@@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase):
{'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3},
{'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4},
]
- info_dict = {
- 'formats': formats, 'extractor': 'test', 'id': 'testvid'}
+ info_dict = _make_result(formats)
ydl = YDL()
ydl.process_ie_result(info_dict)
@@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection(self):
formats = [
- {'format_id': '35', 'ext': 'mp4', 'preference': 1},
- {'format_id': '45', 'ext': 'webm', 'preference': 2},
- {'format_id': '47', 'ext': 'webm', 'preference': 3},
- {'format_id': '2', 'ext': 'flv', 'preference': 4},
+ {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'},
+ {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'},
+ {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'},
+ {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': '20/47'})
ydl.process_ie_result(info_dict.copy())
@@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection_audio(self):
formats = [
- {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'},
- {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'},
- {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'},
- {'format_id': 'vid', 'ext': 'mp4', 'preference': 4},
+ {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'},
+ {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'},
+ {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'},
+ {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': 'bestaudio'})
ydl.process_ie_result(info_dict.copy())
@@ -172,10 +182,10 @@ class TestFormatSelection(unittest.TestCase):
self.assertEqual(downloaded['format_id'], 'audio-low')
formats = [
- {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1},
- {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2},
+ {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'},
+ {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': 'bestaudio/worstaudio/best'})
ydl.process_ie_result(info_dict.copy())
@@ -184,11 +194,11 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection_video(self):
formats = [
- {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none'},
- {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none'},
- {'format_id': 'vid', 'ext': 'mp4', 'preference': 3},
+ {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'},
+ {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'},
+ {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': 'bestvideo'})
ydl.process_ie_result(info_dict.copy())
@@ -217,10 +227,12 @@ class TestFormatSelection(unittest.TestCase):
for f1id, f2id in zip(order, order[1:]):
f1 = YoutubeIE._formats[f1id].copy()
f1['format_id'] = f1id
+ f1['url'] = 'url:' + f1id
f2 = YoutubeIE._formats[f2id].copy()
f2['format_id'] = f2id
+ f2['url'] = 'url:' + f2id
- info_dict = {'formats': [f1, f2], 'extractor': 'youtube'}
+ info_dict = _make_result([f1, f2], extractor='youtube')
ydl = YDL()
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
@@ -228,7 +240,7 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1id)
- info_dict = {'formats': [f2, f1], 'extractor': 'youtube'}
+ info_dict = _make_result([f2, f1], extractor='youtube')
ydl = YDL()
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 39ac8b8a1..577f6ac32 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -143,5 +143,25 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
+ def test_ComedyCentralShows(self):
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
+ ['ComedyCentralShows'])
+
+ def test_yahoo_https(self):
+ # https://github.com/rg3/youtube-dl/issues/2701
+ self.assertMatch(
+ 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
+ ['Yahoo'])
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_download.py b/test/test_download.py
index 815f5bb09..f171c10ba 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -9,16 +9,16 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
get_params,
gettestcases,
- try_rm,
+ expect_info_dict,
md5,
- report_warning
+ try_rm,
+ report_warning,
)
import hashlib
import io
import json
-import re
import socket
import youtube_dl.YoutubeDL
@@ -135,45 +135,8 @@ def generator(test_case):
self.assertEqual(md5_for_file, tc['md5'])
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
- for (info_field, expected) in tc.get('info_dict', {}).items():
- if isinstance(expected, compat_str) and expected.startswith('re:'):
- got = info_dict.get(info_field)
- match_str = expected[len('re:'):]
- match_rex = re.compile(match_str)
-
- self.assertTrue(
- isinstance(got, compat_str) and match_rex.match(got),
- u'field %s (value: %r) should match %r' % (info_field, got, match_str))
- elif isinstance(expected, type):
- got = info_dict.get(info_field)
- self.assertTrue(isinstance(got, expected),
- u'Expected type %r, but got value %r of type %r' % (expected, got, type(got)))
- else:
- if isinstance(expected, compat_str) and expected.startswith('md5:'):
- got = 'md5:' + md5(info_dict.get(info_field))
- else:
- got = info_dict.get(info_field)
- self.assertEqual(expected, got,
- u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
-
- # Check for the presence of mandatory fields
- for key in ('id', 'url', 'title', 'ext'):
- self.assertTrue(key in info_dict.keys() and info_dict[key])
- # Check for mandatory fields that are automatically set by YoutubeDL
- for key in ['webpage_url', 'extractor', 'extractor_key']:
- self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
-
- # Are checkable fields missing from the test case definition?
- test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
- for key, value in info_dict.items()
- if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
- missing_keys = set(test_info_dict.keys()) - set(tc.get('info_dict', {}).keys())
- if missing_keys:
- sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
- self.assertFalse(
- missing_keys,
- 'Missing keys in test definition: %s' % (
- ','.join(sorted(missing_keys))))
+
+ expect_info_dict(self, tc.get('info_dict', {}), info_dict)
finally:
try_rm_tcs_files()
diff --git a/test/test_playlists.py b/test/test_playlists.py
index 4c9c34057..5fb679aa1 100644
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -9,8 +9,10 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL
-
+from test.helper import (
+ expect_info_dict,
+ FakeYDL,
+)
from youtube_dl.extractor import (
AcademicEarthCourseIE,
@@ -38,6 +40,9 @@ from youtube_dl.extractor import (
GenericIE,
TEDIE,
ToypicsUserIE,
+ XTubeUserIE,
+ InstagramUserIE,
+ CSpanIE,
)
@@ -278,5 +283,51 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['id'], 'Mikey')
self.assertTrue(len(result['entries']) >= 17)
+ def test_xtube_user(self):
+ dl = FakeYDL()
+ ie = XTubeUserIE(dl)
+ result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'greenshowers')
+ self.assertTrue(len(result['entries']) >= 155)
+
+ def test_InstagramUser(self):
+ dl = FakeYDL()
+ ie = InstagramUserIE(dl)
+ result = ie.extract('http://instagram.com/porsche')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'porsche')
+ self.assertTrue(len(result['entries']) >= 2)
+ test_video = next(
+ e for e in result['entries']
+ if e['id'] == '614605558512799803_462752227')
+ dl.add_default_extra_info(test_video, ie, '(irrelevant URL)')
+ dl.process_video_result(test_video, download=False)
+ EXPECTED = {
+ 'id': '614605558512799803_462752227',
+ 'ext': 'mp4',
+ 'title': '#Porsche Intelligent Performance.',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Porsche',
+ 'uploader_id': 'porsche',
+ 'timestamp': 1387486713,
+ 'upload_date': '20131219',
+ }
+ expect_info_dict(self, EXPECTED, test_video)
+
+ def test_CSpan_playlist(self):
+ dl = FakeYDL()
+ ie = CSpanIE(dl)
+ result = ie.extract(
+ 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], '342759')
+ self.assertEqual(
+ result['title'], 'General Motors Ignition Switch Recall')
+ self.assertEqual(len(result['entries']), 9)
+ whole_duration = sum(e['duration'] for e in result['entries'])
+ self.assertEqual(whole_duration, 14855)
+
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py
index 7ee74e36c..2348c0415 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests
import io
+import json
import xml.etree.ElementTree
#from youtube_dl.utils import htmlentity_transform
@@ -35,6 +36,8 @@ from youtube_dl.utils import (
url_basename,
urlencode_postdata,
xpath_with_ns,
+ parse_iso8601,
+ strip_jsonp,
)
if sys.version_info < (3, 0):
@@ -266,5 +269,16 @@ class TestUtil(unittest.TestCase):
data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
self.assertTrue(isinstance(data, bytes))
+ def test_parse_iso8601(self):
+ self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
+ self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
+ self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
+
+ def test_strip_jsonp(self):
+ stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
+ d = json.loads(stripped)
+ self.assertEqual(d, [{"id": "532cb", "x": 3}])
+
+
if __name__ == '__main__':
unittest.main()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index c5d08b0bb..5794fdbe9 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -8,6 +8,7 @@ import datetime
import errno
import io
import json
+import locale
import os
import platform
import re
@@ -94,6 +95,7 @@ class YoutubeDL(object):
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
+ no_warnings: Do not print out anything for warnings.
forceurl: Force printing final URL.
forcetitle: Force printing title.
forceid: Force printing ID.
@@ -158,6 +160,7 @@ class YoutubeDL(object):
include_ads: Download ads as well
default_search: Prepend this string if an input url is not valid.
'auto' for elaborate guessing
+ encoding: Use this encoding instead of the system-specified.
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -376,6 +379,8 @@ class YoutubeDL(object):
if self.params.get('logger') is not None:
self.params['logger'].warning(message)
else:
+ if self.params.get('no_warnings'):
+ return
if self._err_file.isatty() and os.name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
else:
@@ -512,13 +517,7 @@ class YoutubeDL(object):
'_type': 'compat_list',
'entries': ie_result,
}
- self.add_extra_info(ie_result,
- {
- 'extractor': ie.IE_NAME,
- 'webpage_url': url,
- 'webpage_url_basename': url_basename(url),
- 'extractor_key': ie.ie_key(),
- })
+ self.add_default_extra_info(ie_result, ie, url)
if process:
return self.process_ie_result(ie_result, download, extra_info)
else:
@@ -537,6 +536,14 @@ class YoutubeDL(object):
else:
self.report_error('no suitable InfoExtractor for URL %s' % url)
+ def add_default_extra_info(self, ie_result, ie, url):
+ self.add_extra_info(ie_result, {
+ 'extractor': ie.IE_NAME,
+ 'webpage_url': url,
+ 'webpage_url_basename': url_basename(url),
+ 'extractor_key': ie.ie_key(),
+ })
+
def process_ie_result(self, ie_result, download=True, extra_info={}):
"""
Take the result of the ie(may be modified) and resolve all unresolved
@@ -695,6 +702,11 @@ class YoutubeDL(object):
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
+ if 'id' not in info_dict:
+ raise ExtractorError('Missing "id" field in extractor result')
+ if 'title' not in info_dict:
+ raise ExtractorError('Missing "title" field in extractor result')
+
if 'playlist' not in info_dict:
# It isn't part of a playlist
info_dict['playlist'] = None
@@ -726,6 +738,9 @@ class YoutubeDL(object):
# We check that all the formats have the format and format_id fields
for i, format in enumerate(formats):
+ if 'url' not in format:
+ raise ExtractorError('Missing "url" key in result (index %d)' % i)
+
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
if format.get('format') is None:
@@ -736,7 +751,7 @@ class YoutubeDL(object):
)
# Automatically determine file extension if missing
if 'ext' not in format:
- format['ext'] = determine_ext(format['url'])
+ format['ext'] = determine_ext(format['url']).lower()
format_limit = self.params.get('format_limit', None)
if format_limit:
@@ -861,7 +876,7 @@ class YoutubeDL(object):
try:
dn = os.path.dirname(encodeFilename(filename))
- if dn != '' and not os.path.exists(dn):
+ if dn and not os.path.exists(dn):
os.makedirs(dn)
except (OSError, IOError) as err:
self.report_error('unable to create directory ' + compat_str(err))
@@ -1195,6 +1210,9 @@ class YoutubeDL(object):
def print_debug_header(self):
if not self.params.get('verbose'):
return
+
+ write_string('[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' %
+ (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, self.get_encoding()))
write_string('[debug] youtube-dl version ' + __version__ + '\n')
try:
sp = subprocess.Popen(
@@ -1259,3 +1277,19 @@ class YoutubeDL(object):
# (See https://github.com/rg3/youtube-dl/issues/1309 for details)
opener.addheaders = []
self._opener = opener
+
+ def encode(self, s):
+ if isinstance(s, bytes):
+ return s # Already encoded
+
+ try:
+ return s.encode(self.get_encoding())
+ except UnicodeEncodeError as err:
+ err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
+ raise
+
+ def get_encoding(self):
+ encoding = self.params.get('encoding')
+ if encoding is None:
+ encoding = preferredencoding()
+ return encoding
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index a4cbdb0bd..aba8b4537 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -51,6 +51,7 @@ __authors__ = (
'David Wagner',
'Juan C. Olivares',
'Mattias Harrysson',
+ 'phaer',
)
__license__ = 'Public Domain'
@@ -227,6 +228,9 @@ def parseOpts(overrideArguments=None):
general.add_option('--referer',
dest='referer', help='specify a custom referer, use if the video access is restricted to one domain',
metavar='REF', default=None)
+ general.add_option('--add-header',
+ dest='headers', help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', action="append",
+ metavar='FIELD:VALUE')
general.add_option('--list-extractors',
action='store_true', dest='list_extractors',
help='List all supported extractors and the URLs they would handle', default=False)
@@ -238,7 +242,7 @@ def parseOpts(overrideArguments=None):
help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option(
- '--prefer-insecure', action='store_true', dest='prefer_insecure',
+ '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',
help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
general.add_option(
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
@@ -252,13 +256,17 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--bidi-workaround', dest='bidi_workaround', action='store_true',
help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
- general.add_option('--default-search',
- dest='default_search', metavar='PREFIX',
- help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
+ general.add_option(
+ '--default-search',
+ dest='default_search', metavar='PREFIX',
+ help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
general.add_option(
'--ignore-config',
action='store_true',
help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
+ general.add_option(
+ '--encoding', dest='encoding', metavar='ENCODING',
+ help='Force the specified encoding (experimental)')
selection.add_option(
'--playlist-start',
@@ -361,6 +369,10 @@ def parseOpts(overrideArguments=None):
verbosity.add_option('-q', '--quiet',
action='store_true', dest='quiet', help='activates quiet mode', default=False)
+ verbosity.add_option(
+ '--no-warnings',
+ dest='no_warnings', action='store_true', default=False,
+ help='Ignore warnings')
verbosity.add_option('-s', '--simulate',
action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
verbosity.add_option('--skip-download',
@@ -388,7 +400,7 @@ def parseOpts(overrideArguments=None):
help='simulate, quiet but print output format', default=False)
verbosity.add_option('-j', '--dump-json',
action='store_true', dest='dumpjson',
- help='simulate, quiet but print JSON information', default=False)
+ help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
verbosity.add_option('--newline',
action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
verbosity.add_option('--no-progress',
@@ -532,8 +544,6 @@ def parseOpts(overrideArguments=None):
write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
- write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' %
- (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding()))
return parser, opts, args
@@ -556,6 +566,16 @@ def _real_main(argv=None):
if opts.referer is not None:
std_headers['Referer'] = opts.referer
+ # Custom HTTP headers
+ if opts.headers is not None:
+ for h in opts.headers:
+ if h.find(':', 1) < 0:
+ parser.error(u'wrong header formatting, it should be key:value, not "%s"'%h)
+ key, value = h.split(':', 2)
+ if opts.verbose:
+ write_string(u'[debug] Adding header from command line option %s:%s\n'%(key, value))
+ std_headers[key] = value
+
# Dump user agent
if opts.dump_user_agent:
compat_print(std_headers['User-Agent'])
@@ -657,7 +677,7 @@ def _real_main(argv=None):
date = DateRange.day(opts.date)
else:
date = DateRange(opts.dateafter, opts.datebefore)
- if opts.default_search not in ('auto', None) and ':' not in opts.default_search:
+ if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search:
parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')
# Do not download videos when there are audio-only formats
@@ -695,6 +715,7 @@ def _real_main(argv=None):
'password': opts.password,
'videopassword': opts.videopassword,
'quiet': (opts.quiet or any_printing),
+ 'no_warnings': opts.no_warnings,
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,
@@ -767,6 +788,7 @@ def _real_main(argv=None):
'include_ads': opts.include_ads,
'default_search': opts.default_search,
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
+ 'encoding': opts.encoding,
}
with YoutubeDL(ydl_opts) as ydl:
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 5a068aa8b..917f3450e 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -4,9 +4,10 @@ import sys
import time
from ..utils import (
+ compat_str,
encodeFilename,
- timeconvert,
format_bytes,
+ timeconvert,
)
@@ -173,7 +174,7 @@ class FileDownloader(object):
return
os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
except (IOError, OSError) as err:
- self.report_error(u'unable to rename file: %s' % str(err))
+ self.report_error(u'unable to rename file: %s' % compat_str(err))
def try_utime(self, filename, last_modified_hdr):
"""Try to set the last-modified time of the given file."""
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 4e6abfe10..e6be6ae6c 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -297,6 +297,7 @@ class F4mFD(FileDownloader):
break
frags_filenames.append(frag_filename)
+ dest_stream.close()
self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start)
self.try_rename(tmpfilename, filename)
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index fa983462b..9d407fe6e 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -13,8 +13,10 @@ class HlsFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
- args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
- '-bsf:a', 'aac_adtstoasc', tmpfilename]
+ args = [
+ '-y', '-i', url, '-f', 'mp4', '-c', 'copy',
+ '-bsf:a', 'aac_adtstoasc',
+ encodeFilename(tmpfilename, for_subprocess=True)]
for program in ['avconv', 'ffmpeg']:
try:
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index 348097dab..cc8b9c9a7 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -23,6 +23,8 @@ class HttpFD(FileDownloader):
headers = {'Youtubedl-no-compression': 'True'}
if 'user_agent' in info_dict:
headers['Youtubedl-user-agent'] = info_dict['user_agent']
+ if 'http_referer' in info_dict:
+ headers['Referer'] = info_dict['http_referer']
basic_request = compat_urllib_request.Request(url, None, headers)
request = compat_urllib_request.Request(url, None, headers)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index b8c843515..66f71edf6 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -14,6 +14,7 @@ from .arte import (
ArteTVConcertIE,
ArteTVFutureIE,
ArteTVDDCIE,
+ ArteTVEmbedIE,
)
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
@@ -25,6 +26,7 @@ from .bloomberg import BloombergIE
from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
+from .byutv import BYUtvIE
from .c56 import C56IE
from .canal13cl import Canal13clIE
from .canalplus import CanalplusIE
@@ -38,6 +40,7 @@ from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .cmt import CMTIE
+from .cnet import CNETIE
from .cnn import (
CNNIE,
CNNBlogsIE,
@@ -81,6 +84,7 @@ from .fktv import (
)
from .flickr import FlickrIE
from .fourtube import FourTubeIE
+from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
PluzzIE,
@@ -112,7 +116,7 @@ from .imdb import (
)
from .ina import InaIE
from .infoq import InfoQIE
-from .instagram import InstagramIE
+from .instagram import InstagramIE, InstagramUserIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
from .ivi import (
@@ -150,10 +154,13 @@ from .mixcloud import MixcloudIE
from .mpora import MporaIE
from .mofosex import MofosexIE
from .mooshare import MooshareIE
+from .morningstar import MorningstarIE
+from .motorsport import MotorsportIE
from .mtv import (
MTVIE,
MTVIggyIE,
)
+from .musicplayon import MusicPlayOnIE
from .muzu import MuzuTVIE
from .myspace import MySpaceIE
from .myspass import MySpassIE
@@ -175,6 +182,8 @@ from .normalboots import NormalbootsIE
from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
+from .ntv import NTVIE
+from .oe1 import OE1IE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .parliamentliveuk import ParliamentLiveUKIE
@@ -195,6 +204,7 @@ from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
+from .rts import RTSIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
@@ -205,7 +215,6 @@ from .rutv import RUTVIE
from .savefrom import SaveFromIE
from .servingsys import ServingSysIE
from .sina import SinaIE
-from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
from .smotri import (
SmotriIE,
@@ -254,13 +263,13 @@ from .udemy import (
UdemyCourseIE
)
from .unistra import UnistraIE
+from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
-from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
@@ -279,16 +288,21 @@ from .vine import VineIE
from .viki import VikiIE
from .vk import VKIE
from .vube import VubeIE
+from .washingtonpost import WashingtonPostIE
from .wat import WatIE
-from .wdr import WDRIE
+from .wdr import (
+ WDRIE,
+ WDRMausIE,
+)
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
+from .xbef import XBefIE
from .xhamster import XHamsterIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
-from .xtube import XTubeIE
+from .xtube import XTubeUserIE, XTubeIE
from .yahoo import (
YahooIE,
YahooNewsIE,
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index a3a1b999d..fcf296057 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -14,14 +16,14 @@ from ..utils import (
class AddAnimeIE(InfoExtractor):
_VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
- IE_NAME = u'AddAnime'
_TEST = {
- u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
- u'file': u'24MR3YO5SAS9.mp4',
- u'md5': u'72954ea10bc979ab5e2eb288b21425a0',
- u'info_dict': {
- u"description": u"One Piece 606",
- u"title": u"One Piece 606"
+ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+ 'md5': '72954ea10bc979ab5e2eb288b21425a0',
+ 'info_dict': {
+ 'id': '24MR3YO5SAS9',
+ 'ext': 'mp4',
+ 'description': 'One Piece 606',
+ 'title': 'One Piece 606',
}
}
@@ -38,10 +40,10 @@ class AddAnimeIE(InfoExtractor):
redir_webpage = ee.cause.read().decode('utf-8')
action = self._search_regex(
r'<form id="challenge-form" action="([^"]+)"',
- redir_webpage, u'Redirect form')
+ redir_webpage, 'Redirect form')
vc = self._search_regex(
r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
- redir_webpage, u'redirect vc value')
+ redir_webpage, 'redirect vc value')
av = re.search(
r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
redir_webpage)
@@ -52,19 +54,19 @@ class AddAnimeIE(InfoExtractor):
parsed_url = compat_urllib_parse_urlparse(url)
av_val = av_res + len(parsed_url.netloc)
confirm_url = (
- parsed_url.scheme + u'://' + parsed_url.netloc +
+ parsed_url.scheme + '://' + parsed_url.netloc +
action + '?' +
compat_urllib_parse.urlencode({
'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
self._download_webpage(
confirm_url, video_id,
- note=u'Confirming after redirect')
+ note='Confirming after redirect')
webpage = self._download_webpage(url, video_id)
formats = []
for format_id in ('normal', 'hq'):
rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
- video_url = self._search_regex(rex, webpage, u'video file URLx',
+ video_url = self._search_regex(rex, webpage, 'video file URLx',
fatal=False)
if not video_url:
continue
@@ -72,14 +74,13 @@ class AddAnimeIE(InfoExtractor):
'format_id': format_id,
'url': video_url,
})
- if not formats:
- raise ExtractorError(u'Cannot find any video format!')
+ self._sort_formats(formats)
video_title = self._og_search_title(webpage)
video_description = self._og_search_description(webpage)
return {
'_type': 'video',
- 'id': video_id,
+ 'id': video_id,
'formats': formats,
'title': video_title,
'description': video_description
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 922cede05..dc8657b67 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -6,7 +6,6 @@ import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- determine_ext,
)
@@ -16,9 +15,10 @@ class AppleTrailersIE(InfoExtractor):
"url": "http://trailers.apple.com/trailers/wb/manofsteel/",
"playlist": [
{
- "file": "manofsteel-trailer4.mov",
"md5": "d97a8e575432dbcb81b7c3acb741f8a8",
"info_dict": {
+ "id": "manofsteel-trailer4",
+ "ext": "mov",
"duration": 111,
"title": "Trailer 4",
"upload_date": "20130523",
@@ -26,9 +26,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
- "file": "manofsteel-trailer3.mov",
"md5": "b8017b7131b721fb4e8d6f49e1df908c",
"info_dict": {
+ "id": "manofsteel-trailer3",
+ "ext": "mov",
"duration": 182,
"title": "Trailer 3",
"upload_date": "20130417",
@@ -36,9 +37,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
- "file": "manofsteel-trailer.mov",
"md5": "d0f1e1150989b9924679b441f3404d48",
"info_dict": {
+ "id": "manofsteel-trailer",
+ "ext": "mov",
"duration": 148,
"title": "Trailer",
"upload_date": "20121212",
@@ -46,15 +48,16 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
- "file": "manofsteel-teaser.mov",
"md5": "5fe08795b943eb2e757fa95cb6def1cb",
"info_dict": {
+ "id": "manofsteel-teaser",
+ "ext": "mov",
"duration": 93,
"title": "Teaser",
"upload_date": "20120721",
"uploader_id": "wb",
},
- }
+ },
]
}
@@ -65,16 +68,16 @@ class AppleTrailersIE(InfoExtractor):
movie = mobj.group('movie')
uploader_id = mobj.group('company')
- playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
+ playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
- s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+ s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m):
- return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+ return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s)
- s = u'<html>' + s + u'</html>'
+ s = '<html>' + s + u'</html>'
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
@@ -82,7 +85,7 @@ class AppleTrailersIE(InfoExtractor):
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE,
- on_click, u'trailer info')
+ on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json)
title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@@ -98,8 +101,7 @@ class AppleTrailersIE(InfoExtractor):
first_url = trailer_info['url']
trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
- settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
- settings = json.loads(settings_json)
+ settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
formats = []
for format in settings['metadata']['sizes']:
@@ -107,7 +109,6 @@ class AppleTrailersIE(InfoExtractor):
format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
formats.append({
'url': format_url,
- 'ext': determine_ext(format_url),
'format': format['type'],
'width': format['width'],
'height': int(format['height']),
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 979481b21..646377e4b 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
@@ -19,114 +18,41 @@ from ..utils import (
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
-class ArteTvIE(InfoExtractor):
- _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
- _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
- _LIVE_URL = r'index-[0-9]+\.html$'
+class ArteTvIE(InfoExtractor):
+ _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
IE_NAME = 'arte.tv'
- @classmethod
- def suitable(cls, url):
- return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
-
- # TODO implement Live Stream
- # from ..utils import compat_urllib_parse
- # def extractLiveStream(self, url):
- # video_lang = url.split('/')[-4]
- # info = self.grep_webpage(
- # url,
- # r'src="(.*?/videothek_js.*?\.js)',
- # 0,
- # [
- # (1, 'url', 'Invalid URL: %s' % url)
- # ]
- # )
- # http_host = url.split('/')[2]
- # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
- # info = self.grep_webpage(
- # next_url,
- # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
- # '(http://.*?\.swf).*?' +
- # '(rtmp://.*?)\'',
- # re.DOTALL,
- # [
- # (1, 'path', 'could not extract video path: %s' % url),
- # (2, 'player', 'could not extract video player: %s' % url),
- # (3, 'url', 'could not extract video url: %s' % url)
- # ]
- # )
- # video_url = '%s/%s' % (info.get('url'), info.get('path'))
-
def _real_extract(self, url):
- mobj = re.match(self._VIDEOS_URL, url)
- if mobj is not None:
- id = mobj.group('id')
- lang = mobj.group('lang')
- return self._extract_video(url, id, lang)
-
- mobj = re.match(self._LIVEWEB_URL, url)
- if mobj is not None:
- name = mobj.group('name')
- lang = mobj.group('lang')
- return self._extract_liveweb(url, name, lang)
-
- if re.search(self._LIVE_URL, url) is not None:
- raise ExtractorError('Arte live streams are not yet supported, sorry')
- # self.extractLiveStream(url)
- # return
-
- raise ExtractorError('No video found')
-
- def _extract_video(self, url, video_id, lang):
- """Extract from videos.arte.tv"""
+ mobj = re.match(self._VALID_URL, url)
+ lang = mobj.group('lang')
+ video_id = mobj.group('id')
+
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml_doc = self._download_xml(
ref_xml_url, video_id, note='Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
- config_xml = self._download_webpage(
+ config = self._download_xml(
config_xml_url, video_id, note='Downloading configuration')
- video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
- def _key(m):
- quality = m.group('quality')
- if quality == 'hd':
- return 2
- else:
- return 1
- # We pick the best quality
- video_urls = sorted(video_urls, key=_key)
- video_url = list(video_urls)[-1].group('url')
-
- title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
- thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
- config_xml, 'thumbnail')
- return {'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'url': video_url,
- 'ext': 'flv',
- }
-
- def _extract_liveweb(self, url, name, lang):
- """Extract form http://liveweb.arte.tv/"""
- webpage = self._download_webpage(url, name)
- video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id')
- config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
- video_id, 'Downloading information')
- event_doc = config_doc.find('event')
- url_node = event_doc.find('video').find('urlHd')
- if url_node is None:
- url_node = event_doc.find('urlSd')
-
- return {'id': video_id,
- 'title': event_doc.find('name%s' % lang.capitalize()).text,
- 'url': url_node.text.replace('MP4', 'mp4'),
- 'ext': 'flv',
- 'thumbnail': self._og_search_thumbnail(webpage),
- }
+ formats = [{
+ 'forma_id': q.attrib['quality'],
+ 'url': q.text,
+ 'ext': 'flv',
+ 'quality': 2 if q.attrib['quality'] == 'hd' else 1,
+ } for q in config.findall('./urls/url')]
+ self._sort_formats(formats)
+
+ title = config.find('.//name').text
+ thumbnail = config.find('.//firstThumbnailUrl').text
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
class ArteTVPlus7IE(InfoExtractor):
@@ -152,9 +78,7 @@ class ArteTVPlus7IE(InfoExtractor):
return self._extract_from_json_url(json_url, video_id, lang)
def _extract_from_json_url(self, json_url, video_id, lang):
- json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
- self.report_extraction(video_id)
- info = json.loads(json_info)
+ info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
info_dict = {
@@ -176,6 +100,8 @@ class ArteTVPlus7IE(InfoExtractor):
l = 'F'
elif lang == 'de':
l = 'A'
+ else:
+ l = lang
regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
@@ -305,3 +231,22 @@ class ArteTVConcertIE(ArteTVPlus7IE):
'description': 'md5:486eb08f991552ade77439fe6d82c305',
},
}
+
+
+class ArteTVEmbedIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:embed'
+ _VALID_URL = r'''(?x)
+ http://www\.arte\.tv
+ /playerv2/embed\.php\?json_url=
+ (?P<json_url>
+ http://arte\.tv/papi/tvguide/videos/stream/player/
+ (?P<lang>[^/]+)/(?P<id>[^/]+)[^&]*
+ )
+ '''
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ lang = mobj.group('lang')
+ json_url = mobj.group('json_url')
+ return self._extract_from_json_url(json_url, video_id, lang)
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py
index c6f30e626..20bf12550 100644
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -11,22 +11,24 @@ from ..utils import (
class AUEngineIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?auengine\.com/embed\.php\?.*?file=(?P<id>[^&]+).*?'
+
_TEST = {
'url': 'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
- 'file': 'lfvlytY6.mp4',
'md5': '48972bdbcf1a3a2f5533e62425b41d4f',
'info_dict': {
+ 'id': 'lfvlytY6',
+ 'ext': 'mp4',
'title': '[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]'
}
}
- _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
+
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
- webpage, 'title')
+ title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title')
title = title.strip()
links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
links = map(compat_urllib_parse.unquote, links)
@@ -39,14 +41,15 @@ class AUEngineIE(InfoExtractor):
elif '/videos/' in link:
video_url = link
if not video_url:
- raise ExtractorError(u'Could not find video URL')
+ raise ExtractorError('Could not find video URL')
ext = '.' + determine_ext(video_url)
if ext == title[-len(ext):]:
title = title[:-len(ext)]
return {
- 'id': video_id,
- 'url': video_url,
- 'title': title,
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
'thumbnail': thumbnail,
+ 'http_referer': 'http://www.auengine.com/flowplayer/flowplayer.commercial-3.2.14.swf',
}
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py
index 2415ce403..25fb79e14 100644
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -1,22 +1,21 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
-from .ooyala import OoyalaIE
class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
_TEST = {
- u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
- u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4',
- u'info_dict': {
- u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies',
- u'description': u'md5:abc86e5236f9f0e4866c59ad36736686',
- },
- u'params': {
- # Requires ffmpeg (m3u8 manifest)
- u'skip_download': True,
+ 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
+ 'md5': '7bf08858ff7c203c870e8a6190e221e5',
+ 'info_dict': {
+ 'id': 'qurhIVlJSB6hzkVi229d8g',
+ 'ext': 'flv',
+ 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
+ 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88',
},
}
@@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
webpage = self._download_webpage(url, name)
- embed_code = self._search_regex(
- r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
- 'embed code')
- return OoyalaIE._build_url_result(embed_code)
+ f4m_url = self._search_regex(
+ r'<source src="(https?://[^"]+\.f4m.*?)"', webpage,
+ 'f4m url')
+ title = re.sub(': Video$', '', self._og_search_title(webpage))
+
+ return {
+ 'id': name.split('-')[-1],
+ 'title': title,
+ 'url': f4m_url,
+ 'ext': 'flv',
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index 85635d1cc..1bfc9f35b 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -27,9 +27,10 @@ class BreakIE(InfoExtractor):
webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
- m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
- if m_youtube is not None:
- return self.url_result(m_youtube.group(1), 'Youtube')
+ youtube_id = info.get('youtubeId')
+ if youtube_id:
+ return self.url_result(youtube_id, 'Youtube')
+
final_url = video_url + '?' + info['AuthToken']
return {
'id': video_id,
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 83eec84d3..339d60ff0 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor):
object_str = object_str.replace('<--', '<!--')
object_str = fix_xml_ampersands(object_str)
- object_doc = xml.etree.ElementTree.fromstring(object_str)
+ object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None:
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
new file mode 100644
index 000000000..91c6398f5
--- /dev/null
+++ b/youtube_dl/extractor/byutv.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+)
+
+
+class BYUtvIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
+ 'info_dict': {
+ 'id': 'granite-flats-talking',
+ 'ext': 'mp4',
+ 'description': 'md5:1a7ae3e153359b7cc355ef3963441e5f',
+ 'title': 'Talking',
+ 'thumbnail': 're:^https?://.*promo.*'
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+
+ webpage = self._download_webpage(url, video_id)
+ episode_code = self._search_regex(
+ r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information')
+ episode_json = re.sub(
+ r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code)
+ ep = json.loads(episode_json)
+
+ if ep['providerType'] == 'Ooyala':
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'Ooyala',
+ 'url': 'ooyala:%s' % ep['providerId'],
+ 'id': video_id,
+ 'title': ep['title'],
+ 'description': ep.get('description'),
+ 'thumbnail': ep.get('imageThumbnail'),
+ }
+ else:
+ raise ExtractorError('Unsupported provider %s' % ep['provider'])
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
index 690bc7c25..cb96c3876 100644
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@@ -2,39 +2,46 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
class C56IE(InfoExtractor):
- _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)'
+ _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
_TEST = {
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
- 'file': '93440716.flv',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
+ 'id': '93440716',
+ 'ext': 'flv',
'title': '网事知多少 第32期:车怒',
+ 'duration': 283.813,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid')
- info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
- text_id, 'Downloading video info')
- info = json.loads(info_page)['info']
- formats = [{
- 'format_id': f['type'],
- 'filesize': int(f['filesize']),
- 'url': f['url']
- } for f in info['rfiles']]
+
+ page = self._download_json(
+ 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
+
+ info = page['info']
+
+ formats = [
+ {
+ 'format_id': f['type'],
+ 'filesize': int(f['filesize']),
+ 'url': f['url']
+ } for f in info['rfiles']
+ ]
self._sort_formats(formats)
return {
'id': info['vid'],
'title': info['Subject'],
+ 'duration': int(info['duration']) / 1000.0,
'formats': formats,
'thumbnail': info.get('bimg') or info.get('img'),
}
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 7cdcd8399..49dfd881e 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -28,7 +28,7 @@ class CanalplusIE(InfoExtractor):
video_id = mobj.groupdict().get('id')
if video_id is None:
webpage = self._download_webpage(url, mobj.group('path'))
- video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
+ video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id
doc = self._download_xml(info_url,video_id,
u'Downloading video info')
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index bfbffefdc..2301f61b6 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -9,12 +9,12 @@ from ..utils import (
class CinemassacreIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?'
+ _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
_TESTS = [
{
'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
'file': '19911.mp4',
- 'md5': 'fde81fbafaee331785f58cd6c0d46190',
+ 'md5': '782f8504ca95a0eba8fc9177c373eec7',
'info_dict': {
'upload_date': '20121110',
'title': '“Angry Video Game Nerd: The Movie” – Trailer',
@@ -24,7 +24,7 @@ class CinemassacreIE(InfoExtractor):
{
'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
'file': '521be8ef82b16.mp4',
- 'md5': 'd72f10cd39eac4215048f62ab477a511',
+ 'md5': 'dec39ee5118f8d9cc067f45f9cbe3a35',
'info_dict': {
'upload_date': '20131002',
'title': 'The Mummy’s Hand (1940)',
@@ -34,8 +34,9 @@ class CinemassacreIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, None) # Don't know video id yet
+ webpage = self._download_webpage(url, display_id)
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
@@ -43,33 +44,36 @@ class CinemassacreIE(InfoExtractor):
playerdata_url = mobj.group('embed_url')
video_id = mobj.group('video_id')
- video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
- webpage, 'title')
- video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
+ video_title = self._html_search_regex(
+ r'<title>(?P<title>.+?)\|', webpage, 'title')
+ video_description = self._html_search_regex(
+ r'<div class="entry-content">(?P<description>.+?)</div>',
webpage, 'description', flags=re.DOTALL, fatal=False)
- if len(video_description) == 0:
- video_description = None
playerdata = self._download_webpage(playerdata_url, video_id)
- sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file')
- hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file')
+ sd_url = self._html_search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
+ hd_url = self._html_search_regex(
+ r'file: \'([^\']+)\', label: \'HD\'', playerdata, 'hd_file',
+ default=None)
video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
- formats = [
- {
- 'url': sd_url,
- 'ext': 'mp4',
- 'format': 'sd',
- 'format_id': 'sd',
- },
- {
+ formats = [{
+ 'url': sd_url,
+ 'ext': 'mp4',
+ 'format': 'sd',
+ 'format_id': 'sd',
+ 'quality': 1,
+ }]
+ if hd_url:
+ formats.append({
'url': hd_url,
'ext': 'mp4',
'format': 'hd',
'format_id': 'hd',
- },
- ]
+ 'quality': 2,
+ })
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index 43efb08bf..669919a2c 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,22 +1,28 @@
+from __future__ import unicode_literals
+
import re
import time
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ parse_duration,
+)
class ClipfishIE(InfoExtractor):
- IE_NAME = u'clipfish'
+ IE_NAME = 'clipfish'
_VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
_TEST = {
- u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
- u'file': u'3966754.mp4',
- u'md5': u'2521cd644e862936cf2e698206e47385',
- u'info_dict': {
- u'title': u'FIFA 14 - E3 2013 Trailer',
- u'duration': 82,
+ 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+ 'md5': '2521cd644e862936cf2e698206e47385',
+ 'info_dict': {
+ 'id': '3966754',
+ 'ext': 'mp4',
+ 'title': 'FIFA 14 - E3 2013 Trailer',
+ 'duration': 82,
},
u'skip': 'Blocked in the US'
}
@@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor):
video_url = doc.find('filename').text
if video_url is None:
xml_bytes = xml.etree.ElementTree.tostring(doc)
- raise ExtractorError(u'Cannot find video URL in document %r' %
+ raise ExtractorError('Cannot find video URL in document %r' %
xml_bytes)
thumbnail = doc.find('imageurl').text
- duration_str = doc.find('duration').text
- m = re.match(
- r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
- duration_str)
- if m:
- duration = (
- (int(m.group('hours')) * 60 * 60) +
- (int(m.group('minutes')) * 60) +
- (int(m.group('seconds')))
- )
- else:
- duration = None
+ duration = parse_duration(doc.find('duration').text)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index 9ab6a4ab6..02a1667fa 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor):
_VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
- u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
- u'info_dict': {
- u'id': u'4629301',
- u'ext': u'mp4',
- u'title': u'Brick Briscoe',
- u'duration': 612,
+ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+ 'md5': '4d7d549451bad625e0ff3d7bd56d776c',
+ 'info_dict': {
+ 'id': '4629301',
+ 'ext': 'mp4',
+ 'title': 'Brick Briscoe',
+ 'duration': 612,
+ 'thumbnail': 're:^https?://.+\.jpg',
},
}
@@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor):
video_id = mobj.group('id')
js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
- video_id, u'Downlaoding player')
+ video_id, 'Downlaoding player')
# it includes a required token
- flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
+ flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars')
pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
- video_id, u'Downloading video info',
+ video_id, 'Downloading video info',
transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track')
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
new file mode 100644
index 000000000..f5ab443d2
--- /dev/null
+++ b/youtube_dl/extractor/cnet.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class CNETIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
+ _TEST = {
+ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
+ 'md5': '041233212a0d06b179c87cbcca1577b8',
+ 'info_dict': {
+ 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
+ 'ext': 'mp4',
+ 'title': 'Hands-on with Microsoft Windows 8.1 Update',
+ 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
+ 'thumbnail': 're:^http://.*/flmswindows8.jpg$',
+ 'uploader_id': 'sarah.mitroff@cbsinteractive.com',
+ 'uploader': 'Sarah Mitroff',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+ data_json = self._html_search_regex(
+ r"<div class=\"cnetVideoPlayer\" data-cnet-video-options='([^']+)'",
+ webpage, 'data json')
+ data = json.loads(data_json)
+ vdata = data['video']
+ if not vdata:
+ vdata = data['videos'][0]
+ if not vdata:
+ raise ExtractorError('Cannot find video data')
+
+ video_id = vdata['id']
+ title = vdata['headline']
+ description = vdata.get('dek')
+ thumbnail = vdata.get('image', {}).get('path')
+ author = vdata.get('author')
+ if author:
+ uploader = '%s %s' % (author['firstName'], author['lastName'])
+ uploader_id = author.get('email')
+ else:
+ uploader = None
+ uploader_id = None
+
+ formats = [{
+ 'format_id': '%s-%s-%s' % (
+ f['type'], f['format'],
+ int_or_none(f.get('bitrate'), 1000, default='')),
+ 'url': f['uri'],
+ 'tbr': int_or_none(f.get('bitrate'), 1000),
+ } for f in vdata['files']['data']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index d50fcdbdb..0c99887a2 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -7,8 +7,8 @@ from .mtv import MTVServicesInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
-
ExtractorError,
+ float_or_none,
unified_strdate,
)
@@ -32,31 +32,34 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
class ComedyCentralShowsIE(InfoExtractor):
- IE_DESC = 'The Daily Show / Colbert Report'
+ IE_DESC = 'The Daily Show / The Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
- _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
- |(https?://)?(www\.)?
- (?P<showname>thedailyshow|colbertnation)\.com/
- (full-episodes/(?P<episode>.*)|
+ _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
+ |https?://(:www\.)?
+ (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
+ (full-episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
(?P<clip>
- (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
- |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
+ (?:(?:guests/[^/]+|videos)/[^/]+/(?P<videotitle>[^/?#]+))
+ |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
+ |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
+ )|
(?P<interview>
- extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
- $"""
+ extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
+ (?:[?#].*|$)'''
_TEST = {
- 'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
- 'file': '422212.mp4',
+ 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
'info_dict': {
- "upload_date": "20121214",
- "description": "Kristen Stewart",
- "uploader": "thedailyshow",
- "title": "thedailyshow-kristen-stewart part 1"
+ 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55',
+ 'ext': 'mp4',
+ 'upload_date': '20121213',
+ 'description': 'Kristen Stewart learns to let loose in "On the Road."',
+ 'uploader': 'thedailyshow',
+ 'title': 'thedailyshow kristen-stewart part 1',
}
}
@@ -79,11 +82,6 @@ class ComedyCentralShowsIE(InfoExtractor):
'400': (384, 216),
}
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
-
@staticmethod
def _transform_rtmp_url(rtmp_video_url):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url)
@@ -99,14 +97,16 @@ class ComedyCentralShowsIE(InfoExtractor):
if mobj.group('shortname'):
if mobj.group('shortname') in ('tds', 'thedailyshow'):
- url = 'http://www.thedailyshow.com/full-episodes/'
+ url = 'http://thedailyshow.cc.com/full-episodes/'
else:
- url = 'http://www.colbertnation.com/full-episodes/'
+ url = 'http://thecolbertreport.cc.com/full-episodes/'
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
assert mobj is not None
if mobj.group('clip'):
- if mobj.group('showname') == 'thedailyshow':
+ if mobj.group('videotitle'):
+ epTitle = mobj.group('videotitle')
+ elif mobj.group('showname') == 'thedailyshow':
epTitle = mobj.group('tdstitle')
else:
epTitle = mobj.group('cntitle')
@@ -120,9 +120,9 @@ class ComedyCentralShowsIE(InfoExtractor):
epTitle = mobj.group('showname')
else:
epTitle = mobj.group('episode')
+ show_name = mobj.group('showname')
- self.report_extraction(epTitle)
- webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
+ webpage, htmlHandle = self._download_webpage_handle(url, epTitle)
if dlNewest:
url = htmlHandle.geturl()
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -130,71 +130,86 @@ class ComedyCentralShowsIE(InfoExtractor):
raise ExtractorError('Invalid redirected URL: ' + url)
if mobj.group('episode') == '':
raise ExtractorError('Redirected URL is still not specific: ' + url)
- epTitle = mobj.group('episode')
+ epTitle = mobj.group('episode').rpartition('/')[-1]
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
-
if len(mMovieParams) == 0:
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
- altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
+ altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage)
if len(altMovieParams) == 0:
raise ExtractorError('unable to find Flash URL in webpage ' + url)
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
uri = mMovieParams[0][1]
- indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
- idoc = self._download_xml(indexUrl, epTitle,
- 'Downloading show index',
- 'unable to download episode index')
-
- results = []
-
- itemEls = idoc.findall('.//item')
- for partNum,itemEl in enumerate(itemEls):
- mediaId = itemEl.findall('./guid')[0].text
- shortMediaId = mediaId.split(':')[-1]
- showId = mediaId.split(':')[-2].replace('.com', '')
- officialTitle = itemEl.findall('./title')[0].text
- officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
-
- configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
- compat_urllib_parse.urlencode({'uri': mediaId}))
- cdoc = self._download_xml(configUrl, epTitle,
- 'Downloading configuration for %s' % shortMediaId)
+ # Correct cc.com in uri
+ uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri)
+
+ index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
+ idoc = self._download_xml(
+ index_url, epTitle,
+ 'Downloading show index', 'Unable to download episode index')
+
+ title = idoc.find('./channel/title').text
+ description = idoc.find('./channel/description').text
+
+ entries = []
+ item_els = idoc.findall('.//item')
+ for part_num, itemEl in enumerate(item_els):
+ upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text)
+ thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url')
+
+ content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
+ duration = float_or_none(content.attrib.get('duration'))
+ mediagen_url = content.attrib['url']
+ guid = itemEl.find('./guid').text.rpartition(':')[-1]
+
+ cdoc = self._download_xml(
+ mediagen_url, epTitle,
+ 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els)))
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
turls.append(finfo)
- if len(turls) == 0:
- self._downloader.report_error('unable to download ' + mediaId + ': No videos found')
- continue
-
formats = []
for format, rtmp_video_url in turls:
w, h = self._video_dimensions.get(format, (None, None))
formats.append({
+ 'format_id': 'vhttp-%s' % format,
'url': self._transform_rtmp_url(rtmp_video_url),
'ext': self._video_extensions.get(format, 'mp4'),
- 'format_id': format,
'height': h,
'width': w,
})
+ formats.append({
+ 'format_id': 'rtmp-%s' % format,
+ 'url': rtmp_video_url,
+ 'ext': self._video_extensions.get(format, 'mp4'),
+ 'height': h,
+ 'width': w,
+ })
+ self._sort_formats(formats)
- effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1)
- results.append({
- 'id': shortMediaId,
+ virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1)
+ entries.append({
+ 'id': guid,
+ 'title': virtual_id,
'formats': formats,
- 'uploader': showId,
- 'upload_date': officialDate,
- 'title': effTitle,
- 'thumbnail': None,
- 'description': compat_str(officialTitle),
+ 'uploader': show_name,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'description': description,
})
- return results
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': show_name + ' ' + title,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 647720c8a..da4193734 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -74,7 +74,7 @@ class InfoExtractor(object):
"http", "https", "rtsp", "rtmp", "m3u8" or so.
* preference Order number of this format. If this field is
present and not None, the formats get sorted
- by this field.
+ by this field, regardless of all other values.
-1 for default (order by other properties),
-2 or smaller for less than default.
* quality Order number of the video quality of this
@@ -252,6 +252,17 @@ class InfoExtractor(object):
outf.write(webpage_bytes)
content = webpage_bytes.decode(encoding, 'replace')
+
+ if (u'<title>Access to this site is blocked</title>' in content and
+ u'Websense' in content[:512]):
+ msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
+ blocked_iframe = self._html_search_regex(
+ r'<iframe src="([^"]+)"', content,
+ u'Websense information URL', default=None)
+ if blocked_iframe:
+ msg += u' Visit %s for more details' % blocked_iframe
+ raise ExtractorError(msg, expected=True)
+
return (content, urlh)
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index 2a8eda9ef..b6552c542 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ int_or_none,
unescapeHTML,
find_xpath_attr,
)
@@ -54,18 +55,29 @@ class CSpanIE(InfoExtractor):
info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
data = self._download_json(info_url, video_id)
- url = unescapeHTML(data['video']['files'][0]['path']['#text'])
-
- doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
+ doc = self._download_xml(
+ 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
video_id)
- def find_string(s):
- return find_xpath_attr(doc, './/string', 'name', s).text
+ title = find_xpath_attr(doc, './/string', 'name', 'title').text
+ thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
+
+ files = data['video']['files']
+
+ entries = [{
+ 'id': '%s_%d' % (video_id, partnum + 1),
+ 'title': (
+ title if len(files) == 1 else
+ '%s part %d' % (title, partnum + 1)),
+ 'url': unescapeHTML(f['path']['#text']),
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': int_or_none(f.get('length', {}).get('#text')),
+ } for partnum, f in enumerate(files)]
return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': title,
'id': video_id,
- 'title': find_string('title'),
- 'url': url,
- 'description': description,
- 'thumbnail': find_string('poster'),
}
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 10b97d8ca..5504d93eb 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -8,7 +8,6 @@ from .subtitles import SubtitlesInfoExtractor
from ..utils import (
compat_urllib_request,
compat_str,
- get_element_by_attribute,
get_element_by_id,
orderedSet,
str_to_int,
@@ -180,7 +179,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = u'dailymotion:playlist'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
- _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
+ _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
def _extract_entries(self, id):
@@ -190,10 +189,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
webpage = self._download_webpage(request,
id, u'Downloading page %s' % pagenum)
- playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
- video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
+ video_ids.extend(re.findall(r'data-id="(.+?)"', webpage))
- if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+ if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
for video_id in orderedSet(video_ids)]
@@ -212,8 +210,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = u'dailymotion:user'
- _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
- _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index 885944c5e..2ae6ecc12 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -10,9 +10,10 @@ class DiscoveryIE(InfoExtractor):
_VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
_TEST = {
'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'file': '614784.mp4',
'md5': 'e12614f9ee303a6ccef415cb0793eba2',
'info_dict': {
+ 'id': '614784',
+ 'ext': 'mp4',
'title': 'MythBusters: Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -34,7 +35,7 @@ class DiscoveryIE(InfoExtractor):
formats = []
for f in info['mp4']:
formats.append(
- {'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])})
+ {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])})
return {
'id': info['contentId'],
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index 2bb77aec6..f8f49a013 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -1,23 +1,25 @@
+from __future__ import unicode_literals
+
import re
from ..utils import (
compat_urllib_parse,
- determine_ext
)
from .common import InfoExtractor
class EHowIE(InfoExtractor):
- IE_NAME = u'eHow'
- _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
+ IE_NAME = 'eHow'
+ _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = {
- u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
- u'file': u'12245069.flv',
- u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
- u'info_dict': {
- u"title": u"Hardwood Flooring Basics",
- u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
- u"uploader": u"Erick Nathan"
+ 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
+ 'md5': '9809b4e3f115ae2088440bcb4efbf371',
+ 'info_dict': {
+ 'id': '12245069',
+ 'ext': 'flv',
+ 'title': 'Hardwood Flooring Basics',
+ 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...',
+ 'uploader': 'Erick Nathan',
}
}
@@ -26,21 +28,16 @@ class EHowIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
- webpage, u'video URL')
- final_url = compat_urllib_parse.unquote(video_url)
- uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
- webpage, u'uploader')
+ webpage, 'video URL')
+ final_url = compat_urllib_parse.unquote(video_url)
+ uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
- ext = determine_ext(final_url)
return {
- '_type': 'video',
- 'id': video_id,
- 'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'id': video_id,
+ 'url': final_url,
+ 'title': title,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
- 'uploader': uploader,
+ 'uploader': uploader,
}
-
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
new file mode 100644
index 000000000..898e0dda7
--- /dev/null
+++ b/youtube_dl/extractor/franceculture.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+ compat_urlparse,
+)
+
+
+class FranceCultureIE(InfoExtractor):
+ _VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
+ 'info_dict': {
+ 'id': '4795174',
+ 'ext': 'mp3',
+ 'title': 'Rendez-vous au pays des geeks',
+ 'vcodec': 'none',
+ 'uploader': 'Colette Fellous',
+ 'upload_date': '20140301',
+ 'duration': 3601,
+ 'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
+ 'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ baseurl = mobj.group('baseurl')
+
+ webpage = self._download_webpage(url, video_id)
+ params_code = self._search_regex(
+ r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
+ webpage, 'parameter code')
+ params = compat_parse_qs(params_code)
+ video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
+
+ title = self._html_search_regex(
+ r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
+ uploader = self._html_search_regex(
+ r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
+ webpage, 'uploader', fatal=False)
+ thumbnail_part = self._html_search_regex(
+ r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
+ 'thumbnail', fatal=False)
+ if thumbnail_part is None:
+ thumbnail = None
+ else:
+ thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
+ description = self._html_search_regex(
+ r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
+
+ info = json.loads(params['infoData'][0])[0]
+ duration = info.get('media_length')
+ upload_date_candidate = info.get('media_section5')
+ upload_date = (
+ upload_date_candidate
+ if (upload_date_candidate is not None and
+ re.match(r'[0-9]{8}$', upload_date_candidate))
+ else None)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 4d649fe71..6a2ce0d6d 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -25,6 +25,7 @@ from ..utils import (
from .brightcove import BrightcoveIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
+from .smotri import SmotriIE
class GenericIE(InfoExtractor):
@@ -81,6 +82,17 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Brightcove'],
},
+ {
+ 'url': 'http://www.championat.com/video/football/v/87/87499.html',
+ 'md5': 'fb973ecf6e4a78a67453647444222983',
+ 'info_dict': {
+ 'id': '3414141473001',
+ 'ext': 'mp4',
+ 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
+ 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
+ 'uploader': 'Championat',
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@@ -102,20 +114,6 @@ class GenericIE(InfoExtractor):
'title': '2cc213299525360.mov', # that's what we get
},
},
- # second style of embedded ooyala videos
- {
- 'url': 'http://www.smh.com.au/tv/business/show/financial-review-sunday/behind-the-scenes-financial-review-sunday--4350201.html',
- 'info_dict': {
- 'id': '13djJjYjptA1XpPx8r9kuzPyj3UZH0Uk',
- 'ext': 'mp4',
- 'title': 'Behind-the-scenes: Financial Review Sunday ',
- 'description': 'Step inside Channel Nine studios for an exclusive tour of its upcoming financial business show.',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
# google redirect
{
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@ -197,6 +195,36 @@ class GenericIE(InfoExtractor):
'description': 'No description',
},
},
+ # arte embed
+ {
+ 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
+ 'md5': '7653032cbb25bf6c80d80f217055fa43',
+ 'info_dict': {
+ 'id': '048195-004_PLUS7-F',
+ 'ext': 'flv',
+ 'title': 'X:enius',
+ 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
+ 'upload_date': '20140320',
+ },
+ 'params': {
+ 'skip_download': 'Requires rtmpdump'
+ }
+ },
+ # smotri embed
+ {
+ 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
+ 'md5': 'ec40048448e9284c9a1de77bb188108b',
+ 'info_dict': {
+ 'id': 'v27008541fad',
+ 'ext': 'mp4',
+ 'title': 'Крым и Севастополь вошли в состав России',
+ 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
+ 'duration': 900,
+ 'upload_date': '20140318',
+ 'uploader': 'rbctv_2012_4',
+ 'uploader_id': 'rbctv_2012_4',
+ },
+ },
]
def report_download_webpage(self, video_id):
@@ -285,13 +313,16 @@ class GenericIE(InfoExtractor):
if not parsed_url.scheme:
default_search = self._downloader.params.get('default_search')
if default_search is None:
- default_search = 'auto'
+ default_search = 'auto_warning'
- if default_search == 'auto':
+ if default_search in ('auto', 'auto_warning'):
if '/' in url:
self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
else:
+ if default_search == 'auto_warning':
+ self._downloader.report_warning(
+ 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url)
return self.url_result('ytsearch:' + url)
else:
assert ':' in default_search
@@ -525,6 +556,18 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
+ # Look for embedded arte.tv player
+ mobj = re.search(
+ r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+
+ # Look for embedded smotri.com player
+ smotri_url = SmotriIE._extract_url(webpage)
+ if smotri_url:
+ return self.url_result(smotri_url, 'Smotri')
+
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py
index 0d1ea6802..94e7cf790 100644
--- a/youtube_dl/extractor/huffpost.py
+++ b/youtube_dl/extractor/huffpost.py
@@ -21,9 +21,10 @@ class HuffPostIE(InfoExtractor):
_TEST = {
'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
- 'file': '52dd3e4b02a7602131000677.mp4',
'md5': '55f5e8981c1c80a64706a44b74833de8',
'info_dict': {
+ 'id': '52dd3e4b02a7602131000677',
+ 'ext': 'mp4',
'title': 'Legalese It! with @MikeSacksHP',
'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
'duration': 1549,
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 381af91e4..cfeaa4146 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -1,10 +1,8 @@
+from __future__ import unicode_literals
+
import re
-import json
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
-)
class IGNIE(InfoExtractor):
@@ -14,52 +12,57 @@ class IGNIE(InfoExtractor):
"""
_VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)'
- IE_NAME = u'ign.com'
+ IE_NAME = 'ign.com'
_CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
- _DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>',
- r'id="my_show_video">.*?<p>(.*?)</p>',
- ]
+ _DESCRIPTION_RE = [
+ r'<span class="page-object-description">(.+?)</span>',
+ r'id="my_show_video">.*?<p>(.*?)</p>',
+ ]
_TESTS = [
{
- u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
- u'file': u'8f862beef863986b2785559b9e1aa599.mp4',
- u'md5': u'eac8bdc1890980122c3b66f14bdd02e9',
- u'info_dict': {
- u'title': u'The Last of Us Review',
- u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c',
+ 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+ 'md5': 'eac8bdc1890980122c3b66f14bdd02e9',
+ 'info_dict': {
+ 'id': '8f862beef863986b2785559b9e1aa599',
+ 'ext': 'mp4',
+ 'title': 'The Last of Us Review',
+ 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
}
},
{
- u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
- u'playlist': [
+ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
+ 'playlist': [
{
- u'file': u'5ebbd138523268b93c9141af17bec937.mp4',
- u'info_dict': {
- u'title': u'GTA 5 Video Review',
- u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
+ 'info_dict': {
+ 'id': '5ebbd138523268b93c9141af17bec937',
+ 'ext': 'mp4',
+ 'title': 'GTA 5 Video Review',
+ 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
},
},
{
- u'file': u'638672ee848ae4ff108df2a296418ee2.mp4',
- u'info_dict': {
- u'title': u'26 Twisted Moments from GTA 5 in Slow Motion',
- u'description': u'The twisted beauty of GTA 5 in stunning slow motion.',
+ 'info_dict': {
+ 'id': '638672ee848ae4ff108df2a296418ee2',
+ 'ext': 'mp4',
+ 'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+ 'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
},
},
],
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
},
},
]
def _find_video_id(self, webpage):
- res_id = [r'data-video-id="(.+?)"',
- r'<object id="vid_(.+?)"',
- r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
- ]
+ res_id = [
+ r'data-video-id="(.+?)"',
+ r'<object id="vid_(.+?)"',
+ r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+ ]
return self._search_regex(res_id, webpage, 'video id')
def _real_extract(self, url):
@@ -68,7 +71,7 @@ class IGNIE(InfoExtractor):
page_type = mobj.group('type')
webpage = self._download_webpage(url, name_or_id)
if page_type == 'articles':
- video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url')
+ video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, 'video url')
return self.url_result(video_url, ie='IGN')
elif page_type != 'video':
multiple_urls = re.findall(
@@ -80,41 +83,37 @@ class IGNIE(InfoExtractor):
video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id)
description = self._html_search_regex(self._DESCRIPTION_RE,
- webpage, 'video description',
- flags=re.DOTALL)
+ webpage, 'video description', flags=re.DOTALL)
result['description'] = description
return result
def _get_video_info(self, video_id):
config_url = self._CONFIG_URL_TEMPLATE % video_id
- config = json.loads(self._download_webpage(config_url, video_id,
- u'Downloading video info'))
+ config = self._download_json(config_url, video_id)
media = config['playlist']['media']
- video_url = media['url']
- return {'id': media['metadata']['videoId'],
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'title': media['metadata']['title'],
- 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
- }
+ return {
+ 'id': media['metadata']['videoId'],
+ 'url': media['url'],
+ 'title': media['metadata']['title'],
+ 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
+ }
class OneUPIE(IGNIE):
- """Extractor for 1up.com, it uses the ign videos system."""
-
_VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
_TEST = {
- u'url': u'http://gamevideos.1up.com/video/id/34976',
- u'file': u'34976.mp4',
- u'md5': u'68a54ce4ebc772e4b71e3123d413163d',
- u'info_dict': {
- u'title': u'Sniper Elite V2 - Trailer',
- u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf',
+ 'url': 'http://gamevideos.1up.com/video/id/34976',
+ 'md5': '68a54ce4ebc772e4b71e3123d413163d',
+ 'info_dict': {
+ 'id': '34976',
+ 'ext': 'mp4',
+ 'title': 'Sniper Elite V2 - Trailer',
+ 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf',
}
}
@@ -123,7 +122,6 @@ class OneUPIE(IGNIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- id = mobj.group('name_or_id')
result = super(OneUPIE, self)._real_extract(url)
- result['id'] = id
+ result['id'] = mobj.group('name_or_id')
return result
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 63141af27..b5372bf7a 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -3,6 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+)
class InstagramIE(InfoExtractor):
@@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor):
'uploader_id': uploader_id,
'description': desc,
}
+
+
+class InstagramUserIE(InfoExtractor):
+ _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+ IE_DESC = 'Instagram user profile'
+ IE_NAME = 'instagram:user'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ uploader_id = mobj.group('username')
+
+ entries = []
+ page_count = 0
+ media_url = 'http://instagram.com/%s/media' % uploader_id
+ while True:
+ page = self._download_json(
+ media_url, uploader_id,
+ note='Downloading page %d ' % (page_count + 1),
+ )
+ page_count += 1
+
+ for it in page['items']:
+ if it.get('type') != 'video':
+ continue
+ like_count = int_or_none(it.get('likes', {}).get('count'))
+ user = it.get('user', {})
+
+ formats = [{
+ 'format_id': k,
+ 'height': v.get('height'),
+ 'width': v.get('width'),
+ 'url': v['url'],
+ } for k, v in it['videos'].items()]
+ self._sort_formats(formats)
+
+ thumbnails_el = it.get('images', {})
+ thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
+
+ title = it.get('caption', {}).get('text', it['id'])
+
+ entries.append({
+ 'id': it['id'],
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'webpage_url': it.get('link'),
+ 'uploader': user.get('full_name'),
+ 'uploader_id': user.get('username'),
+ 'like_count': like_count,
+ 'timestamp': int_or_none(it.get('created_time')),
+ })
+
+ if not page['items']:
+ break
+ max_id = page['items'][-1]['id']
+ media_url = (
+ 'http://instagram.com/%s/media?max_id=%s' % (
+ uploader_id, max_id))
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': uploader_id,
+ 'title': uploader_id,
+ }
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 29658a7d6..75b63cffb 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import os
import re
@@ -11,22 +13,22 @@ from ..aes import (
aes_decrypt_text
)
+
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
_TEST = {
- u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
- u'file': u'1214711.mp4',
- u'md5': u'6e297b7e789329923fcf83abb67c9289',
- u'info_dict': {
- u"title": u"Petite Asian Lady Mai Playing In Bathtub",
- u"age_limit": 18,
+ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
+ 'file': '1214711.mp4',
+ 'md5': '6e297b7e789329923fcf83abb67c9289',
+ 'info_dict': {
+ 'title': 'Petite Asian Lady Mai Playing In Bathtub',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor):
embedded_url = mobj.group(1)
return self.url_result(embedded_url)
- video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title')
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
- if webpage.find('encrypted=true')!=-1:
- password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password')
+ video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title')
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, 'video_url'))
+ if 'encrypted=true' in webpage:
+ password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, 'password')
video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py
index 50bc883ef..961dd1aa6 100644
--- a/youtube_dl/extractor/kickstarter.py
+++ b/youtube_dl/extractor/kickstarter.py
@@ -1,37 +1,39 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
class KickStarterIE(InfoExtractor):
- _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*'
+ _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*'
_TEST = {
- u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location",
- u"file": u"1404461844.mp4",
- u"md5": u"c81addca81327ffa66c642b5d8b08cab",
- u"info_dict": {
- u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling",
+ 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location',
+ 'md5': 'c81addca81327ffa66c642b5d8b08cab',
+ 'info_dict': {
+ 'id': '1404461844',
+ 'ext': 'mp4',
+ 'title': 'Intersection: The Story of Josh Grant by Kyle Cowling',
+ 'description': 'A unique motocross documentary that examines the '
+ 'life and mind of one of sports most elite athletes: Josh Grant.',
},
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
- webpage_src = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'data-video="(.*?)">',
- webpage_src, u'video URL')
- if 'mp4' in video_url:
- ext = 'mp4'
- else:
- ext = 'flv'
- video_title = self._html_search_regex(r"<title>(.*?)</title>",
- webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
+ video_url = self._search_regex(r'data-video-url="(.*?)"',
+ webpage, 'video URL')
+ video_title = self._html_search_regex(r'<title>(.*?)</title>',
+ webpage, 'title').rpartition('— Kickstarter')[0].strip()
- results = [{
- 'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'ext': ext,
- }]
- return results
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index 465ac4916..07f072924 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -13,8 +13,9 @@ class MetacriticIE(InfoExtractor):
_TEST = {
'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
- 'file': '3698222.mp4',
'info_dict': {
+ 'id': '3698222',
+ 'ext': 'mp4',
'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
'duration': 221,
diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py
index f1875add5..7d21ea18f 100644
--- a/youtube_dl/extractor/mooshare.py
+++ b/youtube_dl/extractor/mooshare.py
@@ -14,7 +14,7 @@ from ..utils import (
class MooshareIE(InfoExtractor):
IE_NAME = 'mooshare'
IE_DESC = 'Mooshare.biz'
- _VALID_URL = r'http://mooshare\.biz/(?P<id>[\da-z]{12})'
+ _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})'
_TESTS = [
{
diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py
new file mode 100644
index 000000000..4f7a5d2e4
--- /dev/null
+++ b/youtube_dl/extractor/morningstar.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MorningstarIE(InfoExtractor):
+ IE_DESC = 'morningstar.com'
+ _VALID_URL = r'https?://(?:www\.)?morningstar\.com/cover/videocenter\.aspx\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869',
+ 'md5': '6c0acface7a787aadc8391e4bbf7b0f5',
+ 'info_dict': {
+ 'id': '615869',
+ 'ext': 'mp4',
+ 'title': 'Get Ahead of the Curve on 2013 Taxes',
+ 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.",
+ 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title')
+ video_url = self._html_search_regex(
+ r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"',
+ webpage, 'video URL')
+ thumbnail = self._html_search_regex(
+ r'<input type="hidden" id="hidSnapshot" value="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'<div id="mstarDeck".*?>(.*?)</div>',
+ webpage, 'description', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
new file mode 100644
index 000000000..dc727be10
--- /dev/null
+++ b/youtube_dl/extractor/motorsport.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import json
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+ compat_str,
+ int_or_none,
+)
+
+
+class MotorsportIE(InfoExtractor):
+ IE_DESC = 'motorsport.com'
+ _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
+ _TEST = {
+ 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
+ 'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
+ 'info_dict': {
+ 'id': '7063',
+ 'ext': 'mp4',
+ 'title': 'Red Bull Racing: 2014 Rules Explained',
+ 'duration': 207,
+ 'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.',
+ 'uploader': 'rainiere',
+ 'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+ flashvars_code = self._html_search_regex(
+ r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
+ flashvars = compat_parse_qs(flashvars_code)
+ params = json.loads(flashvars['parameters'][0])
+
+ e = compat_str(int(time.time()) + 24 * 60 * 60)
+ base_video_url = params['location'] + '?e=' + e
+ s = 'h3hg713fh32'
+ h = hashlib.md5(s + base_video_url).hexdigest()
+ video_url = base_video_url + '&h=' + h
+
+ uploader = self._html_search_regex(
+ r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
+ 'uploader', fatal=False)
+
+ return {
+ 'id': params['video_id'],
+ 'display_id': display_id,
+ 'title': params['title'],
+ 'url': video_url,
+ 'description': params.get('description'),
+ 'thumbnail': params.get('main_thumb'),
+ 'duration': int_or_none(params.get('duration')),
+ 'uploader': uploader,
+ }
diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py
new file mode 100644
index 000000000..42d7a82a5
--- /dev/null
+++ b/youtube_dl/extractor/musicplayon.py
@@ -0,0 +1,75 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MusicPlayOnIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://en.musicplayon.com/play?v=433377',
+ 'info_dict': {
+ 'id': '433377',
+ 'ext': 'mp4',
+ 'title': 'Rick Ross - Interview On Chelsea Lately (2014)',
+ 'description': 'Rick Ross Interview On Chelsea Lately',
+ 'duration': 342,
+ 'uploader': 'ultrafish',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(page)
+ description = self._og_search_description(page)
+ thumbnail = self._og_search_thumbnail(page)
+ duration = self._html_search_meta('video:duration', page, 'duration', fatal=False)
+ view_count = self._og_search_property('count', page, fatal=False)
+ uploader = self._html_search_regex(
+ r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
+
+ formats = [
+ {
+ 'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id,
+ 'ext': 'mp4',
+ }
+ ]
+
+ manifest = self._download_webpage(
+ 'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest')
+
+ for entry in manifest.split('#')[1:]:
+ if entry.startswith('EXT-X-STREAM-INF:'):
+ meta, url, _ = entry.split('\n')
+ params = dict(param.split('=') for param in meta.split(',')[1:])
+ formats.append({
+ 'url': url,
+ 'ext': 'mp4',
+ 'tbr': int(params['BANDWIDTH']),
+ 'width': int(params['RESOLUTION'].split('x')[1]),
+ 'height': int(params['RESOLUTION'].split('x')[-1]),
+ 'format_note': params['NAME'].replace('"', '').strip(),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'duration': int_or_none(duration),
+ 'view_count': int_or_none(view_count),
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 7e421610e..633b42f72 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -6,12 +6,13 @@ from .common import InfoExtractor
class NBAIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
_TEST = {
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
- 'file': u'0021200253-okc-bkn-recap.nba.mp4',
'md5': u'c0edcfc37607344e2ff8f13c378c88a4',
'info_dict': {
+ 'id': '0021200253-okc-bkn-recap.nba',
+ 'ext': 'mp4',
'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
'title': 'Thunder vs. Nets',
},
@@ -19,7 +20,7 @@ class NBAIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
@@ -33,7 +34,6 @@ class NBAIE(InfoExtractor):
return {
'id': shortened_video_id,
'url': video_url,
- 'ext': 'mp4',
'title': title,
'description': description,
}
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 46774317c..517a72561 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -1,12 +1,10 @@
# encoding: utf-8
+from __future__ import unicode_literals
import re
-import socket
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
@@ -18,57 +16,54 @@ from ..utils import (
class NiconicoIE(InfoExtractor):
- IE_NAME = u'niconico'
- IE_DESC = u'ニコニコ動画'
+ IE_NAME = 'niconico'
+ IE_DESC = 'ニコニコ動画'
_TEST = {
- u'url': u'http://www.nicovideo.jp/watch/sm22312215',
- u'file': u'sm22312215.mp4',
- u'md5': u'd1a75c0823e2f629128c43e1212760f9',
- u'info_dict': {
- u'title': u'Big Buck Bunny',
- u'uploader': u'takuya0301',
- u'uploader_id': u'2698420',
- u'upload_date': u'20131123',
- u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+ 'url': 'http://www.nicovideo.jp/watch/sm22312215',
+ 'md5': 'd1a75c0823e2f629128c43e1212760f9',
+ 'info_dict': {
+ 'id': 'sm22312215',
+ 'ext': 'mp4',
+ 'title': 'Big Buck Bunny',
+ 'uploader': 'takuya0301',
+ 'uploader_id': '2698420',
+ 'upload_date': '20131123',
+ 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
},
- u'params': {
- u'username': u'ydl.niconico@gmail.com',
- u'password': u'youtube-dl',
+ 'params': {
+ 'username': 'ydl.niconico@gmail.com',
+ 'password': 'youtube-dl',
},
}
_VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
_NETRC_MACHINE = 'niconico'
- # If True it will raise an error if no login info is provided
- _LOGIN_REQUIRED = True
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
- # No authentication to be performed
if username is None:
- if self._LOGIN_REQUIRED:
- raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
- return False
+ # Login is required
+ raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
# Log in
login_form_strs = {
- u'mail': username,
- u'password': password,
+ 'mail': username,
+ 'password': password,
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
request = compat_urllib_request.Request(
- u'https://secure.nicovideo.jp/secure/login', login_data)
+ 'https://secure.nicovideo.jp/secure/login', login_data)
login_results = self._download_webpage(
- request, u'', note=u'Logging in', errnote=u'Unable to log in')
+ request, None, note='Logging in', errnote='Unable to log in')
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
+ self._downloader.report_warning('unable to log in: bad username or password')
return False
return True
@@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor):
video_info = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
- note=u'Downloading video info page')
+ note='Downloading video info page')
# Get flv info
flv_info_webpage = self._download_webpage(
- u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
- video_id, u'Downloading flv info')
+ 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ video_id, 'Downloading flv info')
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
@@ -106,22 +101,22 @@ class NiconicoIE(InfoExtractor):
url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
try:
user_info = self._download_xml(
- url, video_id, note=u'Downloading user information')
+ url, video_id, note='Downloading user information')
video_uploader = user_info.find('.//nickname').text
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+ except ExtractorError as err:
+ self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err))
return {
- 'id': video_id,
- 'url': video_real_url,
- 'title': video_title,
- 'ext': video_extension,
- 'format': video_format,
- 'thumbnail': video_thumbnail,
+ 'id': video_id,
+ 'url': video_real_url,
+ 'title': video_title,
+ 'ext': video_extension,
+ 'format': video_format,
+ 'thumbnail': video_thumbnail,
'description': video_description,
- 'uploader': video_uploader,
+ 'uploader': video_uploader,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
- 'view_count': video_view_count,
+ 'view_count': video_view_count,
'webpage_url': video_webpage_url,
}
diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py
new file mode 100644
index 000000000..8447a9b86
--- /dev/null
+++ b/youtube_dl/extractor/ntv.py
@@ -0,0 +1,157 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unescapeHTML
+)
+
+
+class NTVIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.ntv.ru/novosti/863142/',
+ 'info_dict': {
+ 'id': '746000',
+ 'ext': 'flv',
+ 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+ 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+ 'duration': 136,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/video/novosti/750370/',
+ 'info_dict': {
+ 'id': '750370',
+ 'ext': 'flv',
+ 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+ 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+ 'duration': 172,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
+ 'info_dict': {
+ 'id': '747480',
+ 'ext': 'flv',
+ 'title': '«Сегодня». 21 марта 2014 года. 16:00 ',
+ 'description': '«Сегодня». 21 марта 2014 года. 16:00 ',
+ 'duration': 1496,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/kino/Koma_film',
+ 'info_dict': {
+ 'id': '750783',
+ 'ext': 'flv',
+ 'title': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ',
+ 'description': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ',
+ 'duration': 28,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
+ 'info_dict': {
+ 'id': '751482',
+ 'ext': 'flv',
+ 'title': '«Дело врачей»: «Деревце жизни»',
+ 'description': '«Дело врачей»: «Деревце жизни»',
+ 'duration': 2590,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ ]
+
+ _VIDEO_ID_REGEXES = [
+ r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
+ r'<video embed=[^>]+><id>(\d+)</id>',
+ r'<video restriction[^>]+><key>(\d+)</key>'
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id, 'Downloading page')
+
+ for pattern in self._VIDEO_ID_REGEXES:
+ mobj = re.search(pattern, page)
+ if mobj:
+ break
+
+ if not mobj:
+ raise ExtractorError('No media links available for %s' % video_id)
+
+ video_id = mobj.group(1)
+
+ player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML')
+ title = unescapeHTML(player.find('./data/title').text)
+ description = unescapeHTML(player.find('./data/description').text)
+
+ video = player.find('./data/video')
+ video_id = video.find('./id').text
+ thumbnail = video.find('./splash').text
+ duration = int(video.find('./totaltime').text)
+ view_count = int(video.find('./views').text)
+ puid22 = video.find('./puid22').text
+
+ apps = {
+ '4': 'video1',
+ '7': 'video2',
+ }
+
+ app = apps[puid22] if puid22 in apps else apps['4']
+
+ formats = []
+ for format_id in ['', 'hi', 'webm']:
+ file = video.find('./%sfile' % format_id)
+ if file is None:
+ continue
+ size = video.find('./%ssize' % format_id)
+ formats.append({
+ 'url': 'rtmp://media.ntv.ru/%s' % app,
+ 'app': app,
+ 'play_path': file.text,
+ 'rtmp_conn': 'B:1',
+ 'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
+ 'page_url': 'http://www.ntv.ru',
+ 'flash_ver': 'LNX 11,2,202,341',
+ 'rtmp_live': True,
+ 'ext': 'flv',
+ 'filesize': int(size.text),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/oe1.py b/youtube_dl/extractor/oe1.py
new file mode 100644
index 000000000..38971ab4d
--- /dev/null
+++ b/youtube_dl/extractor/oe1.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import datetime
+import re
+
+from .common import InfoExtractor
+
+# audios on oe1.orf.at are only available for 7 days, so we can't
+# add tests.
+
+
+class OE1IE(InfoExtractor):
+ IE_DESC = 'oe1.orf.at'
+ _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ show_id = mobj.group('id')
+
+ data = self._download_json(
+ 'http://oe1.orf.at/programm/%s/konsole' % show_id,
+ show_id
+ )
+
+ timestamp = datetime.datetime.strptime('%s %s' % (
+ data['item']['day_label'],
+ data['item']['time']
+ ), '%d.%m.%Y %H:%M')
+ unix_timestamp = calendar.timegm(timestamp.utctimetuple())
+
+ return {
+ 'id': show_id,
+ 'title': data['item']['title'],
+ 'url': data['item']['url_stream'],
+ 'ext': 'mp3',
+ 'description': data['item'].get('info'),
+ 'timestamp': unix_timestamp
+ }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index e20327791..13f12824c 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -7,7 +7,7 @@ from ..utils import unescapeHTML
class OoyalaIE(InfoExtractor):
- _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'
+ _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
_TEST = {
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 58f9c690e..718fe9aba 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -1,44 +1,81 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..utils import int_or_none
class PornHdIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
_TEST = {
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
- 'file': '1962.flv',
- 'md5': '35272469887dca97abd30abecc6cdf75',
+ 'md5': '956b8ca569f7f4d8ec563e2c41598441',
'info_dict': {
- "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
- "age_limit": 18,
+ 'id': '1962',
+ 'ext': 'mp4',
+ 'title': 'Sierra loves doing laundry',
+ 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('video_id')
- video_title = mobj.group('video_title')
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- next_url = self._html_search_regex(
- r'&hd=(http.+?)&', webpage, 'video URL')
- next_url = compat_urllib_parse.unquote(next_url)
+ title = self._og_search_title(webpage)
+ TITLE_SUFFIX = ' porn HD Video | PornHD.com '
+ if title.endswith(TITLE_SUFFIX):
+ title = title[:-len(TITLE_SUFFIX)]
+
+ description = self._html_search_regex(
+ r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
+ view_count = int_or_none(self._html_search_regex(
+ r'(\d+) views </span>', webpage, 'view count', fatal=False))
+
+ formats = [
+ {
+ 'url': format_url,
+ 'ext': format.lower(),
+ 'format_id': '%s-%s' % (format.lower(), quality.lower()),
+ 'quality': 1 if quality.lower() == 'high' else 0,
+ } for format, quality, format_url in re.findall(
+ r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
+ ]
+
+ mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
+ if mobj:
+ flashvars = json.loads(mobj.group('flashvars'))
+ formats.extend([
+ {
+ 'url': flashvars['hashlink'].replace('?noProxy=1', ''),
+ 'ext': 'flv',
+ 'format_id': 'flv-low',
+ 'quality': 0,
+ },
+ {
+ 'url': flashvars['hd'].replace('?noProxy=1', ''),
+ 'ext': 'flv',
+ 'format_id': 'flv-high',
+ 'quality': 1,
+ }
+ ])
+ thumbnail = flashvars['urlWallpaper']
+ else:
+ thumbnail = self._og_search_thumbnail(webpage)
- video_url = self._download_webpage(
- next_url, video_id, note='Retrieving video URL',
- errnote='Could not retrieve video URL')
- age_limit = 18
+ self._sort_formats(formats)
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
- 'age_limit': age_limit,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
index 33054591b..d2d909136 100644
--- a/youtube_dl/extractor/pyvideo.py
+++ b/youtube_dl/extractor/pyvideo.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
import os
@@ -5,45 +7,50 @@ from .common import InfoExtractor
class PyvideoIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
- _TESTS = [{
- u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
- u'file': u'24_4WWkSmNo.mp4',
- u'md5': u'de317418c8bc76b1fd8633e4f32acbc6',
- u'info_dict': {
- u"title": u"Become a logging expert in 30 minutes",
- u"description": u"md5:9665350d466c67fb5b1598de379021f7",
- u"upload_date": u"20130320",
- u"uploader": u"NextDayVideo",
- u"uploader_id": u"NextDayVideo",
+ _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
+
+ _TESTS = [
+ {
+ 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
+ 'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+ 'info_dict': {
+ 'id': '24_4WWkSmNo',
+ 'ext': 'mp4',
+ 'title': 'Become a logging expert in 30 minutes',
+ 'description': 'md5:9665350d466c67fb5b1598de379021f7',
+ 'upload_date': '20130320',
+ 'uploader': 'NextDayVideo',
+ 'uploader_id': 'NextDayVideo',
+ },
+ 'add_ie': ['Youtube'],
},
- u'add_ie': ['Youtube'],
- },
- {
- u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
- u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12',
- u'info_dict': {
- u'id': u'2542',
- u'ext': u'm4v',
- u'title': u'Gloriajw-SpotifyWithErikBernhardsson182',
+ {
+ 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
+ 'md5': '5fe1c7e0a8aa5570330784c847ff6d12',
+ 'info_dict': {
+ 'id': '2542',
+ 'ext': 'm4v',
+ 'title': 'Gloriajw-SpotifyWithErikBernhardsson182',
+ },
},
- },
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+
webpage = self._download_webpage(url, video_id)
- m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
+ m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
if m_youtube is not None:
return self.url_result(m_youtube.group(1), 'Youtube')
- title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>',
- webpage, u'title', flags=re.DOTALL)
- video_url = self._search_regex([r'<source src="(.*?)"',
- r'<dt>Download</dt>.*?<a href="(.+?)"'],
- webpage, u'video url', flags=re.DOTALL)
+ title = self._html_search_regex(
+ r'<div class="section">.*?<h3>([^>]+?)</h3>', webpage, 'title', flags=re.DOTALL)
+ video_url = self._search_regex(
+ [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'],
+ webpage, 'video url', flags=re.DOTALL)
+
return {
'id': video_id,
'title': os.path.splitext(title)[0],
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py
index 34652f6c1..09352ed82 100644
--- a/youtube_dl/extractor/radiofrance.py
+++ b/youtube_dl/extractor/radiofrance.py
@@ -1,4 +1,6 @@
# coding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -6,16 +8,17 @@ from .common import InfoExtractor
class RadioFranceIE(InfoExtractor):
_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
- IE_NAME = u'radiofrance'
+ IE_NAME = 'radiofrance'
_TEST = {
- u'url': u'http://maison.radiofrance.fr/radiovisions/one-one',
- u'file': u'one-one.ogg',
- u'md5': u'bdbb28ace95ed0e04faab32ba3160daf',
- u'info_dict': {
- u"title": u"One to one",
- u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
- u"uploader": u"Thomas Hercouët",
+ 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
+ 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
+ 'info_dict': {
+ 'id': 'one-one',
+ 'ext': 'ogg',
+ "title": "One to one",
+ "description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+ "uploader": "Thomas Hercouët",
},
}
@@ -24,27 +27,28 @@ class RadioFranceIE(InfoExtractor):
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title')
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
- webpage, u'description', fatal=False)
+ webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
- webpage, u'uploader', fatal=False)
+ webpage, 'uploader', fatal=False)
formats_str = self._html_search_regex(
r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
- webpage, u'audio URLs')
+ webpage, 'audio URLs')
formats = [
{
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
+ 'preference': i,
}
- for fm in
- re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)
+ for i, fm in
+ enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
]
- # No sorting, we don't know any more about these formats
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py
index d339e6cb5..41638c1d0 100644
--- a/youtube_dl/extractor/roxwel.py
+++ b/youtube_dl/extractor/roxwel.py
@@ -1,5 +1,6 @@
+from __future__ import unicode_literals
+
import re
-import json
from .common import InfoExtractor
from ..utils import unified_strdate, determine_ext
@@ -9,41 +10,44 @@ class RoxwelIE(InfoExtractor):
_VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
_TEST = {
- u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html',
- u'file': u'passionpittakeawalklive.flv',
- u'md5': u'd9dea8360a1e7d485d2206db7fe13035',
- u'info_dict': {
- u'title': u'Take A Walk (live)',
- u'uploader': u'Passion Pit',
- u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
+ 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html',
+ 'info_dict': {
+ 'id': 'passionpittakeawalklive',
+ 'ext': 'flv',
+ 'title': 'Take A Walk (live)',
+ 'uploader': 'Passion Pit',
+ 'uploader_id': 'passionpit',
+ 'upload_date': '20120928',
+ 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
},
- u'skip': u'Requires rtmpdump',
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
filename = mobj.group('filename')
info_url = 'http://www.roxwel.com/api/videos/%s' % filename
- info_page = self._download_webpage(info_url, filename,
- u'Downloading video info')
+ info = self._download_json(info_url, filename)
- self.report_extraction(filename)
- info = json.loads(info_page)
rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
best_rate = rtmp_rates[-1]
url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
- rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url')
+ rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url')
ext = determine_ext(rtmp_url)
if ext == 'f4v':
rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
- return {'id': filename,
- 'title': info['title'],
- 'url': rtmp_url,
- 'ext': 'flv',
- 'description': info['description'],
- 'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
- 'uploader': info['artist'],
- 'uploader_id': info['artistname'],
- 'upload_date': unified_strdate(info['dbdate']),
- }
+ return {
+ 'id': filename,
+ 'title': info['title'],
+ 'url': rtmp_url,
+ 'ext': 'flv',
+ 'description': info['description'],
+ 'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
+ 'uploader': info['artist'],
+ 'uploader_id': info['artistname'],
+ 'upload_date': unified_strdate(info['dbdate']),
+ }
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
new file mode 100644
index 000000000..40224d761
--- /dev/null
+++ b/youtube_dl/extractor/rts.py
@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ unescapeHTML,
+ compat_str,
+)
+
+
+class RTSIE(InfoExtractor):
+ IE_DESC = 'RTS.ch'
+ _VALID_URL = r'^https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-.*?\.html'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
+ 'md5': '753b877968ad8afaeddccc374d4256a5',
+ 'info_dict': {
+ 'id': '3449373',
+ 'ext': 'mp4',
+ 'duration': 1488,
+ 'title': 'Les Enfants Terribles',
+ 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
+ 'uploader': 'Divers',
+ 'upload_date': '19680921',
+ 'timestamp': -40280400,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
+ 'md5': 'c197f0b2421995c63a64cc73d800f42e',
+ 'info_dict': {
+ 'id': '5738317',
+ 'ext': 'mp4',
+ 'duration': 55,
+ 'title': 'Bande de lancement de Passe-moi les jumelles',
+ 'description': '',
+ 'uploader': 'Passe-moi les jumelles',
+ 'upload_date': '20140404',
+ 'timestamp': 1396635300,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html',
+ 'md5': 'b4326fecd3eb64a458ba73c73e91299d',
+ 'info_dict': {
+ 'id': '5745975',
+ 'ext': 'mp4',
+ 'duration': 48,
+ 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
+ 'description': 'Hockey - Playoff',
+ 'uploader': 'Hockey',
+ 'upload_date': '20140403',
+ 'timestamp': 1396556882,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ 'skip': 'Blocked outside Switzerland',
+ },
+ {
+ 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
+ 'md5': '9bb06503773c07ce83d3cbd793cebb91',
+ 'info_dict': {
+ 'id': '5745356',
+ 'ext': 'mp4',
+ 'duration': 33,
+ 'title': 'Londres cachée par un épais smog',
+ 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.',
+ 'uploader': 'Le Journal en continu',
+ 'upload_date': '20140403',
+ 'timestamp': 1396537322,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
+ 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
+ 'info_dict': {
+ 'id': '5706148',
+ 'ext': 'mp3',
+ 'duration': 123,
+ 'title': '"Urban Hippie", de Damien Krisl',
+ 'description': 'Des Hippies super glam.',
+ 'upload_date': '20140403',
+ 'timestamp': 1396551600,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ def download_json(video_id):
+ return self._download_json(
+ 'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id)
+
+ all_info = download_json(video_id)
+
+ # video_id extracted out of URL is not always a real id
+ if 'video' not in all_info and 'audio' not in all_info:
+ page = self._download_webpage(url, video_id)
+ video_id = self._html_search_regex(r'<(?:video|audio) data-id="(\d+)"', page, 'video id')
+ all_info = download_json(video_id)
+
+ info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
+
+ upload_timestamp = parse_iso8601(info.get('broadcast_date'))
+ duration = info.get('duration') or info.get('cutout') or info.get('cutduration')
+ if isinstance(duration, compat_str):
+ duration = parse_duration(duration)
+ view_count = info.get('plays')
+ thumbnail = unescapeHTML(info.get('preview_image_url'))
+
+ def extract_bitrate(url):
+ return int_or_none(self._search_regex(
+ r'-([0-9]+)k\.', url, 'bitrate', default=None))
+
+ formats = [{
+ 'format_id': fid,
+ 'url': furl,
+ 'tbr': extract_bitrate(furl),
+ } for fid, furl in info['streams'].items()]
+
+ if 'media' in info:
+ formats.extend([{
+ 'format_id': '%s-%sk' % (media['ext'], media['rate']),
+ 'url': 'http://download-video.rts.ch/%s' % media['url'],
+ 'tbr': media['rate'] or extract_bitrate(media['url']),
+ } for media in info['media'] if media.get('rate')])
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': info['title'],
+ 'description': info.get('intro'),
+ 'duration': duration,
+ 'view_count': view_count,
+ 'uploader': info.get('programName'),
+ 'timestamp': upload_timestamp,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 4922dd764..f1ce66433 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
import itertools
from .common import InfoExtractor
@@ -20,8 +19,9 @@ class RutubeIE(InfoExtractor):
_TEST = {
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
- 'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4',
'info_dict': {
+ 'id': '3eac3b4561676c17df9132a9a1e62e3e',
+ 'ext': 'mp4',
'title': 'Раненный кенгуру забежал в аптеку',
'description': 'http://www.ntdtv.ru ',
'duration': 80,
@@ -38,15 +38,15 @@ class RutubeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
-
- api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id,
- video_id, 'Downloading video JSON')
- video = json.loads(api_response)
-
- api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id,
- video_id, 'Downloading trackinfo JSON')
- trackinfo = json.loads(api_response)
-
+
+ video = self._download_json(
+ 'http://rutube.ru/api/video/%s/?format=json' % video_id,
+ video_id, 'Downloading video JSON')
+
+ trackinfo = self._download_json(
+ 'http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id,
+ video_id, 'Downloading trackinfo JSON')
+
# Some videos don't have the author field
author = trackinfo.get('author') or {}
m3u8_url = trackinfo['video_balancer'].get('m3u8')
@@ -79,10 +79,9 @@ class RutubeChannelIE(InfoExtractor):
def _extract_videos(self, channel_id, channel_title=None):
entries = []
for pagenum in itertools.count(1):
- api_response = self._download_webpage(
+ page = self._download_json(
self._PAGE_TEMPLATE % (channel_id, pagenum),
channel_id, 'Downloading page %s' % pagenum)
- page = json.loads(api_response)
results = page['results']
if not results:
break
@@ -108,10 +107,9 @@ class RutubeMovieIE(RutubeChannelIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
movie_id = mobj.group('id')
- api_response = self._download_webpage(
+ movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON')
- movie = json.loads(api_response)
movie_name = movie['name']
return self._extract_videos(movie_id, movie_name)
diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py
deleted file mode 100644
index d68646d24..000000000
--- a/youtube_dl/extractor/slashdot.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import re
-
-from .common import InfoExtractor
-
-
-class SlashdotIE(InfoExtractor):
- _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
-
- _TEST = {
- u'add_ie': ['Ooyala'],
- u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',
- u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',
- u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735',
- u'info_dict': {
- u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
- ooyala_url = self._search_regex(r'<script src="(.*?)"', webpage, 'ooyala url')
- return self.url_result(ooyala_url, 'Ooyala')
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 540c55703..13e7e71cb 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -13,22 +13,24 @@ from ..utils import (
compat_urllib_request,
ExtractorError,
url_basename,
+ int_or_none,
)
class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com'
IE_NAME = 'smotri'
- _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+ _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
_NETRC_MACHINE = 'smotri'
_TESTS = [
# real video id 2610366
{
'url': 'http://smotri.com/video/view/?id=v261036632ab',
- 'file': 'v261036632ab.mp4',
'md5': '2a7b08249e6f5636557579c368040eb9',
'info_dict': {
+ 'id': 'v261036632ab',
+ 'ext': 'mp4',
'title': 'катастрофа с камер видеонаблюдения',
'uploader': 'rbc2008',
'uploader_id': 'rbc08',
@@ -40,9 +42,10 @@ class SmotriIE(InfoExtractor):
# real video id 57591
{
'url': 'http://smotri.com/video/view/?id=v57591cb20',
- 'file': 'v57591cb20.flv',
'md5': '830266dfc21f077eac5afd1883091bcd',
'info_dict': {
+ 'id': 'v57591cb20',
+ 'ext': 'flv',
'title': 'test',
'uploader': 'Support Photofile@photofile',
'uploader_id': 'support-photofile',
@@ -54,9 +57,10 @@ class SmotriIE(InfoExtractor):
# video-password
{
'url': 'http://smotri.com/video/view/?id=v1390466a13c',
- 'file': 'v1390466a13c.mp4',
'md5': 'f6331cef33cad65a0815ee482a54440b',
'info_dict': {
+ 'id': 'v1390466a13c',
+ 'ext': 'mp4',
'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
'uploader': 'timoxa40',
'uploader_id': 'timoxa40',
@@ -71,9 +75,10 @@ class SmotriIE(InfoExtractor):
# age limit + video-password
{
'url': 'http://smotri.com/video/view/?id=v15408898bcf',
- 'file': 'v15408898bcf.flv',
'md5': '91e909c9f0521adf5ee86fbe073aad70',
'info_dict': {
+ 'id': 'v15408898bcf',
+ 'ext': 'flv',
'title': 'этот ролик не покажут по ТВ',
'uploader': 'zzxxx',
'uploader_id': 'ueggb',
@@ -85,7 +90,22 @@ class SmotriIE(InfoExtractor):
'params': {
'videopassword': '333'
}
- }
+ },
+ # swf player
+ {
+ 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
+ 'md5': '4d47034979d9390d14acdf59c4935bc2',
+ 'info_dict': {
+ 'id': 'v9188090500',
+ 'ext': 'mp4',
+ 'title': 'Shakira - Don\'t Bother',
+ 'uploader': 'HannahL',
+ 'uploader_id': 'lisaha95',
+ 'upload_date': '20090331',
+ 'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother',
+ 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
+ },
+ },
]
_SUCCESS = 0
@@ -93,6 +113,21 @@ class SmotriIE(InfoExtractor):
_PASSWORD_DETECTED = 2
_VIDEO_NOT_FOUND = 3
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
+ webpage)
+ if mobj is not None:
+ return mobj.group('url')
+
+ mobj = re.search(
+ r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
+ <div\s+class="video_image">[^<]+</div>\s*
+ <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
+ if mobj is not None:
+ return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
+
def _search_meta(self, name, html, display_name=None):
if display_name is None:
display_name = name
@@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor):
# Video JSON does not provide enough meta data
# We will extract some from the video web page instead
- video_page_url = 'http://' + mobj.group('url')
+ video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id
video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')
# Warning if video is unavailable
@@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor):
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'duration': video_duration,
- 'view_count': video_view_count,
+ 'view_count': int_or_none(video_view_count),
'age_limit': 18 if adult_content else 0,
'video_page_url': video_page_url
}
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 2f254f023..8893699aa 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -100,7 +100,7 @@ class SoundcloudIE(InfoExtractor):
def report_resolve(self, video_id):
"""Report information extraction."""
- self.to_screen(u'%s: Resolving id' % video_id)
+ self.to_screen('%s: Resolving id' % video_id)
@classmethod
def _resolv_url(cls, url):
@@ -124,45 +124,46 @@ class SoundcloudIE(InfoExtractor):
'description': info['description'],
'thumbnail': thumbnail,
}
+ formats = []
if info.get('downloadable', False):
# We can build a direct link to the song
format_url = (
'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
track_id, self._CLIENT_ID))
- result['formats'] = [{
+ formats.append({
'format_id': 'download',
'ext': info.get('original_format', 'mp3'),
'url': format_url,
'vcodec': 'none',
- }]
- else:
- # We have to retrieve the url
- streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
- 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
- stream_json = self._download_webpage(
- streams_url,
- track_id, 'Downloading track url')
-
- formats = []
- format_dict = json.loads(stream_json)
- for key, stream_url in format_dict.items():
- if key.startswith(u'http'):
- formats.append({
- 'format_id': key,
- 'ext': ext,
- 'url': stream_url,
- 'vcodec': 'none',
- })
- elif key.startswith(u'rtmp'):
- # The url doesn't have an rtmp app, we have to extract the playpath
- url, path = stream_url.split('mp3:', 1)
- formats.append({
- 'format_id': key,
- 'url': url,
- 'play_path': 'mp3:' + path,
- 'ext': ext,
- 'vcodec': 'none',
- })
+ 'preference': 10,
+ })
+
+ # We have to retrieve the url
+ streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
+ 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
+ stream_json = self._download_webpage(
+ streams_url,
+ track_id, 'Downloading track url')
+
+ format_dict = json.loads(stream_json)
+ for key, stream_url in format_dict.items():
+ if key.startswith('http'):
+ formats.append({
+ 'format_id': key,
+ 'ext': ext,
+ 'url': stream_url,
+ 'vcodec': 'none',
+ })
+ elif key.startswith('rtmp'):
+ # The url doesn't have an rtmp app, we have to extract the playpath
+ url, path = stream_url.split('mp3:', 1)
+ formats.append({
+ 'format_id': key,
+ 'url': url,
+ 'play_path': 'mp3:' + path,
+ 'ext': ext,
+ 'vcodec': 'none',
+ })
if not formats:
# We fallback to the stream_url in the original info, this
@@ -188,7 +189,7 @@ class SoundcloudIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
track_id = mobj.group('track_id')
token = None
@@ -226,7 +227,7 @@ class SoundcloudSetIE(SoundcloudIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
# extract uploader (which is in the url)
uploader = mobj.group(1)
@@ -243,7 +244,7 @@ class SoundcloudSetIE(SoundcloudIE):
info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
- self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
+ self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
return
self.report_extraction(full_title)
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index ad1a46c33..a8d8e8b29 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -18,12 +18,14 @@ class TEDIE(SubtitlesInfoExtractor):
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
+ |
+ (?P<type_watch>watch)/[^/]+/[^/]+
)
(/lang/(.*?))? # The url may contain the language
- /(?P<name>\w+) # Here goes the name and then ".html"
+ /(?P<name>[\w-]+) # Here goes the name and then ".html"
.*)$
'''
- _TEST = {
+ _TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
@@ -36,7 +38,17 @@ class TEDIE(SubtitlesInfoExtractor):
'actively fooling us.'),
'uploader': 'Dan Dennett',
}
- }
+ }, {
+ 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
+ 'md5': '226f4fb9c62380d11b7995efa4c87994',
+ 'info_dict': {
+ 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
+ 'ext': 'mp4',
+ 'title': 'Vishal Sikka: The beauty and power of algorithms',
+ 'thumbnail': 're:^https?://.+\.jpg',
+ 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
+ }
+ }]
_FORMATS_PREFERENCE = {
'low': 1,
@@ -57,6 +69,8 @@ class TEDIE(SubtitlesInfoExtractor):
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url, name)
+ elif m.group('type_watch'):
+ return self._watch_info(url, name)
else:
return self._playlist_videos_info(url, name)
@@ -123,3 +137,26 @@ class TEDIE(SubtitlesInfoExtractor):
else:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
+
+ def _watch_info(self, url, name):
+ webpage = self._download_webpage(url, name)
+
+ config_json = self._html_search_regex(
+ r"data-config='([^']+)", webpage, 'config')
+ config = json.loads(config_json)
+ video_url = config['video']['url']
+ thumbnail = config.get('image', {}).get('url')
+
+ title = self._html_search_regex(
+ r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
+ description = self._html_search_regex(
+ r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
+ webpage, 'description', fatal=False)
+
+ return {
+ 'id': name,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 2c5c88be8..fdae17b1b 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -1,33 +1,37 @@
# coding: utf-8
+from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
+
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html'
+ _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html'
_TEST = {
- u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
- u'file': u'10635995.mp4',
- u'md5': u'2e378cc28b9957607d5e88f274e637d8',
- u'info_dict': {
- u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
- u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
+ 'info_dict': {
+ 'id': '10635995',
+ 'ext': 'mp4',
+ 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle',
+ 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+ },
+ 'params': {
+ # Sometimes wat serves the whole file with the --test option
+ 'skip_download': True,
},
- u'skip': u'Sometimes wat serves the whole file with the --test option',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- id = mobj.group(1)
- webpage = self._download_webpage(url, id)
- embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"',
- webpage, 'embed url')
- embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page')
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ embed_url = self._html_search_regex(
+ r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url')
+ embed_page = self._download_webpage(embed_url, video_id,
+ 'Downloading embed player page')
wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
- wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info')
- wat_info = json.loads(wat_info)['media']
- wat_url = wat_info['url']
- return self.url_result(wat_url, 'Wat')
+ wat_info = self._download_json(
+ 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
+ return self.url_result(wat_info['media']['url'], 'Wat')
diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py
new file mode 100644
index 000000000..5d06fcc9e
--- /dev/null
+++ b/youtube_dl/extractor/urort.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ unified_strdate,
+)
+
+
+class UrortIE(InfoExtractor):
+ IE_DESC = 'NRK P3 Urørt'
+ _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
+
+ _TEST = {
+ 'url': 'https://urort.p3.no/#!/Band/Gerilja',
+ 'md5': '5ed31a924be8a05e47812678a86e127b',
+ 'info_dict': {
+ 'id': '33124-4',
+ 'ext': 'mp3',
+ 'title': 'The Bomb',
+ 'thumbnail': 're:^https?://.+\.jpg',
+ 'like_count': int,
+ 'uploader': 'Gerilja',
+ 'uploader_id': 'Gerilja',
+ 'upload_date': '20100323',
+ },
+ 'params': {
+ 'matchtitle': '^The Bomb$', # To test, we want just one video
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+
+ fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
+ json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
+ songs = self._download_json(json_url, playlist_id)
+ print(songs[0])
+
+ entries = [{
+ 'id': '%d-%s' % (s['BandId'], s['$id']),
+ 'title': s['Title'],
+ 'url': s['TrackUrl'],
+ 'ext': 'mp3',
+ 'uploader_id': playlist_id,
+ 'uploader': s.get('BandName', playlist_id),
+ 'like_count': s.get('LikeCount'),
+ 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
+ 'upload_date': unified_strdate(s.get('Released')),
+ } for s in songs]
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index c90feefd2..d16993daf 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -4,26 +4,99 @@ import re
import json
from .common import InfoExtractor
-from ..utils import compat_urllib_request
+from ..utils import (
+ compat_urllib_request,
+ int_or_none,
+)
class VeohIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'
-
- _TEST = {
- 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- 'file': '56314296.mp4',
- 'md5': '620e68e6a3cff80086df3348426c9ca3',
- 'info_dict': {
- 'title': 'Straight Backs Are Stronger',
- 'uploader': 'LUMOback',
- 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
+ 'info_dict': {
+ 'id': '56314296',
+ 'ext': 'mp4',
+ 'title': 'Straight Backs Are Stronger',
+ 'uploader': 'LUMOback',
+ 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ },
+ },
+ {
+ 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
+ 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
+ 'info_dict': {
+ 'id': '27701988',
+ 'ext': 'mp4',
+ 'title': 'Chile workers cover up to avoid skin damage',
+ 'description': 'md5:2bd151625a60a32822873efc246ba20d',
+ 'uploader': 'afp-news',
+ 'duration': 123,
+ },
+ },
+ {
+ 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
+ 'md5': '4fde7b9e33577bab2f2f8f260e30e979',
+ 'note': 'Embedded ooyala video',
+ 'info_dict': {
+ 'id': '69525809',
+ 'ext': 'mp4',
+ 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
+ 'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
+ 'uploader': 'newsy-videos',
+ },
+ },
+ ]
+
+ def _extract_formats(self, source):
+ formats = []
+ link = source.get('aowPermalink')
+ if link:
+ formats.append({
+ 'url': link,
+ 'ext': 'mp4',
+ 'format_id': 'aow',
+ })
+ link = source.get('fullPreviewHashLowPath')
+ if link:
+ formats.append({
+ 'url': link,
+ 'format_id': 'low',
+ })
+ link = source.get('fullPreviewHashHighPath')
+ if link:
+ formats.append({
+ 'url': link,
+ 'format_id': 'high',
+ })
+ return formats
+
+ def _extract_video(self, source):
+ return {
+ 'id': source.get('videoId'),
+ 'title': source.get('title'),
+ 'description': source.get('description'),
+ 'thumbnail': source.get('highResImage') or source.get('medResImage'),
+ 'uploader': source.get('username'),
+ 'duration': int_or_none(source.get('length')),
+ 'view_count': int_or_none(source.get('views')),
+ 'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
+ 'formats': self._extract_formats(source),
}
- }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+
+ if video_id.startswith('v'):
+ rsp = self._download_xml(
+ r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML')
+ if rsp.get('stat') == 'ok':
+ return self._extract_video(rsp.find('./videoList/video'))
+
webpage = self._download_webpage(url, video_id)
age_limit = 0
if 'class="adultwarning-container"' in webpage:
@@ -33,24 +106,16 @@ class VeohIE(InfoExtractor):
request.add_header('Cookie', 'confirmedAdult=true')
webpage = self._download_webpage(request, video_id)
- m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+ m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|"|\?)', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
self.to_screen('%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
- self.report_extraction(video_id)
- info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
- info = json.loads(info)
- video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+ info = json.loads(
+ self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info').replace('\\\'', '\''))
- return {
- 'id': info['videoId'],
- 'title': info['title'],
- 'url': video_url,
- 'uploader': info['username'],
- 'thumbnail': info.get('highResImage') or info.get('medResImage'),
- 'description': info['description'],
- 'view_count': info['views'],
- 'age_limit': age_limit,
- }
+ video = self._extract_video(info)
+ video['age_limit'] = age_limit
+
+ return video
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
deleted file mode 100644
index 87812d6af..000000000
--- a/youtube_dl/extractor/vice.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import re
-
-from .common import InfoExtractor
-from .ooyala import OoyalaIE
-from ..utils import ExtractorError
-
-
-class ViceIE(InfoExtractor):
- _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
-
- _TEST = {
- u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4',
- u'info_dict': {
- u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- },
- u'params': {
- # Requires ffmpeg (m3u8 manifest)
- u'skip_download': True,
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- webpage = self._download_webpage(url, name)
- try:
- ooyala_url = self._og_search_video_url(webpage)
- except ExtractorError:
- try:
- embed_code = self._search_regex(
- r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage,
- u'ooyala embed code')
- ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
- except ExtractorError:
- raise ExtractorError(u'The page doesn\'t contain a video', expected=True)
- return self.url_result(ooyala_url, ie='Ooyala')
-
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 3b3bec92f..8b1432fec 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -16,7 +16,7 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
- _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
+ _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
new file mode 100644
index 000000000..cb8f0887d
--- /dev/null
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ strip_jsonp,
+)
+
+
+class WashingtonPostIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+ _TEST = {
+ 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+ 'playlist': [{
+ 'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+ 'info_dict': {
+ 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
+ 'ext': 'mp4',
+ 'title': 'Breaking Points: The Paper Mine',
+ 'duration': 1287,
+ 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
+ 'uploader': 'The Washington Post',
+ 'timestamp': 1395527908,
+ 'upload_date': '20140322',
+ },
+ }, {
+ 'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+ 'info_dict': {
+ 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
+ 'ext': 'mp4',
+ 'title': 'The town bureaucracy sustains',
+ 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
+ 'duration': 2217,
+ 'timestamp': 1395528005,
+ 'upload_date': '20140322',
+ 'uploader': 'The Washington Post',
+ },
+ }]
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ page_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, page_id)
+ title = self._og_search_title(webpage)
+ uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
+ entries = []
+ for i, uuid in enumerate(uuids, start=1):
+ vinfo_all = self._download_json(
+ 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
+ page_id,
+ transform_source=strip_jsonp,
+ note='Downloading information of video %d/%d' % (i, len(uuids))
+ )
+ vinfo = vinfo_all[0]['contentConfig']
+ uploader = vinfo.get('credits', {}).get('source')
+ timestamp = int_or_none(
+ vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
+
+ formats = [{
+ 'format_id': (
+ '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
+ if s.get('width')
+ else s.get('type')),
+ 'vbr': s.get('bitrate') if s.get('width') != 0 else None,
+ 'width': s.get('width'),
+ 'height': s.get('height'),
+ 'acodec': s.get('audioCodec'),
+ 'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
+ 'filesize': s.get('fileSize'),
+ 'url': s.get('url'),
+ 'ext': 'mp4',
+ 'protocol': {
+ 'MP4': 'http',
+ 'F4F': 'f4m',
+ }.get(s.get('type'))
+ } for s in vinfo.get('streams', [])]
+ source_media_url = vinfo.get('sourceMediaURL')
+ if source_media_url:
+ formats.append({
+ 'format_id': 'source_media',
+ 'url': source_media_url,
+ })
+ self._sort_formats(formats)
+ entries.append({
+ 'id': uuid,
+ 'title': vinfo['title'],
+ 'description': vinfo.get('blurb'),
+ 'uploader': uploader,
+ 'formats': formats,
+ 'duration': int_or_none(vinfo.get('videoDuration'), 100),
+ 'timestamp': timestamp,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': page_id,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 4fab6c6e8..a584e0896 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -1,37 +1,37 @@
# coding: utf-8
+from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
-
from ..utils import (
unified_strdate,
)
class WatIE(InfoExtractor):
- _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
+ _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
IE_NAME = 'wat.tv'
_TEST = {
- u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
- u'file': u'10631273.mp4',
- u'md5': u'd8b2231e1e333acd12aad94b80937e19',
- u'info_dict': {
- u'title': u'World War Z - Philadelphia VOST',
- u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+ 'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
+ 'info_dict': {
+ 'id': '10631273',
+ 'ext': 'mp4',
+ 'title': 'World War Z - Philadelphia VOST',
+ 'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+ },
+ 'params': {
+ # Sometimes wat serves the whole file with the --test option
+ 'skip_download': True,
},
- u'skip': u'Sometimes wat serves the whole file with the --test option',
}
-
+
def download_video_info(self, real_id):
# 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them
- info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
- info = json.loads(info)
+ info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
return info['media']
-
def _real_extract(self, url):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
@@ -56,17 +56,17 @@ class WatIE(InfoExtractor):
entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
return self.playlist_result(entries, real_id, video_info['title'])
+ upload_date = None
+ if 'date_diffusion' in first_chapter:
+ upload_date = unified_strdate(first_chapter['date_diffusion'])
# Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url
- info = {'id': real_id,
- 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
- 'ext': 'mp4',
- 'title': first_chapter['title'],
- 'thumbnail': first_chapter['preview'],
- 'description': first_chapter['description'],
- 'view_count': video_info['views'],
- }
- if 'date_diffusion' in first_chapter:
- info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
-
- return info
+ return {
+ 'id': real_id,
+ 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
+ 'title': first_chapter['title'],
+ 'thumbnail': first_chapter['preview'],
+ 'description': first_chapter['description'],
+ 'view_count': video_info['views'],
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index 500b9146f..63691aa67 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -4,9 +4,10 @@ import re
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
+ compat_parse_qs,
compat_urlparse,
determine_ext,
+ unified_strdate,
)
@@ -111,4 +112,85 @@ class WDRIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
- } \ No newline at end of file
+ }
+
+
+class WDRMausIE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
+ IE_DESC = 'Sendung mit der Maus'
+ _TESTS = [{
+ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
+ 'info_dict': {
+ 'id': 'aktuelle-sendung',
+ 'ext': 'mp4',
+ 'thumbnail': 're:^http://.+\.jpg',
+ 'upload_date': 're:^[0-9]{8}$',
+ 'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
+ }
+ }, {
+ 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
+ 'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
+ 'info_dict': {
+ 'id': '40_jahre_maus',
+ 'ext': 'mp4',
+ 'thumbnail': 're:^http://.+\.jpg',
+ 'upload_date': '20131007',
+ 'title': '12.03.2011 - 40 Jahre Maus',
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ param_code = self._html_search_regex(
+ r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
+
+ title_date = self._search_regex(
+ r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
+ webpage, 'air date')
+ title_str = self._html_search_regex(
+ r'<h1>(.*?)</h1>', webpage, 'title')
+ title = '%s - %s' % (title_date, title_str)
+ upload_date = unified_strdate(
+ self._html_search_meta('dc.date', webpage))
+
+ fields = compat_parse_qs(param_code)
+ video_url = fields['firstVideo'][0]
+ thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
+
+ formats = [{
+ 'format_id': 'rtmp',
+ 'url': video_url,
+ }]
+
+ jscode = self._download_webpage(
+ 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
+ video_id, fatal=False,
+ note='Downloading URL translation table',
+ errnote='Could not download URL translation table')
+ if jscode:
+ for m in re.finditer(
+ r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
+ jscode):
+ if video_url.startswith(m.group('stream')):
+ http_url = video_url.replace(
+ m.group('stream'), m.group('dl'))
+ formats.append({
+ 'format_id': 'http',
+ 'url': http_url,
+ })
+ break
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
+
+# TODO test _1 \ No newline at end of file
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index 79fd53e0c..c27dda944 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -3,11 +3,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
class WimpIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.wimp.com/maruexhausted/',
'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
'info_dict': {
@@ -16,7 +17,20 @@ class WimpIE(InfoExtractor):
'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8',
}
- }
+ }, {
+ # youtube video
+ 'url': 'http://www.wimp.com/clowncar/',
+ 'info_dict': {
+ 'id': 'cG4CEr2aiSg',
+ 'ext': 'mp4',
+ 'title': 'Basset hound clown car...incredible!',
+ 'description': 'md5:8d228485e0719898c017203f900b3a35',
+ 'uploader': 'Gretchen Hoey',
+ 'uploader_id': 'gretchenandjeff1',
+ 'upload_date': '20140303',
+ },
+ 'add_ie': ['Youtube'],
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -24,6 +38,13 @@ class WimpIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL')
+ if YoutubeIE.suitable(video_url):
+ self.to_screen('Found YouTube video')
+ return {
+ '_type': 'url',
+ 'url': video_url,
+ 'ie_key': YoutubeIE.ie_key(),
+ }
return {
'id': video_id,
@@ -31,4 +52,4 @@ class WimpIE(InfoExtractor):
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index fc9237a3f..4e89acd81 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -7,14 +9,14 @@ class WorldStarHipHopIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
_TEST = {
"url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
- "file": "wshh6a7q1ny0G34ZwuIO.mp4",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
+ "id": "wshh6a7q1ny0G34ZwuIO",
+ "ext": "mp4",
"title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
}
-
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
@@ -23,41 +25,32 @@ class WorldStarHipHopIE(InfoExtractor):
m_vevo_id = re.search(r'videoId=(.*?)&amp?',
webpage_src)
-
if m_vevo_id is not None:
- self.to_screen(u'Vevo video detected:')
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
- video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
- webpage_src, u'video URL')
+ video_url = self._search_regex(
+ r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL')
if 'youtube' in video_url:
- self.to_screen(u'Youtube video detected:')
return self.url_result(video_url, ie='Youtube')
- if 'mp4' in video_url:
- ext = 'mp4'
- else:
- ext = 'flv'
-
- video_title = self._html_search_regex(r"<title>(.*)</title>",
- webpage_src, u'title')
+ video_title = self._html_search_regex(
+ r"<title>(.*)</title>", webpage_src, 'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
- webpage_src, u'thumbnail', fatal=False)
-
+ thumbnail = self._html_search_regex(
+ r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail',
+ fatal=False)
if not thumbnail:
_title = r"""candytitles.*>(.*)</span>"""
mobj = re.search(_title, webpage_src)
if mobj is not None:
video_title = mobj.group(1)
- results = [{
- 'id': video_id,
- 'url' : video_url,
- 'title' : video_title,
- 'thumbnail' : thumbnail,
- 'ext' : ext,
- }]
- return results
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'thumbnail': thumbnail,
+ }
+
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
new file mode 100644
index 000000000..71bd7c463
--- /dev/null
+++ b/youtube_dl/extractor/xbef.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+)
+
+
+class XBefIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking',
+ 'md5': 'a478b565baff61634a98f5e5338be995',
+ 'info_dict': {
+ 'id': '5119',
+ 'ext': 'mp4',
+ 'title': 'md5:7358a9faef8b7b57acda7c04816f170e',
+ 'age_limit': 18,
+ 'thumbnail': 're:^http://.*\.jpg',
+ }
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
+
+ config_url_enc = self._download_webpage(
+ 'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
+ note='Retrieving config URL')
+ config_url = compat_urllib_parse.unquote(config_url_enc)
+ config = self._download_xml(
+ config_url, video_id, note='Retrieving config')
+
+ video_url = config.find('./file').text
+ thumbnail = config.find('./image').text
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18,
+ }
+
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 033a9d893..b293e2665 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -75,4 +75,40 @@ class XTubeIE(InfoExtractor):
'comment_count': comment_count,
'formats': formats,
'age_limit': 18,
- } \ No newline at end of file
+ }
+
+class XTubeUserIE(InfoExtractor):
+ IE_DESC = 'XTube user profile'
+ _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ username = mobj.group('username')
+
+ profile_page = self._download_webpage(
+ url, username, note='Retrieving profile page')
+
+ video_count = int(self._search_regex(
+ r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page,
+ 'video count'))
+
+ PAGE_SIZE = 25
+ urls = []
+ page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+ for n in range(1, page_count + 1):
+ lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
+ lpage = self._download_webpage(
+ lpage_url, username,
+ note='Downloading page %d/%d' % (n, page_count))
+ urls.extend(
+ re.findall(r'addthis:url="([^"]+)"', lpage))
+
+ return {
+ '_type': 'playlist',
+ 'id': username,
+ 'entries': [{
+ '_type': 'url',
+ 'url': eurl,
+ 'ie_key': 'XTube',
+ } for eurl in urls]
+ }
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index d92d14f71..e2cf1ae56 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,22 +15,24 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen'
- _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
+ _VALID_URL = r'https?://screen\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- 'file': '214727115.mp4',
'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
+ 'id': '214727115',
+ 'ext': 'mp4',
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
},
},
{
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- 'file': '103000935.mp4',
'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
'info_dict': {
+ 'id': '103000935',
+ 'ext': 'mp4',
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
@@ -60,10 +62,9 @@ class YahooIE(InfoExtractor):
'env': 'prod',
'format': 'json',
})
- query_result_json = self._download_webpage(
+ query_result = self._download_json(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, 'Downloading video info')
- query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
@@ -86,7 +87,6 @@ class YahooIE(InfoExtractor):
else:
format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url
-
formats.append(format_info)
self._sort_formats(formats)
@@ -134,27 +134,25 @@ class YahooSearchIE(SearchInfoExtractor):
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
-
- res = {
- '_type': 'playlist',
- 'id': query,
- 'entries': []
- }
- for pagenum in itertools.count(0):
+ entries = []
+ for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
- webpage = self._download_webpage(result_url, query,
- note='Downloading results page '+str(pagenum+1))
- info = json.loads(webpage)
+ info = self._download_json(result_url, query,
+ note='Downloading results page '+str(pagenum+1))
m = info['m']
results = info['results']
for (i, r) in enumerate(results):
- if (pagenum * 30) +i >= n:
+ if (pagenum * 30) + i >= n:
break
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
- res['entries'].append(e)
- if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):
+ entries.append(e)
+ if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
break
- return res
+ return {
+ '_type': 'playlist',
+ 'id': query,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 3a3a5a39e..334a61833 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,13 +7,13 @@ import itertools
import json
import os.path
import re
-import string
import struct
import traceback
import zlib
from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
+from ..jsinterp import JSInterpreter
from ..utils import (
compat_chr,
compat_parse_qs,
@@ -438,113 +438,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
r'signature=([a-zA-Z]+)', jscode,
- u'Initial JS player signature function name')
-
- functions = {}
-
- def argidx(varname):
- return string.lowercase.index(varname)
-
- def interpret_statement(stmt, local_vars, allow_recursion=20):
- if allow_recursion < 0:
- raise ExtractorError(u'Recursion limit reached')
-
- if stmt.startswith(u'var '):
- stmt = stmt[len(u'var '):]
- ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
- r'=(?P<expr>.*)$', stmt)
- if ass_m:
- if ass_m.groupdict().get('index'):
- def assign(val):
- lvar = local_vars[ass_m.group('out')]
- idx = interpret_expression(ass_m.group('index'),
- local_vars, allow_recursion)
- assert isinstance(idx, int)
- lvar[idx] = val
- return val
- expr = ass_m.group('expr')
- else:
- def assign(val):
- local_vars[ass_m.group('out')] = val
- return val
- expr = ass_m.group('expr')
- elif stmt.startswith(u'return '):
- assign = lambda v: v
- expr = stmt[len(u'return '):]
- else:
- raise ExtractorError(
- u'Cannot determine left side of statement in %r' % stmt)
-
- v = interpret_expression(expr, local_vars, allow_recursion)
- return assign(v)
-
- def interpret_expression(expr, local_vars, allow_recursion):
- if expr.isdigit():
- return int(expr)
-
- if expr.isalpha():
- return local_vars[expr]
-
- m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
- if m:
- member = m.group('member')
- val = local_vars[m.group('in')]
- if member == 'split("")':
- return list(val)
- if member == 'join("")':
- return u''.join(val)
- if member == 'length':
- return len(val)
- if member == 'reverse()':
- return val[::-1]
- slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
- if slice_m:
- idx = interpret_expression(
- slice_m.group('idx'), local_vars, allow_recursion-1)
- return val[idx:]
-
- m = re.match(
- r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
- if m:
- val = local_vars[m.group('in')]
- idx = interpret_expression(m.group('idx'), local_vars,
- allow_recursion-1)
- return val[idx]
-
- m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
- if m:
- a = interpret_expression(m.group('a'),
- local_vars, allow_recursion)
- b = interpret_expression(m.group('b'),
- local_vars, allow_recursion)
- return a % b
-
- m = re.match(
- r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
- if m:
- fname = m.group('func')
- if fname not in functions:
- functions[fname] = extract_function(fname)
- argvals = [int(v) if v.isdigit() else local_vars[v]
- for v in m.group('args').split(',')]
- return functions[fname](argvals)
- raise ExtractorError(u'Unsupported JS expression %r' % expr)
-
- def extract_function(funcname):
- func_m = re.search(
- r'function ' + re.escape(funcname) +
- r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
- jscode)
- argnames = func_m.group('args').split(',')
-
- def resf(args):
- local_vars = dict(zip(argnames, args))
- for stmt in func_m.group('code').split(';'):
- res = interpret_statement(stmt, local_vars)
- return res
- return resf
-
- initial_function = extract_function(funcname)
+ u'Initial JS player signature function name')
+
+ jsi = JSInterpreter(jscode)
+ initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s])
def _parse_sig_swf(self, file_contents):
@@ -1549,12 +1446,15 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
break
more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex(
- r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
+ r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+ page, u'title')
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title)
@@ -1712,7 +1612,7 @@ class YoutubeUserIE(InfoExtractor):
class YoutubeSearchIE(SearchInfoExtractor):
IE_DESC = u'YouTube.com searches'
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
+ _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_MAX_RESULTS = 1000
IE_NAME = u'youtube:search'
_SEARCH_KEY = 'ytsearch'
@@ -1723,9 +1623,12 @@ class YoutubeSearchIE(SearchInfoExtractor):
video_ids = []
pagenum = 0
limit = n
+ PAGE_SIZE = 50
- while (50 * pagenum) < limit:
- result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
+ while (PAGE_SIZE * pagenum) < limit:
+ result_url = self._API_URL % (
+ compat_urllib_parse.quote_plus(query.encode('utf-8')),
+ (PAGE_SIZE * pagenum) + 1)
data_json = self._download_webpage(
result_url, video_id=u'query "%s"' % query,
note=u'Downloading page %s' % (pagenum + 1),
@@ -1836,11 +1739,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_entries = []
paging = 0
for i in itertools.count(1):
- info = self._download_webpage(self._FEED_TEMPLATE % paging,
+ info = self._download_json(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i)
- info = json.loads(info)
- feed_html = info['feed_html']
+ feed_html = info.get('feed_html') or info.get('content_html')
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend(
@@ -1852,7 +1754,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+ IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
_FEED_NAME = 'subscriptions'
_PLAYLIST_TITLE = u'Youtube Subscriptions'
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
new file mode 100644
index 000000000..449482d3c
--- /dev/null
+++ b/youtube_dl/jsinterp.py
@@ -0,0 +1,116 @@
+from __future__ import unicode_literals
+
+import re
+
+from .utils import (
+ ExtractorError,
+)
+
+
+class JSInterpreter(object):
+ def __init__(self, code):
+ self.code = code
+ self._functions = {}
+
+ def interpret_statement(self, stmt, local_vars, allow_recursion=20):
+ if allow_recursion < 0:
+ raise ExtractorError('Recursion limit reached')
+
+ if stmt.startswith('var '):
+ stmt = stmt[len('var '):]
+ ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
+ r'=(?P<expr>.*)$', stmt)
+ if ass_m:
+ if ass_m.groupdict().get('index'):
+ def assign(val):
+ lvar = local_vars[ass_m.group('out')]
+ idx = self.interpret_expression(
+ ass_m.group('index'), local_vars, allow_recursion)
+ assert isinstance(idx, int)
+ lvar[idx] = val
+ return val
+ expr = ass_m.group('expr')
+ else:
+ def assign(val):
+ local_vars[ass_m.group('out')] = val
+ return val
+ expr = ass_m.group('expr')
+ elif stmt.startswith('return '):
+ assign = lambda v: v
+ expr = stmt[len('return '):]
+ else:
+ raise ExtractorError(
+ 'Cannot determine left side of statement in %r' % stmt)
+
+ v = self.interpret_expression(expr, local_vars, allow_recursion)
+ return assign(v)
+
+ def interpret_expression(self, expr, local_vars, allow_recursion):
+ if expr.isdigit():
+ return int(expr)
+
+ if expr.isalpha():
+ return local_vars[expr]
+
+ m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
+ if m:
+ member = m.group('member')
+ val = local_vars[m.group('in')]
+ if member == 'split("")':
+ return list(val)
+ if member == 'join("")':
+ return u''.join(val)
+ if member == 'length':
+ return len(val)
+ if member == 'reverse()':
+ return val[::-1]
+ slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
+ if slice_m:
+ idx = self.interpret_expression(
+ slice_m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx:]
+
+ m = re.match(
+ r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
+ if m:
+ val = local_vars[m.group('in')]
+ idx = self.interpret_expression(
+ m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx]
+
+ m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
+ if m:
+ a = self.interpret_expression(
+ m.group('a'), local_vars, allow_recursion)
+ b = self.interpret_expression(
+ m.group('b'), local_vars, allow_recursion)
+ return a % b
+
+ m = re.match(
+ r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
+ if m:
+ fname = m.group('func')
+ if fname not in self._functions:
+ self._functions[fname] = self.extract_function(fname)
+ argvals = [int(v) if v.isdigit() else local_vars[v]
+ for v in m.group('args').split(',')]
+ return self._functions[fname](argvals)
+ raise ExtractorError('Unsupported JS expression %r' % expr)
+
+ def extract_function(self, funcname):
+ func_m = re.search(
+ (r'(?:function %s|%s\s*=\s*function)' % (
+ re.escape(funcname), re.escape(funcname))) +
+ r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+ self.code)
+ if func_m is None:
+ raise ExtractorError('Could not find JS function %r' % funcname)
+ argnames = func_m.group('args').split(',')
+
+ def resf(args):
+ local_vars = dict(zip(argnames, args))
+ for stmt in func_m.group('code').split(';'):
+ res = self.interpret_statement(stmt, local_vars)
+ return res
+ return resf
+
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index c22f2cdc6..98b5eccb4 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -53,8 +53,9 @@ class FFmpegPostProcessor(PostProcessor):
if self._downloader.params.get('verbose', False):
self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- stdout,stderr = p.communicate()
+ bcmd = [self._downloader.encode(c) for c in cmd]
+ p = subprocess.Popen(bcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace')
msg = stderr.strip().split('\n')[-1]
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 3574fc615..5f1f664c8 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import calendar
import contextlib
import ctypes
import datetime
@@ -501,13 +502,13 @@ def orderedSet(iterable):
res.append(el)
return res
+
def unescapeHTML(s):
- """
- @param s a string
- """
- assert type(s) == type(u'')
+ if s is None:
+ return None
+ assert type(s) == compat_str
- result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
+ result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
return result
@@ -538,7 +539,6 @@ def encodeFilename(s, for_subprocess=False):
encoding = 'utf-8'
return s.encode(encoding, 'ignore')
-
def decodeOption(optval):
if optval is None:
return optval
@@ -761,6 +761,31 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_response = http_response
+def parse_iso8601(date_str):
+ """ Return a UNIX timestamp from the given date """
+
+ if date_str is None:
+ return None
+
+ m = re.search(
+ r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
+ date_str)
+ if not m:
+ timezone = datetime.timedelta()
+ else:
+ date_str = date_str[:-len(m.group(0))]
+ if not m.group('sign'):
+ timezone = datetime.timedelta()
+ else:
+ sign = 1 if m.group('sign') == '+' else -1
+ timezone = datetime.timedelta(
+ hours=sign * int(m.group('hours')),
+ minutes=sign * int(m.group('minutes')))
+
+ dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
+ return calendar.timegm(dt.timetuple())
+
+
def unified_strdate(date_str):
"""Return a string with the date in the format YYYYMMDD"""
@@ -1126,11 +1151,11 @@ def setproctitle(title):
libc = ctypes.cdll.LoadLibrary("libc.so.6")
except OSError:
return
- title = title
- buf = ctypes.create_string_buffer(len(title) + 1)
- buf.value = title.encode('utf-8')
+ title_bytes = title.encode('utf-8')
+ buf = ctypes.create_string_buffer(len(title_bytes))
+ buf.value = title_bytes
try:
- libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
+ libc.prctl(15, buf, 0, 0, 0)
except AttributeError:
return # Strange libc, just skip this
@@ -1151,8 +1176,12 @@ class HEADRequest(compat_urllib_request.Request):
return "HEAD"
-def int_or_none(v, scale=1):
- return v if v is None else (int(v) // scale)
+def int_or_none(v, scale=1, default=None):
+ return default if v is None else (int(v) // scale)
+
+
+def float_or_none(v, scale=1, default=None):
+ return default if v is None else (float(v) / scale)
def parse_duration(s):
@@ -1160,7 +1189,7 @@ def parse_duration(s):
return None
m = re.match(
- r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
+ r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
if not m:
return None
res = int(m.group('secs'))
@@ -1235,8 +1264,8 @@ class PagedList(object):
def uppercase_escape(s):
return re.sub(
- r'\\U([0-9a-fA-F]{8})',
- lambda m: compat_chr(int(m.group(1), base=16)), s)
+ r'\\U[0-9a-fA-F]{8}',
+ lambda m: m.group(0).decode('unicode-escape'), s)
try:
struct.pack(u'!I', 0)
@@ -1302,3 +1331,7 @@ US_RATINGS = {
'R': 16,
'NC': 18,
}
+
+
+def strip_jsonp(code):
+ return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 266930a7f..f7dd8a5f5 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2014.03.21.5'
+__version__ = '2014.04.04.5'