diff options
73 files changed, 2191 insertions, 686 deletions
diff --git a/.gitignore b/.gitignore index 0422adf44..26dbde73d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.pyo +*.class *~ *.DS_Store wine-py2exe/ @@ -32,4 +33,4 @@ test/testdata .tox youtube-dl.zsh .idea -.idea/*
\ No newline at end of file +.idea/* @@ -160,3 +160,4 @@ Erwin de Haan Jens Wille Robin Houtevelts Patrick Griffis +Aidan Rowe diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d15267d7e..c996f03ab 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ -**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: +**Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` -$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj +$ youtube-dl -v <your command line> [debug] System config: [] [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] @@ -92,7 +92,9 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. + +After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` @@ -140,16 +142,17 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. -8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want. +8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. +9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). +10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/__init__.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! @@ -3,6 +3,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete + find . -name "*.class" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -44,7 +45,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py + $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py tar: youtube-dl.tar.gz @@ -409,13 +409,18 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. + +For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` ---extract-audio +-x --no-mtime --proxy 127.0.0.1:3128 +-o ~/Movies/%(title)s.%(ext)s ``` +Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. + You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. ### Authentication with `.netrc` file @@ -440,7 +445,11 @@ On Windows you may also need to setup the `%HOME%` environment variable manually # OUTPUT TEMPLATE -The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are: +The `-o` option allows users to indicate a template for the output file names. + +**tl;dr:** [navigate me to examples](#output-template-examples). + +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are: - `id`: Video identifier - `title`: Video title @@ -449,6 +458,7 @@ The `-o` option allows users to indicate a template for the output file names. T - `alt_title`: A secondary title of the video - `display_id`: An alternative identifier for the video - `uploader`: Full name of the video uploader + - `license`: License name the video is licensed under - `creator`: The main artist who created the video - `release_date`: The date (YYYYMMDD) when the video was released - `timestamp`: UNIX timestamp of the moment the video became available @@ -513,7 +523,9 @@ The current default template is `%(title)s-%(id)s.%(ext)s`. In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: -Examples (note on Windows you may need to use double quotes instead of single): +#### Output template examples + +Note on Windows you may need to use double quotes instead of single. ```bash $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc @@ -525,6 +537,9 @@ youtube-dl_test_video_.mp4 # A simple file name # Download YouTube playlist videos in separate directory indexed by video order in a playlist $ youtube-dl -o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re +# Download all playlists of YouTube channel/user keeping each playlist in separate directory: +$ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/user/TheLinuxFoundation/playlists + # Download Udemy course keeping each chapter in separate directory under MyVideos directory in your home $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ @@ -543,6 +558,8 @@ But sometimes you may want to download in a different format, for example when y The general syntax for format selection is `--format FORMAT` or shorter `-f FORMAT` where `FORMAT` is a *selector expression*, i.e. an expression that describes format or formats you would like to download. +**tl;dr:** [navigate me to examples](#format-selection-examples). + The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download best quality format of particular file extension served as a single file, e.g. `-f webm` will download best quality format with `webm` extension served as a single file. @@ -588,11 +605,14 @@ You can merge the video and audio of two formats into a single file using `-f <v Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. -Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see [#5447](https://github.com/rg3/youtube-dl/issues/5447), [#5456](https://github.com/rg3/youtube-dl/issues/5456)). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. -Examples (note on Windows you may need to use double quotes instead of single): +#### Format selection examples + +Note on Windows you may need to use double quotes instead of single. + ```bash # Download best mp4 format available or any other best if no mp4 available $ youtube-dl -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' @@ -733,7 +753,7 @@ means you're using an outdated version of Python. Please update to Python 2.6 or ### What is this binary file? Where has the code gone? -Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unzip it (might need renaming to `youtube-dl.zip` first on some systems) or clone the git repository, as laid out above. If you modify the code, you can run it by executing the `__main__.py` file. To recompile the executable, run `make youtube-dl`. +Since June 2012 ([#342](https://github.com/rg3/youtube-dl/issues/342)) youtube-dl is packed as an executable zipfile, simply unzip it (might need renaming to `youtube-dl.zip` first on some systems) or clone the git repository, as laid out above. If you modify the code, you can run it by executing the `__main__.py` file. To recompile the executable, run `make youtube-dl`. ### The exe throws a *Runtime error from Visual C++* @@ -816,7 +836,9 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. + +After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` @@ -864,16 +886,17 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. -8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want. +8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. +9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). +10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/__init__.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! @@ -935,9 +958,9 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues>. Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). -**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: +**Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` -$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj +$ youtube-dl -v <your command line> [debug] System config: [] [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b384a3165..43403233d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -30,6 +30,7 @@ - **AlJazeera** - **Allocine** - **AlphaPorno** + - **AnimeOnDemand** - **anitube.se** - **AnySex** - **Aparat** @@ -49,6 +50,7 @@ - **arte.tv:ddc** - **arte.tv:embed** - **arte.tv:future** + - **arte.tv:magazine** - **AtresPlayer** - **ATTTechChannel** - **AudiMedia** @@ -75,6 +77,7 @@ - **BleacherReportCMS** - **blinkx** - **Bloomberg** + - **BokeCC** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek - **Break** @@ -360,7 +363,7 @@ - **MySpace:album** - **MySpass** - **Myvi** - - **myvideo** + - **myvideo** (Currently broken) - **MyVidster** - **n-tv.de** - **NationalGeographic** @@ -410,6 +413,7 @@ - **NowTV** (Currently broken) - **NowTVList** - **nowvideo**: NowVideo + - **Noz** - **npo**: npo.nl and ntr.nl - **npo.nl:live** - **npo.nl:radio** @@ -460,6 +464,7 @@ - **PornHd** - **PornHub** - **PornHubPlaylist** + - **PornHubUserVideos** - **Pornotube** - **PornoVoisines** - **PornoXO** @@ -522,6 +527,7 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** + - **ScreenJunkies** - **ScreenwaveMedia** - **SenateISVP** - **ServingSys** @@ -555,7 +561,6 @@ - **southpark.de** - **southpark.nl** - **southparkstudios.dk** - - **Space** - **SpankBang** - **Spankwire** - **Spiegel** @@ -615,6 +620,7 @@ - **TMZ** - **TMZArticle** - **TNAFlix** + - **TNAFlixNetworkEmbed** - **toggle** - **tou.tv** - **Toypics**: Toypics user profile @@ -655,6 +661,7 @@ - **twitch:video** - **twitch:vod** - **twitter** + - **twitter:amplify** - **twitter:card** - **Ubu** - **udemy** @@ -664,6 +671,7 @@ - **Urort**: NRK P3 Urørt - **ustream** - **ustream:channel** + - **Ustudio** - **Varzesh3** - **Vbox7** - **VeeHD** @@ -679,7 +687,7 @@ - **video.mit.edu** - **VideoDetective** - **videofy.me** - - **VideoMega** (Currently broken) + - **VideoMega** - **videomore** - **videomore:season** - **videomore:video** diff --git a/test/helper.py b/test/helper.py index bdd7acca4..f2d878212 100644 --- a/test/helper.py +++ b/test/helper.py @@ -11,8 +11,11 @@ import sys import youtube_dl.extractor from youtube_dl import YoutubeDL -from youtube_dl.utils import ( +from youtube_dl.compat import ( + compat_os_name, compat_str, +) +from youtube_dl.utils import ( preferredencoding, write_string, ) @@ -42,7 +45,7 @@ def report_warning(message): Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' - if sys.stderr.isatty() and os.name != 'nt': + if sys.stderr.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 02caf5908..59f7ab49d 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -234,7 +234,7 @@ class TestFormatSelection(unittest.TestCase): def test_youtube_format_selection(self): order = [ - '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13', + '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', # Apple HTTP Live Streaming '96', '95', '94', '93', '92', '132', '151', # 3D diff --git a/test/test_http.py b/test/test_http.py index f2e305b6f..fc59b1aed 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -52,7 +52,12 @@ class TestHTTP(unittest.TestCase): ('localhost', 0), HTTPTestRequestHandler) self.httpd.socket = ssl.wrap_socket( self.httpd.socket, certfile=certfn, server_side=True) - self.port = self.httpd.socket.getsockname()[1] + if os.name == 'java': + # In Jython SSLSocket is not a subclass of socket.socket + sock = self.httpd.socket.sock + else: + sock = self.httpd.socket + self.port = sock.getsockname()[1] self.server_thread = threading.Thread(target=self.httpd.serve_forever) self.server_thread.daemon = True self.server_thread.start() diff --git a/test/test_iqiyi_sdk_interpreter.py b/test/test_iqiyi_sdk_interpreter.py new file mode 100644 index 000000000..9d95cb606 --- /dev/null +++ b/test/test_iqiyi_sdk_interpreter.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL +from youtube_dl.extractor import IqiyiIE + + +class IqiyiIEWithCredentials(IqiyiIE): + def _get_login_info(self): + return 'foo', 'bar' + + +class WarningLogger(object): + def __init__(self): + self.messages = [] + + def warning(self, msg): + self.messages.append(msg) + + def debug(self, msg): + pass + + def error(self, msg): + pass + + +class TestIqiyiSDKInterpreter(unittest.TestCase): + def test_iqiyi_sdk_interpreter(self): + ''' + Test the functionality of IqiyiSDKInterpreter by trying to log in + + If `sign` is incorrect, /validate call throws an HTTP 556 error + ''' + logger = WarningLogger() + ie = IqiyiIEWithCredentials(FakeYDL({'logger': logger})) + ie._login() + self.assertTrue('unable to log in:' in logger.messages[0]) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index a1e416dd5..2bcf8ecf0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -18,6 +18,7 @@ import xml.etree.ElementTree from youtube_dl.utils import ( age_restricted, args_to_str, + encode_base_n, clean_html, DateRange, detect_exe_version, @@ -35,6 +36,7 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, parse_duration, @@ -59,6 +61,7 @@ from youtube_dl.utils import ( lowercase_escape, url_basename, urlencode_postdata, + update_url_query, version_tuple, xpath_with_ns, xpath_element, @@ -74,6 +77,8 @@ from youtube_dl.utils import ( ) from youtube_dl.compat import ( compat_etree_fromstring, + compat_urlparse, + compat_parse_qs, ) @@ -248,6 +253,7 @@ class TestUtil(unittest.TestCase): self.assertEqual( unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') + self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214') self.assertEqual(unified_strdate('25-09-2014'), '20140925') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) @@ -451,6 +457,40 @@ class TestUtil(unittest.TestCase): data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'}) self.assertTrue(isinstance(data, bytes)) + def test_update_url_query(self): + def query_dict(url): + return compat_parse_qs(compat_urlparse.urlparse(url).query) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})), + query_dict('http://example.com/path?quality=HD&format=mp4')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})), + query_dict('http://example.com/path?system=LINUX&system=WINDOWS')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'fields': 'id,formats,subtitles'})), + query_dict('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})), + query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path?manifest=f4m', {'manifest': []})), + query_dict('http://example.com/path')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})), + query_dict('http://example.com/path?system=LINUX')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'fields': b'id,formats,subtitles'})), + query_dict('http://example.com/path?fields=id,formats,subtitles')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'width': 1080, 'height': 720})), + query_dict('http://example.com/path?width=1080&height=720')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'bitrate': 5020.43})), + query_dict('http://example.com/path?bitrate=5020.43')) + self.assertEqual(query_dict(update_url_query( + 'http://example.com/path', {'test': '第二行тест'})), + query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) + def test_dict_get(self): FALSE_VALUES = { 'none': None, @@ -792,6 +832,24 @@ The first line {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), ['--check-certificate=true']) + def test_ohdave_rsa_encrypt(self): + N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd + e = 65537 + + self.assertEqual( + ohdave_rsa_encrypt(b'aa111222', e, N), + '726664bd9a23fd0c70f9f1b84aab5e3905ce1e45a584e9cbcf9bcc7510338fc1986d6c599ff990d923aa43c51c0d9013cd572e13bc58f4ae48f2ed8c0b0ba881') + + def test_encode_base_n(self): + self.assertEqual(encode_base_n(0, 30), '0') + self.assertEqual(encode_base_n(80, 30), '2k') + + custom_table = '9876543210ZYXWVUTSRQPONMLKJIHGFEDCBA' + self.assertEqual(encode_base_n(0, 30, custom_table), '9') + self.assertEqual(encode_base_n(80, 30, custom_table), '7P') + + self.assertRaises(ValueError, encode_base_n, 0, 70) + self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f4324039c..dcc867e45 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,9 +24,6 @@ import time import tokenize import traceback -if os.name == 'nt': - import ctypes - from .compat import ( compat_basestring, compat_cookiejar, @@ -34,6 +31,7 @@ from .compat import ( compat_get_terminal_size, compat_http_client, compat_kwargs, + compat_os_name, compat_str, compat_tokenize_tokenize, compat_urllib_error, @@ -95,6 +93,9 @@ from .postprocessor import ( ) from .version import __version__ +if compat_os_name == 'nt': + import ctypes + class YoutubeDL(object): """YoutubeDL class. @@ -450,7 +451,7 @@ class YoutubeDL(object): def to_console_title(self, message): if not self.params.get('consoletitle', False): return - if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) @@ -521,7 +522,7 @@ class YoutubeDL(object): else: if self.params.get('no_warnings'): return - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' @@ -533,7 +534,7 @@ class YoutubeDL(object): Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' @@ -1631,7 +1632,7 @@ class YoutubeDL(object): self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return - if success: + if success and filename != '-': # Fixup content fixup_policy = self.params.get('fixup') if fixup_policy is None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f5f064241..79b389840 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -355,6 +355,7 @@ def _real_main(argv=None): 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, 'extract_flat': opts.extract_flat, + 'mark_watched': opts.mark_watched, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, 'fixup': opts.fixup, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..2771fb5fa 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -326,6 +326,9 @@ def compat_ord(c): return ord(c) +compat_os_name = os._name if os.name == 'java' else os.name + + if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser @@ -346,7 +349,7 @@ else: # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib # for different platforms with correct environment variables decoding. - if os.name == 'posix': + if compat_os_name == 'posix': def compat_expanduser(path): """Expand ~ and ~user constructions. If user or $HOME is unknown, do nothing.""" @@ -370,7 +373,7 @@ else: userhome = pwent.pw_dir userhome = userhome.rstrip('/') return (userhome + path[i:]) or '/' - elif os.name == 'nt' or os.name == 'ce': + elif compat_os_name == 'nt' or compat_os_name == 'ce': def compat_expanduser(path): """Expand ~ and ~user constructs. @@ -556,6 +559,7 @@ __all__ = [ 'compat_itertools_count', 'compat_kwargs', 'compat_ord', + 'compat_os_name', 'compat_parse_qs', 'compat_print', 'compat_shlex_split', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2d5154051..f39db58f6 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,6 +5,7 @@ import re import sys import time +from ..compat import compat_os_name from ..utils import ( encodeFilename, error_to_compat_str, @@ -219,7 +220,7 @@ class FileDownloader(object): if self.params.get('progress_with_newline', False): self.to_screen(fullmsg) else: - if os.name == 'nt': + if compat_os_name == 'nt': prev_len = getattr(self, '_report_progress_prev_line_length', 0) if prev_len > len(fullmsg): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bfc2008be..08b3dc673 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -45,6 +45,7 @@ from .arte import ( ArteTVFutureIE, ArteTVCinemaIE, ArteTVDDCIE, + ArteTVMagazineIE, ArteTVEmbedIE, ) from .atresplayer import AtresPlayerIE @@ -73,6 +74,7 @@ from .bleacherreport import ( ) from .blinkx import BlinkxIE from .bloomberg import BloombergIE +from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE @@ -351,10 +353,9 @@ from .la7 import LA7IE from .laola1tv import Laola1TvIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE -from .letv import ( - LetvIE, - LetvTvIE, - LetvPlaylistIE, +from .leeco import ( + LeIE, + LePlaylistIE, LetvCloudIE, ) from .libsyn import LibsynIE @@ -505,6 +506,7 @@ from .npr import NprIE from .nrk import ( NRKIE, NRKPlaylistIE, + NRKSkoleIE, NRKTVIE, ) from .ntvde import NTVDeIE @@ -555,6 +557,7 @@ from .pornhd import PornHdIE from .pornhub import ( PornHubIE, PornHubPlaylistIE, + PornHubUserVideosIE, ) from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE @@ -668,7 +671,6 @@ from .southpark import ( SouthParkEsIE, SouthParkNlIE ) -from .space import SpaceIE from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE @@ -736,6 +738,7 @@ from .tmz import ( TMZArticleIE, ) from .tnaflix import ( + TNAFlixNetworkEmbedIE, TNAFlixIE, EMPFlixIE, MovieFapIE, @@ -797,7 +800,11 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE, TwitterIE +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, +) from .ubu import UbuIE from .udemy import ( UdemyIE, @@ -808,6 +815,7 @@ from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import UstudioIE from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 62ed0c918..be40f85b4 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -12,7 +12,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): IE_NAME = 'appletrailers' - _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { @@ -73,6 +73,9 @@ class AppleTrailersIE(InfoExtractor): }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, + }, { + 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/', + 'only_matching': True, }] _JSON_RE = r'iTunes.playURL\((.*?)\);' diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 6ed855a57..efde7e207 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -23,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html' + _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): @@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])' @classmethod def _extract_url_info(cls, url): @@ -102,13 +102,32 @@ class ArteTVPlus7IE(InfoExtractor): iframe_url = find_iframe_url(webpage, None) if not iframe_url: embed_url = self._html_search_regex( - r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url') - player = self._download_json( - embed_url, video_id, 'Downloading player page') - iframe_url = find_iframe_url(player['html']) - json_url = compat_parse_qs( - compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] - return self._extract_from_json_url(json_url, video_id, lang) + r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) + if embed_url: + player = self._download_json( + embed_url, video_id, 'Downloading player page') + iframe_url = find_iframe_url(player['html']) + # en and es URLs produce react-based pages with different layout (e.g. + # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) + if not iframe_url: + program = self._search_regex( + r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', + webpage, 'program', default=None) + if program: + embed_html = self._parse_json(program, video_id) + if embed_html: + iframe_url = find_iframe_url(embed_html['embed_html']) + if iframe_url: + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] + if json_url: + return self._extract_from_json_url(json_url, video_id, lang) + # Differend kind of embed URL (e.g. + # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) + embed_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', + webpage, 'embed url', group='url') + return self.url_result(embed_url) def _extract_from_json_url(self, json_url, video_id, lang): info = self._download_json(json_url, video_id) @@ -116,7 +135,7 @@ class ArteTVPlus7IE(InfoExtractor): upload_date_str = player_info.get('shootingDate') if not upload_date_str: - upload_date_str = player_info.get('VDA', '').split(' ')[0] + upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] title = player_info['VTI'].strip() subtitle = player_info.get('VSU', '').strip() @@ -132,27 +151,30 @@ class ArteTVPlus7IE(InfoExtractor): } qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) + LANGS = { + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + } + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - - langcode = { - 'fr': 'F', - 'de': 'A', - }.get(lang, lang) - lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode] - lang_pref = ( - None if versionCode is None else ( - 10 if any(re.match(r, versionCode) for r in lang_rexs) - else -10)) + langcode = LANGS.get(lang, lang) + lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] + lang_pref = None + if versionCode: + matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] + lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) source_pref = 0 if versionCode is not None: # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A)', versionCode): + if re.match(r'VO-ST(F|A|E)', versionCode): source_pref -= 10 # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A)-STM\1', versionCode): + elif re.match(r'VO?(F|A|E)-STM\1', versionCode): source_pref -= 9 format = { 'format_id': format_id, @@ -185,7 +207,7 @@ class ArteTVPlus7IE(InfoExtractor): # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' - _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/(?:magazine?/)?(?P<id>[^?#]+)' + _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:magazine?/)?(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', @@ -209,7 +231,7 @@ class ArteTVCreativeIE(ArteTVPlus7IE): class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' - _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(?P<id>.+)' + _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', @@ -217,6 +239,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): 'id': '050940-028-A', 'ext': 'mp4', 'title': 'Les écrevisses aussi peuvent être anxieuses', + 'upload_date': '20140902', }, }, { 'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', @@ -226,7 +249,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' - _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' + _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' def _real_extract(self, url): video_id, lang = self._extract_url_info(url) @@ -244,7 +267,7 @@ class ArteTVDDCIE(ArteTVPlus7IE): class ArteTVConcertIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:concert' - _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)' + _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', @@ -261,7 +284,7 @@ class ArteTVConcertIE(ArteTVPlus7IE): class ArteTVCinemaIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:cinema' - _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)' + _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' _TEST = { 'url': 'http://cinema.arte.tv/de/node/38291', @@ -276,6 +299,37 @@ class ArteTVCinemaIE(ArteTVPlus7IE): } +class ArteTVMagazineIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:magazine' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' + + _TESTS = [{ + # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..." + 'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium', + 'md5': '2a9369bcccf847d1c741e51416299f25', + 'info_dict': { + 'id': '065965-000-A', + 'ext': 'mp4', + 'title': 'Trepalium - Extrait Ep.01', + 'upload_date': '20160121', + }, + }, { + # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium" + 'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium', + 'md5': 'fedc64fc7a946110fe311634e79782ca', + 'info_dict': { + 'id': '054813-004_PLUS7-F', + 'ext': 'mp4', + 'title': 'Trepalium (4/6)', + 'description': 'md5:10057003c34d54e95350be4f9b05cb40', + 'upload_date': '20160218', + }, + }, { + 'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis', + 'only_matching': True, + }] + + class ArteTVEmbedIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:embed' _VALID_URL = r'''(?x) diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py new file mode 100644 index 000000000..122a1cbb6 --- /dev/null +++ b/youtube_dl/extractor/bokecc.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', + webpage, 'player params') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', + 'info_dict': { + 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index cb96c3876..cac8fdcba 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -4,12 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import js_to_json class C56IE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' IE_NAME = '56.com' - _TEST = { + _TESTS = [{ 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { @@ -18,12 +19,29 @@ class C56IE(InfoExtractor): 'title': '网事知多少 第32期:车怒', 'duration': 283.813, }, - } + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + page = self._download_json( 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 8f864699f..7319ee1b7 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from .theplatform import ThePlatformIE -from ..utils import parse_duration +from ..utils import ( + parse_duration, + find_xpath_attr, +) class CBSNewsIE(ThePlatformIE): @@ -46,6 +49,15 @@ class CBSNewsIE(ThePlatformIE): }, ] + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') + return { + 'en': [{ + 'ext': 'ttml', + 'url': closed_caption_e.attrib['value'], + }] + } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + def _real_extract(self, url): video_id = self._match_id(url) @@ -61,12 +73,6 @@ class CBSNewsIE(ThePlatformIE): thumbnail = item.get('mediaImage') or item.get('thumbnail') subtitles = {} - if 'mpxRefId' in video_info: - subtitles['en'] = [{ - 'ext': 'ttml', - 'url': 'http://www.cbsnews.com/videos/captions/%s.adb_xml' % video_info['mpxRefId'], - }] - formats = [] for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: pid = item.get('media' + format_id) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f411ea763..07bd2cbe2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,13 +15,14 @@ import math from ..compat import ( compat_cookiejar, compat_cookies, + compat_etree_fromstring, compat_getpass, compat_http_client, + compat_os_name, + compat_str, compat_urllib_error, compat_urllib_parse, compat_urlparse, - compat_str, - compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -46,6 +47,7 @@ from ..utils import ( xpath_with_ns, determine_protocol, parse_duration, + mimetype2ext, ) @@ -156,12 +158,14 @@ class InfoExtractor(object): thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. + license: License name the video is licensed under. creator: The main artist who created the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {language: subformats}. "subformats" is a list sorted from @@ -424,7 +428,7 @@ class InfoExtractor(object): self.to_screen('Saving request to ' + filename) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if os.name == 'nt': + if compat_os_name == 'nt': absfilepath = os.path.abspath(filename) if len(absfilepath) > 259: filename = '\\\\?\\' + absfilepath @@ -593,7 +597,7 @@ class InfoExtractor(object): if mobj: break - if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name @@ -899,6 +903,16 @@ class InfoExtractor(object): item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), formats) + @staticmethod + def _remove_duplicate_formats(formats): + format_urls = set() + unique_formats = [] + for f in formats: + if f['url'] not in format_urls: + format_urls.add(f['url']) + unique_formats.append(f) + formats[:] = unique_formats + def _is_valid_url(self, url, video_id, item='video'): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid @@ -1022,11 +1036,21 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() - # A Media Playlist Tag MUST NOT appear in a Master Playlist - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 - if '#EXT-X-TARGETDURATION' in m3u8_doc: + + # We should try extracting formats only from master playlists [1], i.e. + # playlists that describe available qualities. On the other hand media + # playlists [2] should be returned as is since they contain just the media + # without qualities renditions. + # Fortunately, master playlist can be easily distinguished from media + # playlist based on particular tags availability. As of [1, 2] master + # playlist tags MUST NOT appear in a media playist and vice versa. + # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist + # and MUST NOT appear in master playlist thus we can clearly detect media + # playlist with this criterion. + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, 'format_id': m3u8_id, @@ -1073,19 +1097,29 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, } - codecs = last_info.get('CODECS') - if codecs: - # TODO: looks like video codec is not always necessarily goes first - va_codecs = codecs.split(',') - if va_codecs[0]: - f['vcodec'] = va_codecs[0] - if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + codecs = last_info.get('CODECS') + if codecs: + vcodec, acodec = [None] * 2 + va_codecs = codecs.split(',') + if len(va_codecs) == 1: + # Audio only entries usually come with single codec and + # no resolution. For more robustness we also check it to + # be mp4 audio. + if not resolution and va_codecs[0].startswith('mp4a'): + vcodec, acodec = 'none', va_codecs[0] + else: + vcodec = va_codecs[0] + else: + vcodec, acodec = va_codecs[:2] + f.update({ + 'acodec': acodec, + 'vcodec': vcodec, + }) if last_media is not None: f['m3u8_media'] = last_media last_media = None @@ -1277,16 +1311,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) - if not ext: - type_ = textstream.get('type') - SUBTITLES_TYPES = { - 'text/vtt': 'vtt', - 'text/srt': 'srt', - 'application/smptett+xml': 'tt', - } - if type_ in SUBTITLES_TYPES: - ext = SUBTITLES_TYPES[type_] + ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, @@ -1598,6 +1623,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def mark_watched(self, *args, **kwargs): + if (self._downloader.params.get('mark_watched', False) and + (self._get_login_info()[0] is not None or + self._downloader.params.get('cookiefile') is not None)): + self._mark_watched(*args, **kwargs) + + def _mark_watched(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7ae9f2359..2e6226ea0 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -122,10 +122,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') - view_count = str_to_int(self._search_regex( - [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"', - r'video_views_count[^>]+>\s+([\d\.,]+)'], - webpage, 'view count', fatal=False)) + view_count_str = self._search_regex( + (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', + r'video_views_count[^>]+>\s+([\s\d\,.]+)'), + webpage, 'view count', fatal=False) + if view_count_str: + view_count_str = re.sub(r'\s', '', view_count_str) + view_count = str_to_int(view_count_str) comment_count = int_or_none(self._search_regex( r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', webpage, 'comment count', fatal=False)) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 373b3b4b4..bdc768c78 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:c93d6692dde6fe33809a46edcbecca44', + 'description': 'md5:f34981259a03e980a3c6404190a3ed61', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.douyutv.com/85982', 'info_dict': { @@ -42,7 +42,24 @@ class DouyuTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Romm not found', + }, { + 'url': 'http://www.douyutv.com/17732', + 'info_dict': { + 'id': '17732', + 'display_id': '17732', + 'ext': 'flv', + 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': '7师傅', + 'uploader_id': '431925', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 6cda56a7f..a638c827c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,6 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import json +import re import time from .common import InfoExtractor @@ -8,44 +10,125 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)' + _VALID_URL = r'http://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' - _TEST = { + _TESTS = [{ + 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', + 'info_dict': { + 'id': '1255600', + 'display_id': 'stagione-1-episodio-25', + 'ext': 'mp4', + 'title': 'Episodio 25', + 'description': 'md5:cae5f40ad988811b197d2d27a53227eb', + 'duration': 2761, + 'timestamp': 1454701800, + 'upload_date': '20160205', + 'creator': 'RTIT', + 'series': 'Take me out', + 'season_number': 1, + 'episode_number': 25, + 'age_limit': 0, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', 'info_dict': { 'id': '3172', - 'ext': 'mp4', 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', + 'ext': 'flv', 'title': 'Svensken lär sig njuta av livet', + 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', 'duration': 2650, + 'timestamp': 1365454320, + 'upload_date': '20130408', + 'creator': 'Kanal 5 (Home)', + 'series': 'Nugammalt - 77 händelser som format Sverige', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 0, }, - } + }, { + 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', + 'info_dict': { + 'id': '70816', + 'display_id': 'season-6-episode-12', + 'ext': 'flv', + 'title': 'Episode 12', + 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', + 'duration': 2563, + 'timestamp': 1429696800, + 'upload_date': '20150422', + 'creator': 'Kanal 4', + 'series': 'Mig og min mor', + 'season_number': 6, + 'episode_number': 12, + 'age_limit': 0, + }, + }, { + 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', + 'only_matching': True, + }] def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + domain = mobj.group('domain') + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( - r'data-video-id="(\d+)"', webpage, 'video id') + r'data-video-id=["\'](\d+)', webpage, 'video id') info = self._download_json( - 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id, + 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), video_id)['data'][0] - self._set_cookie( - 'secure.dplay.se', 'dsc-geo', - '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000)) - # TODO: consider adding support for 'stream_type=hds', it seems to - # require setting some cookies - manifest_url = self._download_json( - 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id, - video_id, 'Getting manifest url for hls stream')['hls'] - formats = self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native') + title = info['title'] + + PROTOCOLS = ('hls', 'hds') + formats = [] + + def extract_formats(protocol, manifest_url): + if protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)) + elif protocol == 'hds': + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', + video_id, f4m_id=protocol, fatal=False)) + + domain_tld = domain.split('.')[-1] + if domain_tld in ('se', 'dk'): + for protocol in PROTOCOLS: + self._set_cookie( + 'secure.dplay.%s' % domain_tld, 'dsc-geo', + json.dumps({ + 'countryCode': domain_tld.upper(), + 'expiry': (time.time() + 20 * 60) * 1000, + })) + stream = self._download_json( + 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s' + % (domain_tld, video_id, protocol), video_id, + 'Downloading %s stream JSON' % protocol, fatal=False) + if stream and stream.get(protocol): + extract_formats(protocol, stream[protocol]) + else: + for protocol in PROTOCOLS: + if info.get(protocol): + extract_formats(protocol, info[protocol]) return { 'id': video_id, 'display_id': display_id, - 'title': info['title'], - 'formats': formats, + 'title': title, + 'description': info.get('video_metadata_longDescription'), 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), + 'timestamp': int_or_none(info.get('video_publish_date')), + 'creator': info.get('video_metadata_homeChannel'), + 'series': info.get('video_metadata_show'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ed237f081..6c6c3b1bd 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -34,8 +34,9 @@ class FacebookIE(InfoExtractor): video/video\.php| photo\.php| video\.php| - video/embed - )\?(?:.*?)(?:v|video_id)=| + video/embed| + story\.php + )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)? )| facebook: @@ -92,6 +93,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'facebook:544765982287235', 'only_matching': True, + }, { + 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', + 'only_matching': True, }] def _login(self): @@ -186,7 +190,7 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json(self._search_regex( r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) - for item in server_js_data['instances']: + for item in server_js_data.get('instances', []): if item[1][0] == 'VideoConfig': video_data = video_data_list2dict(item[2][0]['videoData']) break @@ -208,10 +212,13 @@ class FacebookIE(InfoExtractor): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: + preference = -10 if format_id == 'progressive' else 0 + if quality == 'hd': + preference += 5 formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, - 'preference': -10 if format_id == 'progressive' else 0, + 'preference': preference, }) dash_manifest = f[0].get('dash_manifest') if dash_manifest: diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 6f9b003c2..fd535457d 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -52,7 +52,7 @@ class FazIE(InfoExtractor): formats = [] for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): encoding = xpath_element(encodings, code) - if encoding: + if encoding is not None: encoding_url = xpath_text(encoding, 'FILENAME') if encoding_url: formats.append({ diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 8e60cf60f..3f4ac3093 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -289,7 +289,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_id, catalogue = self._html_search_regex( - r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video ID').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 45adbb7a3..ca745ae41 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE @@ -1573,6 +1574,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'VK') + # Look for embedded Odnoklassniki player + mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Odnoklassniki') + # Look for embedded ivi player mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) if mobj is not None: @@ -1628,6 +1634,11 @@ class GenericIE(InfoExtractor): if xhamster_urls: return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + # Look for embedded TNAFlixNetwork player + tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) + if tnaflix_urls: + return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index f354c9c7a..37be34091 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -82,7 +82,7 @@ class GoogleDriveIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 02e1e428e..b61b2dc4e 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -42,7 +42,7 @@ class ImdbIE(InfoExtractor): for f_url, f_name in extra_formats] format_pages.append(player_page) - quality = qualities(['SD', '480p', '720p']) + quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] for format_page in format_pages: json_data = self._search_regex( diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 12fb5e8e1..9622f198a 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor): 'url': self._proto_relative_url(thumbnail) } for thumbnail in video.get('thumbnails', [])] - tags = [tag['title'] for tag in video.get('tags', [])] + tags = [tag['title'] for tag in video.get('tags') or []] return { 'id': video.get('id') or video_id, diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 016af2084..cca0b8a93 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,15 +4,12 @@ from __future__ import unicode_literals import base64 -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_parse_qs, -) +from ..compat import compat_urllib_parse_unquote from ..utils import determine_ext +from .bokecc import BokeCCBaseIE -class InfoQIE(InfoExtractor): +class InfoQIE(BokeCCBaseIE): _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ @@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor): }, }] - def _extract_bokecc_videos(self, webpage, video_id): - # TODO: bokecc.com is a Chinese video cloud platform - # It should have an independent extractor but I don't have other - # examples using bokecc - player_params_str = self._html_search_regex( - r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', - webpage, 'player params', default=None) - - player_params = compat_parse_qs(player_params_str) - - info_xml = self._download_xml( - 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( - player_params['siteid'][0], player_params['vid'][0]), video_id) - - return [{ - 'format_id': 'bokecc', - 'url': quality.find('./copy').attrib['playurl'], - 'preference': int(quality.attrib['value']), - } for quality in info_xml.findall('./video/quality')] - def _extract_rtmp_videos(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor): if '/cn/' in url: # for China videos, HTTP video URL exists but always fails with 403 - formats = self._extract_bokecc_videos(webpage, video_id) + formats = self._extract_bokecc_formats(webpage, video_id) else: formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 9046705a5..d3bee3a19 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -2,32 +2,173 @@ from __future__ import unicode_literals import hashlib +import itertools import math import os import random +import re import time import uuid from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, ) from ..utils import ( + decode_packed_codes, ExtractorError, + ohdave_rsa_encrypt, + remove_start, sanitized_Request, urlencode_postdata, url_basename, ) +def md5_text(text): + return hashlib.md5(text.encode('utf-8')).hexdigest() + + +class IqiyiSDK(object): + def __init__(self, target, ip, timestamp): + self.target = target + self.ip = ip + self.timestamp = timestamp + + @staticmethod + def split_sum(data): + return compat_str(sum(map(lambda p: int(p, 16), list(data)))) + + @staticmethod + def digit_sum(num): + if isinstance(num, int): + num = compat_str(num) + return compat_str(sum(map(int, num))) + + def even_odd(self): + even = self.digit_sum(compat_str(self.timestamp)[::2]) + odd = self.digit_sum(compat_str(self.timestamp)[1::2]) + return even, odd + + def preprocess(self, chunksize): + self.target = md5_text(self.target) + chunks = [] + for i in range(32 // chunksize): + chunks.append(self.target[chunksize * i:chunksize * (i + 1)]) + if 32 % chunksize: + chunks.append(self.target[32 - 32 % chunksize:]) + return chunks, list(map(int, self.ip.split('.'))) + + def mod(self, modulus): + chunks, ip = self.preprocess(32) + self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip)) + + def split(self, chunksize): + modulus_map = { + 4: 256, + 5: 10, + 8: 100, + } + + chunks, ip = self.preprocess(chunksize) + ret = '' + for i in range(len(chunks)): + ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else '' + if chunksize == 8: + ret += ip_part + chunks[i] + else: + ret += chunks[i] + ip_part + self.target = ret + + def handle_input16(self): + self.target = md5_text(self.target) + self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:]) + + def handle_input8(self): + self.target = md5_text(self.target) + ret = '' + for i in range(4): + part = self.target[8 * i:8 * (i + 1)] + ret += self.split_sum(part) + part + self.target = ret + + def handleSum(self): + self.target = md5_text(self.target) + self.target = self.split_sum(self.target) + self.target + + def date(self, scheme): + self.target = md5_text(self.target) + d = time.localtime(self.timestamp) + strings = { + 'y': compat_str(d.tm_year), + 'm': '%02d' % d.tm_mon, + 'd': '%02d' % d.tm_mday, + } + self.target += ''.join(map(lambda c: strings[c], list(scheme))) + + def split_time_even_odd(self): + even, odd = self.even_odd() + self.target = odd + md5_text(self.target) + even + + def split_time_odd_even(self): + even, odd = self.even_odd() + self.target = even + md5_text(self.target) + odd + + def split_ip_time_sum(self): + chunks, ip = self.preprocess(32) + self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp) + + def split_time_ip_sum(self): + chunks, ip = self.preprocess(32) + self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip)) + + +class IqiyiSDKInterpreter(object): + def __init__(self, sdk_code): + self.sdk_code = sdk_code + + def run(self, target, ip, timestamp): + self.sdk_code = decode_packed_codes(self.sdk_code) + + functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code) + + sdk = IqiyiSDK(target, ip, timestamp) + + other_functions = { + 'handleSum': sdk.handleSum, + 'handleInput8': sdk.handle_input8, + 'handleInput16': sdk.handle_input16, + 'splitTimeEvenOdd': sdk.split_time_even_odd, + 'splitTimeOddEven': sdk.split_time_odd_even, + 'splitIpTimeSum': sdk.split_ip_time_sum, + 'splitTimeIpSum': sdk.split_time_ip_sum, + } + for function in functions: + if re.match(r'mod\d+', function): + sdk.mod(int(function[3:])) + elif re.match(r'date[ymd]{3}', function): + sdk.date(function[4:]) + elif re.match(r'split\d+', function): + sdk.split(int(function[5:])) + elif function in other_functions: + other_functions[function]() + else: + raise ExtractorError('Unknown funcion %s' % function) + + return sdk.target + + class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _NETRC_MACHINE = 'iqiyi' + _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', 'md5': '2cb594dc2781e6c941a110d8f358118b', @@ -125,6 +266,13 @@ class IqiyiIE(InfoExtractor): }, }], 'expected_warnings': ['Needs a VIP account for full video'], + }, { + 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', + 'info_dict': { + 'id': '202918101', + 'title': '灌篮高手 国语版', + }, + 'playlist_count': 101, }] _FORMATS_MAP = [ @@ -136,9 +284,63 @@ class IqiyiIE(InfoExtractor): ('10', 'h1'), ] + def _real_initialize(self): + self._login() + @staticmethod - def md5_text(text): - return hashlib.md5(text.encode('utf-8')).hexdigest() + def _rsa_fun(data): + # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js + N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd + e = 65537 + + return ohdave_rsa_encrypt(data, e, N) + + def _login(self): + (username, password) = self._get_login_info() + + # No authentication to be performed + if not username: + return True + + data = self._download_json( + 'http://kylin.iqiyi.com/get_token', None, + note='Get token for logging', errnote='Unable to get token for logging') + sdk = data['sdk'] + timestamp = int(time.time()) + target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % ( + username, self._rsa_fun(password.encode('utf-8'))) + + interp = IqiyiSDKInterpreter(sdk) + sign = interp.run(target, data['ip'], timestamp) + + validation_params = { + 'target': target, + 'server': 'BEA3AA1908656AABCCFF76582C4C6660', + 'token': data['token'], + 'bird_src': 'f8d91d57af224da7893dd397d52d811a', + 'sign': sign, + 'bird_t': timestamp, + } + validation_result = self._download_json( + 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None, + note='Validate credentials', errnote='Unable to validate credentials') + + MSG_MAP = { + 'P00107': 'please login via the web interface and enter the CAPTCHA code', + 'P00117': 'bad username or password', + } + + code = validation_result['code'] + if code != 'A00000': + msg = MSG_MAP.get(code) + if not msg: + msg = 'error %s' % code + if validation_result.get('msg'): + msg += ': ' + validation_result['msg'] + self._downloader.report_warning('unable to log in: ' + msg) + return False + + return True def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): auth_params = { @@ -199,7 +401,7 @@ class IqiyiIE(InfoExtractor): note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) )['t'] t = str(int(math.floor(int(tm) / (600.0)))) - return self.md5_text(t + mg + x) + return md5_text(t + mg + x) video_urls_dict = {} need_vip_warning_report = True @@ -278,16 +480,16 @@ class IqiyiIE(InfoExtractor): tail = tm + tvid param = { 'key': 'fvip', - 'src': self.md5_text('youtube-dl'), + 'src': md5_text('youtube-dl'), 'tvId': tvid, 'vid': video_id, 'vinfo': 1, 'tm': tm, - 'enc': self.md5_text(enc_key + tail), + 'enc': md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), 'um': 0, - 'authkey': self.md5_text(self.md5_text('') + tail), + 'authkey': md5_text(md5_text('') + tail), 'k_tag': 1, } @@ -296,24 +498,62 @@ class IqiyiIE(InfoExtractor): raw_data = self._download_json(api_url, video_id) return raw_data - def get_enc_key(self, swf_url, video_id): + def get_enc_key(self, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite enc_key = '6ab6d0280511493ba85594779759d4ed' return enc_key + def _extract_playlist(self, webpage): + PAGE_SIZE = 50 + + links = re.findall( + r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"', + webpage) + if not links: + return + + album_id = self._search_regex( + r'albumId\s*:\s*(\d+),', webpage, 'album ID') + album_title = self._search_regex( + r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False) + + entries = list(map(self.url_result, links)) + + # Start from 2 because links in the first page are already on webpage + for page_num in itertools.count(2): + pagelist_page = self._download_webpage( + 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE), + album_id, + note='Download playlist page %d' % page_num, + errnote='Failed to download playlist page %d' % page_num) + pagelist = self._parse_json( + remove_start(pagelist_page, 'var tvInfoJs='), album_id) + vlist = pagelist['data']['vlist'] + for item in vlist: + entries.append(self.url_result(item['vurl'])) + if len(vlist) < PAGE_SIZE: + break + + return self.playlist_result(entries, album_id, album_title) + def _real_extract(self, url): webpage = self._download_webpage( url, 'temp_id', note='download video page') + + # There's no simple way to determine whether an URL is a playlist or not + # So detect it + playlist_result = self._extract_playlist(webpage) + if playlist_result: + return playlist_result + tvid = self._search_regex( r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - swf_url = self._search_regex( - r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') _uuid = uuid.uuid4().hex - enc_key = self.get_enc_key(swf_url, video_id) + enc_key = self.get_enc_key(video_id) raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 8e90d5986..6770685d7 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,33 +7,9 @@ from .common import InfoExtractor from ..utils import int_or_none -class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' - _TEST = { - 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', - 'info_dict': { - 'id': 'nPripu9l', - 'ext': 'mov', - 'title': 'Big Buck Bunny Trailer', - 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', - 'upload_date': '20081127', - 'timestamp': 1227796140, - } - } - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) - video_data = json_data['playlist'][0] +class JWPlatformBaseIE(InfoExtractor): + def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): + video_data = jwplayer_data['playlist'][0] subtitles = {} for track in video_data['tracks']: if track['kind'] == 'captions': @@ -43,7 +19,7 @@ class JWPlatformIE(InfoExtractor): for source in video_data['sources']: source_url = self._proto_relative_url(source['file']) source_type = source.get('type') or '' - if source_type == 'application/vnd.apple.mpegurl': + if source_type in ('application/vnd.apple.mpegurl', 'hls'): formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', 'm3u8_native', fatal=False)) elif source_type.startswith('audio'): @@ -61,10 +37,39 @@ class JWPlatformIE(InfoExtractor): return { 'id': video_id, - 'title': video_data['title'], + 'title': video_data['title'] if require_title else video_data.get('title'), 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), 'subtitles': subtitles, 'formats': formats, } + + +class JWPlatformIE(JWPlatformBaseIE): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + return self._parse_jwplayer_data(json_data, video_id) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index f641edef8..700e44b63 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -68,6 +68,7 @@ class KuwoIE(KuwoBaseIE): 'id': '6446136', 'ext': 'mp3', 'title': '心', + 'description': 'md5:b2ab6295d014005bfc607525bfc1e38a', 'creator': 'IU', 'upload_date': '20150518', }, diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/leeco.py index 9665ece89..df47e88ba 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/leeco.py @@ -1,36 +1,39 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import datetime +import hashlib import re import time -import base64 -import hashlib from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_ord, compat_str, + compat_urllib_parse, ) from ..utils import ( determine_ext, + encode_data_uri, ExtractorError, + int_or_none, + orderedSet, parse_iso8601, sanitized_Request, - int_or_none, str_or_none, - encode_data_uri, url_basename, ) -class LetvIE(InfoExtractor): +class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html' + _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' + + _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' _TESTS = [{ - 'url': 'http://www.letv.com/ptv/vplay/22005890.html', + 'url': 'http://www.le.com/ptv/vplay/22005890.html', 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', @@ -42,7 +45,7 @@ class LetvIE(InfoExtractor): 'hls_prefer_native': True, }, }, { - 'url': 'http://www.letv.com/ptv/vplay/1415246.html', + 'url': 'http://www.le.com/ptv/vplay/1415246.html', 'info_dict': { 'id': '1415246', 'ext': 'mp4', @@ -54,7 +57,7 @@ class LetvIE(InfoExtractor): }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', - 'url': 'http://www.letv.com/ptv/vplay/1118082.html', + 'url': 'http://www.le.com/ptv/vplay/1118082.html', 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', @@ -94,17 +97,16 @@ class LetvIE(InfoExtractor): return encrypted_data encrypted_data = encrypted_data[5:] - _loc4_ = bytearray() - while encrypted_data: - b = compat_ord(encrypted_data[0]) - _loc4_.extend([b // 16, b & 0x0f]) - encrypted_data = encrypted_data[1:] + _loc4_ = bytearray(2 * len(encrypted_data)) + for idx, val in enumerate(encrypted_data): + b = compat_ord(val) + _loc4_[2 * idx] = b // 16 + _loc4_[2 * idx + 1] = b % 16 idx = len(_loc4_) - 11 _loc4_ = _loc4_[idx:] + _loc4_[:idx] - _loc7_ = bytearray() - while _loc4_: - _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) - _loc4_ = _loc4_[2:] + _loc7_ = bytearray(len(encrypted_data)) + for i in range(len(encrypted_data)): + _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1] return bytes(_loc7_) @@ -117,10 +119,10 @@ class LetvIE(InfoExtractor): 'splatid': 101, 'format': 1, 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.letv.com' + 'domain': 'www.le.com' } play_json_req = sanitized_Request( - 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) + 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) ) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') if cn_verification_proxy: @@ -193,26 +195,51 @@ class LetvIE(InfoExtractor): } -class LetvTvIE(InfoExtractor): - _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html' +class LePlaylistIE(InfoExtractor): + _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' + _TESTS = [{ - 'url': 'http://www.letv.com/tv/46177.html', + 'url': 'http://www.le.com/tv/46177.html', 'info_dict': { 'id': '46177', 'title': '美人天下', 'description': 'md5:395666ff41b44080396e59570dbac01c' }, 'playlist_count': 35 + }, { + 'url': 'http://tv.le.com/izt/wuzetian/index.html', + 'info_dict': { + 'id': 'wuzetian', + 'title': '武媚娘传奇', + 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' + }, + # This playlist contains some extra videos other than the drama itself + 'playlist_mincount': 96 + }, { + 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', + # This series is moved to http://www.le.com/tv/10005297.html + 'only_matching': True, + }, { + 'url': 'http://www.le.com/comic/92063.html', + 'only_matching': True, + }, { + 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) + def _real_extract(self, url): playlist_id = self._match_id(url) page = self._download_webpage(url, playlist_id) - media_urls = list(set(re.findall( - r'http://www.letv.com/ptv/vplay/\d+.html', page))) - entries = [self.url_result(media_url, ie='Letv') - for media_url in media_urls] + # Currently old domain names are still used in playlists + media_ids = orderedSet(re.findall( + r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page)) + entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le') + for media_id in media_ids] title = self._html_search_meta('keywords', page, fatal=False).split(',')[0] @@ -222,31 +249,9 @@ class LetvTvIE(InfoExtractor): playlist_description=description) -class LetvPlaylistIE(LetvTvIE): - _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html' - _TESTS = [{ - 'url': 'http://tv.letv.com/izt/wuzetian/index.html', - 'info_dict': { - 'id': 'wuzetian', - 'title': '武媚娘传奇', - 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' - }, - # This playlist contains some extra videos other than the drama itself - 'playlist_mincount': 96 - }, { - 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', - 'info_dict': { - 'id': 'lswjzzjc', - # The title should be "劲舞青春", but I can't find a simple way to - # determine the playlist title - 'title': '乐视午间自制剧场', - 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' - }, - 'playlist_mincount': 7 - }] - - class LetvCloudIE(InfoExtractor): + # Most of *.letv.com is changed to *.le.com on 2016/01/02 + # but yuntv.letv.com is kept, so also keep the extractor name IE_DESC = '乐视云' _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' @@ -327,7 +332,7 @@ class LetvCloudIE(InfoExtractor): formats.append({ 'url': url, 'ext': determine_ext(decoded_url), - 'format_id': int_or_none(play_url.get('vtype')), + 'format_id': str_or_none(play_url.get('vtype')), 'format_note': str_or_none(play_url.get('definition')), 'width': int_or_none(play_url.get('vwidth')), 'height': int_or_none(play_url.get('vheight')), diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index f8cbca7b3..a8fd639cc 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -20,18 +20,18 @@ class LifeNewsIE(InfoExtractor): _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://lifenews.ru/news/126342', - 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', + # single video embedded via video/source + 'url': 'http://lifenews.ru/news/98736', + 'md5': '77c95eaefaca216e32a76a343ad89d23', 'info_dict': { - 'id': '126342', + 'id': '98736', 'ext': 'mp4', - 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', - 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', - 'thumbnail': 're:http://.*\.jpg', - 'upload_date': '20140130', + 'title': 'Мужчина нашел дома архив оборонного завода', + 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', + 'upload_date': '20120805', } }, { - # video in <iframe> + # single video embedded via iframe 'url': 'http://lifenews.ru/news/152125', 'md5': '77d19a6f0886cd76bdbf44b4d971a273', 'info_dict': { @@ -42,15 +42,33 @@ class LifeNewsIE(InfoExtractor): 'upload_date': '20150402', } }, { + # two videos embedded via iframe 'url': 'http://lifenews.ru/news/153461', - 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', 'info_dict': { 'id': '153461', - 'ext': 'mp4', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', 'upload_date': '20150505', - } + }, + 'playlist': [{ + 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', + 'info_dict': { + 'id': '153461-video1', + 'ext': 'mp4', + 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)', + 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'upload_date': '20150505', + }, + }, { + 'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322', + 'info_dict': { + 'id': '153461-video2', + 'ext': 'mp4', + 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)', + 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', + 'upload_date': '20150505', + }, + }], }, { 'url': 'http://lifenews.ru/video/13035', 'only_matching': True, @@ -65,10 +83,14 @@ class LifeNewsIE(InfoExtractor): 'http://lifenews.ru/%s/%s' % (section, video_id), video_id, 'Downloading page') - videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage) - iframe_link = self._html_search_regex( - '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None) - if not videos and not iframe_link: + video_urls = re.findall( + r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) + + iframe_links = re.findall( + r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']', + webpage) + + if not video_urls and not iframe_links: raise ExtractorError('No media links available for %s' % video_id) title = remove_end( @@ -95,31 +117,44 @@ class LifeNewsIE(InfoExtractor): 'upload_date': upload_date, } - def make_entry(video_id, media, video_number=None): + def make_entry(video_id, video_url, index=None): cur_info = dict(common_info) cur_info.update({ - 'id': video_id, - 'url': media[1], - 'thumbnail': media[0], - 'title': title if video_number is None else '%s-video%s' % (title, video_number), + 'id': video_id if not index else '%s-video%s' % (video_id, index), + 'url': video_url, + 'title': title if not index else '%s (Видео %s)' % (title, index), }) return cur_info - if iframe_link: - iframe_link = self._proto_relative_url(iframe_link, 'http:') - cur_info = dict(common_info) - cur_info.update({ - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': iframe_link, - }) + def make_video_entry(video_id, video_url, index=None): + video_url = compat_urlparse.urljoin(url, video_url) + return make_entry(video_id, video_url, index) + + def make_iframe_entry(video_id, video_url, index=None): + video_url = self._proto_relative_url(video_url, 'http:') + cur_info = make_entry(video_id, video_url, index) + cur_info['_type'] = 'url_transparent' return cur_info - if len(videos) == 1: - return make_entry(video_id, videos[0]) - else: - return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)] + if len(video_urls) == 1 and not iframe_links: + return make_video_entry(video_id, video_urls[0]) + + if len(iframe_links) == 1 and not video_urls: + return make_iframe_entry(video_id, iframe_links[0]) + + entries = [] + + if video_urls: + for num, video_url in enumerate(video_urls, 1): + entries.append(make_video_entry(video_id, video_url, num)) + + if iframe_links: + for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1): + entries.append(make_iframe_entry(video_id, iframe_link, num)) + + playlist = common_info.copy() + playlist.update(self.playlist_result(entries, video_id, title, description)) + return playlist class LifeEmbedIE(InfoExtractor): diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 38fb3d9e4..988436226 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -64,7 +64,7 @@ class LivestreamIE(InfoExtractor): def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base_ele = find_xpath_attr( smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') - base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/' + base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' formats = [] video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 425fc9e2a..2338e7f96 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -14,7 +14,7 @@ from ..utils import ( class MDRIE(InfoExtractor): IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P<id>\d+)(?:_.+?)?\.html' _TESTS = [{ # MDR regularly deletes its videos @@ -60,6 +60,9 @@ class MDRIE(InfoExtractor): }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', 'only_matching': True, + }, { + 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +71,8 @@ class MDRIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_url = self._search_regex( - r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', - webpage, 'data url', group='url') + r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', + webpage, 'data url', default=None, group='url').replace('\/', '/') doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 97d5da626..0b4787c1d 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, str_to_int, unified_strdate, ) @@ -12,55 +13,62 @@ from ..utils import ( class MotherlessIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' - _TESTS = [ - { - 'url': 'http://motherless.com/AC3FFE1', - 'md5': '310f62e325a9fafe64f68c0bccb6e75f', - 'info_dict': { - 'id': 'AC3FFE1', - 'ext': 'mp4', - 'title': 'Fucked in the ass while playing PS3', - 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], - 'upload_date': '20100913', - 'uploader_id': 'famouslyfuckedup', - 'thumbnail': 're:http://.*\.jpg', - 'age_limit': 18, - } - }, - { - 'url': 'http://motherless.com/532291B', - 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', - 'info_dict': { - 'id': '532291B', - 'ext': 'mp4', - 'title': 'Amazing girl playing the omegle game, PERFECT!', - 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'], - 'upload_date': '20140622', - 'uploader_id': 'Sulivana7x', - 'thumbnail': 're:http://.*\.jpg', - 'age_limit': 18, - } + _TESTS = [{ + 'url': 'http://motherless.com/AC3FFE1', + 'md5': '310f62e325a9fafe64f68c0bccb6e75f', + 'info_dict': { + 'id': 'AC3FFE1', + 'ext': 'mp4', + 'title': 'Fucked in the ass while playing PS3', + 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], + 'upload_date': '20100913', + 'uploader_id': 'famouslyfuckedup', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } + }, { + 'url': 'http://motherless.com/532291B', + 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', + 'info_dict': { + 'id': '532291B', + 'ext': 'mp4', + 'title': 'Amazing girl playing the omegle game, PERFECT!', + 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', + 'game', 'hairy'], + 'upload_date': '20140622', + 'uploader_id': 'Sulivana7x', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, }, - { - 'url': 'http://motherless.com/g/cosplay/633979F', - 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', - 'info_dict': { - 'id': '633979F', - 'ext': 'mp4', - 'title': 'Turtlette', - 'categories': ['superheroine heroine superher'], - 'upload_date': '20140827', - 'uploader_id': 'shade0230', - 'thumbnail': 're:http://.*\.jpg', - 'age_limit': 18, - } + 'skip': '404', + }, { + 'url': 'http://motherless.com/g/cosplay/633979F', + 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', + 'info_dict': { + 'id': '633979F', + 'ext': 'mp4', + 'title': 'Turtlette', + 'categories': ['superheroine heroine superher'], + 'upload_date': '20140827', + 'uploader_id': 'shade0230', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, } - ] + }, { + # no keywords + 'url': 'http://motherless.com/8B4BBC1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(p in webpage for p in ( + '<title>404 - MOTHERLESS.COM<', + ">The page you're looking for cannot be found.<")): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = self._html_search_regex( @@ -86,7 +94,7 @@ class MotherlessIE(InfoExtractor): r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') - categories = self._html_search_meta('keywords', webpage) + categories = self._html_search_meta('keywords', webpage, default=None) if categories: categories = [cat.strip() for cat in categories.split(',')] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e8bb527b8..ed068365d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -11,6 +11,7 @@ from ..utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, + float_or_none, HEADRequest, sanitized_Request, unescapeHTML, @@ -110,7 +111,8 @@ class MTVServicesInfoExtractor(InfoExtractor): uri = itemdoc.find('guid').text video_id = self._id_from_uri(uri) self.report_extraction(video_id) - mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) + mediagen_url = content_el.attrib['url'] # Remove the templates, like &device={device} mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: @@ -165,6 +167,7 @@ class MTVServicesInfoExtractor(InfoExtractor): 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, + 'duration': float_or_none(content_el.attrib.get('duration')), } def _get_feed_query(self, uri): diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index a071378b6..3e2b3e599 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,18 +1,26 @@ from __future__ import unicode_literals +import functools +import os.path import re from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) from ..utils import ( - parse_duration, int_or_none, + OnDemandPagedList, + parse_duration, + remove_start, xpath_text, xpath_attr, ) class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', 'md5': '9e7729d3010a9c71506fd1248f74e4f4', @@ -44,14 +52,101 @@ class NBAIE(InfoExtractor): 'timestamp': 1432134543, 'upload_date': '20150520', } + }, { + 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', + 'info_dict': { + 'id': '1455672027478-Doc_Feb16_720', + 'ext': 'mp4', + 'title': 'Practice: Doc Rivers - 2/16/16', + 'description': 'Head Coach Doc Rivers addresses the media following practice.', + 'upload_date': '20160217', + 'timestamp': 1455672000, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', + 'info_dict': { + 'id': 'timberwolves', + 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', + }, + 'playlist_count': 30, + 'params': { + # Download the whole playlist takes too long time + 'playlist_items': '1-30', + }, + }, { + 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', + 'info_dict': { + 'id': 'Wigginsmp4', + 'ext': 'mp4', + 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', + 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', + 'upload_date': '20141212', + 'timestamp': 1418418600, + }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, }] + _PAGE_SIZE = 30 + + def _fetch_page(self, team, video_id, page): + search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse.urlencode({ + 'type': 'teamvideo', + 'start': page * self._PAGE_SIZE + 1, + 'npp': (page + 1) * self._PAGE_SIZE + 1, + 'sort': 'recent', + 'output': 'json', + 'site': team, + }) + results = self._download_json( + search_url, video_id, note='Download page %d of playlist data' % page)['results'][0] + for item in results: + yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url'])) + + def _extract_playlist(self, orig_path, video_id, webpage): + team = orig_path.split('/')[0] + + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video because of --no-playlist') + video_path = self._search_regex( + r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path') + video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path) + return self.url_result(video_url) + + self.to_screen('Downloading playlist - add --no-playlist to just download video') + playlist_title = self._og_search_title(webpage, fatal=False) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, team, video_id), + self._PAGE_SIZE, use_cache=True) + + return self.playlist_result(entries, team, playlist_title) + def _real_extract(self, url): path, video_id = re.match(self._VALID_URL, url).groups() + orig_path = path if path.startswith('nba/'): path = path[3:] + + if 'video/' not in path: + webpage = self._download_webpage(url, video_id) + path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/') + + if path == '{{id}}': + return self._extract_playlist(orig_path, video_id, webpage) + + # See prepareContentId() of pkgCvp.js + if path.startswith('video/teams'): + path = 'video/channels/proxy/' + path[6:] + video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) - video_id = xpath_text(video_info, 'slug') + video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0] title = xpath_text(video_info, 'headline') description = xpath_text(video_info, 'description') duration = parse_duration(xpath_text(video_info, 'length')) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index a126f5054..3b21fbd4d 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, ExtractorError, @@ -87,7 +90,7 @@ class NRKIE(InfoExtractor): class NRKPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', @@ -126,6 +129,37 @@ class NRKPlaylistIE(InfoExtractor): entries, playlist_id, playlist_title, playlist_description) +class NRKSkoleIE(InfoExtractor): + IE_DESC = 'NRK Skole' + _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532', + 'md5': '04cd85877cc1913bce73c5d28a47e00f', + 'info_dict': { + 'id': '6021', + 'ext': 'flv', + 'title': 'Genetikk og eneggede tvillinger', + 'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d', + 'duration': 399, + }, + }, { + 'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed', + 'only_matching': True, + }, { + 'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(self._match_id(url)) + + webpage = self._download_webpage(url, video_id) + + nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id') + return self.url_result('nrk:%s' % nrk_id) + + class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index c54775d54..958eb398b 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -112,6 +112,7 @@ class ORFTVthekIE(InfoExtractor): % geo_str), fatal=False) + self._check_formats(formats, video_id) self._sort_formats(formats) upload_date = unified_strdate(sd['created_date']) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index cca012953..f43e3a146 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -338,6 +338,21 @@ class PBSIE(InfoExtractor): }, }, { + # Serves hd only via wigget/partnerplayer page + 'url': 'http://www.pbs.org/video/2365641075/', + 'info_dict': { + 'id': '2365641075', + 'ext': 'mp4', + 'title': 'FRONTLINE - Netanyahu at War', + 'duration': 6852, + 'thumbnail': 're:^https?://.*\.jpg$', + 'formats': 'mincount:8', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, + { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, }, @@ -437,34 +452,54 @@ class PBSIE(InfoExtractor): for vid_id in video_id] return self.playlist_result(entries, display_id) + info = None + redirects = [] + redirect_urls = set() + + def extract_redirect_urls(info): + for encoding_name in ('recommended_encoding', 'alternate_encoding'): + redirect = info.get(encoding_name) + if not redirect: + continue + redirect_url = redirect.get('url') + if redirect_url and redirect_url not in redirect_urls: + redirects.append(redirect) + redirect_urls.add(redirect_url) + try: - info = self._download_json( + video_info = self._download_json( 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, display_id, 'Downloading video info JSON') + extract_redirect_urls(video_info) + info = video_info except ExtractorError as e: + # videoInfo API may not work for some videos if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: raise - # videoInfo API may not work for some videos, fallback to portalplayer API + + # Player pages may also serve different qualities + for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( - 'http://player.pbs.org/portalplayer/%s' % video_id, display_id) - info = self._parse_json( - self._search_regex( - r'(?s)PBS\.videoData\s*=\s*({.+?});\n', - player, 'video data', default='{}'), - display_id, transform_source=js_to_json, fatal=False) + 'http://player.pbs.org/%s/%s' % (page, video_id), + display_id, 'Downloading %s page' % page, fatal=False) + if player: + video_info = self._parse_json( + self._search_regex( + r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + player, '%s video data' % page, default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if video_info: + extract_redirect_urls(video_info) + if not info: + info = video_info formats = [] - for encoding_name in ('recommended_encoding', 'alternate_encoding'): - redirect = info.get(encoding_name) - if not redirect: - continue - redirect_url = redirect.get('url') - if not redirect_url: - continue + for num, redirect in enumerate(redirects): + redirect_id = redirect.get('eeid') redirect_info = self._download_json( - redirect_url + '?format=json', display_id, - 'Downloading %s video url info' % encoding_name) + '%s?format=json' % redirect['url'], display_id, + 'Downloading %s video url info' % (redirect_id or num)) if redirect_info['status'] == 'error': raise ExtractorError( @@ -483,8 +518,9 @@ class PBSIE(InfoExtractor): else: formats.append({ 'url': format_url, - 'format_id': redirect.get('eeid'), + 'format_id': redirect_id, }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) rating_str = info.get('rating') diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 91e574dc2..5a55c25e7 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -11,6 +11,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + int_or_none, sanitized_Request, str_to_int, ) @@ -23,13 +24,18 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', - 'md5': '882f488fa1f0026f023f33576004a2ed', + 'md5': '1e19b41231a02eba417839222ac9d58e', 'info_dict': { 'id': '648719015', 'ext': 'mp4', - 'uploader': 'Babes', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', - 'age_limit': 18 + 'uploader': 'Babes', + 'duration': 361, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, } }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', @@ -67,13 +73,23 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) - video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') + flashvars = self._parse_json( + self._search_regex( + r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + video_id) + if flashvars: + video_title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + else: + video_title, thumbnail, duration = [None] * 3 + + if not video_title: + video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') + video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) - thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) - if thumbnail: - thumbnail = compat_urllib_parse_unquote(thumbnail) view_count = self._extract_count( r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') @@ -120,6 +136,7 @@ class PornHubIE(InfoExtractor): 'uploader': video_uploader, 'title': video_title, 'thumbnail': thumbnail, + 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, @@ -129,27 +146,20 @@ class PornHubIE(InfoExtractor): } -class PornHubPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.pornhub.com/playlist/6201671', - 'info_dict': { - 'id': '6201671', - 'title': 'P0p4', - }, - 'playlist_mincount': 35, - }] +class PornHubPlaylistBaseIE(InfoExtractor): + def _extract_entries(self, webpage): + return [ + self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key()) + for video_url in set(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) + ] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') - for video_url in set(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) - ] + entries = self._extract_entries(webpage) playlist = self._parse_json( self._search_regex( @@ -158,3 +168,33 @@ class PornHubPlaylistIE(InfoExtractor): return self.playlist_result( entries, playlist_id, playlist.get('title'), playlist.get('description')) + + +class PornHubPlaylistIE(PornHubPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.pornhub.com/playlist/6201671', + 'info_dict': { + 'id': '6201671', + 'title': 'P0p4', + }, + 'playlist_mincount': 35, + }] + + +class PornHubUserVideosIE(PornHubPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' + _TESTS = [{ + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'info_dict': { + 'id': 'rushandlia', + }, + 'playlist_mincount': 13, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + return self.playlist_result(self._extract_entries(webpage), user_id) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 603d7bd00..8a8c5d2a0 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -10,6 +10,7 @@ from ..utils import ( ExtractorError, float_or_none, remove_end, + remove_start, sanitized_Request, std_headers, struct_unpack, @@ -178,14 +179,14 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ - 'url': 'http://www.rtve.es/noticias/directo-la-1/', + 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { - 'id': 'directo-la-1', - 'ext': 'flv', - 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', }, 'params': { 'skip_download': 'live stream', @@ -198,23 +199,20 @@ class RTVELiveIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - player_url = self._search_regex( - r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL') - title = remove_end(self._og_search_title(webpage), ' en directo') + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + r'playerId=player([0-9]+)', webpage, 'internal video ID') + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) + m3u8_url = _decrypt_url(png) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') return { 'id': video_id, - 'ext': 'flv', 'title': title, - 'url': video_url, - 'app': 'rtve-live-live?ovpfv=2.1.2', - 'player_url': player_url, - 'rtmp_live': True, + 'formats': formats, + 'is_live': True, } diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 2cf210e0d..44b0bbee6 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -70,25 +70,27 @@ class ScreenwaveMediaIE(InfoExtractor): formats = [] for source in sources: - if source['type'] == 'hls': - formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4')) + file_ = source.get('file') + if not file_: + continue + if source.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4')) else: - file_ = source.get('file') - if not file_: - continue - format_label = source.get('label') format_id = self._search_regex( r'_(.+?)\.[^.]+$', file_, 'format id', default=None) + if not self._is_valid_url(file_, video_id, format_id or 'video'): + continue + format_label = source.get('label') height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_label, 'height', default=None)) formats.append({ - 'url': source['file'], + 'url': file_, 'format_id': format_id, 'format': format_label, 'ext': source.get('type'), 'height': height, }) - self._sort_formats(formats) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id, diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py deleted file mode 100644 index ebb5d6ec0..000000000 --- a/youtube_dl/extractor/space.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..utils import RegexNotFoundError, ExtractorError - - -class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' - _TEST = { - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', - 'info_dict': { - 'id': '2780937028001', - 'ext': 'mp4', - 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video', - 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61', - 'uploader': 'TechMedia Networks', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - try: - # Some videos require the playerKey field, which isn't define in - # the BrightcoveExperience object - brightcove_url = self._og_search_video_url(webpage) - except RegexNotFoundError: - # Other videos works fine with the info from the object - brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if brightcove_url is None: - raise ExtractorError( - 'The webpage does not contain a video', expected=True) - return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key()) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 6890021cf..9ee844684 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,8 +48,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1', webpage, 'wat id', group='id') - wat_info = self._download_json( - 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id) - return self.url_result(wat_info['media']['url'], 'Wat') + return self.url_result('wat:%s' % wat_id, 'Wat') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 755f816ff..9a57b49df 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -21,6 +21,8 @@ from ..utils import ( sanitized_Request, unsmuggle_url, xpath_with_ns, + mimetype2ext, + find_xpath_attr, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -30,15 +32,11 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(InfoExtractor): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note) - try: - error_msg = next( - n.attrib['abstract'] - for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') - except StopIteration: - pass - else: - raise ExtractorError(error_msg, expected=True) + error_element = find_xpath_attr( + meta, _x('.//smil:ref'), 'src', + 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') + if error_element is not None: + raise ExtractorError(error_element.attrib['abstract'], expected=True) formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, @@ -68,7 +66,7 @@ class ThePlatformBaseIE(InfoExtractor): for caption in captions: lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') subtitles[lang] = [{ - 'ext': 'srt' if mime == 'text/srt' else 'ttml', + 'ext': mimetype2ext(mime), 'url': src, }] diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 49516abca..79f036fe4 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -71,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id webpage = self._download_webpage(url, display_id) @@ -117,7 +117,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): title = self._html_search_regex( self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) - age_limit = self._rta_search(webpage) + age_limit = self._rta_search(webpage) or 18 duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', default=None)) @@ -152,6 +152,36 @@ class TNAFlixNetworkBaseIE(InfoExtractor): } +class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' + + _TITLE_REGEX = r'<title>([^<]+)</title>' + + _TESTS = [{ + 'url': 'https://player.tnaflix.com/video/6538', + 'info_dict': { + 'id': '6538', + 'display_id': '6538', + 'ext': 'mp4', + 'title': 'Educational xxx video', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://player.empflix.com/video/33051', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', + webpage)] + + class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 69882da63..8639293e3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -17,6 +17,7 @@ from ..utils import ( encode_dict, ExtractorError, int_or_none, + orderedSet, parse_duration, parse_iso8601, sanitized_Request, @@ -281,17 +282,36 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): entries = [] offset = 0 limit = self._PAGE_LIMIT + broken_paging_detected = False + counter_override = None for counter in itertools.count(1): response = self._download_json( self._PLAYLIST_URL % (channel_id, offset, limit), - channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter)) + channel_id, + 'Downloading %s videos JSON page %s' + % (self._PLAYLIST_TYPE, counter_override or counter)) page_entries = self._extract_playlist_page(response) if not page_entries: break + total = int_or_none(response.get('_total')) + # Since the beginning of March 2016 twitch's paging mechanism + # is completely broken on the twitch side. It simply ignores + # a limit and returns the whole offset number of videos. + # Working around by just requesting all videos at once. + if not broken_paging_detected and total and len(page_entries) > limit: + self.report_warning( + 'Twitch paging is broken on twitch side, requesting all videos at once', + channel_id) + broken_paging_detected = True + offset = total + counter_override = '(all at once)' + continue entries.extend(page_entries) + if broken_paging_detected or total and len(page_entries) >= total: + break offset += limit return self.playlist_result( - [self.url_result(entry) for entry in set(entries)], + [self.url_result(entry) for entry in orderedSet(entries)], channel_id, channel_name) def _extract_playlist_page(self, response): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a161f046b..67762a003 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -10,21 +10,26 @@ from ..utils import ( remove_end, int_or_none, ExtractorError, - sanitized_Request, ) -class TwitterCardIE(InfoExtractor): +class TwitterBaseIE(InfoExtractor): + def _get_vmap_video_url(self, vmap_url, video_id): + vmap_data = self._download_xml(vmap_url, video_id) + return xpath_text(vmap_data, './/MediaFile').strip() + + +class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': '4fa26a35f9d1bf4b646590ba8e84be19', + # MD5 checksums are different in different places 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'TwitterCard', + 'title': 'Twitter Card', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 30.033, } @@ -35,14 +40,14 @@ class TwitterCardIE(InfoExtractor): 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'TwitterCard', + 'title': 'Twitter Card', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', - 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', + 'md5': 'd4724ffe6d2437886d004fa5de1043b3', 'info_dict': { 'id': 'dq4Oj5quskI', 'ext': 'mp4', @@ -62,49 +67,44 @@ class TwitterCardIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20151113', 'uploader_id': '1189339351084113920', - 'uploader': '@ArsenalTerje', - 'title': 'Vine by @ArsenalTerje', + 'uploader': 'ArsenalTerje', + 'title': 'Vine by ArsenalTerje', }, 'add_ie': ['Vine'], - } + }, { + 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', + 'md5': '3846d0a07109b5ab622425449b59049d', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': 'Twitter web player', + 'thumbnail': 're:^https?://.*\.jpg', + }, + }, ] def _real_extract(self, url): video_id = self._match_id(url) - # Different formats served for different User-Agents - USER_AGENTS = [ - 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4 - 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm - ] - config = None formats = [] - for user_agent in USER_AGENTS: - request = sanitized_Request(url) - request.add_header('User-Agent', user_agent) - webpage = self._download_webpage(request, video_id) - - iframe_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) - - config = self._parse_json(self._html_search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config'), - video_id) - if 'playlist' not in config: - if 'vmapUrl' in config: - vmap_data = self._download_xml(config['vmapUrl'], video_id) - video_url = xpath_text(vmap_data, './/MediaFile').strip() - formats.append({ - 'url': video_url, - }) - break # same video regardless of UA - continue - - video_url = config['playlist'][0]['source'] + duration = None + + webpage = self._download_webpage(url, video_id) + + iframe_url = self._html_search_regex( + r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', + webpage, 'video iframe', default=None) + if iframe_url: + return self.url_result(iframe_url) + + config = self._parse_json(self._html_search_regex( + r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), + video_id) + + playlist = config.get('playlist') + if playlist: + video_url = playlist[0]['source'] f = { 'url': video_url, @@ -117,14 +117,50 @@ class TwitterCardIE(InfoExtractor): 'height': int(m.group('height')), }) formats.append(f) + + vmap_url = config.get('vmapUrl') or config.get('vmap_url') + if vmap_url: + formats.append({ + 'url': self._get_vmap_video_url(vmap_url, video_id), + }) + + media_info = None + + for entity in config.get('status', {}).get('entities', []): + if 'mediaInfo' in entity: + media_info = entity['mediaInfo'] + + if media_info: + for media_variant in media_info['variants']: + media_url = media_variant['url'] + if media_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) + elif media_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) + else: + vbr = int_or_none(media_variant.get('bitRate'), scale=1000) + a_format = { + 'url': media_url, + 'format_id': 'http-%d' % vbr if vbr else 'http', + 'vbr': vbr, + } + # Reported bitRate may be zero + if not a_format['vbr']: + del a_format['vbr'] + + formats.append(a_format) + + duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) + self._sort_formats(formats) - thumbnail = config.get('posterImageUrl') - duration = float_or_none(config.get('duration')) + title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') + thumbnail = config.get('posterImageUrl') or config.get('image_src') + duration = float_or_none(config.get('duration')) or duration return { 'id': video_id, - 'title': 'TwitterCard', + 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, @@ -138,7 +174,6 @@ class TwitterIE(InfoExtractor): _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', - 'md5': 'db6612ec5d03355953c3ca9250c97e5e', 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', @@ -149,6 +184,9 @@ class TwitterIE(InfoExtractor): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', @@ -161,6 +199,7 @@ class TwitterIE(InfoExtractor): 'uploader': 'Gifs', 'uploader_id': 'giphz', }, + 'expected_warnings': ['height', 'width'], }, { 'url': 'https://twitter.com/starwars/status/665052190608723968', 'md5': '39b7199856dee6cd4432e72c74bc69d4', @@ -172,6 +211,36 @@ class TwitterIE(InfoExtractor): 'uploader_id': 'starwars', 'uploader': 'Star Wars', }, + }, { + 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', + 'info_dict': { + 'id': '705235433198714880', + 'ext': 'mp4', + 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', + 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'uploader_id': 'BTNBrentYarina', + 'uploader': 'Brent Yarina', + }, + 'params': { + # The same video as https://twitter.com/i/videos/tweet/705235433198714880 + # Test case of TwitterCardIE + 'skip_download': True, + }, + }, { + 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', + 'md5': '', + 'info_dict': { + 'id': '700207533655363584', + 'ext': 'mp4', + 'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'jay on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'jay', + 'uploader_id': 'jaydingeer', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -208,21 +277,91 @@ class TwitterIE(InfoExtractor): return info mobj = re.search(r'''(?x) - <video[^>]+class="animated-gif"[^>]+ - (?:data-height="(?P<height>\d+)")?[^>]+ - (?:data-width="(?P<width>\d+)")?[^>]+ - (?:poster="(?P<poster>[^"]+)")?[^>]*>\s* + <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s* <source[^>]+video-src="(?P<url>[^"]+)" ''', webpage) if mobj: + more_info = mobj.group('more_info') + height = int_or_none(self._search_regex( + r'data-height="(\d+)"', more_info, 'height', fatal=False)) + width = int_or_none(self._search_regex( + r'data-width="(\d+)"', more_info, 'width', fatal=False)) + thumbnail = self._search_regex( + r'poster="([^"]+)"', more_info, 'poster', fatal=False) info.update({ 'id': twid, 'url': mobj.group('url'), - 'height': int_or_none(mobj.group('height')), - 'width': int_or_none(mobj.group('width')), - 'thumbnail': mobj.group('poster'), + 'height': height, + 'width': width, + 'thumbnail': thumbnail, }) return info - raise ExtractorError('There\'s not video in this tweet.') + if 'class="PlayableMedia' in webpage: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid), + }) + + return info + + raise ExtractorError('There\'s no video in this tweet.') + + +class TwitterAmplifyIE(TwitterBaseIE): + IE_NAME = 'twitter:amplify' + _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' + + _TEST = { + 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', + 'md5': '7df102d0b9fd7066b86f3159f8e81bf6', + 'info_dict': { + 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', + 'ext': 'mp4', + 'title': 'Twitter Video', + 'thumbnail': 're:^https?://.*', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + vmap_url = self._html_search_meta( + 'twitter:amplify:vmap', webpage, 'vmap url') + video_url = self._get_vmap_video_url(vmap_url, video_id) + + thumbnails = [] + thumbnail = self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail', fatal=False) + + def _find_dimension(target): + w = int_or_none(self._html_search_meta( + 'twitter:%s:width' % target, webpage, fatal=False)) + h = int_or_none(self._html_search_meta( + 'twitter:%s:height' % target, webpage, fatal=False)) + return w, h + + if thumbnail: + thumbnail_w, thumbnail_h = _find_dimension('image') + thumbnails.append({ + 'url': thumbnail, + 'width': thumbnail_w, + 'height': thumbnail_h, + }) + + video_w, video_h = _find_dimension('player') + formats = [{ + 'url': video_url, + 'width': video_w, + 'height': video_h, + }] + + return { + 'id': video_id, + 'title': 'Twitter Video', + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py new file mode 100644 index 000000000..cafc082b6 --- /dev/null +++ b/youtube_dl/extractor/ustudio.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, +) + + +class UstudioIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', + 'md5': '58bbfca62125378742df01fc2abbdef6', + 'info_dict': { + 'id': 'Uxu2my9bgSph', + 'display_id': 'san_francisco_golden_gate_bridge', + 'ext': 'mp4', + 'title': 'San Francisco: Golden Gate Bridge', + 'description': 'md5:23925500697f2c6d4830e387ba51a9be', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20111107', + 'uploader': 'Tony Farley', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + config = self._download_xml( + 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, + display_id) + + def extract(kind): + return [{ + 'url': item.attrib['url'], + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] + + formats = extract('video') + self._sort_formats(formats) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + upload_date = unified_strdate(self._search_regex( + r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>', + webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'Uploaded by\s*<a[^>]*>([^<]+)<', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnails': extract('image'), + 'upload_date': upload_date, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 5e2e7cbac..4f0dcd18c 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import sanitized_Request +from ..utils import ( + decode_packed_codes, + sanitized_Request, +) class VideoMegaIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', @@ -42,8 +44,10 @@ class VideoMegaIE(InfoExtractor): r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + + real_codes = decode_packed_codes(webpage) video_url = self._search_regex( - r'<source[^>]+?src="([^"]+)"', webpage, 'video URL') + r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL') return { 'id': video_id, diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 7c6e98026..3c78fb3d5 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -1,11 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import smuggle_url +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + decode_packed_codes, + js_to_json, +) -class VidziIE(InfoExtractor): +class VidziIE(JWPlatformBaseIE): _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' _TEST = { 'url': 'http://vidzi.tv/cghql9yq6emu.html', @@ -14,7 +17,6 @@ class VidziIE(InfoExtractor): 'id': 'cghql9yq6emu', 'ext': 'mp4', 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - 'uploader': 'vidzi.tv', }, 'params': { # m3u8 download @@ -29,11 +31,12 @@ class VidziIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - # Vidzi now uses jwplayer, which can be handled by GenericIE - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(url, {'to_generic': True}), - 'ie_key': 'Generic', - } + code = decode_packed_codes(webpage).replace('\\\'', '\'') + jwplayer_data = self._parse_json( + self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'), + video_id, transform_source=js_to_json) + + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict['title'] = title + + return info_dict diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 3049dffb6..9f282a1da 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -93,6 +93,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 'description': 'md5:2d3305bad981a06ff79f027f19865021', 'upload_date': '20121220', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434', 'uploader_id': 'user7108434', 'uploader': 'Filippo Valsorda', 'duration': 10, @@ -105,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'info_dict': { 'id': '68093876', 'ext': 'mp4', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus', 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', @@ -121,6 +123,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, 'description': None, @@ -135,6 +138,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl password protected test video', 'upload_date': '20130614', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, @@ -154,6 +158,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Key & Peele: Terrorist Interrogation', 'description': 'md5:8678b246399b070816b12313e8b4eb5c', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', 'upload_date': '20130927', @@ -169,6 +174,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', 'upload_date': '20131015', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, @@ -183,6 +189,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'Pier Solar OUYA Official Trailer', 'uploader': 'Tulio Gonçalves', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593', 'uploader_id': 'user28849593', }, }, @@ -195,6 +202,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', 'uploader': 'The DMCI', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', @@ -370,9 +378,10 @@ class VimeoIE(VimeoBaseInfoExtractor): # Extract title video_title = config['video']['title'] - # Extract uploader and uploader_id - video_uploader = config['video']['owner']['name'] - video_uploader_id = config['video']['owner']['url'].split('/')[-1] if config['video']['owner']['url'] else None + # Extract uploader, uploader_url and uploader_id + video_uploader = config['video'].get('owner', {}).get('name') + video_uploader_url = config['video'].get('owner', {}).get('url') + video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None # Extract video thumbnail video_thumbnail = config['video'].get('thumbnail') @@ -473,6 +482,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return { 'id': video_id, 'uploader': video_uploader, + 'uploader_url': video_uploader_url, 'uploader_id': video_uploader_id, 'upload_date': video_upload_date, 'title': video_title, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 0805e3c08..670a438af 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -11,6 +11,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + int_or_none, orderedSet, sanitized_Request, str_to_int, @@ -152,6 +153,19 @@ class VKIE(InfoExtractor): }, }, { + # video key is extra_data not url\d+ + 'url': 'http://vk.com/video-110305615_171782105', + 'md5': 'e13fcda136f99764872e739d13fac1d1', + 'info_dict': { + 'id': '171782105', + 'ext': 'mp4', + 'title': 'S-Dance, репетиции к The way show', + 'uploader': 'THE WAY SHOW | 17 апреля', + 'upload_date': '20160207', + 'view_count': int, + }, + }, + { # removed video, just testing that we match the pattern 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 'only_matching': True, @@ -298,12 +312,17 @@ class VKIE(InfoExtractor): view_count = str_to_int(self._search_regex( r'([\d,.]+)', views, 'view count', fatal=False)) - formats = [{ - 'format_id': k, - 'url': v, - 'width': int(k[len('url'):]), - } for k, v in data.items() - if k.startswith('url')] + formats = [] + for k, v in data.items(): + if not k.startswith('url') and k != 'extra_data' or not v: + continue + height = int_or_none(self._search_regex( + r'^url(\d+)', k, 'height', default=None)) + formats.append({ + 'format_id': k, + 'url': v, + 'height': height, + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 01891ac4c..2b6bae89b 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -73,11 +73,16 @@ class VRTIE(InfoExtractor): if mobj: formats.extend(self._extract_m3u8_formats( '%s/%s' % (mobj.group('server'), mobj.group('path')), - video_id, 'mp4', m3u8_id='hls')) + video_id, 'mp4', m3u8_id='hls', fatal=False)) mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage) if mobj: formats.extend(self._extract_f4m_formats( - '%s/manifest.f4m' % mobj.group('src'), video_id, f4m_id='hds')) + '%s/manifest.f4m' % mobj.group('src'), + video_id, f4m_id='hds', fatal=False)) + + if not formats and 'data-video-geoblocking="true"' in webpage: + self.raise_geo_restricted('This video is only available in Belgium') + self._sort_formats(formats) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index affcc52f6..37cf3d309 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,7 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html' + _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' IE_NAME = 'wat.tv' _TESTS = [ { @@ -54,10 +54,12 @@ class WatIE(InfoExtractor): def real_id_for_chapter(chapter): return chapter['tc_start'].split('-')[0] mobj = re.match(self._VALID_URL, url) - short_id = mobj.group('short_id') display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id or short_id) - real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') + real_id = mobj.group('real_id') + if not real_id: + short_id = mobj.group('short_id') + webpage = self._download_webpage(url, display_id or short_id) + real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') video_info = self.download_video_info(real_id) diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 2037d9b3d..7aea47ed5 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -12,38 +12,52 @@ class WebOfStoriesIE(InfoExtractor): _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' - _TESTS = [ - { - 'url': 'http://www.webofstories.com/play/hans.bethe/71', - 'md5': '373e4dd915f60cfe3116322642ddf364', - 'info_dict': { - 'id': '4536', - 'ext': 'mp4', - 'title': 'The temperature of the sun', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Hans Bethe talks about calculating the temperature of the sun', - 'duration': 238, - } + _TESTS = [{ + 'url': 'http://www.webofstories.com/play/hans.bethe/71', + 'md5': '373e4dd915f60cfe3116322642ddf364', + 'info_dict': { + 'id': '4536', + 'ext': 'mp4', + 'title': 'The temperature of the sun', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Hans Bethe talks about calculating the temperature of the sun', + 'duration': 238, + } + }, { + 'url': 'http://www.webofstories.com/play/55908', + 'md5': '2985a698e1fe3211022422c4b5ed962c', + 'info_dict': { + 'id': '55908', + 'ext': 'mp4', + 'title': 'The story of Gemmata obscuriglobus', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', + 'duration': 169, + }, + 'skip': 'notfound', + }, { + # malformed og:title meta + 'url': 'http://www.webofstories.com/play/54215?o=MS', + 'info_dict': { + 'id': '54215', + 'ext': 'mp4', + 'title': '"A Leg to Stand On"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Oliver Sacks talks about the death and resurrection of a limb', + 'duration': 97, }, - { - 'url': 'http://www.webofstories.com/play/55908', - 'md5': '2985a698e1fe3211022422c4b5ed962c', - 'info_dict': { - 'id': '55908', - 'ext': 'mp4', - 'title': 'The story of Gemmata obscuriglobus', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', - 'duration': 169, - } + 'params': { + 'skip_download': True, }, - ] + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + # Sometimes og:title meta is malformed + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title') description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index fdb16d91c..41061dd31 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -35,7 +35,8 @@ class WistiaIE(InfoExtractor): formats = [] thumbnails = [] - for atype, a in data['assets'].items(): + for a in data['assets']: + atype = a.get('type') if atype == 'still': thumbnails.append({ 'url': a['url'], diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index a3236e66c..94abdb4f3 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -17,7 +17,7 @@ class XFileShareIE(InfoExtractor): IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me' _VALID_URL = r'''(?x) https?://(?P<host>(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/ + (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/ (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? ''' @@ -81,6 +81,13 @@ class XFileShareIE(InfoExtractor): 'ext': 'mp4', 'title': 'test' } + }, { + 'url': 'http://powerwatch.pw/duecjibvicbu', + 'info_dict': { + 'id': 'duecjibvicbu', + 'ext': 'mp4', + 'title': 'Big Buck Bunny trailer', + }, }] def _real_extract(self, url): @@ -112,6 +119,7 @@ class XFileShareIE(InfoExtractor): title = (self._search_regex( [r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', + r'h4-fine[^>]*>([^<]+)<', r'>Watch (.+) ', r'<h2 class="video-page-head">([^<]+)</h2>'], webpage, 'title', default=None) or self._og_search_title(webpage)).strip() diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 8cd3a0687..4075b8a4f 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -7,15 +7,17 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - parse_duration, + orderedSet, sanitized_Request, str_to_int, ) class XTubeIE(InfoExtractor): - _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/watch\.php\?.*\bv=)(?P<id>[^/?&#]+)' - _TEST = { + _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)' + + _TESTS = [{ + # old URL schema 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', 'info_dict': { @@ -27,63 +29,60 @@ class XTubeIE(InfoExtractor): 'duration': 450, 'age_limit': 18, } - } + }, { + # new URL schema + 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', + 'only_matching': True, + }, { + 'url': 'xtube:625837', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - - req = sanitized_Request('http://www.xtube.com/watch.php?v=%s' % video_id) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - video_title = self._html_search_regex( - r'<p class="title">([^<]+)', webpage, 'title') - video_uploader = self._html_search_regex( - [r"var\s+contentOwnerId\s*=\s*'([^']+)", - r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'], + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + if not display_id: + display_id = video_id + url = 'http://www.xtube.com/watch.php?v=%s' % video_id + + req = sanitized_Request(url) + req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') + webpage = self._download_webpage(req, display_id) + + flashvars = self._parse_json( + self._search_regex( + r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), + video_id)['flashvars'] + + title = flashvars.get('title') or self._search_regex( + r'<h1>([^<]+)</h1>', webpage, 'title') + video_url = compat_urllib_parse_unquote(flashvars['video_url']) + duration = int_or_none(flashvars.get('video_duration')) + + uploader = self._search_regex( + r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', webpage, 'uploader', fatal=False) - video_description = self._html_search_regex( - r'<p class="fieldsDesc">([^<]+)', - webpage, 'description', fatal=False) - duration = parse_duration(self._html_search_regex( - r'<span class="bold">Runtime:</span> ([^<]+)</p>', - webpage, 'duration', fatal=False)) - view_count = str_to_int(self._html_search_regex( - r'<span class="bold">Views:</span> ([\d,\.]+)</p>', + description = self._search_regex( + r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + view_count = str_to_int(self._search_regex( + r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( - r'<div id="commentBar">([\d,\.]+) Comments</div>', + r'>Comments? \(([\d,\.]+)\)<', webpage, 'comment count', fatal=False)) - formats = [] - for format_id, video_url in re.findall( - r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage): - fmt = { - 'url': compat_urllib_parse_unquote(video_url), - 'format_id': format_id, - } - m = re.search(r'^(?P<height>\d+)[pP]', format_id) - if m: - fmt['height'] = int(m.group('height')) - formats.append(fmt) - - if not formats: - video_url = compat_urllib_parse_unquote(self._search_regex( - r'flashvars\.video_url\s*=\s*"([^"]+)"', - webpage, 'video URL')) - formats.append({'url': video_url}) - - self._sort_formats(formats) - return { 'id': video_id, - 'title': video_title, - 'uploader': video_uploader, - 'description': video_description, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'uploader': uploader, 'duration': duration, 'view_count': view_count, 'comment_count': comment_count, - 'formats': formats, 'age_limit': 18, } @@ -120,7 +119,8 @@ class XTubeUserIE(InfoExtractor): if not html: break - for _, video_id in re.findall(r'data-plid=(["\'])(.+?)\1', html): + for video_id in orderedSet([video_id for _, video_id in re.findall( + r'data-plid=(["\'])(.+?)\1', html)]): entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key())) page_count = int_or_none(page.get('pageCount')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 45ad88152..27e67feb4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals import itertools import json import os.path +import random import re import time import traceback @@ -382,7 +383,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', + 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], @@ -401,12 +404,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop', + 'license': 'Standard YouTube License', 'creator': 'Icona Pop', } }, @@ -422,6 +427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', + 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', 'age_limit': 18, } @@ -437,6 +444,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', 'uploader_id': 'setindia', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia', + 'license': 'Standard YouTube License', 'age_limit': 18, } }, @@ -449,7 +458,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', + 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], @@ -468,8 +479,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', + 'license': 'Standard YouTube License', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -488,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', + 'license': 'Standard YouTube License', }, 'params': { 'youtube_include_dash_manifest': True, @@ -506,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', + 'license': 'Standard YouTube License', 'creator': 'Taylor Swift', }, 'params': { @@ -522,6 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100909', 'uploader': 'The Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'license': 'Standard YouTube License', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -536,7 +553,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', + 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -550,7 +569,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', + 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -562,9 +583,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20100430', 'uploader_id': 'deadmau5', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', + 'license': 'Standard YouTube License', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', }, @@ -580,6 +603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20150827', 'uploader_id': 'olympic', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', + 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', @@ -597,8 +622,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'upload_date': '20110310', 'uploader_id': 'AllenMeow', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫艾倫', + 'license': 'Standard YouTube License', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, @@ -629,7 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'upload_date': '20150625', 'uploader_id': 'dorappi2000', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', + 'license': 'Standard YouTube License', 'formats': 'mincount:33', }, }, @@ -644,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Airtek', 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', + 'license': 'Standard YouTube License', 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', }, 'params': { @@ -668,6 +698,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -678,6 +710,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -688,6 +722,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -698,6 +734,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }], 'params': { @@ -731,7 +769,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', + 'license': 'Standard YouTube License', 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', }, 'params': { @@ -760,6 +800,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { + # Video licensed under Creative Commons + 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', + 'info_dict': { + 'id': 'M4gD1WSo5mA', + 'ext': 'mp4', + 'title': 'md5:e41008789470fc2533a3252216f1c1d1', + 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'upload_date': '20150127', + 'uploader_id': 'BerkmanCenter', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader': 'BerkmanCenter', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Channel-like uploader_url + 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', + 'info_dict': { + 'id': 'eQcmzGIKrzg', + 'ext': 'mp4', + 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', + 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'upload_date': '20151119', + 'uploader': 'Bernie 2016', + 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, + { 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', 'only_matching': True, } @@ -975,40 +1051,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return {} try: args = player_config['args'] - caption_url = args['ttsurl'] - if not caption_url: - self._downloader.report_warning(err_msg) - return {} - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') + caption_url = args.get('ttsurl') + if caption_url: + timestamp = args['timestamp'] + # We get the available subtitles + list_params = compat_urllib_parse.urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, + }) + list_url = caption_url + '&' + list_params + caption_list = self._download_xml(list_url, video_id) + original_lang_node = caption_list.find('track') + if original_lang_node is None: + self._downloader.report_warning('Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + caption_kind = original_lang_node.attrib.get('kind', '') + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats + return sub_lang_list + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + parsed_caption_url = compat_urlparse.urlparse(caption_url) + caption_qs = compat_parse_qs(parsed_caption_url.query) sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if not sub_lang: + continue sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], }) + sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( + query=compat_urllib_parse.urlencode(caption_qs, True))) sub_formats.append({ - 'url': caption_url + '&' + params, + 'url': sub_url, 'ext': ext, }) sub_lang_list[sub_lang] = sub_formats @@ -1019,6 +1122,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} + def _mark_watched(self, video_id, video_info): + playback_url = video_info.get('videostats_playback_base_url', [None])[0] + if not playback_url: + return + parsed_playback_url = compat_urlparse.urlparse(playback_url) + qs = compat_urlparse.parse_qs(parsed_playback_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + }) + playback_url = compat_urlparse.urlunparse( + parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + + self._download_webpage( + playback_url, video_id, 'Marking watched', + 'Unable to mark watched', fatal=False) + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1245,9 +1371,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # uploader_id video_uploader_id = None - mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage) + video_uploader_url = None + mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', + video_webpage) if mobj is not None: - video_uploader_id = mobj.group(1) + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') else: self._downloader.report_warning('unable to extract uploader nickname') @@ -1275,6 +1405,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) + video_license = self._html_search_regex( + r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', + video_webpage, 'license', default=None) + m_music = re.search( r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', video_webpage) @@ -1348,6 +1482,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) + formats_spec = {} + fmt_list = video_info.get('fmt_list', [''])[0] + if fmt_list: + for fmt in fmt_list.split(','): + spec = fmt.split('/') + if len(spec) > 1: + width_height = spec[1].split('x') + if len(width_height) == 2: + formats_spec[spec[0]] = { + 'resolution': spec[1], + 'width': int_or_none(width_height[0]), + 'height': int_or_none(width_height[1]), + } formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) @@ -1416,6 +1563,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } if format_id in self._formats: dct.update(self._formats[format_id]) + if format_id in formats_spec: + dct.update(formats_spec[format_id]) # Some itags are not included in DASH manifest thus corresponding formats will # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). @@ -1528,11 +1677,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._sort_formats(formats) + self.mark_watched(video_id, video_info) + return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, + 'uploader_url': video_uploader_url, 'upload_date': upload_date, + 'license': video_license, 'creator': video_creator, 'title': video_title, 'alt_title': video_alt_title, @@ -1701,13 +1854,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - + def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) if 'v' in query_dict: @@ -1718,6 +1865,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + video = self._check_download_just_video(url, playlist_id) + if video: + return video + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) @@ -2026,11 +2184,20 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeWatchLaterIE(YoutubePlaylistIE): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - _TESTS = [] # override PlaylistIE tests + _TESTS = [{ + 'url': 'https://www.youtube.com/playlist?list=WL', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', + 'only_matching': True, + }] def _real_extract(self, url): + video = self._check_download_just_video(url, 'WL') + if video: + return video return self._extract_playlist('WL') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c619a75e2..81c22a627 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -137,6 +137,10 @@ class ZDFIE(InfoExtractor): formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) elif ext == 'm3u8': + # the certificates are misconfigured (see + # https://github.com/rg3/youtube-dl/issues/8665) + if video_url.startswith('https://'): + continue formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) elif ext == 'f4m': diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3afa8bb6f..9dd7a8034 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -171,6 +171,14 @@ def parseOpts(overrideArguments=None): default=False, help='Do not extract the videos of a playlist, only list them.') general.add_option( + '--mark-watched', + action='store_true', dest='mark_watched', default=False, + help='Mark videos watched (YouTube only)') + general.add_option( + '--no-mark-watched', + action='store_false', dest='mark_watched', default=False, + help='Do not mark videos watched (YouTube only)') + general.add_option( '--no-color', '--no-colors', action='store_true', dest='no_color', default=False, diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index e19dbf73d..3bad5a266 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -40,7 +40,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): 'Skipping embedding the thumbnail because the file is missing.') return [], info - if info['ext'] == 'mp3': + if info['ext'] in ('mp3', 'mkv'): options = [ '-c', 'copy', '-map', '0', '-map', '1', '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 480d48d05..e39ca60aa 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -6,6 +6,7 @@ import sys import errno from .common import PostProcessor +from ..compat import compat_os_name from ..utils import ( check_executable, hyphenate_date, @@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor): raise XAttrMetadataError(e.errno, e.strerror) except ImportError: - if os.name == 'nt': + if compat_os_name == 'nt': # Write xattrs to NTFS Alternate Data Streams: # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 def write_xattr(path, key, value): @@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor): 'Unable to write extended attributes due to too long values.') else: msg = 'This filesystem doesn\'t support extended attributes. ' - if os.name == 'nt': + if compat_os_name == 'nt': msg += 'You need to use NTFS.' else: msg += '(You may have to enable them in your /etc/fstab)' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 672ce05ea..d431aa6b7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import base64 +import binascii import calendar import codecs import contextlib @@ -159,8 +160,6 @@ if sys.version_info >= (2, 7): def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z_-]+$', key) - if val: - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: @@ -466,6 +465,10 @@ def encodeFilename(s, for_subprocess=False): if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: return s + # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible + if sys.platform.startswith('java'): + return s + return s.encode(get_subprocess_encoding(), 'ignore') @@ -904,9 +907,9 @@ def unified_strdate(date_str, day_first=True): '%d %b %Y', '%B %d %Y', '%b %d %Y', - '%b %dst %Y %I:%M%p', - '%b %dnd %Y %I:%M%p', - '%b %dth %Y %I:%M%p', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', '%Y/%m/%d', @@ -1216,13 +1219,23 @@ if sys.platform == 'win32': raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) else: - import fcntl + # Some platforms, such as Jython, is missing fcntl + try: + import fcntl - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + def _lock_file(f, exclusive): + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) - def _unlock_file(f): - fcntl.flock(f, fcntl.LOCK_UN) + def _unlock_file(f): + fcntl.flock(f, fcntl.LOCK_UN) + except ImportError: + UNSUPPORTED_MSG = 'file locking is not supported on this platform' + + def _lock_file(f, exclusive): + raise IOError(UNSUPPORTED_MSG) + + def _unlock_file(f): + raise IOError(UNSUPPORTED_MSG) class locked_file(object): @@ -1386,6 +1399,12 @@ def fix_xml_ampersands(xml_str): def setproctitle(title): assert isinstance(title, compat_str) + + # ctypes in Jython is not complete + # http://bugs.jython.org/issue2148 + if sys.platform.startswith('java'): + return + try: libc = ctypes.cdll.LoadLibrary('libc.so.6') except OSError: @@ -1569,9 +1588,12 @@ class PagedList(object): class OnDemandPagedList(PagedList): - def __init__(self, pagefunc, pagesize): + def __init__(self, pagefunc, pagesize, use_cache=False): self._pagefunc = pagefunc self._pagesize = pagesize + self._use_cache = use_cache + if use_cache: + self._cache = {} def getslice(self, start=0, end=None): res = [] @@ -1581,7 +1603,13 @@ class OnDemandPagedList(PagedList): if start >= nextfirstid: continue - page_results = list(self._pagefunc(pagenum)) + page_results = None + if self._use_cache: + page_results = self._cache.get(pagenum) + if page_results is None: + page_results = list(self._pagefunc(pagenum)) + if self._use_cache: + self._cache[pagenum] = page_results startv = ( start % self._pagesize @@ -1711,6 +1739,14 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +def update_url_query(url, query): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_parse_qs(parsed_url.query) + qs.update(query) + return compat_urlparse.urlunparse(parsed_url._replace( + query=compat_urllib_parse.urlencode(qs, True))) + + def encode_dict(d, encoding='utf-8'): def encode(v): return v.encode(encoding) if isinstance(v, compat_basestring) else v @@ -1835,11 +1871,21 @@ def error_to_compat_str(err): def mimetype2ext(mt): + ext = { + 'audio/mp4': 'm4a', + }.get(mt) + if ext is not None: + return ext + _, _, res = mt.rpartition('/') return { '3gpp': '3gp', + 'smptett+xml': 'tt', + 'srt': 'srt', + 'ttaf+xml': 'dfxp', 'ttml+xml': 'ttml', + 'vtt': 'vtt', 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', @@ -2582,3 +2628,58 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): return None # No Proxy return compat_urllib_request.ProxyHandler.proxy_open( self, req, proxy, type) + + +def ohdave_rsa_encrypt(data, exponent, modulus): + ''' + Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ + + Input: + data: data to encrypt, bytes-like object + exponent, modulus: parameter e and N of RSA algorithm, both integer + Output: hex string of encrypted data + + Limitation: supports one block encryption only + ''' + + payload = int(binascii.hexlify(data[::-1]), 16) + encrypted = pow(payload, exponent, modulus) + return '%x' % encrypted + + +def encode_base_n(num, n, table=None): + FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + if not table: + table = FULL_TABLE[:n] + + if n > len(table): + raise ValueError('base %d exceeds table length %d' % (n, len(table))) + + if num == 0: + return table[0] + + ret = '' + while num: + ret = table[num % n] + ret + num = num // n + return ret + + +def decode_packed_codes(code): + mobj = re.search( + r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)", + code) + obfucasted_code, base, count, symbols = mobj.groups() + base = int(base) + count = int(count) + symbols = symbols.split('|') + symbol_table = {} + + while count: + count -= 1 + base_n_count = encode_base_n(count, base) + symbol_table[base_n_count] = symbols[count] or base_n_count + + return re.sub( + r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], + obfucasted_code) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9aca8001a..adafd601b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.02.13' +__version__ = '2016.03.01' |