diff options
110 files changed, 3771 insertions, 879 deletions
diff --git a/.travis.yml b/.travis.yml index fb34299fc..511bee64c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python python: - "2.6" - "2.7" + - "3.2" - "3.3" - "3.4" before_install: @@ -113,3 +113,9 @@ Robin de Rooij Ryan Schmidt Leslie P. Polzer Duncan Keall +Alexander Mamay +Devin J. Pohly +Eduardo Ferro Aldama +Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 351229f21..588b15bde 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,9 @@ If your report is shorter than two lines, it is almost certainly missing some of For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? @@ -2,7 +2,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe - find -name "*.pyc" -delete + find . -name "*.pyc" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -47,211 +47,109 @@ which means you can modify it, redistribute it or use it however you like. # OPTIONS -h, --help print this help text and exit --version print program version and exit - -U, --update update this program to latest version. Make - sure that you have sufficient permissions - (run with sudo if needed) - -i, --ignore-errors continue on download errors, for example to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error - occurs + -U, --update update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) + -i, --ignore-errors continue on download errors, for example to skip unavailable videos in a playlist + --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs --dump-user-agent display the current browser identification - --list-extractors List all supported extractors and the URLs - they would handle - --extractor-descriptions Output descriptions of all supported - extractors - --default-search PREFIX Use this prefix for unqualified URLs. For - example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a - warning when guessing). "error" just throws - an error. The default value "fixup_error" - repairs broken URLs, but emits an error if - this is not possible instead of searching. - --ignore-config Do not read configuration files. When given - in the global configuration file /etc - /youtube-dl.conf: Do not read the user - configuration in ~/.config/youtube- - dl/config (%APPDATA%/youtube-dl/config.txt - on Windows) - --flat-playlist Do not extract the videos of a playlist, - only list them. + --list-extractors List all supported extractors and the URLs they would handle + --extractor-descriptions Output descriptions of all supported extractors + --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". + Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The + default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching. + --ignore-config Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: Do not read the user configuration + in ~/.config/youtube-dl/config (%APPDATA%/youtube-dl/config.txt on Windows) + --flat-playlist Do not extract the videos of a playlist, only list them. --no-color Do not emit color codes in output. ## Network Options: - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in - an empty string (--proxy "") for direct - connection + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to - (experimental) - -4, --force-ipv4 Make all connections via IPv4 - (experimental) - -6, --force-ipv6 Make all connections via IPv6 - (experimental) + --source-address IP Client-side IP address to bind to (experimental) + -4, --force-ipv4 Make all connections via IPv4 (experimental) + -6, --force-ipv6 Make all connections via IPv6 (experimental) + --cn-verification-proxy URL Use this proxy to verify the IP address for some Chinese sites. The default proxy specified by --proxy (or none, if the options is + not present) is used for the actual downloading. (experimental) ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) --playlist-end NUMBER playlist video to end at (default is last) - --playlist-items ITEM_SPEC playlist video items to download. Specify - indices of the videos in the playlist - seperated by commas like: "--playlist-items - 1,2,5,8" if you want to download videos - indexed 1, 2, 5, 8 in the playlist. You can - specify range: "--playlist-items - 1-3,7,10-13", it will download the videos - at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX download only matching titles (regex or - caseless sub-string) - --reject-title REGEX skip download for matching titles (regex or - caseless sub-string) + --playlist-items ITEM_SPEC playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" + if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will + download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. + --match-title REGEX download only matching titles (regex or caseless sub-string) + --reject-title REGEX skip download for matching titles (regex or caseless sub-string) --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than - SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) + --min-filesize SIZE Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than SIZE (e.g. 50k or 44.6m) --date DATE download only videos uploaded in this date - --datebefore DATE download only videos uploaded on or before - this date (i.e. inclusive) - --dateafter DATE download only videos uploaded on or after - this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views - --match-filter FILTER (Experimental) Generic video filter. - Specify any key (see help for -o for a list - of available keys) to match if the key is - present, !key to check if the key is not - present,key > NUMBER (like "comment_count > - 12", also works with >=, <, <=, !=, =) to - compare against a number, and & to require - multiple matches. Values which are not - known are excluded unless you put a - question mark (?) after the operator.For - example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & + --datebefore DATE download only videos uploaded on or before this date (i.e. inclusive) + --dateafter DATE download only videos uploaded on or after this date (i.e. inclusive) + --min-views COUNT Do not download any videos with less than COUNT views + --max-views COUNT Do not download any videos with more than COUNT views + --match-filter FILTER (Experimental) Generic video filter. Specify any key (see help for -o for a list of available keys) to match if the key is present, + !key to check if the key is not present,key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against + a number, and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the + operator.For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike + functionality is not available at the given service), but who also have a description, use --match-filter "like_count > 100 & dislike_count <? 50 & description" . - --no-playlist If the URL refers to a video and a - playlist, download only the video. - --yes-playlist If the URL refers to a video and a - playlist, download the playlist. - --age-limit YEARS download only videos suitable for the given - age - --download-archive FILE Download only videos not listed in the - archive file. Record the IDs of all - downloaded videos in it. - --include-ads Download advertisements as well - (experimental) + --no-playlist If the URL refers to a video and a playlist, download only the video. + --yes-playlist If the URL refers to a video and a playlist, download the playlist. + --age-limit YEARS download only videos suitable for the given age + --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + --include-ads Download advertisements as well (experimental) ## Download Options: - -r, --rate-limit LIMIT maximum download rate in bytes per second - (e.g. 50K or 4.2M) - -R, --retries RETRIES number of retries (default is 10), or - "infinite". - --buffer-size SIZE size of download buffer (e.g. 1024 or 16K) - (default is 1024) - --no-resize-buffer do not automatically adjust the buffer - size. By default, the buffer size is - automatically resized from an initial value - of SIZE. + -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. 50K or 4.2M) + -R, --retries RETRIES number of retries (default is 10), or "infinite". + --buffer-size SIZE size of download buffer (e.g. 1024 or 16K) (default is 1024) + --no-resize-buffer do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE. --playlist-reverse Download playlist videos in reverse order - --xattr-set-filesize (experimental) set file xattribute - ytdl.filesize with expected filesize - --hls-prefer-native (experimental) Use the native HLS - downloader instead of ffmpeg. - --external-downloader COMMAND (experimental) Use the specified external - downloader. Currently supports - aria2c,curl,wget + --xattr-set-filesize (experimental) set file xattribute ytdl.filesize with expected filesize + --hls-prefer-native (experimental) Use the native HLS downloader instead of ffmpeg. + --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,wget + --external-downloader-args ARGS Give these arguments to the external downloader. ## Filesystem Options: - -a, --batch-file FILE file containing URLs to download ('-' for - stdin) + -a, --batch-file FILE file containing URLs to download ('-' for stdin) --id use only video ID in file name - -o, --output TEMPLATE output filename template. Use %(title)s to - get the title, %(uploader)s for the - uploader name, %(uploader_id)s for the - uploader nickname if different, - %(autonumber)s to get an automatically - incremented number, %(ext)s for the - filename extension, %(format)s for the - format description (like "22 - 1280x720" or - "HD"), %(format_id)s for the unique id of - the format (like Youtube's itags: "137"), - %(upload_date)s for the upload date - (YYYYMMDD), %(extractor)s for the provider - (youtube, metacafe, etc), %(id)s for the - video id, %(playlist_title)s, - %(playlist_id)s, or %(playlist)s (=title if - present, ID otherwise) for the playlist the - video is in, %(playlist_index)s for the - position in the playlist. %(height)s and - %(width)s for the width and height of the - video format. %(resolution)s for a textual - description of the resolution of the video - format. %% for a literal percent. Use - to - output to stdout. Can also be used to - download to a different directory, for - example with -o '/my/downloads/%(uploader)s - /%(title)s-%(id)s.%(ext)s' . - --autonumber-size NUMBER Specifies the number of digits in - %(autonumber)s when it is present in output - filename template or --auto-number option - is given - --restrict-filenames Restrict filenames to only ASCII - characters, and avoid "&" and spaces in - filenames - -A, --auto-number [deprecated; use -o - "%(autonumber)s-%(title)s.%(ext)s" ] number - downloaded files starting from 00000 - -t, --title [deprecated] use title in file name - (default) + -o, --output TEMPLATE output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(uploader_id)s for the uploader + nickname if different, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(format)s for + the format description (like "22 - 1280x720" or "HD"), %(format_id)s for the unique id of the format (like Youtube's itags: "137"), + %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id, + %(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, + %(playlist_index)s for the position in the playlist. %(height)s and %(width)s for the width and height of the video format. + %(resolution)s for a textual description of the resolution of the video format. %% for a literal percent. Use - to output to stdout. + Can also be used to download to a different directory, for example with -o '/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' . + --autonumber-size NUMBER Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given + --restrict-filenames Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames + -A, --auto-number [deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] number downloaded files starting from 00000 + -t, --title [deprecated] use title in file name (default) -l, --literal [deprecated] alias of --title -w, --no-overwrites do not overwrite files - -c, --continue force resume of partially downloaded files. - By default, youtube-dl will resume - downloads if possible. - --no-continue do not resume partially downloaded files - (restart from beginning) - --no-part do not use .part files - write directly - into output file - --no-mtime do not use the Last-modified header to set - the file modification time - --write-description write video description to a .description - file + -c, --continue force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible. + --no-continue do not resume partially downloaded files (restart from beginning) + --no-part do not use .part files - write directly into output file + --no-mtime do not use the Last-modified header to set the file modification time + --write-description write video description to a .description file --write-info-json write video metadata to a .info.json file - --write-annotations write video annotations to a .annotation - file - --load-info FILE json file containing the video information - (created with the "--write-json" option) - --cookies FILE file to read cookies from and dump cookie - jar in - --cache-dir DIR Location in the filesystem where youtube-dl - can store some downloaded information - permanently. By default $XDG_CACHE_HOME - /youtube-dl or ~/.cache/youtube-dl . At the - moment, only YouTube player files (for - videos with obfuscated signatures) are - cached, but that may change. + --write-annotations write video annotations to a .annotation file + --load-info FILE json file containing the video information (created with the "--write-json" option) + --cookies FILE file to read cookies from and dump cookie jar in + --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl + or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may + change. --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files ## Thumbnail images: --write-thumbnail write thumbnail image to disk --write-all-thumbnails write all thumbnail image formats to disk - --list-thumbnails Simulate and list all available thumbnail - formats + --list-thumbnails Simulate and list all available thumbnail formats ## Verbosity / Simulation Options: -q, --quiet activates quiet mode --no-warnings Ignore warnings - -s, --simulate do not download the video and do not write - anything to disk + -s, --simulate do not download the video and do not write anything to disk --skip-download do not download the video -g, --get-url simulate, quiet but print URL -e, --get-title simulate, quiet but print title @@ -261,155 +159,87 @@ which means you can modify it, redistribute it or use it however you like. --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format - -j, --dump-json simulate, quiet but print JSON information. - See --output for a description of available - keys. - -J, --dump-single-json simulate, quiet but print JSON information - for each command-line argument. If the URL - refers to a playlist, dump the whole - playlist information in a single line. - --print-json Be quiet and print the video information as - JSON (video is still being downloaded). + -j, --dump-json simulate, quiet but print JSON information. See --output for a description of available keys. + -J, --dump-single-json simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist + information in a single line. + --print-json Be quiet and print the video information as JSON (video is still being downloaded). --newline output progress bar as new lines --no-progress do not print progress bar --console-title display progress in console titlebar -v, --verbose print various debugging information - --dump-intermediate-pages print downloaded pages to debug problems - (very verbose) - --write-pages Write downloaded intermediary pages to - files in the current directory to debug - problems + --dump-pages print downloaded pages to debug problems (very verbose) + --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic - -C, --call-home Contact the youtube-dl server for - debugging. - --no-call-home Do NOT contact the youtube-dl server for - debugging. + -C, --call-home Contact the youtube-dl server for debugging. + --no-call-home Do NOT contact the youtube-dl server for debugging. ## Workarounds: --encoding ENCODING Force the specified encoding (experimental) --no-check-certificate Suppress HTTPS certificate validation. - --prefer-insecure Use an unencrypted connection to retrieve - information about the video. (Currently - supported only for YouTube) + --prefer-insecure Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube) --user-agent UA specify a custom user agent - --referer URL specify a custom referer, use if the video - access is restricted to one domain - --add-header FIELD:VALUE specify a custom HTTP header and its value, - separated by a colon ':'. You can use this - option multiple times - --bidi-workaround Work around terminals that lack - bidirectional text support. Requires bidiv - or fribidi executable in PATH - --sleep-interval SECONDS Number of seconds to sleep before each - download. + --referer URL specify a custom referer, use if the video access is restricted to one domain + --add-header FIELD:VALUE specify a custom HTTP header and its value, separated by a colon ':'. You can use this option multiple times + --bidi-workaround Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH + --sleep-interval SECONDS Number of seconds to sleep before each download. ## Video Format Options: - -f, --format FORMAT video format code, specify the order of - preference using slashes, as in -f 22/17/18 - . Instead of format codes, you can select - by extension for the extensions aac, m4a, - mp3, mp4, ogg, wav, webm. You can also use - the special names "best", "bestvideo", - "bestaudio", "worst". You can filter the - video results by putting a condition in - brackets, as in -f "best[height=720]" (or - -f "[filesize>10M]"). This works for - filesize, height, width, tbr, abr, vbr, - asr, and fps and the comparisons <, <=, >, - >=, =, != and for ext, acodec, vcodec, - container, and protocol and the comparisons - =, != . Formats for which the value is not - known are excluded unless you put a - question mark (?) after the operator. You - can combine format filters, so -f "[height - <=? 720][tbr>500]" selects up to 720p - videos (or videos where the height is not - known) with a bitrate of at least 500 - KBit/s. By default, youtube-dl will pick - the best quality. Use commas to download - multiple audio formats, such as -f - 136/137/mp4/bestvideo,140/m4a/bestaudio. - You can merge the video and audio of two - formats into a single file using -f <video- - format>+<audio-format> (requires ffmpeg or - avconv), for example -f + -f, --format FORMAT video format code, specify the order of preference using slashes, as in -f 22/17/18 . Instead of format codes, you can select by + extension for the extensions aac, m4a, mp3, mp4, ogg, wav, webm. You can also use the special names "best", "bestvideo", "bestaudio", + "worst". You can filter the video results by putting a condition in brackets, as in -f "best[height=720]" (or -f "[filesize>10M]"). + This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, + vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a + question mark (?) after the operator. You can combine format filters, so -f "[height <=? 720][tbr>500]" selects up to 720p videos + (or videos where the height is not known) with a bitrate of at least 500 KBit/s. By default, youtube-dl will pick the best quality. + Use commas to download multiple audio formats, such as -f 136/137/mp4/bestvideo,140/m4a/bestaudio. You can merge the video and audio + of two formats into a single file using -f <video-format>+<audio-format> (requires ffmpeg or avconv), for example -f bestvideo+bestaudio. --all-formats download all available video formats - --prefer-free-formats prefer free video formats unless a specific - one is requested + --prefer-free-formats prefer free video formats unless a specific one is requested --max-quality FORMAT highest quality format to download -F, --list-formats list all available formats - --youtube-skip-dash-manifest Do not download the DASH manifest on - YouTube videos - --merge-output-format FORMAT If a merge is required (e.g. - bestvideo+bestaudio), output to given - container format. One of mkv, mp4, ogg, - webm, flv.Ignored if no merge is required + --youtube-skip-dash-manifest Do not download the DASH manifest on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no + merge is required ## Subtitle Options: --write-sub write subtitle file - --write-auto-sub write automatic subtitle file (youtube - only) - --all-subs downloads all the available subtitles of - the video + --write-auto-sub write automatic subtitle file (youtube only) + --all-subs downloads all the available subtitles of the video --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format, accepts formats - preference, for example: "ass/srt/best" - --sub-lang LANGS languages of the subtitles to download - (optional) separated by commas, use IETF - language tags like 'en,pt' + --sub-format FORMAT subtitle format, accepts formats preference, for example: "ass/srt/best" + --sub-lang LANGS languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' ## Authentication Options: -u, --username USERNAME login with this account ID - -p, --password PASSWORD account password. If this option is left - out, youtube-dl will ask interactively. + -p, --password PASSWORD account password. If this option is left out, youtube-dl will ask interactively. -2, --twofactor TWOFACTOR two-factor auth code -n, --netrc use .netrc authentication data --video-password PASSWORD video password (vimeo, smotri) ## Post-processing Options: - -x, --extract-audio convert video files to audio-only files - (requires ffmpeg or avconv and ffprobe or - avprobe) - --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", - "opus", or "wav"; "best" by default - --audio-quality QUALITY ffmpeg/avconv audio quality specification, - insert a value between 0 (better) and 9 - (worse) for VBR or a specific bitrate like - 128K (default 5) - --recode-video FORMAT Encode the video to another format if - necessary (currently supported: - mp4|flv|ogg|webm|mkv) - -k, --keep-video keeps the video file on disk after the - post-processing; the video is erased by - default - --no-post-overwrites do not overwrite post-processed files; the - post-processed files are overwritten by - default - --embed-subs embed subtitles in the video (only for mp4 - videos) + -x, --extract-audio convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) + --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default + --audio-quality QUALITY ffmpeg/avconv audio quality specification, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K + (default 5) + --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv) + -k, --keep-video keeps the video file on disk after the post-processing; the video is erased by default + --no-post-overwrites do not overwrite post-processed files; the post-processed files are overwritten by default + --embed-subs embed subtitles in the video (only for mp4 videos) --embed-thumbnail embed thumbnail in the audio as cover art --add-metadata write metadata to the video file - --xattrs write metadata to the video file's xattrs - (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the - file. One of never (do nothing), warn (only - emit a warning), detect_or_warn(the - default; fix file if we can, warn - otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors (default) - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; - either the path to the binary or its - containing directory. - --exec CMD Execute a command on the file after - downloading, similar to find's -exec - syntax. Example: --exec 'adb push {} - /sdcard/Music/ && rm {}' - --convert-subtitles FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt) + --metadata-from-title FORMAT parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed + parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + %(title)s" matches a title like "Coldplay - Paradise" + --xattrs write metadata to the video file's xattrs (using dublin core and xdg standards) + --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; + fix file if we can, warn otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) + --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. + --exec CMD Execute a command on the file after downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm + {}' + --convert-subtitles FORMAT Convert the subtitles to other format (currently supported: srt|ass|vtt) # CONFIGURATION @@ -529,6 +359,10 @@ YouTube requires an additional signature since September 2012 which is not suppo In February 2015, the new YouTube player contained a character sequence in a string that was misinterpreted by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. +### HTTP Error 429: Too Many Requests or 402: Payment Required + +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--network-address` options](#network-options) to select another IP address. + ### SyntaxError: Non-ASCII character ### The error @@ -573,6 +407,18 @@ A note on the service that they don't host the infringing content, but just link Support requests for services that **do** purchase the rights to distribute their content are perfectly fine though. If in doubt, you can simply include a source that mentions the legitimate purchase of content. +### How can I speed up work on my issue? + +(Also known as: Help, my important issue not being solved!) The youtube-dl core developer team is quite small. While we do our best to solve as many issues as possible, sometimes that can take quite a while. To speed up your issue, here's what you can do: + +First of all, please do report the issue [at our issue tracker](https://yt-dl.org/bugs). That allows us to coordinate all efforts by users and developers, and serves as a unified point. Unfortunately, the youtube-dl project has grown too large to use personal email as an effective communication channel. + +Please read the [bug reporting instructions](#bugs) below. A lot of bugs lack all the necessary information. If you can, offer proxy, VPN, or shell access to the youtube-dl developers. If you are able to, test the issue from multiple computers in multiple countries to exclude local censorship or misconfiguration issues. + +If nobody is interested in solving your issue, you are welcome to take matters into your own hands and submit a pull request (or coerce/pay somebody else to do so). + +Feel free to bump the issue from time to time by writing a small comment ("Issue is still present in youtube-dl version ...from France, but fixed from Belgium"), but please not more than once a month. Please do not declare your issue as `important` or `urgent`. + ### How can I detect whether a given URL is supported by youtube-dl? For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. @@ -672,6 +518,7 @@ youtube-dl makes the best effort to be a good command-line program, and thus sho From a Python program, you can embed youtube-dl in a more powerful fashion, like this: ```python +from __future__ import unicode_literals import youtube_dl ydl_opts = {} @@ -684,6 +531,7 @@ Most likely, you'll want to use various options. For a list of what can be done, Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: ```python +from __future__ import unicode_literals import youtube_dl @@ -741,7 +589,9 @@ If your report is shorter than two lines, it is almost certainly missing some of For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 6a5bd9eda..7a219ebe9 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -28,7 +28,7 @@ for test in get_testcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() - except: + except Exception: print('\nFail: {0}'.format(test['name'])) continue diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py new file mode 100644 index 000000000..2e389fc8e --- /dev/null +++ b/devscripts/generate_aes_testdata.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import codecs +import subprocess + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import intlist_to_bytes +from youtube_dl.aes import aes_encrypt, key_expansion + +secret_msg = b'Secret message goes here' + + +def hex_str(int_list): + return codecs.encode(intlist_to_bytes(int_list), 'hex') + + +def openssl_encode(algo, key, iv): + cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] + prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + out, _ = prog.communicate(secret_msg) + return out + +iv = key = [0x20, 0x15] + 14 * [0] + +r = openssl_encode('aes-128-cbc', key, iv) +print('aes_cbc_decrypt') +print(repr(r)) + +password = key +new_key = aes_encrypt(password, key_expansion(password)) +r = openssl_encode('aes-128-ctr', new_key, iv) +print('aes_decrypt_text 16') +print(repr(r)) + +password = key + 16 * [0] +new_key = aes_encrypt(password, key_expansion(password)) * (32 // 16) +r = openssl_encode('aes-256-ctr', new_key, iv) +print('aes_decrypt_text 32') +print(repr(r)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 062cb3d62..fd59cc2be 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -2,6 +2,8 @@ - **1tv**: Первый канал - **1up.com** - **220.ro** + - **22tracks:genre** + - **22tracks:track** - **24video** - **3sat** - **4tube** @@ -47,6 +49,7 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer + - **BeatportPro** - **Beeg** - **BehindKink** - **Bet** @@ -111,12 +114,14 @@ - **Discovery** - **divxstage**: DivxStage - **Dotsub** + - **DouyuTV** - **DRBonanza** - **Dropbox** - **DrTuber** - **DRTV** - **Dump** - **dvtv**: http://video.aktualne.cz/ + - **EaglePlatform** - **EbaumsWorld** - **EchoMsk** - **eHow** @@ -144,6 +149,7 @@ - **Firstpost** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) + - **FootyRoom** - **Foxgay** - **FoxNews** - **france2.fr:generation-quoi** @@ -161,6 +167,7 @@ - **GameSpot** - **GameStar** - **Gametrailers** + - **Gazeta** - **GDCVault** - **generic**: Generic downloader that works on some sites - **GiantBomb** @@ -211,6 +218,7 @@ - **jpopsuki.tv** - **Jukebox** - **Kaltura** + - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** - **keek** @@ -225,6 +233,7 @@ - **Letv** - **LetvPlaylist** - **LetvTv** + - **Libsyn** - **lifenews**: LIFE | NEWS - **LiveLeak** - **livestream** @@ -304,6 +313,7 @@ - **npo.nl:radio** - **npo.nl:radio:fragment** - **NRK** + - **NRKPlaylist** - **NRKTV** - **ntv.ru** - **Nuvid** @@ -315,6 +325,7 @@ - **Ooyala** - **OpenFilm** - **orf:fm4**: radio FM4 + - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek - **parliamentlive.tv**: UK parliament videos @@ -322,10 +333,12 @@ - **PBS** - **Phoenix** - **Photobucket** + - **Pladform** - **PlanetaPlay** - **play.fm** - **played.to** - **Playvid** + - **Playwire** - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** @@ -334,6 +347,7 @@ - **PornHubPlaylist** - **Pornotube** - **PornoXO** + - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital - **Puls4** @@ -359,6 +373,7 @@ - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **RUHD** - **rutube**: Rutube videos @@ -367,6 +382,8 @@ - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU + - **safari**: safaribooksonline.com online video + - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos - **savefrom.net** @@ -409,6 +426,7 @@ - **SportBox** - **SportDeutschland** - **SRMediathek**: Saarländischer Rundfunk + - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **streamcloud.eu** @@ -478,6 +496,7 @@ - **Ubu** - **udemy** - **udemy:course** + - **Ultimedia** - **Unistra** - **Urort**: NRK P3 Urørt - **ustream** @@ -485,6 +504,7 @@ - **Vbox7** - **VeeHD** - **Veoh** + - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - **VGTV** @@ -505,6 +525,7 @@ - **Vidzi** - **vier** - **vier:videos** + - **Viewster** - **viki** - **vimeo** - **vimeo:album** @@ -551,6 +572,9 @@ - **XXXYMovies** - **Yahoo**: Yahoo screen and movies - **Yam** + - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист + - **yandexmusic:track**: Яндекс.Музыка - Трек - **YesJapan** - **Ynet** - **YouJizz** @@ -569,7 +593,7 @@ - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** - **ZDF** - **ZDFChannel** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 055e42555..652519831 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -14,6 +14,9 @@ from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor +from youtube_dl.utils import match_filter_func + +TEST_URL = 'http://localhost/sample.mp4' class YDL(FakeYDL): @@ -46,8 +49,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 460, 'url': 'x'}, - {'ext': 'mp4', 'height': 460, 'url': 'y'}, + {'ext': 'webm', 'height': 460, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 460, 'url': TEST_URL}, ] info_dict = _make_result(formats) yie = YoutubeIE(ydl) @@ -60,8 +63,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 720, 'url': 'a'}, - {'ext': 'mp4', 'height': 1080, 'url': 'b'}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 1080, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -74,9 +77,9 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'webm', 'height': 720, 'url': '_'}, - {'ext': 'mp4', 'height': 720, 'url': '_'}, - {'ext': 'flv', 'height': 720, 'url': '_'}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 720, 'url': TEST_URL}, + {'ext': 'flv', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -88,8 +91,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'flv', 'height': 720, 'url': '_'}, - {'ext': 'webm', 'height': 720, 'url': '_'}, + {'ext': 'flv', 'height': 720, 'url': TEST_URL}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -133,10 +136,10 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection(self): formats = [ - {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, - {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, - {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, - {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, + {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, + {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -167,10 +170,10 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_audio(self): formats = [ - {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -185,8 +188,8 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, - {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -228,9 +231,9 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_video(self): formats = [ - {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, - {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, + {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -337,6 +340,8 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + +class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): def s_formats(lang, autocaption=False): return [{ @@ -459,6 +464,73 @@ class TestFormatSelection(unittest.TestCase): self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(audiofile) + def test_match_filter(self): + class FilterYDL(YDL): + def __init__(self, *args, **kwargs): + super(FilterYDL, self).__init__(*args, **kwargs) + self.params['simulate'] = True + + def process_info(self, info_dict): + super(YDL, self).process_info(info_dict) + + def _match_entry(self, info_dict, incomplete): + res = super(FilterYDL, self)._match_entry(info_dict, incomplete) + if res is None: + self.downloaded_info_dicts.append(info_dict) + return res + + first = { + 'id': '1', + 'url': TEST_URL, + 'title': 'one', + 'extractor': 'TEST', + 'duration': 30, + 'filesize': 10 * 1024, + } + second = { + 'id': '2', + 'url': TEST_URL, + 'title': 'two', + 'extractor': 'TEST', + 'duration': 10, + 'description': 'foo', + 'filesize': 5 * 1024, + } + videos = [first, second] + + def get_videos(filter_=None): + ydl = FilterYDL({'match_filter': filter_}) + for v in videos: + ydl.process_ie_result(v, download=True) + return [v['id'] for v in ydl.downloaded_info_dicts] + + res = get_videos() + self.assertEqual(res, ['1', '2']) + + def f(v): + if v['id'] == '1': + return None + else: + return 'Video id is not 1' + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func('duration < 30') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description = foo') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description =? foo') + res = get_videos(f) + self.assertEqual(res, ['1', '2']) + + f = match_filter_func('filesize > 5KiB') + res = get_videos(f) + self.assertEqual(res, ['1']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py new file mode 100644 index 000000000..4dc7de7b5 --- /dev/null +++ b/test/test_aes.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +import base64 + +# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' + + +class TestAES(unittest.TestCase): + def setUp(self): + self.key = self.iv = [0x20, 0x15] + 14 * [0] + self.secret_msg = b'Secret message goes here' + + def test_encrypt(self): + msg = b'message' + key = list(range(16)) + encrypted = aes_encrypt(bytes_to_intlist(msg), key) + decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) + self.assertEqual(decrypted, msg) + + def test_cbc_decrypt(self): + data = bytes_to_intlist( + b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" + ) + decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + + def test_decrypt_text(self): + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' + ) + decrypted = (aes_decrypt_text(encrypted, password, 16)) + self.assertEqual(decrypted, self.secret_msg) + + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' + ) + decrypted = (aes_decrypt_text(encrypted, password, 32)) + self.assertEqual(decrypted, self.secret_msg) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index e66264b4b..a9db42b30 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -59,7 +59,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) @@ -104,11 +104,11 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): - self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) - self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel']) - self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo']) - self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) - self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user']) + self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('https://vimeo.com/channels/31259', ['vimeo:channel']) + self.assertMatch('https://vimeo.com/channels/31259/53576664', ['vimeo']) + self.assertMatch('https://vimeo.com/user7108434', ['vimeo:user']) + self.assertMatch('https://vimeo.com/user7108434/videos', ['vimeo:user']) self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review']) # https://github.com/rg3/youtube-dl/issues/1930 diff --git a/test/test_execution.py b/test/test_execution.py index 60df187de..f31e51558 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# coding: utf-8 + from __future__ import unicode_literals import unittest @@ -27,5 +29,12 @@ class TestExecution(unittest.TestCase): def test_main_exec(self): subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + def test_cmdline_umlauts(self): + p = subprocess.Popen( + [sys.executable, 'youtube_dl/__main__.py', 'ä', '--version'], + cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) + _, stderr = p.communicate() + self.assertFalse(stderr) + if __name__ == '__main__': unittest.main() diff --git a/test/test_http.py b/test/test_http.py index bd4d46fef..f2e305b6f 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server +from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl import threading @@ -68,5 +68,52 @@ class TestHTTP(unittest.TestCase): r = ydl.extract_info('https://localhost:%d/video.html' % self.port) self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) + +def _build_proxy_handler(name): + class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + proxy_name = name + + def log_message(self, format, *args): + pass + + def do_GET(self): + self.send_response(200) + self.send_header('Content-Type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8')) + return HTTPTestRequestHandler + + +class TestProxy(unittest.TestCase): + def setUp(self): + self.proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('normal')) + self.port = self.proxy.socket.getsockname()[1] + self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) + self.proxy_thread.daemon = True + self.proxy_thread.start() + + self.cn_proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('cn')) + self.cn_port = self.cn_proxy.socket.getsockname()[1] + self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) + self.cn_proxy_thread.daemon = True + self.cn_proxy_thread.start() + + def test_proxy(self): + cn_proxy = 'localhost:{0}'.format(self.cn_port) + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + 'cn_verification_proxy': cn_proxy, + }) + url = 'http://foo.com/bar' + response = ydl.urlopen(url).read().decode('utf-8') + self.assertEqual(response, 'normal: {0}'.format(url)) + + req = compat_urllib_request.Request(url) + req.add_header('Ytdl-request-proxy', cn_proxy) + response = ydl.urlopen(req).read().decode('utf-8') + self.assertEqual(response, 'cn: {0}'.format(url)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_netrc.py b/test/test_netrc.py new file mode 100644 index 000000000..7cf3a6a2e --- /dev/null +++ b/test/test_netrc.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import ( + gen_extractors, +) + + +class TestNetRc(unittest.TestCase): + def test_netrc_present(self): + for ie in gen_extractors(): + if not hasattr(ie, '_login'): + continue + self.assertTrue( + hasattr(ie, '_NETRC_MACHINE'), + 'Extractor %s supports login, but is missing a _NETRC_MACHINE property' % ie.IE_NAME) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py new file mode 100644 index 000000000..addb69d6f --- /dev/null +++ b/test/test_postprocessors.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.postprocessor import MetadataFromTitlePP + + +class TestMetadataFromTitle(unittest.TestCase): + def test_format_to_regex(self): + pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') + self.assertEqual(pp._titleregex, '(?P<title>.+)\ \-\ (?P<artist>.+)') diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 3f2d8a2ba..891ee620b 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,6 +26,7 @@ from youtube_dl.extractor import ( VikiIE, ThePlatformIE, RTVEALaCartaIE, + FunnyOrDieIE, ) @@ -320,5 +321,17 @@ class TestRtveSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') +class TestFunnyOrDieSubtitles(BaseTestSubtitles): + url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' + IE = FunnyOrDieIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') + + if __name__ == '__main__': unittest.main() diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py index 7f816698e..6c1b7ec91 100644 --- a/test/test_unicode_literals.py +++ b/test/test_unicode_literals.py @@ -17,13 +17,22 @@ IGNORED_FILES = [ 'buildserver.py', ] +IGNORED_DIRS = [ + '.git', + '.tox', +] from test.helper import assertRegexpMatches class TestUnicodeLiterals(unittest.TestCase): def test_all_files(self): - for dirpath, _, filenames in os.walk(rootDir): + for dirpath, dirnames, filenames in os.walk(rootDir): + for ignore_dir in IGNORED_DIRS: + if ignore_dir in dirnames: + # If we remove the directory from dirnames os.walk won't + # recurse into it + dirnames.remove(ignore_dir) for basename in filenames: if not basename.endswith('.py'): continue diff --git a/test/test_utils.py b/test/test_utils.py index 3fba8ae11..abaf1ab73 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -24,6 +24,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + ExtractorError, find_xpath_attr, fix_xml_ampersands, InAdvancePagedList, @@ -38,6 +39,8 @@ from youtube_dl.utils import ( parse_iso8601, read_batch_urls, sanitize_filename, + sanitize_path, + sanitize_url_path_consecutive_slashes, shell_quote, smuggle_url, str_to_int, @@ -52,6 +55,7 @@ from youtube_dl.utils import ( urlencode_postdata, version_tuple, xpath_with_ns, + xpath_text, render_table, match_str, ) @@ -85,8 +89,11 @@ class TestUtil(unittest.TestCase): self.assertEqual( sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') + self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') + self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf') + self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf') forbidden = '"\0\\/' for fc in forbidden: @@ -128,6 +135,62 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') + def test_sanitize_path(self): + if sys.platform != 'win32': + return + + self.assertEqual(sanitize_path('abc'), 'abc') + self.assertEqual(sanitize_path('abc/def'), 'abc\\def') + self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') + self.assertEqual(sanitize_path('abc|def'), 'abc#def') + self.assertEqual(sanitize_path('<>:"|?*'), '#######') + self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def') + self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def') + + self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc') + self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc') + + self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') + self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc') + self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f') + self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') + + self.assertEqual( + sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'), + 'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s') + + self.assertEqual( + sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'), + 'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part') + self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#') + self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def') + self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#') + + self.assertEqual(sanitize_path('../abc'), '..\\abc') + self.assertEqual(sanitize_path('../../abc'), '..\\..\\abc') + self.assertEqual(sanitize_path('./abc'), 'abc') + self.assertEqual(sanitize_path('./../abc'), '..\\abc') + + def test_sanitize_url_path_consecutive_slashes(self): + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'), + 'http://hostname/foo/bar/filename.html') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'), + 'http://hostname/foo/bar/filename.html') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname//'), + 'http://hostname/') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'), + 'http://hostname/foo/bar/filename.html') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/'), + 'http://hostname/') + self.assertEqual( + sanitize_url_path_consecutive_slashes('http://hostname/abc//'), + 'http://hostname/abc/') + def test_ordered_set(self): self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(orderedSet([]), []) @@ -137,6 +200,8 @@ class TestUtil(unittest.TestCase): def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') + self.assertEqual(unescapeHTML('/'), '/') + self.assertEqual(unescapeHTML('/'), '/') self.assertEqual( unescapeHTML('é'), 'é') @@ -189,6 +254,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') + def test_xpath_text(self): + testxml = '''<root> + <div> + <p>Foo</p> + </div> + </root>''' + doc = xml.etree.ElementTree.fromstring(testxml) + self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') + self.assertTrue(xpath_text(doc, 'div/bar') is None) + self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) + def test_smuggle_url(self): data = {"ö": "ö", "abc": [3]} url = 'https://foo.bar/baz?x=y#a' @@ -1,8 +1,11 @@ [tox] -envlist = py26,py27,py33 +envlist = py26,py27,py33,py34 [testenv] deps = nose coverage -commands = nosetests --verbose {posargs:test} # --with-coverage --cover-package=youtube_dl --cover-html +defaultargs = test --exclude test_download.py --exclude test_age_restriction.py + --exclude test_subtitles.py --exclude test_write_annotations.py + --exclude test_youtube_lists.py +commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d7c6db0ff..4fa2223ad 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -4,8 +4,10 @@ from __future__ import absolute_import, unicode_literals import collections +import contextlib import datetime import errno +import fileinput import io import itertools import json @@ -52,12 +54,14 @@ from .utils import ( MaxDownloadsReached, PagedList, parse_filesize, + PerRequestProxyHandler, PostProcessingError, platform_name, preferredencoding, render_table, SameFileError, sanitize_filename, + sanitize_path, std_headers, subtitles_filename, takewhile_inclusive, @@ -181,6 +185,8 @@ class YoutubeDL(object): prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use + cn_verification_proxy: URL of the proxy to use for IP address verification + on Chinese sites. (Experimental) socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -247,10 +253,10 @@ class YoutubeDL(object): hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. The following parameters are not used by YoutubeDL itself, they are used by - the FileDownloader: + the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize. + xattr_set_filesize, external_downloader_args. The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, @@ -317,8 +323,10 @@ class YoutubeDL(object): 'Set the LC_ALL environment variable to fix this.') self.params['restrictfilenames'] = True - if '%(stitle)s' in self.params.get('outtmpl', ''): - self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') + if isinstance(params.get('outtmpl'), bytes): + self.report_warning( + 'Parameter outtmpl is bytes, but should be a unicode string. ' + 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') self._setup_opener() @@ -557,7 +565,7 @@ class YoutubeDL(object): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) + outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL)) tmpl = compat_expanduser(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 @@ -624,7 +632,7 @@ class YoutubeDL(object): Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result - ''' + ''' if ie_key: ies = [self.get_info_extractor(ie_key)] @@ -1080,8 +1088,7 @@ class YoutubeDL(object): if req_format is None: req_format = 'best' formats_to_download = [] - # The -1 is for supporting YoutubeIE - if req_format in ('-1', 'all'): + if req_format == 'all': formats_to_download = formats else: for rfstr in req_format.split(','): @@ -1208,9 +1215,6 @@ class YoutubeDL(object): if len(info_dict['title']) > 200: info_dict['title'] = info_dict['title'][:197] + '...' - # Keep for backwards compatibility - info_dict['stitle'] = info_dict['title'] - if 'format' not in info_dict: info_dict['format'] = info_dict['ext'] @@ -1256,7 +1260,7 @@ class YoutubeDL(object): return try: - dn = os.path.dirname(encodeFilename(filename)) + dn = os.path.dirname(sanitize_path(encodeFilename(filename))) if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: @@ -1452,8 +1456,11 @@ class YoutubeDL(object): return self._download_retcode def download_with_info_file(self, info_filename): - with io.open(info_filename, 'r', encoding='utf-8') as f: - info = json.load(f) + with contextlib.closing(fileinput.FileInput( + [info_filename], mode='r', + openhook=fileinput.hook_encoded('utf-8'))) as f: + # FileInput doesn't have a read method, we can't call json.load + info = json.loads('\n'.join(f)) try: self.process_ie_result(info, download=True) except DownloadError: @@ -1694,10 +1701,10 @@ class YoutubeDL(object): out = out.decode().strip() if re.match('[0-9a-f]+', out): self._write_string('[debug] Git HEAD: ' + out + '\n') - except: + except Exception: try: sys.exc_clear() - except: + except Exception: pass self._write_string('[debug] Python version %s - %s\n' % ( platform.python_version(), platform_name())) @@ -1757,13 +1764,20 @@ class YoutubeDL(object): # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) + proxy_handler = PerRequestProxyHandler(proxies) debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) + # The ssl context is only available in python 2.7.9 and 3.x + if hasattr(https_handler, '_context'): + if len(https_handler._context.get_ca_certs()) == 0: + self.report_warning( + 'No ssl certificates were loaded, urls that use https ' + 'won\'t work') ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) opener = compat_urllib_request.build_opener( - https_handler, proxy_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh) + # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play # (See https://github.com/rg3/youtube-dl/issues/1309 for details) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 49f382695..852b2fc3d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -9,6 +9,7 @@ import codecs import io import os import random +import shlex import sys @@ -212,6 +213,11 @@ def _real_main(argv=None): # PostProcessors postprocessors = [] # Add the metadata pp first, the other pps will copy it + if opts.metafromtitle: + postprocessors.append({ + 'key': 'MetadataFromTitle', + 'titleformat': opts.metafromtitle + }) if opts.addmetadata: postprocessors.append({'key': 'FFmpegMetadata'}) if opts.extractaudio: @@ -255,6 +261,9 @@ def _real_main(argv=None): xattr # Confuse flake8 except ImportError: parser.error('setting filesize xattr requested but python-xattr is not available') + external_downloader_args = None + if opts.external_downloader_args: + external_downloader_args = shlex.split(opts.external_downloader_args) match_filter = ( None if opts.match_filter is None else match_filter_func(opts.match_filter)) @@ -359,6 +368,8 @@ def _real_main(argv=None): 'no_color': opts.no_color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, + 'external_downloader_args': external_downloader_args, + 'cn_verification_proxy': opts.cn_verification_proxy, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b2bf149ef..973bcd320 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -389,7 +389,7 @@ else: stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = sp.communicate() lines, columns = map(int, out.split()) - except: + except Exception: pass return _terminal_size(columns, lines) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 3ae90021a..a0fc5ead0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -42,6 +42,8 @@ class FileDownloader(object): max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. (experimenatal) + external_downloader_args: A list of additional command-line arguments for the + external downloader. Subclasses of this one must re-define the real_download method. """ @@ -202,7 +204,7 @@ class FileDownloader(object): return try: os.utime(filename, (time.time(), filetime)) - except: + except Exception: pass return filetime @@ -316,7 +318,7 @@ class FileDownloader(object): ) continuedl_and_exists = ( - self.params.get('continuedl', False) and + self.params.get('continuedl', True) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False) ) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 51c41c704..1673b2382 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -51,6 +51,13 @@ class ExternalFD(FileDownloader): return [] return [command_option, source_address] + def _configuration_args(self, default=[]): + ex_args = self.params.get('external_downloader_args') + if ex_args is None: + return default + assert isinstance(ex_args, list) + return ex_args + def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ cmd = self._make_cmd(tmpfilename, info_dict) @@ -79,6 +86,7 @@ class CurlFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._source_address('--interface') + cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -89,15 +97,16 @@ class WgetFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._source_address('--bind-address') + cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd class Aria2cFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): - cmd = [ - self.exe, '-c', - '--min-split-size', '1M', '--max-connection-per-server', '4'] + cmd = [self.exe, '-c'] + cmd += self._configuration_args([ + '--min-split-size', '1M', '--max-connection-per-server', '4']) dn = os.path.dirname(tmpfilename) if dn: cmd += ['--dir', dn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 3dc796faa..4ab000d67 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -281,7 +281,7 @@ class F4mFD(FileDownloader): boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None - bootstrap = base64.b64decode(node.text) + bootstrap = base64.b64decode(node.text.encode('ascii')) boot_info = read_bootstrap_info(bootstrap) return (boot_info, bootstrap_url) @@ -308,7 +308,7 @@ class F4mFD(FileDownloader): live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = base64.b64decode(metadata_node.text) + metadata = base64.b64decode(metadata_node.text.encode('ascii')) else: metadata = None diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 2e3dac825..d136bebd1 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -49,7 +49,7 @@ class HttpFD(FileDownloader): open_mode = 'wb' if resume_len != 0: - if self.params.get('continuedl', False): + if self.params.get('continuedl', True): self.report_resuming_byte(resume_len) request.add_header('Range', 'bytes=%d-' % resume_len) open_mode = 'ab' @@ -92,6 +92,8 @@ class HttpFD(FileDownloader): self._hook_progress({ 'filename': filename, 'status': 'finished', + 'downloaded_bytes': resume_len, + 'total_bytes': resume_len, }) return True else: @@ -218,12 +220,6 @@ class HttpFD(FileDownloader): if tmpfilename != '-': stream.close() - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, - 'tmpfilename': tmpfilename, - 'status': 'error', - }) if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 89e98ae61..ddf5724ae 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -105,7 +105,7 @@ class RtmpFD(FileDownloader): protocol = info_dict.get('rtmp_protocol', None) real_time = info_dict.get('rtmp_real_time', False) no_resume = info_dict.get('no_resume', False) - continue_dl = info_dict.get('continuedl', False) + continue_dl = info_dict.get('continuedl', True) self.report_destination(filename) tmpfilename = self.temp_name(filename) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 370154773..0b9736f2d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -37,6 +37,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE @@ -105,17 +106,21 @@ from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE +from .dhm import DHMIE from .dotsub import DotsubIE +from .douyutv import DouyuTVIE from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import DRTVIE from .dvtv import DVTVIE from .dump import DumpIE +from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE from .dropbox import DropboxIE +from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE from .ehow import EHowIE @@ -150,6 +155,7 @@ from .fktv import ( ) from .flickr import FlickrIE from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE from .fourtube import FourTubeIE from .foxgay import FoxgayIE from .foxnews import FoxNewsIE @@ -174,6 +180,7 @@ from .gameone import ( from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gametrailers import GametrailersIE +from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE from .giantbomb import GiantBombIE @@ -228,6 +235,7 @@ from .jove import JoveIE from .jukebox import JukeboxIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE +from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .keezmovies import KeezMoviesIE @@ -244,6 +252,7 @@ from .letv import ( LetvTvIE, LetvPlaylistIE ) +from .libsyn import LibsynIE from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import ( @@ -303,6 +312,8 @@ from .nba import NBAIE from .nbc import ( NBCIE, NBCNewsIE, + NBCSportsIE, + NBCSportsVPlayerIE, ) from .ndr import NDRIE from .ndtv import NDTVIE @@ -341,6 +352,7 @@ from .npo import ( ) from .nrk import ( NRKIE, + NRKPlaylistIE, NRKTVIE, ) from .ntvde import NTVDeIE @@ -355,6 +367,7 @@ from .orf import ( ORFTVthekIE, ORFOE1IE, ORFFM4IE, + ORFIPTVIE, ) from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE @@ -362,9 +375,11 @@ from .pbs import PBSIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE from .played import PlayedIE from .playfm import PlayFMIE from .playvid import PlayvidIE +from .playwire import PlaywireIE from .podomatic import PodomaticIE from .pornhd import PornHdIE from .pornhub import ( @@ -373,6 +388,7 @@ from .pornhub import ( ) from .pornotube import PornotubeIE from .pornoxo import PornoXOIE +from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE @@ -398,7 +414,7 @@ from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, @@ -409,6 +425,10 @@ from .rutube import ( ) from .rutv import RUTVIE from .sandia import SandiaIE +from .safari import ( + SafariIE, + SafariCourseIE, +) from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE @@ -456,6 +476,7 @@ from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE from .srmediathek import SRMediathekIE +from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE @@ -514,6 +535,10 @@ from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE +from .twentytwotracks import ( + TwentyTwoTracksIE, + TwentyTwoTracksGenreIE +) from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -528,12 +553,15 @@ from .udemy import ( UdemyIE, UdemyCourseIE ) +from .ultimedia import UltimediaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE +from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE +from .vessel import VesselIE from .vesti import VestiIE from .vevo import VevoIE from .vgtv import VGTVIE @@ -551,6 +579,7 @@ from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE +from .viewster import ViewsterIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -607,6 +636,11 @@ from .yahoo import ( YahooSearchIE, ) from .yam import YamIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, +) from .yesjapan import YesJapanIE from .ynet import YnetIE from .youjizz import YouJizzIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 34b8b0115..39335b827 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -2,13 +2,12 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( ExtractorError, - xpath_text, float_or_none, + xpath_text, ) @@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor): 'title': 'American Dad - Putting Francine Out of Business', 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' }, + }, { + 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', + 'playlist': [ + { + 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', + 'info_dict': { + 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', + 'ext': 'flv', + 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + }, + } + ], + 'info_dict': { + 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + }, }] @staticmethod @@ -80,6 +97,7 @@ class AdultSwimIE(InfoExtractor): for video in collection.get('videos'): if video.get('slug') == slug: return collection, video + return None, None def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -90,28 +108,30 @@ class AdultSwimIE(InfoExtractor): webpage = self._download_webpage(url, episode_path) # Extract the value of `bootstrappedData` from the Javascript in the page. - bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path) - - try: - bootstrappedData = json.loads(bootstrappedDataJS) - except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % episode_path - raise ExtractorError(errmsg, cause=ve) + bootstrapped_data = self._parse_json(self._search_regex( + r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) # Downloading videos from a /videos/playlist/ URL needs to be handled differently. # NOTE: We are only downloading one video (the current one) not the playlist if is_playlist: - collections = bootstrappedData['playlists']['collections'] + collections = bootstrapped_data['playlists']['collections'] collection = self.find_collection_by_linkURL(collections, show_path) video_info = self.find_video_info(collection, episode_path) show_title = video_info['showTitle'] segment_ids = [video_info['videoPlaybackID']] else: - collections = bootstrappedData['show']['collections'] + collections = bootstrapped_data['show']['collections'] collection, video_info = self.find_collection_containing_video(collections, episode_path) - show = bootstrappedData['show'] + # Video wasn't found in the collections, let's try `slugged_video`. + if video_info is None: + if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: + video_info = bootstrapped_data['slugged_video'] + else: + raise ExtractorError('Unable to find video info') + + show = bootstrapped_data['show'] show_title = show['title'] segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py index 2b257ede7..e15c015fb 100644 --- a/youtube_dl/extractor/aftenposten.py +++ b/youtube_dl/extractor/aftenposten.py @@ -14,10 +14,10 @@ from ..utils import ( class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/([^/]+/)*(?P<id>[^/]+)-\d+\.html' + _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' _TEST = { - 'url': 'http://www.aftenposten.no/webtv/serier-og-programmer/sweatshopenglish/TRAILER-SWEATSHOP---I-cant-take-any-more-7800835.html?paging=§ion=webtv_serierogprogrammer_sweatshop_sweatshopenglish', + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', 'md5': 'fd828cd29774a729bf4d4425fe192972', 'info_dict': { 'id': '21039', @@ -30,12 +30,7 @@ class AftenpostenIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._html_search_regex( - r'data-xs-id="(\d+)"', webpage, 'video id') + video_id = self._match_id(url) data = self._download_xml( 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 783b53e23..6a35ea463 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -50,6 +50,9 @@ class ARDMediathekIE(InfoExtractor): if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage: raise ExtractorError('Video %s is no longer available' % video_id, expected=True) + if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage: + raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) + if re.search(r'[\?&]rss($|[=&])', url): doc = parse_xml(webpage) if doc.tag == 'rss': diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 929dd3cc5..8273bd6c9 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -146,6 +146,7 @@ class ArteTVPlus7IE(InfoExtractor): formats.append(format) + self._check_formats(formats, video_id) self._sort_formats(formats) info_dict['formats'] = formats diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 7669e0e3d..29f8795d3 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -19,6 +19,7 @@ from ..utils import ( class AtresPlayerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' + _NETRC_MACHINE = 'atresplayer' _TESTS = [ { 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py new file mode 100644 index 000000000..3c7775d3e --- /dev/null +++ b/youtube_dl/extractor/beatportpro.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class BeatportProIE(InfoExtractor): + _VALID_URL = r'https?://pro\.beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', + 'md5': 'b3c34d8639a2f6a7f734382358478887', + 'info_dict': { + 'id': '5379371', + 'display_id': 'synesthesia-original-mix', + 'ext': 'mp4', + 'title': 'Froxic - Synesthesia (Original Mix)', + }, + }, { + 'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896', + 'md5': 'e44c3025dfa38c6577fbaeb43da43514', + 'info_dict': { + 'id': '3756896', + 'display_id': 'love-and-war-original-mix', + 'ext': 'mp3', + 'title': 'Wolfgang Gartner - Love & War (Original Mix)', + }, + }, { + 'url': 'https://pro.beatport.com/track/birds-original-mix/4991738', + 'md5': 'a1fd8e8046de3950fd039304c186c05f', + 'info_dict': { + 'id': '4991738', + 'display_id': 'birds-original-mix', + 'ext': 'mp4', + 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + track_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + playables = self._parse_json( + self._search_regex( + r'window\.Playables\s*=\s*({.+?});', webpage, + 'playables info', flags=re.DOTALL), + track_id) + + track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) + + title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] + if track['mix']: + title += ' (' + track['mix'] + ')' + + formats = [] + for ext, info in track['preview'].items(): + if not info['url']: + continue + fmt = { + 'url': info['url'], + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } + if ext == 'mp3': + fmt['preference'] = 0 + fmt['acodec'] = 'mp3' + fmt['abr'] = 96 + fmt['asr'] = 44100 + elif ext == 'mp4': + fmt['preference'] = 1 + fmt['acodec'] = 'aac' + fmt['abr'] = 96 + fmt['asr'] = 44100 + formats.append(fmt) + self._sort_formats(formats) + + images = [] + for name, info in track['images'].items(): + image_url = info.get('url') + if name == 'dynamic' or not image_url: + continue + image = { + 'id': name, + 'url': image_url, + 'height': int_or_none(info.get('height')), + 'width': int_or_none(info.get('width')), + } + images.append(image) + + return { + 'id': compat_str(track.get('id')) or track_id, + 'display_id': track.get('slug') or display_id, + 'title': title, + 'formats': formats, + 'thumbnails': images, + } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 4bcc897c9..809287d14 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -41,7 +41,7 @@ class BreakIE(InfoExtractor): 'tbr': media['bitRate'], 'width': media['width'], 'height': media['height'], - } for media in info['media']] + } for media in info['media'] if media.get('mediaPurpose') == 'play'] if not formats: formats.append({ diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index abf8cc280..0fa720ee8 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -105,6 +105,7 @@ class CloudyIE(InfoExtractor): webpage = self._download_webpage(url, video_id) file_key = self._search_regex( - r'filekey\s*=\s*"([^"]+)"', webpage, 'file_key') + [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], + webpage, 'file_key') return self._extract_video(video_host, video_id, file_key) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 90ea07438..0a77e951c 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln|ktvk)(?:-ap)?|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -45,6 +45,9 @@ class CNNIE(InfoExtractor): 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', } + }, { + 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7977fa8d0..e5245ec3f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -767,6 +767,10 @@ class InfoExtractor(object): formats) def _is_valid_url(self, url, video_id, item='video'): + url = self._proto_relative_url(url, scheme='http:') + # For now assume non HTTP(S) URLs always valid + if not (url.startswith('http://') or url.startswith('https://')): + return True try: self._request_webpage(url, video_id, 'Checking %s URL' % item) return True @@ -835,7 +839,7 @@ class InfoExtractor(object): m3u8_id=None): formats = [{ - 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])), + 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -879,8 +883,13 @@ class InfoExtractor(object): formats.append({'url': format_url(line)}) continue tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + format_id = [] + if m3u8_id: + format_id.append(m3u8_id) + last_media_name = last_media.get('NAME') if last_media else None + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { - 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])), + 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, @@ -1053,6 +1062,9 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + def _subtitles_timecode(self, seconds): + return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f1da7d09b..6ded723c9 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -23,12 +23,12 @@ from ..utils import ( ) from ..aes import ( aes_cbc_decrypt, - inc, ) class CrunchyrollIE(InfoExtractor): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' + _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -101,13 +101,6 @@ class CrunchyrollIE(InfoExtractor): key = obfuscate_key(id) - class Counter: - __value = iv - - def next_value(self): - temp = self.__value - self.__value = inc(self.__value) - return temp decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) return zlib.decompress(decrypted_data) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 42b20a46d..47d58330b 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): def _build_request(url): """Build a request with the family filter disabled""" request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - request.add_header('Cookie', 'ff=off') + request.add_header('Cookie', 'family_filter=off; ff=off') return request @@ -46,13 +45,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _TESTS = [ { - 'url': 'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', - 'md5': '392c4b85a60a90dc4792da41ce3144eb', + 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', + 'md5': '2137c41a8e78554bb09225b8eb322406', 'info_dict': { - 'id': 'x33vw9', + 'id': 'x2iuewm', 'ext': 'mp4', - 'uploader': 'Amphora Alex and Van .', - 'title': 'Tutoriel de Youtubeur"DL DES VIDEO DE YOUTUBE"', + 'uploader': 'IGN', + 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', } }, # Vevo video @@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') + embed_request = self._build_request(embed_url) + embed_page = self._download_webpage( + embed_request, video_id, 'Downloading embed page') info = self._search_regex(r'var info = ({.*?}),$', embed_page, 'video info', flags=re.MULTILINE) info = json.loads(info) diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py new file mode 100644 index 000000000..3ed1f1663 --- /dev/null +++ b/youtube_dl/extractor/dhm.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + parse_duration, +) + + +class DHMIE(InfoExtractor): + IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' + _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/', + 'md5': '11c475f670209bf6acca0b2b7ef51827', + 'info_dict': { + 'id': 'the-marshallplan-at-work-in-west-germany', + 'ext': 'flv', + 'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE', + 'description': 'md5:1fabd480c153f97b07add61c44407c82', + 'duration': 660, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/', + 'md5': '09890226332476a3e3f6f2cb74734aa5', + 'info_dict': { + 'id': 'rolle-1', + 'ext': 'flv', + 'title': 'ROLLE 1', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r"file\s*:\s*'([^']+)'", webpage, 'playlist url') + + playlist = self._download_xml(playlist_url, video_id) + + track = playlist.find( + './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track') + + video_url = xpath_text( + track, './{http://xspf.org/ns/0/}location', + 'video url', fatal=True) + thumbnail = xpath_text( + track, './{http://xspf.org/ns/0/}image', + 'thumbnail') + + title = self._search_regex( + [r'dc:title="([^"]+)"', r'<title> »([^<]+)</title>'], + webpage, 'title').strip() + description = self._html_search_regex( + r'<p><strong>Description:</strong>(.+?)</p>', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)', + webpage, 'duration', default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py new file mode 100644 index 000000000..479430c51 --- /dev/null +++ b/youtube_dl/extractor/douyutv.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import time +from .common import InfoExtractor +from ..utils import (ExtractorError, unescapeHTML) +from ..compat import (compat_str, compat_basestring) + + +class DouyuTVIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'http://www.douyutv.com/iseven', + 'info_dict': { + 'id': '17732', + 'display_id': 'iseven', + 'ext': 'flv', + 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:c93d6692dde6fe33809a46edcbecca44', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': '7师傅', + 'uploader_id': '431925', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.douyutv.com/85982', + 'info_dict': { + 'id': '85982', + 'display_id': '85982', + 'ext': 'flv', + 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'douyu小漠', + 'uploader_id': '3769985', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + if video_id.isdigit(): + room_id = video_id + else: + page = self._download_webpage(url, video_id) + room_id = self._html_search_regex( + r'"room_id"\s*:\s*(\d+),', page, 'room id') + + prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( + room_id, int(time.time())) + + auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() + config = self._download_json( + 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), + video_id) + + data = config['data'] + + error_code = config.get('error', 0) + if error_code is not 0: + error_desc = 'Server reported error %i' % error_code + if isinstance(data, (compat_str, compat_basestring)): + error_desc += ': ' + data + raise ExtractorError(error_desc, expected=True) + + show_status = data.get('show_status') + # 1 = live, 2 = offline + if show_status == '2': + raise ExtractorError( + 'Live stream is offline', expected=True) + + base_url = data['rtmp_url'] + live_path = data['rtmp_live'] + + title = self._live_title(unescapeHTML(data['room_name'])) + description = data.get('show_details') + thumbnail = data.get('room_src') + + uploader = data.get('nickname') + uploader_id = data.get('owner_uid') + + multi_formats = data.get('rtmp_multi_bitrate') + if not isinstance(multi_formats, dict): + multi_formats = {} + multi_formats['live'] = live_path + + formats = [{ + 'url': '%s/%s' % (base_url, format_path), + 'format_id': format_id, + 'preference': 1 if format_id == 'live' else 0, + } for format_id, format_path in multi_formats.items()] + self._sort_formats(formats) + + return { + 'id': room_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py new file mode 100644 index 000000000..e43bc81b2 --- /dev/null +++ b/youtube_dl/extractor/dumpert.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import qualities + + +class DumpertIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', + 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', + 'info_dict': { + 'id': '6646981/951bc60f', + 'ext': 'mp4', + 'title': 'Ik heb nieuws voor je', + 'description': 'Niet schrikken hoor', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + files_base64 = self._search_regex( + r'data-files="([^"]+)"', webpage, 'data files') + + files = self._parse_json( + base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), + video_id) + + quality = qualities(['flv', 'mobile', 'tablet', '720p']) + + formats = [{ + 'url': video_url, + 'format_id': format_id, + 'quality': quality(format_id), + } for format_id, video_url in files.items() if format_id != 'still'] + self._sort_formats(formats) + + title = self._html_search_meta( + 'title', webpage) or self._og_search_title(webpage) + description = self._html_search_meta( + 'description', webpage) or self._og_search_description(webpage) + thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats + } diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py new file mode 100644 index 000000000..7173371ee --- /dev/null +++ b/youtube_dl/extractor/eagleplatform.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class EaglePlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + eagleplatform:(?P<custom_host>[^/]+):| + https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id= + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # http://lenta.ru/news/2015/03/06/navalny/ + 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', + 'md5': '0b7994faa2bd5c0f69a3db6db28d078d', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + }, { + # http://muz-tv.ru/play/7129/ + # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true + 'url': 'eagleplatform:media.clipyou.ru:12820', + 'md5': '6c2ebeab03b739597ce8d86339d5a905', + 'info_dict': { + 'id': '12820', + 'ext': 'mp4', + 'title': "'O Sole Mio", + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 216, + 'view_count': int, + }, + }] + + def _handle_error(self, response): + status = int_or_none(response.get('status', 200)) + if status != 200: + raise ExtractorError(' '.join(response['errors']), expected=True) + + def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): + response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + self._handle_error(response) + return response + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + + player_data = self._download_json( + 'http://%s/api/player_data?id=%s' % (host, video_id), video_id) + + media = player_data['data']['playlist']['viewports'][0]['medialist'][0] + + title = media['title'] + description = media.get('description') + thumbnail = media.get('snapshot') + duration = int_or_none(media.get('duration')) + view_count = int_or_none(media.get('views')) + + age_restriction = media.get('age_restriction') + age_limit = None + if age_restriction: + age_limit = 0 if age_restriction == 'allow_all' else 18 + + m3u8_data = self._download_json( + media['sources']['secure_m3u8']['auto'], + video_id, 'Downloading m3u8 JSON') + + formats = self._extract_m3u8_formats( + m3u8_data['data'][0], video_id, + 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index fb5dbbe2b..0b61ea0ba 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import json import random -import re from .common import InfoExtractor from ..compat import ( @@ -103,20 +102,23 @@ class EightTracksIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - json_like = self._search_regex( - r"(?s)PAGE.mix = (.*?);\n", webpage, 'trax information') - data = json.loads(json_like) + data = self._parse_json( + self._search_regex( + r"(?s)PAGE\.mix\s*=\s*({.+?});\n", webpage, 'trax information'), + playlist_id) session = str(random.randint(0, 1000000000)) mix_id = data['id'] track_count = data['tracks_count'] duration = data['duration'] avg_song_duration = float(duration) / track_count + # duration is sometimes negative, use predefined avg duration + if avg_song_duration <= 0: + avg_song_duration = 300 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) next_url = first_url entries = [] diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 79e2fbd39..0cbca90b0 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -1,11 +1,17 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ExtractorError class EroProfileIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)' - _TEST = { + _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' + _NETRC_MACHINE = 'eroprofile' + _TESTS = [{ 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore', 'md5': 'c26f351332edf23e1ea28ce9ec9de32f', 'info_dict': { @@ -16,13 +22,55 @@ class EroProfileIE(InfoExtractor): 'thumbnail': 're:https?://.*\.jpg', 'age_limit': 18, } - } + }, { + 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', + 'md5': '1baa9602ede46ce904c431f5418d8916', + 'info_dict': { + 'id': '1133519', + 'ext': 'm4v', + 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', + 'thumbnail': 're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'skip': 'Requires login', + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + query = compat_urllib_parse.urlencode({ + 'username': username, + 'password': password, + 'url': 'http://www.eroprofile.com/', + }) + login_url = self._LOGIN_URL + query + login_page = self._download_webpage(login_url, None, False) + + m = re.search(r'Your username or password was incorrect\.', login_page) + if m: + raise ExtractorError( + 'Wrong username and/or password.', expected=True) + + self.report_login() + redirect_url = self._search_regex( + r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') + self._download_webpage(redirect_url, None, False) + + def _real_initialize(self): + self._login() def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + m = re.search(r'You must be logged in to view this video\.', webpage) + if m: + raise ExtractorError( + 'This video requires login. Please specify a username and password and try again.', expected=True) + video_id = self._search_regex( [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 36ba33128..c826a5404 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -4,11 +4,11 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlparse, + compat_parse_qs, compat_urllib_request, - compat_urllib_parse, ) from ..utils import ( + qualities, str_to_int, ) @@ -17,7 +17,7 @@ class ExtremeTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', + 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', 'info_dict': { 'id': '652431', 'ext': 'mp4', @@ -49,19 +49,27 @@ class ExtremeTubeIE(InfoExtractor): r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', webpage, 'view count', fatal=False)) - video_url = compat_urllib_parse.unquote(self._html_search_regex( - r'video_url=(.+?)&', webpage, 'video_url')) - path = compat_urllib_parse_urlparse(video_url).path - format = path.split('/')[5].split('_')[:2] - format = "-".join(format) + flash_vars = compat_parse_qs(self._search_regex( + r'<param[^>]+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars')) + + formats = [] + quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p']) + for k, vals in flash_vars.items(): + m = re.match(r'quality_(?P<quality>[0-9]+p)$', k) + if m is not None: + formats.append({ + 'format_id': m.group('quality'), + 'quality': quality(m.group('quality')), + 'url': vals[0], + }) + + self._sort_formats(formats) return { 'id': video_id, 'title': video_title, + 'formats': formats, 'uploader': uploader, 'view_count': view_count, - 'url': video_url, - 'format': format, - 'format_id': format, 'age_limit': 18, } diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py new file mode 100644 index 000000000..2b4691ae8 --- /dev/null +++ b/youtube_dl/extractor/footyroom.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FootyRoomIE(InfoExtractor): + _VALID_URL = r'http://footyroom\.com/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', + 'info_dict': { + 'id': 'schalke-04-0-2-real-madrid-2015-02', + 'title': 'Schalke 04 0 – 2 Real Madrid', + }, + 'playlist_count': 3, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + playlist = self._parse_json( + self._search_regex( + r'VideoSelector\.load\((\[.+?\])\);', webpage, 'video selector'), + playlist_id) + + playlist_title = self._og_search_title(webpage) + + entries = [] + for video in playlist: + payload = video.get('payload') + if not payload: + continue + playwire_url = self._search_regex( + r'data-config="([^"]+)"', payload, + 'playwire url', default=None) + if playwire_url: + entries.append(self.url_result(playwire_url, 'Playwire')) + + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index a49fc1151..dd87257c4 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -50,7 +50,6 @@ class FunnyOrDieIE(InfoExtractor): bitrates.sort() formats = [] - for bitrate in bitrates: for link in links: formats.append({ @@ -59,6 +58,13 @@ class FunnyOrDieIE(InfoExtractor): 'vbr': bitrate, }) + subtitles = {} + for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage): + subtitles[src_lang] = [{ + 'ext': src.split('/')[-1], + 'url': 'http://www.funnyordie.com%s' % src, + }] + post_json = self._search_regex( r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') post = json.loads(post_json) @@ -69,4 +75,5 @@ class FunnyOrDieIE(InfoExtractor): 'description': post.get('description'), 'thumbnail': post.get('picture'), 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py new file mode 100644 index 000000000..ea32b621c --- /dev/null +++ b/youtube_dl/extractor/gazeta.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class GazetaIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' + _TESTS = [{ + 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', + 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', + 'info_dict': { + 'id': '205566', + 'ext': 'mp4', + 'title': '«70–80 процентов гражданских в Донецке на грани голода»', + 'description': 'md5:38617526050bd17b234728e7f9620a71', + 'thumbnail': 're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + display_id = mobj.group('id') + embed_url = '%s?p=embed' % mobj.group('url') + embed_page = self._download_webpage( + embed_url, display_id, 'Downloading embed page') + + video_id = self._search_regex( + r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') + + return self.url_result( + 'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f7b467b0a..51796f3a4 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -12,6 +12,7 @@ from ..utils import remove_end class GDCVaultIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)' + _NETRC_MACHINE = 'gdcvault' _TESTS = [ { 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 27e2bc300..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -26,8 +26,10 @@ from ..utils import ( unsmuggle_url, UnsupportedError, url_basename, + xpath_text, ) from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .smotri import SmotriIE @@ -526,6 +528,17 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Viddler'], }, + # Libsyn embed + { + 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', + 'info_dict': { + 'id': '3377616', + 'ext': 'mp3', + 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", + 'description': 'md5:601cb790edd05908957dae8aaa866465', + 'upload_date': '20150220', + }, + }, # jwplayer YouTube { 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', @@ -569,6 +582,75 @@ class GenericIE(InfoExtractor): 'title': 'John Carlson Postgame 2/25/15', }, }, + # Eagle.Platform embed (generic URL) + { + 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + }, + # ClipYou (Eagle.Platform) embed (custom URL) + { + 'url': 'http://muz-tv.ru/play/7129/', + 'info_dict': { + 'id': '12820', + 'ext': 'mp4', + 'title': "'O Sole Mio", + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 216, + 'view_count': int, + }, + }, + # Pladform embed + { + 'url': 'http://muz-tv.ru/kinozal/view/7400/', + 'info_dict': { + 'id': '100183293', + 'ext': 'mp4', + 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', + 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 694, + 'age_limit': 0, + }, + }, + # 5min embed + { + 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', + 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', + 'info_dict': { + 'id': '518726732', + 'ext': 'mp4', + 'title': 'Facebook Creates "On This Day" | Crunch Report', + }, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # NBC Sports vplayer embed + { + 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', + 'info_dict': { + 'id': 'ln7x1qSThw4k', + 'ext': 'flv', + 'title': "PFT Live: New leader in the 'new-look' defense", + 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + }, + } ] def report_following_redirect(self, new_url): @@ -580,11 +662,24 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - entries = [{ - '_type': 'url', - 'url': e.find('link').text, - 'title': e.find('title').text, - } for e in doc.findall('./channel/item')] + entries = [] + for it in doc.findall('./channel/item'): + next_url = xpath_text(it, 'link', fatal=False) + if not next_url: + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + + if not next_url: + continue + + entries.append({ + '_type': 'url', + 'url': next_url, + 'title': it.find('title').text, + }) return { '_type': 'playlist', @@ -943,6 +1038,19 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for NYTimes player + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + + # Look for Libsyn player + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or @@ -1131,6 +1239,35 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') + # Look for Eagle.Platform embeds + mobj = re.search( + r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'EaglePlatform') + + # Look for ClipYou (uses Eagle.Platform) embeds + mobj = re.search( + r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) + if mobj is not None: + return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') + + # Look for Pladform embeds + mobj = re.search( + r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Pladform') + + # Look for 5min embeds + mobj = re.search( + r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) + if mobj is not None: + return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + + # Look for NBC Sports VPlayer embeds + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -1187,10 +1324,16 @@ class GenericIE(InfoExtractor): # HTML5 video found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage) if not found: + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)', + r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, webpage) + if not found: + # Look also in Refresh HTTP header + refresh_header = head_response.headers.get('Refresh') + if refresh_header: + found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = found.group(1) self.report_following_redirect(new_url) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 29638a194..8a95793ca 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -20,7 +20,7 @@ class GloboIE(InfoExtractor): _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' - _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s' + _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' _VIDEOID_REGEXES = [ r'\bdata-video-id="(\d+)"', diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py index 848d17beb..36ad4915c 100644 --- a/youtube_dl/extractor/grooveshark.py +++ b/youtube_dl/extractor/grooveshark.py @@ -140,9 +140,9 @@ class GroovesharkIE(InfoExtractor): if webpage is not None: o = GroovesharkHtmlParser.extract_object_tags(webpage) - return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) + return webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'] - return (webpage, None) + return webpage, None def _real_initialize(self): self.ts = int(time.time() * 1000) # timestamp in millis @@ -154,7 +154,7 @@ class GroovesharkIE(InfoExtractor): swf_referer = None if self.do_playerpage_request: (_, player_objs) = self._get_playerpage(url) - if player_objs is not None: + if player_objs: swf_referer = self._build_swf_referer(url, player_objs[0]) self.to_screen('SWF Referer: %s' % swf_referer) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 8094cc2e4..d0720ff56 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -15,10 +14,10 @@ class JeuxVideoIE(InfoExtractor): 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', 'md5': '046e491afb32a8aaac1f44dd4ddd54ee', 'info_dict': { - 'id': '5182', + 'id': '114765', 'ext': 'mp4', - 'title': 'GC 2013 : Tearaway nous présente ses papiers d\'identité', - 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', + 'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité', + 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.', }, } @@ -26,26 +25,29 @@ class JeuxVideoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group(1) webpage = self._download_webpage(url, title) - xml_link = self._html_search_regex( - r'<param name="flashvars" value="config=(.*?)" />', + title = self._html_search_meta('name', webpage) + config_url = self._html_search_regex( + r'data-src="(/contenu/medias/video.php.*?)"', webpage, 'config URL') + config_url = 'http://www.jeuxvideo.com' + config_url video_id = self._search_regex( - r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml', - xml_link, 'video ID') + r'id=(\d+)', + config_url, 'video ID') - config = self._download_xml( - xml_link, title, 'Downloading XML config') - info_json = config.find('format.json').text - info = json.loads(info_json)['versions'][0] + config = self._download_json( + config_url, title, 'Downloading JSON config') - video_url = 'http://video720.jeuxvideo.com/' + info['file'] + formats = [{ + 'url': source['file'], + 'format_id': source['label'], + 'resolution': source['label'], + } for source in reversed(config['sources'])] return { 'id': video_id, - 'title': config.find('titre_video').text, - 'ext': 'mp4', - 'url': video_url, + 'title': title, + 'formats': formats, 'description': self._og_search_description(webpage), - 'thumbnail': config.find('image').text, + 'thumbnail': config.get('image'), } diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py new file mode 100644 index 000000000..2bb078036 --- /dev/null +++ b/youtube_dl/extractor/kanalplay.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, +) + + +class KanalPlayIE(InfoExtractor): + IE_DESC = 'Kanal 5/9/11 Play' + _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', + 'info_dict': { + 'id': '3270012277', + 'ext': 'flv', + 'title': 'Saknar både dusch och avlopp', + 'description': 'md5:6023a95832a06059832ae93bc3c7efb7', + 'duration': 2636.36, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', + 'only_matching': True, + }, { + 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', + 'only_matching': True, + }] + + def _fix_subtitles(self, subs): + return '\r\n\r\n'.join( + '%s\r\n%s --> %s\r\n%s' + % ( + num, + self._subtitles_timecode(item['startMillis'] / 1000.0), + self._subtitles_timecode(item['endMillis'] / 1000.0), + item['text'], + ) for num, item in enumerate(subs, 1)) + + def _get_subtitles(self, channel_id, video_id): + subs = self._download_json( + 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), + video_id, 'Downloading subtitles JSON', fatal=False) + return {'se': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + channel_id = mobj.group('channel_id') + + video = self._download_json( + 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), + video_id) + + reasons_for_no_streams = video.get('reasonsForNoStreams') + if reasons_for_no_streams: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), + expected=True) + + title = video['title'] + description = video.get('description') + duration = float_or_none(video.get('length'), 1000) + thumbnail = video.get('posterUrl') + + stream_base_url = video['streamBaseUrl'] + + formats = [{ + 'url': stream_base_url, + 'play_path': stream['source'], + 'ext': 'flv', + 'tbr': float_or_none(stream.get('bitrate'), 1000), + 'rtmp_real_time': True, + } for stream in video['streams']] + self._sort_formats(formats) + + subtitles = {} + if video.get('hasSubtitle'): + subtitles = self.extract_subtitles(channel_id, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index e46954b47..96f95979a 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor): description = self._og_search_description(webpage, default=None) thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage) duration = int_or_none(flashvars.get('duration')) - width = int_or_none(self._og_search_property('video:width', webpage, 'video width')) - height = int_or_none(self._og_search_property('video:height', webpage, 'video height')) + width = int_or_none(self._og_search_property( + 'video:width', webpage, 'video width', default=None)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, 'video height', default=None)) return { 'id': video_id, diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 583ce35b9..1484ac0d2 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -7,8 +7,9 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_urllib_parse, + compat_urllib_request, + compat_urlparse, ) from ..utils import ( determine_ext, @@ -39,12 +40,20 @@ class LetvIE(InfoExtractor): 'title': '美人天下01', 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', }, - 'expected_warnings': [ - 'publish time' - ] + }, { + 'note': 'This video is available only in Mainland China, thus a proxy is needed', + 'url': 'http://www.letv.com/ptv/vplay/1118082.html', + 'md5': 'f80936fbe20fb2f58648e81386ff7927', + 'info_dict': { + 'id': '1118082', + 'ext': 'mp4', + 'title': '与龙共舞 完整版', + 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', + }, + 'params': { + 'cn_verification_proxy': 'http://proxy.uku.im:8888' + }, }] - # http://www.letv.com/ptv/vplay/1118082.html - # This video is available only in Mainland China @staticmethod def urshift(val, n): @@ -76,9 +85,16 @@ class LetvIE(InfoExtractor): 'tkey': self.calc_time_key(int(time.time())), 'domain': 'www.letv.com' } + play_json_req = compat_urllib_request.Request( + 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) + ) + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) + play_json = self._download_json( - 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params), - media_id, 'playJson data') + play_json_req, + media_id, 'Downloading playJson data') # Check for errors playstatus = play_json['playstatus'] @@ -114,7 +130,8 @@ class LetvIE(InfoExtractor): url_info_dict = { 'url': media_url, - 'ext': determine_ext(dispatch[format_id][1]) + 'ext': determine_ext(dispatch[format_id][1]), + 'format_id': format_id, } if format_id[-1:] == 'p': @@ -123,7 +140,7 @@ class LetvIE(InfoExtractor): urls.append(url_info_dict) publish_time = parse_iso8601(self._html_search_regex( - r'发布时间 ([^<>]+) ', page, 'publish time', fatal=False), + r'发布时间 ([^<>]+) ', page, 'publish time', default=None), delimiter=' ', timezone=datetime.timedelta(hours=8)) description = self._html_search_meta('description', page, fatal=False) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py new file mode 100644 index 000000000..9ab1416f5 --- /dev/null +++ b/youtube_dl/extractor/libsyn.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class LibsynIE(InfoExtractor): + _VALID_URL = r'https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', + 'md5': '443360ee1b58007bc3dcf09b41d093bb', + 'info_dict': { + 'id': '3377616', + 'ext': 'mp3', + 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", + 'description': 'md5:601cb790edd05908957dae8aaa866465', + 'upload_date': '20150220', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = [{ + 'url': media_url, + } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] + + podcast_title = self._search_regex( + r'<h2>([^<]+)</h2>', webpage, 'title') + episode_title = self._search_regex( + r'<h3>([^<]+)</h3>', webpage, 'title', default=None) + + title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + + description = self._html_search_regex( + r'<div id="info_text_body">(.+?)</div>', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + release_date = unified_strdate(self._search_regex( + r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': release_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 3642089f7..2467f8bdd 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re import json +import itertools from .common import InfoExtractor from ..compat import ( @@ -41,6 +42,13 @@ class LivestreamIE(InfoExtractor): }, 'playlist_mincount': 4, }, { + 'url': 'http://new.livestream.com/chess24/tatasteelchess', + 'info_dict': { + 'title': 'Tata Steel Chess', + 'id': '3705884', + }, + 'playlist_mincount': 60, + }, { 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', 'only_matching': True, }] @@ -117,6 +125,30 @@ class LivestreamIE(InfoExtractor): 'view_count': video_data.get('views'), } + def _extract_event(self, info): + event_id = compat_str(info['id']) + account = compat_str(info['owner_account_id']) + root_url = ( + 'https://new.livestream.com/api/accounts/{account}/events/{event}/' + 'feed.json'.format(account=account, event=event_id)) + + def _extract_videos(): + last_video = None + for i in itertools.count(1): + if last_video is None: + info_url = root_url + else: + info_url = '{root}?&id={id}&newer=-1&type=video'.format( + root=root_url, id=last_video) + videos_info = self._download_json(info_url, event_id, 'Downloading page {0}'.format(i))['data'] + videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] + if not videos_info: + break + for v in videos_info: + yield self._extract_video_info(v) + last_video = videos_info[-1]['id'] + return self.playlist_result(_extract_videos(), event_id, info['full_name']) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -144,14 +176,13 @@ class LivestreamIE(InfoExtractor): result = result and compat_str(vdata['data']['id']) == vid return result - videos = [self._extract_video_info(video_data['data']) - for video_data in info['feed']['data'] - if is_relevant(video_data, video_id)] if video_id is None: # This is an event page: - return self.playlist_result( - videos, '%s' % info['id'], info['full_name']) + return self._extract_event(info) else: + videos = [self._extract_video_info(video_data['data']) + for video_data in info['feed']['data'] + if is_relevant(video_data, video_id)] if not videos: raise ExtractorError('Cannot find video %s' % video_id) return videos[0] diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index 9c2fbdd96..e3236f7b5 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -52,6 +52,7 @@ class LRTIE(InfoExtractor): 'url': data['streamer'], 'play_path': 'mp4:%s' % data['file'], 'preference': -1, + 'rtmp_real_time': True, }) else: formats.extend( diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5dc22da22..cfd3b14f4 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -15,18 +15,73 @@ from ..utils import ( ) -class LyndaIE(InfoExtractor): +class LyndaBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' + _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true' + _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' + _NETRC_MACHINE = 'lynda' + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'username': username, + 'password': password, + 'remember': 'false', + 'stayPut': 'false' + } + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + login_page = self._download_webpage( + request, None, 'Logging in as %s' % username) + + # Not (yet) logged in + m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) + if m is not None: + response = m.group('json') + response_json = json.loads(response) + state = response_json['state'] + + if state == 'notlogged': + raise ExtractorError( + 'Unable to login, incorrect username and/or password', + expected=True) + + # This is when we get popup: + # > You're already logged in to lynda.com on two devices. + # > If you log in here, we'll log you out of another device. + # So, we need to confirm this. + if state == 'conflicted': + confirm_form = { + 'username': '', + 'password': '', + 'resolve': 'true', + 'remember': 'false', + 'stayPut': 'false', + } + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) + login_page = self._download_webpage( + request, None, + 'Confirming log in and log out from another device') + + if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + raise ExtractorError('Unable to log in') + + +class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(\d+)' - _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' + _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)' _NETRC_MACHINE = 'lynda' - _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true' _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' - ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' - _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', @@ -41,23 +96,22 @@ class LyndaIE(InfoExtractor): 'only_matching': True, }] - def _real_initialize(self): - self._login() - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) - page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id, - 'Downloading video JSON') + page = self._download_webpage( + 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, + video_id, 'Downloading video JSON') video_json = json.loads(page) if 'Status' in video_json: - raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True) + raise ExtractorError( + 'lynda returned error: %s' % video_json['Message'], expected=True) if video_json['HasAccess'] is False: raise ExtractorError( - 'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True) + 'Video %s is only available for members. ' + % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True) video_id = compat_str(video_json['ID']) duration = video_json['DurationInSeconds'] @@ -100,50 +154,9 @@ class LyndaIE(InfoExtractor): 'formats': formats } - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - login_form = { - 'username': username, - 'password': password, - 'remember': 'false', - 'stayPut': 'false' - } - request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) - login_page = self._download_webpage(request, None, 'Logging in as %s' % username) - - # Not (yet) logged in - m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) - if m is not None: - response = m.group('json') - response_json = json.loads(response) - state = response_json['state'] - - if state == 'notlogged': - raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) - - # This is when we get popup: - # > You're already logged in to lynda.com on two devices. - # > If you log in here, we'll log you out of another device. - # So, we need to confirm this. - if state == 'conflicted': - confirm_form = { - 'username': '', - 'password': '', - 'resolve': 'true', - 'remember': 'false', - 'stayPut': 'false', - } - request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) - login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device') - - if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: - raise ExtractorError('Unable to log in') - def _fix_subtitles(self, subs): srt = '' + seq_counter = 0 for pos in range(0, len(subs) - 1): seq_current = subs[pos] m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) @@ -155,8 +168,10 @@ class LyndaIE(InfoExtractor): continue appear_time = m_current.group('timecode') disappear_time = m_next.group('timecode') - text = seq_current['Caption'].lstrip() - srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) + text = seq_current['Caption'].strip() + if text: + seq_counter += 1 + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text) if srt: return srt @@ -169,7 +184,7 @@ class LyndaIE(InfoExtractor): return {} -class LyndaCourseIE(InfoExtractor): +class LyndaCourseIE(LyndaBaseIE): IE_NAME = 'lynda:course' IE_DESC = 'lynda.com online courses' @@ -182,35 +197,37 @@ class LyndaCourseIE(InfoExtractor): course_path = mobj.group('coursepath') course_id = mobj.group('courseid') - page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, - course_id, 'Downloading course JSON') + page = self._download_webpage( + 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, + course_id, 'Downloading course JSON') course_json = json.loads(page) if 'Status' in course_json and course_json['Status'] == 'NotFound': - raise ExtractorError('Course %s does not exist' % course_id, expected=True) + raise ExtractorError( + 'Course %s does not exist' % course_id, expected=True) unaccessible_videos = 0 videos = [] - (username, _) = self._get_login_info() # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore for chapter in course_json['Chapters']: for video in chapter['Videos']: - if username is None and video['HasAccess'] is False: + if video['HasAccess'] is False: unaccessible_videos += 1 continue videos.append(video['ID']) if unaccessible_videos > 0: - self._downloader.report_warning('%s videos are only available for members and will not be downloaded. ' - % unaccessible_videos + LyndaIE.ACCOUNT_CREDENTIALS_HINT) + self._downloader.report_warning( + '%s videos are only available for members (or paid members) and will not be downloaded. ' + % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) entries = [ - self.url_result('http://www.lynda.com/%s/%s-4.html' % - (course_path, video_id), - 'Lynda') + self.url_result( + 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), + 'Lynda') for video_id in videos] course_title = course_json['Title'] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 1831c6749..21aea0c55 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..compat import ( @@ -10,7 +11,6 @@ from ..utils import ( ExtractorError, HEADRequest, str_to_int, - parse_iso8601, ) @@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor): 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', - 'upload_date': '20111115', - 'timestamp': 1321359578, 'thumbnail': 're:https?://.*\.jpg', 'view_count': int, 'like_count': int, @@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor): 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', - 'ext': 'm4a', - 'title': 'Electric Relaxation vol. 3', + 'ext': 'mp3', + 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', - 'uploader': 'Daniel Drumz', + 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': 're:https?://.*/images/', 'view_count': int, 'like_count': int, }, }] - def _get_url(self, track_id, template_url): - server_count = 30 - for i in range(server_count): - url = template_url % i + def _get_url(self, track_id, template_url, server_number): + boundaries = (1, 30) + for nr in server_numbers(server_number, boundaries): + url = template_url % nr try: # We only want to know if the request succeed # don't download the whole file self._request_webpage( HEADRequest(url), track_id, - 'Checking URL %d/%d ...' % (i + 1, server_count + 1)) + 'Checking URL %d/%d ...' % (nr, boundaries[-1])) return url except ExtractorError: pass - return None def _real_extract(self, url): @@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') + server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number')) template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self._get_url(track_id, template_url) + final_song_url = self._get_url(track_id, template_url, server_number) if final_song_url is None: self.to_screen('Trying with m4a extension') template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - final_song_url = self._get_url(track_id, template_url) + final_song_url = self._get_url(track_id, template_url, server_number) if final_song_url is None: raise ExtractorError('Unable to extract track url') PREFIX = ( - r'<span class="play-button[^"]*?"' + r'm-play-on-spacebar[^>]+' r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') title = self._html_search_regex( PREFIX + r'm-title="([^"]+)"', webpage, 'title') @@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor): r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) like_count = str_to_int(self._search_regex( - [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"', - r'/favorites/?">([0-9]+)<'], + r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"', webpage, 'like count', fatal=False)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', r'/listeners/?">([0-9,.]+)</a>'], webpage, 'play count', fatal=False)) - timestamp = parse_iso8601(self._search_regex( - r'<time itemprop="dateCreated" datetime="([^"]+)">', - webpage, 'upload date', default=None)) return { 'id': track_id, @@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, - 'timestamp': timestamp, 'view_count': view_count, 'like_count': like_count, } + + +def server_numbers(first, boundaries): + """ Server numbers to try in descending order of probable availability. + Starting from first (i.e. the number of the server hosting the preview file) + and going further and further up to the higher boundary and down to the + lower one in an alternating fashion. Namely: + + server_numbers(2, (1, 5)) + + # Where the preview server is 2, min number is 1 and max is 5. + # Yields: 2, 3, 1, 4, 5 + + Why not random numbers or increasing sequences? Since from what I've seen, + full length files seem to be hosted on servers whose number is closer to + that of the preview; to be confirmed. + """ + zip_longest = getattr(itertools, 'zip_longest', None) + if zip_longest is None: + # python 2.x + zip_longest = itertools.izip_longest + + if len(boundaries) != 2: + raise ValueError("boundaries should be a two-element tuple") + min, max = boundaries + highs = range(first + 1, max + 1) + lows = range(first - 1, min - 1, -1) + rest = filter( + None, itertools.chain.from_iterable(zip_longest(highs, lows))) + yield first + for n in rest: + yield n diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 1a241aca7..e369551c2 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -10,7 +10,7 @@ from ..utils import ( class MLBIE(InfoExtractor): - _VALID_URL = r'https?://m(?:lb)?\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)' + _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)' _TESTS = [ { 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', @@ -80,6 +80,10 @@ class MLBIE(InfoExtractor): 'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553', 'only_matching': True, }, + { + 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'only_matching': True, + } ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3645d3033..ecd0ac8b1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -14,7 +14,7 @@ from ..utils import ( class NBCIE(InfoExtractor): - _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' + _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' _TESTS = [ { @@ -50,6 +50,57 @@ class NBCIE(InfoExtractor): return self.url_result(theplatform_url) +class NBCSportsVPlayerIE(InfoExtractor): + _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + + _TESTS = [{ + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } + }, { + 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + iframe_m = re.search( + r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + if iframe_m: + return iframe_m.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + theplatform_url = self._og_search_video_url(webpage) + return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): + # Does not include https becuase its certificate is invalid + _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + + _TEST = { + 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'info_dict': { + 'id': 'PHJSaFWbrTY9', + 'ext': 'flv', + 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', + 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result( + NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + + class NBCNewsIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 4c1890416..ddec7b338 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -22,7 +22,7 @@ class NiconicoIE(InfoExtractor): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' - _TEST = { + _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { @@ -39,9 +39,26 @@ class NiconicoIE(InfoExtractor): 'username': 'ydl.niconico@gmail.com', 'password': 'youtube-dl', }, - } + }, { + 'url': 'http://www.nicovideo.jp/watch/nm14296458', + 'md5': '8db08e0158457cf852a31519fceea5bc', + 'info_dict': { + 'id': 'nm14296458', + 'ext': 'swf', + 'title': '【鏡音リン】Dance on media【オリジナル】take2!', + 'description': 'md5:', + 'uploader': 'りょうた', + 'uploader_id': '18822557', + 'upload_date': '20110429', + 'duration': 209, + }, + 'params': { + 'username': 'ydl.niconico@gmail.com', + 'password': 'youtube-dl', + }, + }] - _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)' + _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' # Determine whether the downloader used authentication to download video _AUTHENTICATED = False @@ -76,8 +93,7 @@ class NiconicoIE(InfoExtractor): return True def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) # Get video webpage. We are not actually interested in it, but need # the cookies in order to be able to download the info webpage @@ -90,7 +106,7 @@ class NiconicoIE(InfoExtractor): if self._AUTHENTICATED: # Get flv info flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') else: # Get external player info diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 9c01eb0af..5d8448571 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -219,7 +219,8 @@ class NPOLiveIE(NPOBaseIE): if streams: for stream in streams: stream_type = stream.get('type').lower() - if stream_type == 'ss': + # smooth streaming is not supported + if stream_type in ['ss', 'ms']: continue stream_info = self._download_json( 'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' @@ -230,7 +231,10 @@ class NPOLiveIE(NPOBaseIE): stream_url = self._download_json( stream_info['stream'], display_id, 'Downloading %s URL' % stream_type, - transform_source=strip_jsonp) + 'Unable to download %s URL' % stream_type, + transform_source=strip_jsonp, fatal=False) + if not stream_url: + continue if stream_type == 'hds': f4m_formats = self._extract_f4m_formats(stream_url, display_id) # f4m downloader downloads only piece of live stream @@ -242,6 +246,7 @@ class NPOLiveIE(NPOBaseIE): else: formats.append({ 'url': stream_url, + 'preference': -10, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 1e4cfa2e7..e91d3a248 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,46 +14,48 @@ from ..utils import ( class NRKIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})' + _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' _TESTS = [ { - 'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/', - 'md5': 'a6eac35052f3b242bb6bb7f43aed5886', + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'bccd850baebefe23b56d708a113229c2', 'info_dict': { 'id': '150533', 'ext': 'flv', 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f' + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 263, } }, { - 'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/', - 'md5': '3471f2a51718195164e88f46bf427668', + 'url': 'http://www.nrk.no/video/PS*154915', + 'md5': '0b1493ba1aae7d9579a5ad5531bc395a', 'info_dict': { 'id': '154915', 'ext': 'flv', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, } }, ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - page = self._download_webpage(url, video_id) - - video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id') + video_id = self._match_id(url) data = self._download_json( - 'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') + 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, + video_id, 'Downloading media JSON') if data['usageRights']['isGeoBlocked']: - raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True) + raise ExtractorError( + 'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', + expected=True) + + video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' - video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' + duration = parse_duration(data.get('duration')) images = data.get('images') if images: @@ -69,10 +71,51 @@ class NRKIE(InfoExtractor): 'ext': 'flv', 'title': data['title'], 'description': data['description'], + 'duration': duration, 'thumbnail': thumbnail, } +class NRKPlaylistIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', + 'info_dict': { + 'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', + 'title': 'Gjenopplev den historiske solformørkelsen', + 'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', + 'info_dict': { + 'id': 'rivertonprisen-til-karin-fossum-1.12266449', + 'title': 'Rivertonprisen til Karin Fossum', + 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', + }, + 'playlist_count': 5, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('nrk:%s' % video_id, 'NRK') + for video_id in re.findall( + r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"', + webpage) + ] + + playlist_title = self._og_search_title(webpage) + playlist_description = self._og_search_description(webpage) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + + class NRKTVIE(InfoExtractor): _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' @@ -149,9 +192,6 @@ class NRKTVIE(InfoExtractor): } ] - def _seconds2str(self, s): - return '%02d:%02d:%02d.%03d' % (s / 3600, (s % 3600) / 60, s % 60, (s % 1) * 1000) - def _debug_print(self, txt): if self._downloader.params.get('verbose', False): self.to_screen('[debug] %s' % txt) @@ -168,8 +208,8 @@ class NRKTVIE(InfoExtractor): for pos, p in enumerate(ps): begin = parse_duration(p.get('begin')) duration = parse_duration(p.get('dur')) - starttime = self._seconds2str(begin) - endtime = self._seconds2str(begin + duration) + starttime = self._subtitles_timecode(begin) + endtime = self._subtitles_timecode(begin + duration) srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text) return {lang: [ {'ext': 'ttml', 'url': url}, diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 56e1cad3b..03f0a4de6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,15 +1,17 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, +) class NYTimesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', 'md5': '18a525a510f942ada2720db5f31644c0', 'info_dict': { @@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor): 'uploader': 'Brett Weiner', 'duration': 419, } - } + }, { + 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON') + 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, + video_id, 'Downloading video JSON') title = video_data['headline'] - description = video_data['summary'] - duration = video_data['duration'] / 1000.0 + description = video_data.get('summary') + duration = float_or_none(video_data.get('duration'), 1000) uploader = video_data['byline'] timestamp = parse_iso8601(video_data['publication_date'][:-8]) @@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor): formats = [ { 'url': video['url'], - 'format_id': video['type'], - 'vcodec': video['video_codec'], - 'width': video['width'], - 'height': video['height'], - 'filesize': get_file_size(video['fileSize']), + 'format_id': video.get('type'), + 'vcodec': video.get('video_codec'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': get_file_size(video.get('fileSize')), } for video in video_data['renditions'] ] self._sort_formats(formats) @@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor): thumbnails = [ { 'url': 'http://www.nytimes.com/%s' % image['url'], - 'resolution': '%dx%d' % (image['width'], image['height']), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), } for image in video_data['images'] ] diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 4e293392b..ca1a5bb3c 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -11,6 +11,11 @@ from ..utils import ( HEADRequest, unified_strdate, ExtractorError, + strip_jsonp, + int_or_none, + float_or_none, + determine_ext, + remove_end, ) @@ -197,3 +202,92 @@ class ORFFM4IE(InfoExtractor): 'description': data['subtitle'], 'entries': entries } + + +class ORFIPTVIE(InfoExtractor): + IE_NAME = 'orf:iptv' + IE_DESC = 'iptv.ORF.at' + _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + + _TEST = { + 'url': 'http://iptv.orf.at/stories/2267952', + 'md5': '26ffa4bab6dbce1eee78bbc7021016cd', + 'info_dict': { + 'id': '339775', + 'ext': 'flv', + 'title': 'Kreml-Kritiker Nawalny wieder frei', + 'description': 'md5:6f24e7f546d364dacd0e616a9e409236', + 'duration': 84.729, + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150306', + }, + } + + def _real_extract(self, url): + story_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://iptv.orf.at/stories/%s' % story_id, story_id) + + video_id = self._search_regex( + r'data-video(?:id)?="(\d+)"', webpage, 'video id') + + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['default'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index a20672c0c..46cebc0d7 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url class PhoenixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } - } + _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ + (?: + phoenix/die_sendungen/(?:[^/]+/)? + )? + (?P<id>[0-9]+)''' + _TESTS = [ + { + 'url': 'http://www.phoenix.de/content/884301', + 'md5': 'ed249f045256150c92e72dbb70eadec6', + 'info_dict': { + 'id': '884301', + 'ext': 'mp4', + 'title': 'Michael Krons mit Hans-Werner Sinn', + 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', + 'upload_date': '20141025', + 'uploader': 'Im Dialog', + } + }, + { + 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', + 'only_matching': True, + }, + { + 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', + 'only_matching': True, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py new file mode 100644 index 000000000..abde34b94 --- /dev/null +++ b/youtube_dl/extractor/pladform.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_text, + qualities, +) + + +class PladformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + out\.pladform\.ru/player| + static\.pladform\.ru/player\.swf + ) + \?.*\bvideoid=| + video\.pladform\.ru/catalog/video/videoid/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # http://muz-tv.ru/kinozal/view/7400/ + 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', + 'md5': '61f37b575dd27f1bb2e1854777fe31f4', + 'info_dict': { + 'id': '100183293', + 'ext': 'mp4', + 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', + 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 694, + 'age_limit': 0, + }, + }, { + 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', + 'only_matching': True, + }, { + 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_xml( + 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, + video_id) + + if video.tag == 'error': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, video.text), + expected=True) + + quality = qualities(('ld', 'sd', 'hd')) + + formats = [{ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + } for src in video.findall('./src')] + self._sort_formats(formats) + + webpage = self._download_webpage( + 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + video_id) + + title = self._og_search_title(webpage, fatal=False) or xpath_text( + video, './/title', 'title', fatal=True) + description = self._search_regex( + r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) or xpath_text( + video, './/cover', 'cover') + + duration = int_or_none(xpath_text(video, './/time', 'duration')) + age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index 9576aed0e..e766ccca3 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -4,85 +4,72 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - float_or_none, int_or_none, - str_to_int, + parse_iso8601, ) class PlayFMIE(InfoExtractor): IE_NAME = 'play.fm' - _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' _TEST = { - 'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', + 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', 'md5': 'c505f8307825a245d0c7ad1850001f22', 'info_dict': { - 'id': '137220', + 'id': '71276', 'ext': 'mp3', - 'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', - 'uploader': 'Sven Tasnadi', - 'uploader_id': 'sventasnadi', - 'duration': 5627.428, - 'upload_date': '20140712', + 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'description': '', + 'duration': 5627, + 'timestamp': 1406033781, + 'upload_date': '20140722', + 'uploader': 'Dan Drastic', + 'uploader_id': '71170', 'view_count': int, 'comment_count': int, - 'thumbnail': 're:^https?://.*\.jpg$', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - upload_date = mobj.group('upload_date') - - rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) - req = compat_urllib_request.Request( - 'http://www.play.fm/flexRead/recording', data=rec_data) - req.add_header('Content-Type', 'application/x-www-form-urlencoded') - rec_doc = self._download_xml(req, video_id) + slug = mobj.group('slug') - error_node = rec_doc.find('./error') - if error_node is not None: - raise ExtractorError('An error occured: %s (code %s)' % ( - error_node.text, rec_doc.find('./status').text)) + recordings = self._download_json( + 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) - recording = rec_doc.find('./recording') - title = recording.find('./title').text - view_count = str_to_int(recording.find('./stats/playcount').text) - comment_count = str_to_int(recording.find('./stats/comments').text) - duration = float_or_none(recording.find('./duration').text, scale=1000) - thumbnail = recording.find('./image').text + error = recordings.get('error') + if isinstance(error, dict): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('message')), + expected=True) - artist = recording.find('./artists/artist') - uploader = artist.find('./name').text - uploader_id = artist.find('./slug').text - - video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( - 'http:', recording.find('./url').text, - recording.find('./_class').text, recording.find('./file_id').text, - rec_doc.find('./uuid').text, video_id, - rec_doc.find('./jingle/file_id').text, - 'http%3A%2F%2Fwww.play.fm%2Fplayer', - ) + audio_url = recordings['audio'] + video_id = compat_str(recordings.get('id') or video_id) + title = recordings['title'] + description = recordings.get('description') + duration = int_or_none(recordings.get('recordingDuration')) + timestamp = parse_iso8601(recordings.get('created_at')) + uploader = recordings.get('page', {}).get('title') + uploader_id = compat_str(recordings.get('page', {}).get('id')) + view_count = int_or_none(recordings.get('playCount')) + comment_count = int_or_none(recordings.get('commentCount')) + categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] return { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', - 'filesize': int_or_none(recording.find('./size').text), + 'url': audio_url, 'title': title, - 'upload_date': upload_date, - 'view_count': view_count, - 'comment_count': comment_count, + 'description': description, 'duration': duration, - 'thumbnail': thumbnail, + 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, } diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py new file mode 100644 index 000000000..bdc71017b --- /dev/null +++ b/youtube_dl/extractor/playwire.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + float_or_none, + int_or_none, +) + + +class PlaywireIE(InfoExtractor): + _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', + 'md5': 'e6398701e3595888125729eaa2329ed9', + 'info_dict': { + 'id': '3353705', + 'ext': 'mp4', + 'title': 'S04_RM_UCL_Rus', + 'thumbnail': 're:^http://.*\.png$', + 'duration': 145.94, + }, + }, { + 'url': 'http://cdn.playwire.com/11625/embed/85228.html', + 'only_matching': True, + }, { + 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', + 'only_matching': True, + }, { + 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + + player = self._download_json( + 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + video_id) + + title = player['settings']['title'] + duration = float_or_none(player.get('duration'), 1000) + + content = player['content'] + thumbnail = content.get('poster') + src = content['media']['f4m'] + + f4m = self._download_xml(src, video_id) + base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True) + formats = [] + for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'): + media_url = media.get('url') + if not media_url: + continue + tbr = int_or_none(media.get('bitrate')) + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + f = { + 'url': '%s/%s' % (base_url, media.attrib['url']), + 'tbr': tbr, + 'width': width, + 'height': height, + } + if not (tbr or width or height): + f['quality'] = 1 if '-hd.' in media_url else 0 + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3a27e3789..0c8b731cf 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor): } def _extract_count(self, pattern, webpage, name): - count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) - if count: - count = str_to_int(count) - return count + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): video_id = self._match_id(url) @@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor): if thumbnail: thumbnail = compat_urllib_parse.unquote(thumbnail) - view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') - like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') - dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') + like_count = self._extract_count( + r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count( + r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') comment_count = self._extract_count( - r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py new file mode 100644 index 000000000..01cc3d9ea --- /dev/null +++ b/youtube_dl/extractor/primesharetv.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ExtractorError + + +class PrimeShareTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)' + + _TEST = { + 'url': 'http://primeshare.tv/download/238790B611', + 'md5': 'b92d9bf5461137c36228009f31533fbc', + 'info_dict': { + 'id': '238790B611', + 'ext': 'mp4', + 'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>File not exist<' in webpage: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + fields = dict(re.findall(r'''(?x)<input\s+ + type="hidden"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', webpage)) + + headers = { + 'Referer': url, + 'Content-Type': 'application/x-www-form-urlencoded', + } + + wait_time = int(self._search_regex( + r'var\s+cWaitTime\s*=\s*(\d+)', + webpage, 'wait time', default=7)) + 1 + self._sleep(wait_time, video_id) + + req = compat_urllib_request.Request( + url, compat_urllib_parse.urlencode(fields), headers) + video_page = self._download_webpage( + req, video_id, 'Downloading video page') + + video_url = self._search_regex( + r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'", + video_page, 'video url') + + title = self._html_search_regex( + r'<h1>Watch\s*(?: )?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?: )?\s*<strong>', + video_page, 'title') + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': 'mp4', + } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 846b76c81..d6054d717 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,17 +1,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ExtractorError class RedTubeIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', + 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', 'info_dict': { 'id': '66418', 'ext': 'mp4', - "title": "Sucked on a toilet", - "age_limit": 18, + 'title': 'Sucked on a toilet', + 'age_limit': 18, } } @@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): + raise ExtractorError('Video %s has been removed' % video_id, expected=True) + video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index b42442d12..13f071077 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -127,6 +127,47 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) +class RTVEInfantilIE(InfoExtractor): + IE_NAME = 'rtve.es:infantil' + IE_DESC = 'RTVE infantil' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/' + + _TESTS = [{ + 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', + 'md5': '915319587b33720b8e0357caaa6617e6', + 'info_dict': { + 'id': '3040283', + 'ext': 'mp4', + 'title': 'Maneras de vivir', + 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'duration': 357.958, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + + webpage = self._download_webpage(url, video_id) + vidplayer_id = self._search_regex( + r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') + + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + + return { + 'id': video_id, + 'ext': 'mp4', + 'title': info['title'], + 'url': video_url, + 'thumbnail': info.get('image'), + 'duration': float_or_none(info.get('duration'), scale=1000), + } + + class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py new file mode 100644 index 000000000..10251f29e --- /dev/null +++ b/youtube_dl/extractor/safari.py @@ -0,0 +1,157 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE + +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + smuggle_url, + std_headers, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' + _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' + _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _real_initialize(self): + # We only need to log in once for courses or individual videos + if not self.LOGGED_IN: + self._login() + SafariBaseIE.LOGGED_IN = True + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + raise ExtractorError( + self._ACCOUNT_CREDENTIALS_HINT, + expected=True) + + headers = std_headers + if 'Referer' not in headers: + headers['Referer'] = self._LOGIN_URL + + login_page = self._download_webpage( + self._LOGIN_URL, None, + 'Downloading login form') + + csrf = self._html_search_regex( + r"name='csrfmiddlewaretoken'\s+value='([^']+)'", + login_page, 'csrf token') + + login_form = { + 'csrfmiddlewaretoken': csrf, + 'email': username, + 'password1': password, + 'login': 'Sign In', + 'next': '', + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) + login_page = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + raise ExtractorError( + 'Login failed; make sure your credentials are correct and try again.', + expected=True) + + self.to_screen('Login successful') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x)https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book + )/ + (?P<course_id>\d+)/ + (?:chapter(?:-content)?/)? + (?P<part>part\d+)\.html + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', + 'info_dict': { + 'id': '2842601850001', + 'ext': 'mp4', + 'title': 'Introduction', + }, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + part = mobj.group('part') + + webpage = self._download_webpage( + '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), + part) + + bc_url = BrightcoveIE._extract_brightcove_url(webpage) + if not bc_url: + raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + + return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove') + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, 'Safari') + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 9f79ff5c1..0b717a1e4 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=', + r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index c04791997..11edf616a 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -4,22 +4,87 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .common import compat_str +from ..compat import ( + compat_str, + compat_urllib_request +) +from ..utils import sanitize_url_path_consecutive_slashes class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' - _TEST = { + _TESTS = [{ + 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7', + 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', 'title': 'MV:Far East Movement《The Illest》', }, - 'skip': 'Only available from China', - } + 'params': { + 'cn_verification_proxy': 'proxy.uku.im:8888' + } + }, { + 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', + 'md5': '699060e75cf58858dd47fb9c03c42cfb', + 'info_dict': { + 'id': '409385080', + 'ext': 'mp4', + 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', + } + }, { + 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', + 'md5': '9bf34be48f2f4dadcb226c74127e203c', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Multipart video', + 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', + 'info_dict': { + 'id': '78910339', + }, + 'playlist': [{ + 'md5': 'bdbfb8f39924725e6589c146bc1883ad', + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'md5': '8407e634175fdac706766481b9443450', + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] + }, { + 'note': 'Video with title containing dash', + 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', + 'info_dict': { + 'id': '78932792', + 'ext': 'mp4', + 'title': 'youtube-dl testing video', + }, + 'params': { + 'skip_download': True + } + }] def _real_extract(self, url): @@ -29,8 +94,14 @@ class SohuIE(InfoExtractor): else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + req = compat_urllib_request.Request(base_data_url + vid_id) + + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + req.add_header('Ytdl-request-proxy', cn_verification_proxy) + return self._download_json( - base_data_url + vid_id, video_id, + req, video_id, 'Downloading JSON data for %s' % vid_id) mobj = re.match(self._VALID_URL, url) @@ -38,10 +109,8 @@ class SohuIE(InfoExtractor): mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) - raw_title = self._html_search_regex( - r'(?s)<title>(.+?)</title>', - webpage, 'video title') - title = raw_title.partition('-')[0].strip() + + title = self._og_search_title(webpage) vid = self._html_search_regex( r'var vid ?= ?["\'](\d+)["\']', @@ -77,7 +146,9 @@ class SohuIE(InfoExtractor): % (format_id, i + 1, part_count)) part_info = part_str.split('|') - video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + video_url = sanitize_url_path_consecutive_slashes( + '%s%s?key=%s' % (part_info[0], su[i], part_info[3])) formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index c5284fa67..316b2c90f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -180,7 +180,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': key, 'url': url, 'play_path': 'mp3:' + path, - 'ext': ext, + 'ext': 'flv', 'vcodec': 'none', }) @@ -200,8 +200,9 @@ class SoundcloudIE(InfoExtractor): if f['format_id'].startswith('rtmp'): f['protocol'] = 'rtmp' - self._sort_formats(formats) - result['formats'] = formats + self._check_formats(formats, track_id) + self._sort_formats(formats) + result['formats'] = formats return result @@ -241,7 +242,7 @@ class SoundcloudIE(InfoExtractor): class SoundcloudSetIE(SoundcloudIE): - _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -286,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' IE_NAME = 'soundcloud:user' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band', diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py new file mode 100644 index 000000000..13101c714 --- /dev/null +++ b/youtube_dl/extractor/ssa.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + parse_duration, +) + + +class SSAIE(InfoExtractor): + _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' + _TEST = { + 'url': 'http://ssa.nls.uk/film/3561', + 'info_dict': { + 'id': '3561', + 'ext': 'flv', + 'title': 'SHETLAND WOOL', + 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', + 'duration': 900, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + streamer = self._search_regex( + r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') + play_path = self._search_regex( + r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + + def search_field(field_name, fatal=False): + return self._search_regex( + r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name, + webpage, 'title', fatal=fatal) + + title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') + description = unescapeHTML(search_field('Description')) + duration = parse_duration(search_field('Running time')) + thumbnail = self._search_regex( + r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + + return { + 'id': video_id, + 'url': streamer, + 'play_path': play_path, + 'ext': 'flv', + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 5793dbc10..a46a7ecba 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -53,10 +53,10 @@ class TeamcocoIE(InfoExtractor): embed = self._download_webpage( embed_url, video_id, 'Downloading embed page') - encoded_data = self._search_regex( - r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') + player_data = self._parse_json(self._search_regex( + r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id) data = self._parse_json( - base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) + base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id) formats = [] get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index feac666f7..0e3e627f4 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor): error_msg = next( n.attrib['abstract'] for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction') + if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') except StopIteration: pass else: diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 9a53a3c74..e83e31a31 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -16,6 +16,7 @@ class TVPlayIE(InfoExtractor): _VALID_URL = r'''(?x)http://(?:www\.)? (?:tvplay\.lv/parraides| tv3play\.lt/programos| + play\.tv3\.lt/programos| tv3play\.ee/sisu| tv3play\.se/program| tv6play\.se/program| @@ -45,7 +46,7 @@ class TVPlayIE(InfoExtractor): }, }, { - 'url': 'http://www.tv3play.lt/programos/moterys-meluoja-geriau/409229?autostart=true', + 'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true', 'info_dict': { 'id': '409229', 'ext': 'flv', diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py new file mode 100644 index 000000000..d6c0ab184 --- /dev/null +++ b/youtube_dl/extractor/twentytwotracks.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + +# 22Tracks regularly replace the audio tracks that can be streamed on their +# site. The tracks usually expire after 1 months, so we can't add tests. + + +class TwentyTwoTracksIE(InfoExtractor): + _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)' + IE_NAME = '22tracks:track' + + _API_BASE = 'http://22tracks.com/api' + + def _extract_info(self, city, genre_name, track_id=None): + item_id = track_id if track_id else genre_name + + cities = self._download_json( + '%s/cities' % self._API_BASE, item_id, + 'Downloading cities info', + 'Unable to download cities info') + city_id = [x['id'] for x in cities if x['slug'] == city][0] + + genres = self._download_json( + '%s/genres/%s' % (self._API_BASE, city_id), item_id, + 'Downloading %s genres info' % city, + 'Unable to download %s genres info' % city) + genre = [x for x in genres if x['slug'] == genre_name][0] + genre_id = genre['id'] + + tracks = self._download_json( + '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, + 'Downloading %s genre tracks info' % genre_name, + 'Unable to download track info') + + return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] + + def _get_track_url(self, filename, track_id): + token = self._download_json( + 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, + track_id, 'Downloading token', 'Unable to download token') + return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) + + def _extract_track_info(self, track_info, track_id): + download_url = self._get_track_url(track_info['filename'], track_id) + title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) + return { + 'id': track_id, + 'url': download_url, + 'ext': 'mp3', + 'title': title, + 'duration': int_or_none(track_info.get('duration')), + 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + city = mobj.group('city') + genre = mobj.group('genre') + track_id = mobj.group('id') + + track_info = self._extract_info(city, genre, track_id) + return self._extract_track_info(track_info, track_id) + + +class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): + _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$' + IE_NAME = '22tracks:genre' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + city = mobj.group('city') + genre = mobj.group('genre') + + genre_title, tracks = self._extract_info(city, genre) + + entries = [ + self._extract_track_info(track_info, track_info['id']) + for track_info in tracks] + + return self.playlist_result(entries, genre, genre_title) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4b0ce54df..94bd6345d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -23,6 +23,8 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'http://usher.twitch.tv' _LOGIN_URL = 'https://secure.twitch.tv/user/login' + _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login' + _NETRC_MACHINE = 'twitch' def _handle_error(self, response): if not isinstance(response, dict): @@ -66,14 +68,14 @@ class TwitchBaseIE(InfoExtractor): 'authenticity_token': authenticity_token, 'redirect_on_login': '', 'embed_form': 'false', - 'mp_source_action': '', + 'mp_source_action': 'login-button', 'follow': '', - 'user[login]': username, - 'user[password]': password, + 'login': username, + 'password': password, } request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -84,6 +86,14 @@ class TwitchBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login: %s' % m.group('msg').strip(), expected=True) + def _prefer_source(self, formats): + try: + source = next(f for f in formats if f['format_id'] == 'Source') + source['preference'] = 10 + except StopIteration: + pass # No Source stream present + self._sort_formats(formats) + class TwitchItemBaseIE(TwitchBaseIE): def _download_info(self, item, item_id): @@ -139,7 +149,7 @@ class TwitchItemBaseIE(TwitchBaseIE): class TwitchVideoIE(TwitchItemBaseIE): IE_NAME = 'twitch:video' - _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE _ITEM_TYPE = 'video' _ITEM_SHORTCUT = 'a' @@ -155,7 +165,7 @@ class TwitchVideoIE(TwitchItemBaseIE): class TwitchChapterIE(TwitchItemBaseIE): IE_NAME = 'twitch:chapter' - _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE _ITEM_TYPE = 'chapter' _ITEM_SHORTCUT = 'c' @@ -174,7 +184,7 @@ class TwitchChapterIE(TwitchItemBaseIE): class TwitchVodIE(TwitchItemBaseIE): IE_NAME = 'twitch:vod' - _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/[^/]+/v/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE _ITEM_TYPE = 'vod' _ITEM_SHORTCUT = 'v' @@ -208,6 +218,7 @@ class TwitchVodIE(TwitchItemBaseIE): '%s/vod/%s?nauth=%s&nauthsig=%s' % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']), item_id, 'mp4') + self._prefer_source(formats) info['formats'] = formats return info @@ -348,21 +359,14 @@ class TwitchStreamIE(TwitchBaseIE): 'p': random.randint(1000000, 10000000), 'player': 'twitchweb', 'segment_preference': '4', - 'sig': access_token['sig'], - 'token': access_token['token'], + 'sig': access_token['sig'].encode('utf-8'), + 'token': access_token['token'].encode('utf-8'), } - formats = self._extract_m3u8_formats( '%s/api/channel/hls/%s.m3u8?%s' - % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')), + % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)), channel_id, 'mp4') - - # prefer the 'source' stream, the others are limited to 30 fps - def _sort_source(f): - if f.get('m3u8_media') is not None and f['m3u8_media'].get('NAME') == 'Source': - return 1 - return 0 - formats = sorted(formats, key=_sort_source) + self._prefer_source(formats) view_count = stream.get('viewers') timestamp = parse_iso8601(stream.get('created_at')) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py new file mode 100644 index 000000000..96c809eaf --- /dev/null +++ b/youtube_dl/extractor/ultimedia.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, + clean_html, +) + + +class UltimediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' + _TESTS = [{ + # news + 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', + 'md5': '276a0e49de58c7e85d32b057837952a2', + 'info_dict': { + 'id': 's8uk0r', + 'ext': 'mp4', + 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', + 'description': 'md5:3e5c8fd65791487333dda5db8aed32af', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150317', + }, + }, { + # music + 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', + 'md5': '2ea3513813cf230605c7e2ffe7eca61c', + 'info_dict': { + 'id': 'xvpfp8', + 'ext': 'mp4', + 'title': "Two - C'est la vie (Clip)", + 'description': 'Two', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150224', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + deliver_url = self._search_regex( + r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', + webpage, 'deliver URL') + + deliver_page = self._download_webpage( + deliver_url, video_id, 'Downloading iframe page') + + if '>This video is currently not available' in deliver_page: + raise ExtractorError( + 'Video %s is currently not available' % video_id, expected=True) + + player = self._parse_json( + self._search_regex( + r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), + video_id) + + quality = qualities(['flash', 'html5']) + formats = [] + for mode in player['modes']: + video_url = mode.get('config', {}).get('file') + if not video_url: + continue + if re.match(r'https?://www\.youtube\.com/.+?', video_url): + return self.url_result(video_url, 'Youtube') + formats.append({ + 'url': video_url, + 'format_id': mode.get('type'), + 'quality': quality(mode.get('type')), + }) + self._sort_formats(formats) + + thumbnail = player.get('image') + + title = clean_html(( + self._html_search_regex( + r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', + webpage, 'title', default=None) or + self._search_regex( + r"var\s+nameVideo\s*=\s*'([^']+)'", + deliver_page, 'title'))) + + description = clean_html(self._html_search_regex( + r'(?s)<span>Description</span>(.+?)</p>', webpage, + 'description', fatal=False)) + + upload_date = unified_strdate(self._search_regex( + r'Ajouté le\s*<span>([^<]+)', webpage, + 'upload date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py new file mode 100644 index 000000000..9369abaf8 --- /dev/null +++ b/youtube_dl/extractor/varzesh3.py @@ -0,0 +1,45 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Varzesh3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' + _TEST = { + 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', + 'md5': '2a933874cb7dce4366075281eb49e855', + 'info_dict': { + 'id': '76337', + 'ext': 'mp4', + 'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', + 'description': 'فصل ۲۰۱۵-۲۰۱۴', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r'<source[^>]+src="([^"]+)"', webpage, 'video url') + + title = self._og_search_title(webpage) + description = self._html_search_regex( + r'(?s)<div class="matn">(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + + video_id = self._search_regex( + r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", + webpage, display_id, default=display_id) + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py new file mode 100644 index 000000000..6215f0642 --- /dev/null +++ b/youtube_dl/extractor/vessel.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( + ExtractorError, + parse_iso8601, +) + + +class VesselIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' + _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' + _LOGIN_URL = 'https://www.vessel.com/api/account/login' + _NETRC_MACHINE = 'vessel' + _TEST = { + 'url': 'https://www.vessel.com/videos/HDN7G5UMs', + 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', + 'info_dict': { + 'id': 'HDN7G5UMs', + 'ext': 'mp4', + 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150317', + 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', + 'timestamp': int, + }, + } + + @staticmethod + def make_json_request(url, data): + payload = json.dumps(data).encode('utf-8') + req = compat_urllib_request.Request(url, payload) + req.add_header('Content-Type', 'application/json; charset=utf-8') + return req + + @staticmethod + def find_assets(data, asset_type): + for asset in data.get('assets', []): + if asset.get('type') == asset_type: + yield asset + + def _check_access_rights(self, data): + access_info = data.get('__view', {}) + if not access_info.get('allow_access', True): + err_code = access_info.get('error_code') or '' + if err_code == 'ITEM_PAID_ONLY': + raise ExtractorError( + 'This video requires subscription.', expected=True) + else: + raise ExtractorError( + 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + data = { + 'client_id': 'web', + 'type': 'password', + 'user_key': username, + 'password': password, + } + login_request = VesselIE.make_json_request(self._LOGIN_URL, data) + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + data = self._parse_json(self._search_regex( + r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) + asset_id = data['model']['data']['id'] + + req = VesselIE.make_json_request( + self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) + data = self._download_json(req, video_id) + + self._check_access_rights(data) + + try: + video_asset = next(VesselIE.find_assets(data, 'video')) + except StopIteration: + raise ExtractorError('No video assets found') + + formats = [] + for f in video_asset.get('sources', []): + if f['name'] == 'hls-index': + formats.extend(self._extract_m3u8_formats( + f['location'], video_id, ext='mp4', m3u8_id='m3u8')) + else: + formats.append({ + 'format_id': f['name'], + 'tbr': f.get('bitrate'), + 'height': f.get('height'), + 'width': f.get('width'), + 'url': f['location'], + }) + self._sort_formats(formats) + + thumbnails = [] + for im_asset in VesselIE.find_assets(data, 'image'): + thumbnails.append({ + 'url': im_asset['location'], + 'width': im_asset.get('width', 0), + 'height': im_asset.get('height', 0), + }) + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'thumbnails': thumbnails, + 'description': data.get('short_description'), + 'duration': data.get('duration'), + 'comment_count': data.get('comment_count'), + 'like_count': data.get('like_count'), + 'view_count': data.get('view_count'), + 'timestamp': parse_iso8601(data.get('released_at')), + } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 273030316..eb309a7cd 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, - remove_start, -) +from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?:www\.)?videomega\.tv/ - (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+) + (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+) ''' _TEST = { - 'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ', + 'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4', 'md5': 'bf5c2f95c4c917536e80936af7bc51e1', 'info_dict': { - 'id': 'QR0HCUHI1661IHUCH0RQ', + 'id': '4GNA688SU99US886ANG4', 'ext': 'mp4', - 'title': 'Big Buck Bunny', + 'title': 'BigBuckBunny_320x180', 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id) + iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id req = compat_urllib_request.Request(iframe_url) req.add_header('Referer', url) webpage = self._download_webpage(req, video_id) - try: - escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1] - except IndexError: - raise ExtractorError('Unable to extract escaped data') - - playlist = compat_urllib_parse.unquote(escaped_data) - + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'title') + title = re.sub( + r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) - video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') - title = remove_start(self._html_search_regex( - r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - }] - self._sort_formats(formats) + r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + video_url = self._search_regex( + r'<source[^>]+?src="([^"]+)"', webpage, 'video URL') return { 'id': video_id, 'title': title, - 'formats': formats, + 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { 'Referer': iframe_url, diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 5c89824c1..bd953fb4c 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -28,12 +26,11 @@ class VidmeIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex(r'<source src="([^"]+)"', webpage, 'video URL') + video_url = self._html_search_regex( + r'<source src="([^"]+)"', webpage, 'video URL') title = self._og_search_title(webpage) description = self._og_search_description(webpage, default='') @@ -44,13 +41,10 @@ class VidmeIE(InfoExtractor): duration = float_or_none(self._html_search_regex( r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) view_count = str_to_int(self._html_search_regex( - r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) + r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', webpage, 'like count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">', - webpage, 'comment count', fatal=False)) return { 'id': video_id, @@ -64,5 +58,4 @@ class VidmeIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'like_count': like_count, - 'comment_count': comment_count, } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py new file mode 100644 index 000000000..1742e66f4 --- /dev/null +++ b/youtube_dl/extractor/viewster.py @@ -0,0 +1,129 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_request + + +class ViewsterIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P<id>\d+-\d+-\d+)' + _TESTS = [{ + # movielink, paymethod=fre + 'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/', + 'playlist': [{ + 'md5': '8f9d94b282d80c42b378dffdbb11caf3', + 'info_dict': { + 'id': '1293-19341-000-movie', + 'ext': 'flv', + 'title': "'Hout' (Wood) - Movie", + }, + }], + 'info_dict': { + 'id': '1293-19341-000', + 'title': "'Hout' (Wood)", + 'description': 'md5:925733185a9242ef96f436937683f33b', + } + }, { + # movielink, paymethod=adv + 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', + 'playlist': [{ + 'md5': '77a005453ca7396cbe3d35c9bea30aef', + 'info_dict': { + 'id': '1140-11855-000-movie', + 'ext': 'flv', + 'title': "THE LISTENING PROJECT - Movie", + }, + }], + 'info_dict': { + 'id': '1140-11855-000', + 'title': "THE LISTENING PROJECT", + 'description': 'md5:714421ae9957e112e672551094bf3b08', + } + }, { + # direct links, no movielink + 'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/', + 'playlist': [{ + 'md5': '0307b7eac6bfb21ab0577a71f6eebd8f', + 'info_dict': { + 'id': '1198-56411-000-trailer', + 'ext': 'mp4', + 'title': "Sinister - Trailer", + }, + }, { + 'md5': '80b9ee3ad69fb368f104cb5d9732ae95', + 'info_dict': { + 'id': '1198-56411-000-behind-scenes', + 'ext': 'mp4', + 'title': "Sinister - Behind Scenes", + }, + }, { + 'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5', + 'info_dict': { + 'id': '1198-56411-000-scene-from-movie', + 'ext': 'mp4', + 'title': "Sinister - Scene from movie", + }, + }], + 'info_dict': { + 'id': '1198-56411-000', + 'title': "Sinister", + 'description': 'md5:014c40b0488848de9683566a42e33372', + } + }] + + _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' + + def _real_extract(self, url): + video_id = self._match_id(url) + + request = compat_urllib_request.Request( + 'http://api.live.viewster.com/api/v1/movie/%s' % video_id) + request.add_header('Accept', self._ACCEPT_HEADER) + + movie = self._download_json( + request, video_id, 'Downloading movie metadata JSON') + + title = movie.get('title') or movie['original_title'] + description = movie.get('synopsis') + thumbnail = movie.get('large_artwork') or movie.get('artwork') + + entries = [] + for clip in movie['play_list']: + entry = None + + # movielink api + link_request = clip.get('link_request') + if link_request: + request = compat_urllib_request.Request( + 'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s¤cy=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s' + % link_request) + request.add_header('Accept', self._ACCEPT_HEADER) + + movie_link = self._download_json( + request, video_id, 'Downloading movie link JSON', fatal=False) + + if movie_link: + formats = self._extract_f4m_formats( + movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id) + self._sort_formats(formats) + entry = { + 'formats': formats, + } + + # direct link + clip_url = clip.get('clip_data', {}).get('url') + if clip_url: + entry = { + 'url': clip_url, + 'ext': 'mp4', + } + + if entry: + entry.update({ + 'id': '%s-%s' % (video_id, clip['canonical_title']), + 'title': '%s - %s' % (title, clip['title']), + }) + entries.append(entry) + + playlist = self.playlist_result(entries, video_id, title, description) + playlist['thumbnail'] = thumbnail + return playlist diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8f540f578..28bcc89cd 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import json import re import itertools -import hashlib from .common import InfoExtractor from ..compat import ( @@ -20,6 +19,7 @@ from ..utils import ( RegexNotFoundError, smuggle_url, std_headers, + unified_strdate, unsmuggle_url, urlencode_postdata, ) @@ -38,7 +38,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() login_url = 'https://vimeo.com/log_in' webpage = self._download_webpage(login_url, None, False) - token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') + token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token') data = urlencode_postdata({ 'email': username, 'password': password, @@ -140,6 +140,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:8678b246399b070816b12313e8b4eb5c', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', + 'upload_date': '20130927', 'duration': 187, }, }, @@ -176,17 +177,15 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') - data = compat_urllib_parse.urlencode({ + token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token') + data = urlencode_postdata({ 'password': password, 'token': token, }) - # I didn't manage to use the password with https - if url.startswith('https'): - pass_url = url.replace('https', 'http') - else: - pass_url = url - password_request = compat_urllib_request.Request(pass_url + '/password', data) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = compat_urllib_request.Request(url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') password_request.add_header('Cookie', 'xsrft=%s' % token) return self._download_webpage( @@ -223,12 +222,7 @@ class VimeoIE(VimeoBaseInfoExtractor): video_id = mobj.group('id') orig_url = url if mobj.group('pro') or mobj.group('player'): - url = 'http://player.vimeo.com/video/' + video_id - - password = self._downloader.params.get('videopassword', None) - if password: - headers['Cookie'] = '%s_password=%s' % ( - video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) + url = 'https://player.vimeo.com/video/' + video_id # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) @@ -250,6 +244,16 @@ class VimeoIE(VimeoBaseInfoExtractor): # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) + vimeo_config = self._search_regex( + r'vimeo\.config\s*=\s*({.+?});', webpage, + 'vimeo config', default=None) + if vimeo_config: + seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) + if seed_status.get('state') == 'failed': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, seed_status['title']), + expected=True) + # Extract the config JSON try: try: @@ -323,9 +327,9 @@ class VimeoIE(VimeoBaseInfoExtractor): # Extract upload date video_upload_date = None - mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage) + mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage) if mobj is not None: - video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) + video_upload_date = unified_strdate(mobj.group(1)) try: view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -379,7 +383,7 @@ class VimeoIE(VimeoBaseInfoExtractor): for tt in text_tracks: subtitles[tt['lang']] = [{ 'ext': 'vtt', - 'url': 'http://vimeo.com' + tt['url'], + 'url': 'https://vimeo.com' + tt['url'], }] return { @@ -402,11 +406,11 @@ class VimeoIE(VimeoBaseInfoExtractor): class VimeoChannelIE(InfoExtractor): IE_NAME = 'vimeo:channel' - _VALID_URL = r'https?://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' + _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' _TESTS = [{ - 'url': 'http://vimeo.com/channels/tributes', + 'url': 'https://vimeo.com/channels/tributes', 'info_dict': { 'id': 'tributes', 'title': 'Vimeo Tributes', @@ -435,10 +439,10 @@ class VimeoChannelIE(InfoExtractor): name="([^"]+)"\s+ value="([^"]*)" ''', login_form)) - token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') + token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token') fields['token'] = token fields['password'] = password - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) @@ -465,7 +469,7 @@ class VimeoChannelIE(InfoExtractor): if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break - entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') + entries = [self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') for video_id in video_ids] return {'_type': 'playlist', 'id': list_id, @@ -476,15 +480,15 @@ class VimeoChannelIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) channel_id = mobj.group('id') - return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) + return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id) class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https?://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' _TESTS = [{ - 'url': 'http://vimeo.com/nkistudio/videos', + 'url': 'https://vimeo.com/nkistudio/videos', 'info_dict': { 'title': 'Nki', 'id': 'nkistudio', @@ -495,15 +499,15 @@ class VimeoUserIE(VimeoChannelIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') - return self._extract_videos(name, 'http://vimeo.com/%s' % name) + return self._extract_videos(name, 'https://vimeo.com/%s' % name) class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https?://vimeo\.com/album/(?P<id>\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' _TESTS = [{ - 'url': 'http://vimeo.com/album/2632481', + 'url': 'https://vimeo.com/album/2632481', 'info_dict': { 'id': '2632481', 'title': 'Staff Favorites: November 2013', @@ -527,14 +531,14 @@ class VimeoAlbumIE(VimeoChannelIE): def _real_extract(self, url): album_id = self._match_id(url) - return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id) + return self._extract_videos(album_id, 'https://vimeo.com/album/%s' % album_id) class VimeoGroupsIE(VimeoAlbumIE): IE_NAME = 'vimeo:group' - _VALID_URL = r'(?:https?://)?vimeo\.com/groups/(?P<name>[^/]+)' + _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)' _TESTS = [{ - 'url': 'http://vimeo.com/groups/rolexawards', + 'url': 'https://vimeo.com/groups/rolexawards', 'info_dict': { 'id': 'rolexawards', 'title': 'Rolex Awards for Enterprise', @@ -548,13 +552,13 @@ class VimeoGroupsIE(VimeoAlbumIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') - return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name) + return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name) class VimeoReviewIE(InfoExtractor): IE_NAME = 'vimeo:review' IE_DESC = 'Review pages on vimeo' - _VALID_URL = r'https?://vimeo\.com/[^/]+/review/(?P<id>[^/]+)' + _VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)' _TESTS = [{ 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -566,7 +570,7 @@ class VimeoReviewIE(InfoExtractor): } }, { 'note': 'video player needs Referer', - 'url': 'http://vimeo.com/user22258446/review/91613211/13f927e053', + 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', 'md5': '6295fdab8f4bf6a002d058b2c6dce276', 'info_dict': { 'id': '91613211', @@ -588,11 +592,11 @@ class VimeoReviewIE(InfoExtractor): class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): IE_NAME = 'vimeo:watchlater' IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' - _VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater' + _VALID_URL = r'https://vimeo\.com/home/watchlater|:vimeowatchlater' _LOGIN_REQUIRED = True _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<' _TESTS = [{ - 'url': 'http://vimeo.com/home/watchlater', + 'url': 'https://vimeo.com/home/watchlater', 'only_matching': True, }] @@ -612,7 +616,7 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' + _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' _TEST = { @@ -640,8 +644,8 @@ class VimeoLikesIE(InfoExtractor): description = self._html_search_meta('description', webpage) def _get_page(idx): - page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( - self.http_scheme(), user_id, idx + 1) + page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % ( + user_id, idx + 1) webpage = self._download_webpage( page_url, user_id, note='Downloading page %d/%d' % (idx + 1, page_count)) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0b58fe0fe..c3187cfeb 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -33,14 +33,13 @@ class VineIE(InfoExtractor): r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data')) formats = [{ - 'url': data['videoLowURL'], - 'ext': 'mp4', - 'format_id': 'low', - }, { - 'url': data['videoUrl'], - 'ext': 'mp4', - 'format_id': 'standard', - }] + 'format_id': '%(format)s-%(rate)s' % f, + 'vcodec': f['format'], + 'quality': f['rate'], + 'url': f['videoUrl'], + } for f in data['videoUrls'] if f.get('rate')] + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 7dea8c59d..cc384adbf 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -31,7 +31,7 @@ class VKIE(InfoExtractor): 'id': '162222515', 'ext': 'flv', 'title': 'ProtivoGunz - Хуёвая песня', - 'uploader': 're:Noize MC.*', + 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, 'upload_date': '20120212', }, @@ -140,7 +140,7 @@ class VKIE(InfoExtractor): if not video_id: video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id + info_url = 'http://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) ERRORS = { @@ -152,7 +152,10 @@ class VKIE(InfoExtractor): 'use --username and --password options to provide account credentials.', r'<!>Unknown error': - 'Video %s does not exist.' + 'Video %s does not exist.', + + r'<!>Видео временно недоступно': + 'Video %s is temporarily unavailable.', } for error_re, error_msg in ERRORS.items(): diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 4971965f9..81d885fdc 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def base64_decode_utf8(data): + return base64.b64decode(data.encode('utf-8')).decode('utf-8') + + @staticmethod + def base64_encode_utf8(data): + return base64.b64encode(data.encode('utf-8')).decode('utf-8') + def _extract_flv_config(self, media_id): - base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8') + base64_media_id = self.base64_encode_utf8(media_id) flv_config = self._download_xml( 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, 'flv config') prop_dict = {} for prop in flv_config.findall('./property'): - prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8') + prop_id = self.base64_decode_utf8(prop.attrib['id']) # CDATA may be empty in flv config if not prop.text: continue - encoded_content = base64.b64decode(prop.text).decode('utf-8') + encoded_content = self.base64_decode_utf8(prop.text) prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content) return prop_dict diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 97dbac4cc..b777159c5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,6 +17,8 @@ from ..utils import ( int_or_none, ) +from .nbc import NBCSportsVPlayerIE + class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' @@ -129,6 +131,15 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } } ] @@ -151,6 +162,10 @@ class YahooIE(InfoExtractor): items = json.loads(items_json) video_id = items[0]['id'] return self._get_info(video_id, display_id, webpage) + # Look for NBCSports iframes + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index b294767c5..19f8762ae 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -8,6 +8,7 @@ from ..compat import compat_urlparse from ..utils import ( float_or_none, month_by_abbreviation, + ExtractorError, ) @@ -28,23 +29,45 @@ class YamIE(InfoExtractor): } }, { # An external video hosted on YouTube - 'url': 'http://mymedia.yam.com/m/3598173', - 'md5': '0238ceec479c654e8c2f1223755bf3e9', + 'url': 'http://mymedia.yam.com/m/3599430', + 'md5': '03127cf10d8f35d120a9e8e52e3b17c6', 'info_dict': { - 'id': 'pJ2Deys283c', + 'id': 'CNpEoQlrIgA', 'ext': 'mp4', - 'upload_date': '20150202', + 'upload_date': '20150306', 'uploader': '新莊社大瑜伽社', - 'description': 'md5:f5cc72f0baf259a70fb731654b0d2eff', + 'description': 'md5:11e2e405311633ace874f2e6226c8b17', 'uploader_id': '2323agoy', - 'title': '外婆的澎湖灣KTV-潘安邦', - } + 'title': '20090412陽明山二子坪-1', + }, + 'skip': 'Video does not exist', + }, { + 'url': 'http://mymedia.yam.com/m/3598173', + 'info_dict': { + 'id': '3598173', + 'ext': 'mp4', + }, + 'skip': 'cause Yam system error', + }, { + 'url': 'http://mymedia.yam.com/m/3599437', + 'info_dict': { + 'id': '3599437', + 'ext': 'mp4', + }, + 'skip': 'invalid YouTube URL', }] def _real_extract(self, url): video_id = self._match_id(url) page = self._download_webpage(url, video_id) + # Check for errors + system_msg = self._html_search_regex( + r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message', + default=None) + if system_msg: + raise ExtractorError(system_msg, expected=True) + # Is it hosted externally on YouTube? youtube_url = self._html_search_regex( r'<embed src="(http://www.youtube.com/[^"]+)"', diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py new file mode 100644 index 000000000..f4c0f5702 --- /dev/null +++ b/youtube_dl/extractor/yandexmusic.py @@ -0,0 +1,127 @@ +# coding=utf-8 +from __future__ import unicode_literals + +import re +import hashlib + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, +) + + +class YandexMusicBaseIE(InfoExtractor): + def _get_track_url(self, storage_dir, track_id): + data = self._download_json( + 'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' + % storage_dir, + track_id, 'Downloading track location JSON') + + key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() + storage = storage_dir.split('.') + + return ('http://%s/get-mp3/%s/%s?track-id=%s&from=service-10-track&similarities-experiment=default' + % (data['host'], key, data['ts'] + data['path'], storage[1])) + + def _get_track_info(self, track): + return { + 'id': track['id'], + 'ext': 'mp3', + 'url': self._get_track_url(track['storageDir'], track['id']), + 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), + 'filesize': int_or_none(track.get('fileSize')), + 'duration': float_or_none(track.get('durationMs'), 1000), + } + + +class YandexMusicTrackIE(YandexMusicBaseIE): + IE_NAME = 'yandexmusic:track' + IE_DESC = 'Яндекс.Музыка - Трек' + _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)' + + _TEST = { + 'url': 'http://music.yandex.ru/album/540508/track/4878838', + 'md5': 'f496818aa2f60b6c0062980d2e00dc20', + 'info_dict': { + 'id': '4878838', + 'ext': 'mp3', + 'title': 'Carlo Ambrosio - Gypsy Eyes 1', + 'filesize': 4628061, + 'duration': 193.04, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + album_id, track_id = mobj.group('album_id'), mobj.group('id') + + track = self._download_json( + 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), + track_id, 'Downloading track JSON')['track'] + + return self._get_track_info(track) + + +class YandexMusicAlbumIE(YandexMusicBaseIE): + IE_NAME = 'yandexmusic:album' + IE_DESC = 'Яндекс.Музыка - Альбом' + _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)' + + _TEST = { + 'url': 'http://music.yandex.ru/album/540508', + 'info_dict': { + 'id': '540508', + 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', + }, + 'playlist_count': 50, + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + album = self._download_json( + 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, + album_id, 'Downloading album JSON') + + entries = [self._get_track_info(track) for track in album['volumes'][0]] + + title = '%s - %s' % (album['artists'][0]['name'], album['title']) + year = album.get('year') + if year: + title += ' (%s)' % year + + return self.playlist_result(entries, compat_str(album['id']), title) + + +class YandexMusicPlaylistIE(YandexMusicBaseIE): + IE_NAME = 'yandexmusic:playlist' + IE_DESC = 'Яндекс.Музыка - Плейлист' + _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' + + _TEST = { + 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', + 'info_dict': { + 'id': '1245', + 'title': 'Что слушают Enter Shikari', + 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', + }, + 'playlist_count': 6, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + playlist = self._parse_json( + self._search_regex( + r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), + playlist_id)['pageData']['playlist'] + + entries = [self._get_track_info(track) for track in playlist['tracks']] + + return self.playlist_result( + entries, compat_str(playlist_id), + playlist['title'], playlist.get('description')) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 107c9ac36..6abe72f73 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -47,11 +47,12 @@ class YouPornIE(InfoExtractor): # Get JSON parameters json_params = self._search_regex( - r'var currentVideo = new Video\((.*)\)[,;]', + [r'var\s+videoJa?son\s*=\s*({.+?});', + r'var\s+currentVideo\s*=\s*new\s+Video\((.+?)\)[,;]'], webpage, 'JSON parameters') try: params = json.loads(json_params) - except: + except ValueError: raise ExtractorError('Invalid JSON') self.report_extraction(video_id) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3690f8021..5488101e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - - if playlist_id.startswith('RD') or playlist_id.startswith('UL'): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - + def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page @@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, 'Youtube', video_id=video_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + + return self._extract_playlist(playlist_id) + class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' @@ -1532,7 +1535,7 @@ class YoutubeSearchURLIE(InfoExtractor): webpage = self._download_webpage(url, query) result_code = self._search_regex( - r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML') + r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') part_codes = re.findall( r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) @@ -1643,21 +1646,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:recommended' IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' - _FEED_NAME = 'watch_later' - _PLAYLIST_TITLE = 'Youtube Watch Later' - _PERSONAL_FEED = True + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + + _TESTS = [] # override PlaylistIE tests + + def _real_extract(self, url): + return self._extract_playlist('WL') class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:history' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index a2ffe96bc..35c7e5fb3 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -195,6 +195,12 @@ def parseOpts(overrideArguments=None): action='store_const', const='::', dest='source_address', help='Make all connections via IPv6 (experimental)', ) + network.add_option( + '--cn-verification-proxy', + dest='cn_verification_proxy', default=None, metavar='URL', + help='Use this proxy to verify the IP address for some Chinese sites. ' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + ) selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -435,8 +441,12 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--external-downloader', dest='external_downloader', metavar='COMMAND', - help='(experimental) Use the specified external downloader. ' + help='Use the specified external downloader. ' 'Currently supports %s' % ','.join(list_external_downloaders())) + downloader.add_option( + '--external-downloader-args', + dest='external_downloader_args', metavar='ARGS', + help='Give these arguments to the external downloader.') workarounds = optparse.OptionGroup(parser, 'Workarounds') workarounds.add_option( @@ -553,7 +563,7 @@ def parseOpts(overrideArguments=None): action='store_true', dest='verbose', default=False, help='print various debugging information') verbosity.add_option( - '--dump-intermediate-pages', + '--dump-pages', '--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems (very verbose)') verbosity.add_option( @@ -726,6 +736,15 @@ def parseOpts(overrideArguments=None): action='store_true', dest='addmetadata', default=False, help='write metadata to the video file') postproc.add_option( + '--metadata-from-title', + metavar='FORMAT', dest='metafromtitle', + help='parse additional metadata like song title / artist from the video title. ' + 'The format syntax is the same as --output, ' + 'the parsed parameters replace existing values. ' + 'Additional templates: %(album), %(artist). ' + 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' + '"Coldplay - Paradise"') + postproc.add_option( '--xattrs', action='store_true', dest='xattrs', default=False, help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)') @@ -775,6 +794,11 @@ def parseOpts(overrideArguments=None): write_string('[debug] Override config: ' + repr(overrideArguments) + '\n') else: command_line_conf = sys.argv[1:] + # Workaround for Python 2.x, where argv is a byte list + if sys.version_info < (3,): + command_line_conf = [ + a.decode('utf-8', 'replace') for a in command_line_conf] + if '--ignore-config' in command_line_conf: system_conf = [] user_conf = [] diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py index 708df3dd4..f39acadce 100644 --- a/youtube_dl/postprocessor/__init__.py +++ b/youtube_dl/postprocessor/__init__.py @@ -15,6 +15,7 @@ from .ffmpeg import ( ) from .xattrpp import XAttrMetadataPP from .execafterdownload import ExecAfterDownloadPP +from .metadatafromtitle import MetadataFromTitlePP def get_postprocessor(key): @@ -34,5 +35,6 @@ __all__ = [ 'FFmpegPostProcessor', 'FFmpegSubtitlesConvertorPP', 'FFmpegVideoConvertorPP', + 'MetadataFromTitlePP', 'XAttrMetadataPP', ] diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 30094c2f3..55adf9685 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import io import os import subprocess -import sys import time @@ -269,19 +268,17 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): else: self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path) self.run_ffmpeg(path, new_path, acodec, more_opts) - except: - etype, e, tb = sys.exc_info() - if isinstance(e, AudioConversionError): - msg = 'audio conversion failed: ' + e.msg - else: - msg = 'error running ' + self.basename - raise PostProcessingError(msg) + except AudioConversionError as e: + raise PostProcessingError( + 'audio conversion failed: ' + e.msg) + except Exception: + raise PostProcessingError('error running ' + self.basename) # Try to update the date time for extracted audio file. if information.get('filetime') is not None: try: os.utime(encodeFilename(new_path), (time.time(), information['filetime'])) - except: + except Exception: self._downloader.report_warning('Cannot update utime of audio file') information['filepath'] = new_path @@ -545,7 +542,9 @@ class FFmpegMetadataPP(FFmpegPostProcessor): metadata['title'] = info['title'] if info.get('upload_date') is not None: metadata['date'] = info['upload_date'] - if info.get('uploader') is not None: + if info.get('artist') is not None: + metadata['artist'] = info['artist'] + elif info.get('uploader') is not None: metadata['artist'] = info['uploader'] elif info.get('uploader_id') is not None: metadata['artist'] = info['uploader_id'] @@ -554,6 +553,8 @@ class FFmpegMetadataPP(FFmpegPostProcessor): metadata['comment'] = info['description'] if info.get('webpage_url') is not None: metadata['purl'] = info['webpage_url'] + if info.get('album') is not None: + metadata['album'] = info['album'] if not metadata: self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py new file mode 100644 index 000000000..5019433d3 --- /dev/null +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import re + +from .common import PostProcessor +from ..utils import PostProcessingError + + +class MetadataFromTitlePPError(PostProcessingError): + pass + + +class MetadataFromTitlePP(PostProcessor): + def __init__(self, downloader, titleformat): + super(MetadataFromTitlePP, self).__init__(downloader) + self._titleformat = titleformat + self._titleregex = self.format_to_regex(titleformat) + + def format_to_regex(self, fmt): + """ + Converts a string like + '%(title)s - %(artist)s' + to a regex like + '(?P<title>.+)\ \-\ (?P<artist>.+)' + """ + lastpos = 0 + regex = "" + # replace %(..)s with regex group and escape other string parts + for match in re.finditer(r'%\((\w+)\)s', fmt): + regex += re.escape(fmt[lastpos:match.start()]) + regex += r'(?P<' + match.group(1) + '>.+)' + lastpos = match.end() + if lastpos < len(fmt): + regex += re.escape(fmt[lastpos:len(fmt)]) + return regex + + def run(self, info): + title = info['title'] + match = re.match(self._titleregex, title) + if match is None: + raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) + for attribute, value in match.groupdict().items(): + value = match.group(attribute) + info[attribute] = value + self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value) + + return True, info diff --git a/youtube_dl/update.py b/youtube_dl/update.py index d8be4049f..de3169eef 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -65,7 +65,7 @@ def update_self(to_screen, verbose): # Check if there is a new version try: newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() - except: + except Exception: if verbose: to_screen(compat_str(traceback.format_exc())) to_screen('ERROR: can\'t find the current version. Please try again later.') @@ -78,7 +78,7 @@ def update_self(to_screen, verbose): try: versions_info = opener.open(JSON_URL).read().decode('utf-8') versions_info = json.loads(versions_info) - except: + except Exception: if verbose: to_screen(compat_str(traceback.format_exc())) to_screen('ERROR: can\'t obtain versions info. Please try again later.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d4938ec36..90e0ed9ab 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -75,7 +75,7 @@ def preferredencoding(): try: pref = locale.getpreferredencoding() 'TEST'.encode(pref) - except: + except Exception: pref = 'UTF-8' return pref @@ -127,7 +127,7 @@ def write_json_file(obj, fn): except OSError: pass os.rename(tf.name, fn) - except: + except Exception: try: os.remove(tf.name) except OSError: @@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode): raise # In case of error, try to remove win32 forbidden chars - alt_filename = os.path.join( - re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) - for path_part in os.path.split(filename) - ) + alt_filename = sanitize_path(filename) if alt_filename == filename: raise else: # An exception here should be caught in the caller - stream = open(encodeFilename(filename), open_mode) + stream = open(encodeFilename(alt_filename), open_mode) return (stream, alt_filename) @@ -305,11 +302,37 @@ def sanitize_filename(s, restricted=False, is_id=False): result = result[2:] if result.startswith('-'): result = '_' + result[len('-'):] + result = result.lstrip('.') if not result: result = '_' return result +def sanitize_path(s): + """Sanitizes and normalizes path on Windows""" + if sys.platform != 'win32': + return s + drive, _ = os.path.splitdrive(s) + unc, _ = os.path.splitunc(s) + unc_or_drive = unc or drive + norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep) + if unc_or_drive: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + for path_part in norm_path] + if unc_or_drive: + sanitized_path.insert(0, unc_or_drive + os.path.sep) + return os.path.join(*sanitized_path) + + +def sanitize_url_path_consecutive_slashes(url): + """Collapses consecutive slashes in URLs' path""" + parsed_url = list(compat_urlparse.urlparse(url)) + parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) + return compat_urlparse.urlunparse(parsed_url) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -325,7 +348,7 @@ def _htmlentity_transform(entity): if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) - mobj = re.match(r'#(x?[0-9]+)', entity) + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith('x'): @@ -1767,3 +1790,24 @@ def match_filter_func(filter_str): video_title = info_dict.get('title', info_dict.get('id', 'video')) return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) return _match_func + + +class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + return compat_urllib_request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + return compat_urllib_request.ProxyHandler.proxy_open( + self, req, proxy, type) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5582348ba..dd93e295a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.28' +__version__ = '2015.03.28' |