diff options
110 files changed, 3771 insertions, 879 deletions
| diff --git a/.travis.yml b/.travis.yml index fb34299fc..511bee64c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python  python:    - "2.6"    - "2.7" +  - "3.2"    - "3.3"    - "3.4"  before_install: @@ -113,3 +113,9 @@ Robin de Rooij  Ryan Schmidt  Leslie P. Polzer  Duncan Keall +Alexander Mamay +Devin J. Pohly +Eduardo Ferro Aldama +Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 351229f21..588b15bde 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,9 @@ If your report is shorter than two lines, it is almost certainly missing some of  For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.  ###  Are you using the latest version? @@ -2,7 +2,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas  clean:  	rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe -	find -name "*.pyc" -delete +	find . -name "*.pyc" -delete  PREFIX ?= /usr/local  BINDIR ?= $(PREFIX)/bin @@ -47,211 +47,109 @@ which means you can modify it, redistribute it or use it however you like.  # OPTIONS      -h, --help                       print this help text and exit      --version                        print program version and exit -    -U, --update                     update this program to latest version. Make -                                     sure that you have sufficient permissions -                                     (run with sudo if needed) -    -i, --ignore-errors              continue on download errors, for example to -                                     skip unavailable videos in a playlist -    --abort-on-error                 Abort downloading of further videos (in the -                                     playlist or the command line) if an error -                                     occurs +    -U, --update                     update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) +    -i, --ignore-errors              continue on download errors, for example to skip unavailable videos in a playlist +    --abort-on-error                 Abort downloading of further videos (in the playlist or the command line) if an error occurs      --dump-user-agent                display the current browser identification -    --list-extractors                List all supported extractors and the URLs -                                     they would handle -    --extractor-descriptions         Output descriptions of all supported -                                     extractors -    --default-search PREFIX          Use this prefix for unqualified URLs. For -                                     example "gvsearch2:" downloads two videos -                                     from google videos for  youtube-dl "large -                                     apple". Use the value "auto" to let -                                     youtube-dl guess ("auto_warning" to emit a -                                     warning when guessing). "error" just throws -                                     an error. The default value "fixup_error" -                                     repairs broken URLs, but emits an error if -                                     this is not possible instead of searching. -    --ignore-config                  Do not read configuration files. When given -                                     in the global configuration file /etc -                                     /youtube-dl.conf: Do not read the user -                                     configuration in ~/.config/youtube- -                                     dl/config (%APPDATA%/youtube-dl/config.txt -                                     on Windows) -    --flat-playlist                  Do not extract the videos of a playlist, -                                     only list them. +    --list-extractors                List all supported extractors and the URLs they would handle +    --extractor-descriptions         Output descriptions of all supported extractors +    --default-search PREFIX          Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". +                                     Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The +                                     default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching. +    --ignore-config                  Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: Do not read the user configuration +                                     in ~/.config/youtube-dl/config (%APPDATA%/youtube-dl/config.txt on Windows) +    --flat-playlist                  Do not extract the videos of a playlist, only list them.      --no-color                       Do not emit color codes in output.  ## Network Options: -    --proxy URL                      Use the specified HTTP/HTTPS proxy. Pass in -                                     an empty string (--proxy "") for direct -                                     connection +    --proxy URL                      Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection      --socket-timeout SECONDS         Time to wait before giving up, in seconds -    --source-address IP              Client-side IP address to bind to -                                     (experimental) -    -4, --force-ipv4                 Make all connections via IPv4 -                                     (experimental) -    -6, --force-ipv6                 Make all connections via IPv6 -                                     (experimental) +    --source-address IP              Client-side IP address to bind to (experimental) +    -4, --force-ipv4                 Make all connections via IPv4 (experimental) +    -6, --force-ipv6                 Make all connections via IPv6 (experimental) +    --cn-verification-proxy URL      Use this proxy to verify the IP address for some Chinese sites. The default proxy specified by --proxy (or none, if the options is +                                     not present) is used for the actual downloading. (experimental)  ## Video Selection:      --playlist-start NUMBER          playlist video to start at (default is 1)      --playlist-end NUMBER            playlist video to end at (default is last) -    --playlist-items ITEM_SPEC       playlist video items to download. Specify -                                     indices of the videos in the playlist -                                     seperated by commas like: "--playlist-items -                                     1,2,5,8" if you want to download videos -                                     indexed 1, 2, 5, 8 in the playlist. You can -                                     specify range: "--playlist-items -                                     1-3,7,10-13", it will download the videos -                                     at index 1, 2, 3, 7, 10, 11, 12 and 13. -    --match-title REGEX              download only matching titles (regex or -                                     caseless sub-string) -    --reject-title REGEX             skip download for matching titles (regex or -                                     caseless sub-string) +    --playlist-items ITEM_SPEC       playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" +                                     if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will +                                     download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. +    --match-title REGEX              download only matching titles (regex or caseless sub-string) +    --reject-title REGEX             skip download for matching titles (regex or caseless sub-string)      --max-downloads NUMBER           Abort after downloading NUMBER files -    --min-filesize SIZE              Do not download any videos smaller than -                                     SIZE (e.g. 50k or 44.6m) -    --max-filesize SIZE              Do not download any videos larger than SIZE -                                     (e.g. 50k or 44.6m) +    --min-filesize SIZE              Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) +    --max-filesize SIZE              Do not download any videos larger than SIZE (e.g. 50k or 44.6m)      --date DATE                      download only videos uploaded in this date -    --datebefore DATE                download only videos uploaded on or before -                                     this date (i.e. inclusive) -    --dateafter DATE                 download only videos uploaded on or after -                                     this date (i.e. inclusive) -    --min-views COUNT                Do not download any videos with less than -                                     COUNT views -    --max-views COUNT                Do not download any videos with more than -                                     COUNT views -    --match-filter FILTER            (Experimental) Generic video filter. -                                     Specify any key (see help for -o for a list -                                     of available keys) to match if the key is -                                     present, !key to check if the key is not -                                     present,key > NUMBER (like "comment_count > -                                     12", also works with >=, <, <=, !=, =) to -                                     compare against a number, and & to require -                                     multiple matches. Values which are not -                                     known are excluded unless you put a -                                     question mark (?) after the operator.For -                                     example, to only match videos that have -                                     been liked more than 100 times and disliked -                                     less than 50 times (or the dislike -                                     functionality is not available at the given -                                     service), but who also have a description, -                                     use  --match-filter "like_count > 100 & +    --datebefore DATE                download only videos uploaded on or before this date (i.e. inclusive) +    --dateafter DATE                 download only videos uploaded on or after this date (i.e. inclusive) +    --min-views COUNT                Do not download any videos with less than COUNT views +    --max-views COUNT                Do not download any videos with more than COUNT views +    --match-filter FILTER            (Experimental) Generic video filter. Specify any key (see help for -o for a list of available keys) to match if the key is present, +                                     !key to check if the key is not present,key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against +                                     a number, and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the +                                     operator.For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike +                                     functionality is not available at the given service), but who also have a description, use  --match-filter "like_count > 100 &                                       dislike_count <? 50 & description" . -    --no-playlist                    If the URL refers to a video and a -                                     playlist, download only the video. -    --yes-playlist                   If the URL refers to a video and a -                                     playlist, download the playlist. -    --age-limit YEARS                download only videos suitable for the given -                                     age -    --download-archive FILE          Download only videos not listed in the -                                     archive file. Record the IDs of all -                                     downloaded videos in it. -    --include-ads                    Download advertisements as well -                                     (experimental) +    --no-playlist                    If the URL refers to a video and a playlist, download only the video. +    --yes-playlist                   If the URL refers to a video and a playlist, download the playlist. +    --age-limit YEARS                download only videos suitable for the given age +    --download-archive FILE          Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. +    --include-ads                    Download advertisements as well (experimental)  ## Download Options: -    -r, --rate-limit LIMIT           maximum download rate in bytes per second -                                     (e.g. 50K or 4.2M) -    -R, --retries RETRIES            number of retries (default is 10), or -                                     "infinite". -    --buffer-size SIZE               size of download buffer (e.g. 1024 or 16K) -                                     (default is 1024) -    --no-resize-buffer               do not automatically adjust the buffer -                                     size. By default, the buffer size is -                                     automatically resized from an initial value -                                     of SIZE. +    -r, --rate-limit LIMIT           maximum download rate in bytes per second (e.g. 50K or 4.2M) +    -R, --retries RETRIES            number of retries (default is 10), or "infinite". +    --buffer-size SIZE               size of download buffer (e.g. 1024 or 16K) (default is 1024) +    --no-resize-buffer               do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.      --playlist-reverse               Download playlist videos in reverse order -    --xattr-set-filesize             (experimental) set file xattribute -                                     ytdl.filesize with expected filesize -    --hls-prefer-native              (experimental) Use the native HLS -                                     downloader instead of ffmpeg. -    --external-downloader COMMAND    (experimental) Use the specified external -                                     downloader. Currently supports -                                     aria2c,curl,wget +    --xattr-set-filesize             (experimental) set file xattribute ytdl.filesize with expected filesize +    --hls-prefer-native              (experimental) Use the native HLS downloader instead of ffmpeg. +    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,wget +    --external-downloader-args ARGS  Give these arguments to the external downloader.  ## Filesystem Options: -    -a, --batch-file FILE            file containing URLs to download ('-' for -                                     stdin) +    -a, --batch-file FILE            file containing URLs to download ('-' for stdin)      --id                             use only video ID in file name -    -o, --output TEMPLATE            output filename template. Use %(title)s to -                                     get the title, %(uploader)s for the -                                     uploader name, %(uploader_id)s for the -                                     uploader nickname if different, -                                     %(autonumber)s to get an automatically -                                     incremented number, %(ext)s for the -                                     filename extension, %(format)s for the -                                     format description (like "22 - 1280x720" or -                                     "HD"), %(format_id)s for the unique id of -                                     the format (like Youtube's itags: "137"), -                                     %(upload_date)s for the upload date -                                     (YYYYMMDD), %(extractor)s for the provider -                                     (youtube, metacafe, etc), %(id)s for the -                                     video id, %(playlist_title)s, -                                     %(playlist_id)s, or %(playlist)s (=title if -                                     present, ID otherwise) for the playlist the -                                     video is in, %(playlist_index)s for the -                                     position in the playlist. %(height)s and -                                     %(width)s for the width and height of the -                                     video format. %(resolution)s for a textual -                                     description of the resolution of the video -                                     format. %% for a literal percent. Use - to -                                     output to stdout. Can also be used to -                                     download to a different directory, for -                                     example with -o '/my/downloads/%(uploader)s -                                     /%(title)s-%(id)s.%(ext)s' . -    --autonumber-size NUMBER         Specifies the number of digits in -                                     %(autonumber)s when it is present in output -                                     filename template or --auto-number option -                                     is given -    --restrict-filenames             Restrict filenames to only ASCII -                                     characters, and avoid "&" and spaces in -                                     filenames -    -A, --auto-number                [deprecated; use  -o -                                     "%(autonumber)s-%(title)s.%(ext)s" ] number -                                     downloaded files starting from 00000 -    -t, --title                      [deprecated] use title in file name -                                     (default) +    -o, --output TEMPLATE            output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(uploader_id)s for the uploader +                                     nickname if different, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(format)s for +                                     the format description (like "22 - 1280x720" or "HD"), %(format_id)s for the unique id of the format (like Youtube's itags: "137"), +                                     %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id, +                                     %(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, +                                     %(playlist_index)s for the position in the playlist. %(height)s and %(width)s for the width and height of the video format. +                                     %(resolution)s for a textual description of the resolution of the video format. %% for a literal percent. Use - to output to stdout. +                                     Can also be used to download to a different directory, for example with -o '/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' . +    --autonumber-size NUMBER         Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given +    --restrict-filenames             Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames +    -A, --auto-number                [deprecated; use  -o "%(autonumber)s-%(title)s.%(ext)s" ] number downloaded files starting from 00000 +    -t, --title                      [deprecated] use title in file name (default)      -l, --literal                    [deprecated] alias of --title      -w, --no-overwrites              do not overwrite files -    -c, --continue                   force resume of partially downloaded files. -                                     By default, youtube-dl will resume -                                     downloads if possible. -    --no-continue                    do not resume partially downloaded files -                                     (restart from beginning) -    --no-part                        do not use .part files - write directly -                                     into output file -    --no-mtime                       do not use the Last-modified header to set -                                     the file modification time -    --write-description              write video description to a .description -                                     file +    -c, --continue                   force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible. +    --no-continue                    do not resume partially downloaded files (restart from beginning) +    --no-part                        do not use .part files - write directly into output file +    --no-mtime                       do not use the Last-modified header to set the file modification time +    --write-description              write video description to a .description file      --write-info-json                write video metadata to a .info.json file -    --write-annotations              write video annotations to a .annotation -                                     file -    --load-info FILE                 json file containing the video information -                                     (created with the "--write-json" option) -    --cookies FILE                   file to read cookies from and dump cookie -                                     jar in -    --cache-dir DIR                  Location in the filesystem where youtube-dl -                                     can store some downloaded information -                                     permanently. By default $XDG_CACHE_HOME -                                     /youtube-dl or ~/.cache/youtube-dl . At the -                                     moment, only YouTube player files (for -                                     videos with obfuscated signatures) are -                                     cached, but that may change. +    --write-annotations              write video annotations to a .annotation file +    --load-info FILE                 json file containing the video information (created with the "--write-json" option) +    --cookies FILE                   file to read cookies from and dump cookie jar in +    --cache-dir DIR                  Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl +                                     or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may +                                     change.      --no-cache-dir                   Disable filesystem caching      --rm-cache-dir                   Delete all filesystem cache files  ## Thumbnail images:      --write-thumbnail                write thumbnail image to disk      --write-all-thumbnails           write all thumbnail image formats to disk -    --list-thumbnails                Simulate and list all available thumbnail -                                     formats +    --list-thumbnails                Simulate and list all available thumbnail formats  ## Verbosity / Simulation Options:      -q, --quiet                      activates quiet mode      --no-warnings                    Ignore warnings -    -s, --simulate                   do not download the video and do not write -                                     anything to disk +    -s, --simulate                   do not download the video and do not write anything to disk      --skip-download                  do not download the video      -g, --get-url                    simulate, quiet but print URL      -e, --get-title                  simulate, quiet but print title @@ -261,155 +159,87 @@ which means you can modify it, redistribute it or use it however you like.      --get-duration                   simulate, quiet but print video length      --get-filename                   simulate, quiet but print output filename      --get-format                     simulate, quiet but print output format -    -j, --dump-json                  simulate, quiet but print JSON information. -                                     See --output for a description of available -                                     keys. -    -J, --dump-single-json           simulate, quiet but print JSON information -                                     for each command-line argument. If the URL -                                     refers to a playlist, dump the whole -                                     playlist information in a single line. -    --print-json                     Be quiet and print the video information as -                                     JSON (video is still being downloaded). +    -j, --dump-json                  simulate, quiet but print JSON information. See --output for a description of available keys. +    -J, --dump-single-json           simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist +                                     information in a single line. +    --print-json                     Be quiet and print the video information as JSON (video is still being downloaded).      --newline                        output progress bar as new lines      --no-progress                    do not print progress bar      --console-title                  display progress in console titlebar      -v, --verbose                    print various debugging information -    --dump-intermediate-pages        print downloaded pages to debug problems -                                     (very verbose) -    --write-pages                    Write downloaded intermediary pages to -                                     files in the current directory to debug -                                     problems +    --dump-pages                     print downloaded pages to debug problems (very verbose) +    --write-pages                    Write downloaded intermediary pages to files in the current directory to debug problems      --print-traffic                  Display sent and read HTTP traffic -    -C, --call-home                  Contact the youtube-dl server for -                                     debugging. -    --no-call-home                   Do NOT contact the youtube-dl server for -                                     debugging. +    -C, --call-home                  Contact the youtube-dl server for debugging. +    --no-call-home                   Do NOT contact the youtube-dl server for debugging.  ## Workarounds:      --encoding ENCODING              Force the specified encoding (experimental)      --no-check-certificate           Suppress HTTPS certificate validation. -    --prefer-insecure                Use an unencrypted connection to retrieve -                                     information about the video. (Currently -                                     supported only for YouTube) +    --prefer-insecure                Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)      --user-agent UA                  specify a custom user agent -    --referer URL                    specify a custom referer, use if the video -                                     access is restricted to one domain -    --add-header FIELD:VALUE         specify a custom HTTP header and its value, -                                     separated by a colon ':'. You can use this -                                     option multiple times -    --bidi-workaround                Work around terminals that lack -                                     bidirectional text support. Requires bidiv -                                     or fribidi executable in PATH -    --sleep-interval SECONDS         Number of seconds to sleep before each -                                     download. +    --referer URL                    specify a custom referer, use if the video access is restricted to one domain +    --add-header FIELD:VALUE         specify a custom HTTP header and its value, separated by a colon ':'. You can use this option multiple times +    --bidi-workaround                Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH +    --sleep-interval SECONDS         Number of seconds to sleep before each download.  ## Video Format Options: -    -f, --format FORMAT              video format code, specify the order of -                                     preference using slashes, as in -f 22/17/18 -                                     .  Instead of format codes, you can select -                                     by extension for the extensions aac, m4a, -                                     mp3, mp4, ogg, wav, webm. You can also use -                                     the special names "best", "bestvideo", -                                     "bestaudio", "worst".  You can filter the -                                     video results by putting a condition in -                                     brackets, as in -f "best[height=720]" (or -                                     -f "[filesize>10M]").  This works for -                                     filesize, height, width, tbr, abr, vbr, -                                     asr, and fps and the comparisons <, <=, >, -                                     >=, =, != and for ext, acodec, vcodec, -                                     container, and protocol and the comparisons -                                     =, != . Formats for which the value is not -                                     known are excluded unless you put a -                                     question mark (?) after the operator. You -                                     can combine format filters, so  -f "[height -                                     <=? 720][tbr>500]" selects up to 720p -                                     videos (or videos where the height is not -                                     known) with a bitrate of at least 500 -                                     KBit/s. By default, youtube-dl will pick -                                     the best quality. Use commas to download -                                     multiple audio formats, such as -f -                                     136/137/mp4/bestvideo,140/m4a/bestaudio. -                                     You can merge the video and audio of two -                                     formats into a single file using -f <video- -                                     format>+<audio-format> (requires ffmpeg or -                                     avconv), for example -f +    -f, --format FORMAT              video format code, specify the order of preference using slashes, as in -f 22/17/18 .  Instead of format codes, you can select by +                                     extension for the extensions aac, m4a, mp3, mp4, ogg, wav, webm. You can also use the special names "best", "bestvideo", "bestaudio", +                                     "worst".  You can filter the video results by putting a condition in brackets, as in -f "best[height=720]" (or -f "[filesize>10M]"). +                                     This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, +                                     vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a +                                     question mark (?) after the operator. You can combine format filters, so  -f "[height <=? 720][tbr>500]" selects up to 720p videos +                                     (or videos where the height is not known) with a bitrate of at least 500 KBit/s. By default, youtube-dl will pick the best quality. +                                     Use commas to download multiple audio formats, such as -f  136/137/mp4/bestvideo,140/m4a/bestaudio. You can merge the video and audio +                                     of two formats into a single file using -f <video-format>+<audio-format> (requires ffmpeg or avconv), for example -f                                       bestvideo+bestaudio.      --all-formats                    download all available video formats -    --prefer-free-formats            prefer free video formats unless a specific -                                     one is requested +    --prefer-free-formats            prefer free video formats unless a specific one is requested      --max-quality FORMAT             highest quality format to download      -F, --list-formats               list all available formats -    --youtube-skip-dash-manifest     Do not download the DASH manifest on -                                     YouTube videos -    --merge-output-format FORMAT     If a merge is required (e.g. -                                     bestvideo+bestaudio), output to given -                                     container format. One of mkv, mp4, ogg, -                                     webm, flv.Ignored if no merge is required +    --youtube-skip-dash-manifest     Do not download the DASH manifest on YouTube videos +    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no +                                     merge is required  ## Subtitle Options:      --write-sub                      write subtitle file -    --write-auto-sub                 write automatic subtitle file (youtube -                                     only) -    --all-subs                       downloads all the available subtitles of -                                     the video +    --write-auto-sub                 write automatic subtitle file (youtube only) +    --all-subs                       downloads all the available subtitles of the video      --list-subs                      lists all available subtitles for the video -    --sub-format FORMAT              subtitle format, accepts formats -                                     preference, for example: "ass/srt/best" -    --sub-lang LANGS                 languages of the subtitles to download -                                     (optional) separated by commas, use IETF -                                     language tags like 'en,pt' +    --sub-format FORMAT              subtitle format, accepts formats preference, for example: "ass/srt/best" +    --sub-lang LANGS                 languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt'  ## Authentication Options:      -u, --username USERNAME          login with this account ID -    -p, --password PASSWORD          account password. If this option is left -                                     out, youtube-dl will ask interactively. +    -p, --password PASSWORD          account password. If this option is left out, youtube-dl will ask interactively.      -2, --twofactor TWOFACTOR        two-factor auth code      -n, --netrc                      use .netrc authentication data      --video-password PASSWORD        video password (vimeo, smotri)  ## Post-processing Options: -    -x, --extract-audio              convert video files to audio-only files -                                     (requires ffmpeg or avconv and ffprobe or -                                     avprobe) -    --audio-format FORMAT            "best", "aac", "vorbis", "mp3", "m4a", -                                     "opus", or "wav"; "best" by default -    --audio-quality QUALITY          ffmpeg/avconv audio quality specification, -                                     insert a value between 0 (better) and 9 -                                     (worse) for VBR or a specific bitrate like -                                     128K (default 5) -    --recode-video FORMAT            Encode the video to another format if -                                     necessary (currently supported: -                                     mp4|flv|ogg|webm|mkv) -    -k, --keep-video                 keeps the video file on disk after the -                                     post-processing; the video is erased by -                                     default -    --no-post-overwrites             do not overwrite post-processed files; the -                                     post-processed files are overwritten by -                                     default -    --embed-subs                     embed subtitles in the video (only for mp4 -                                     videos) +    -x, --extract-audio              convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) +    --audio-format FORMAT            "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default +    --audio-quality QUALITY          ffmpeg/avconv audio quality specification, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K +                                     (default 5) +    --recode-video FORMAT            Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv) +    -k, --keep-video                 keeps the video file on disk after the post-processing; the video is erased by default +    --no-post-overwrites             do not overwrite post-processed files; the post-processed files are overwritten by default +    --embed-subs                     embed subtitles in the video (only for mp4 videos)      --embed-thumbnail                embed thumbnail in the audio as cover art      --add-metadata                   write metadata to the video file -    --xattrs                         write metadata to the video file's xattrs -                                     (using dublin core and xdg standards) -    --fixup POLICY                   Automatically correct known faults of the -                                     file. One of never (do nothing), warn (only -                                     emit a warning), detect_or_warn(the -                                     default; fix file if we can, warn -                                     otherwise) -    --prefer-avconv                  Prefer avconv over ffmpeg for running the -                                     postprocessors (default) -    --prefer-ffmpeg                  Prefer ffmpeg over avconv for running the -                                     postprocessors -    --ffmpeg-location PATH           Location of the ffmpeg/avconv binary; -                                     either the path to the binary or its -                                     containing directory. -    --exec CMD                       Execute a command on the file after -                                     downloading, similar to find's -exec -                                     syntax. Example: --exec 'adb push {} -                                     /sdcard/Music/ && rm {}' -    --convert-subtitles FORMAT       Convert the subtitles to other format -                                     (currently supported: srt|ass|vtt) +    --metadata-from-title FORMAT     parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed +                                     parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - +                                     %(title)s" matches a title like "Coldplay - Paradise" +    --xattrs                         write metadata to the video file's xattrs (using dublin core and xdg standards) +    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; +                                     fix file if we can, warn otherwise) +    --prefer-avconv                  Prefer avconv over ffmpeg for running the postprocessors (default) +    --prefer-ffmpeg                  Prefer ffmpeg over avconv for running the postprocessors +    --ffmpeg-location PATH           Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. +    --exec CMD                       Execute a command on the file after downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm +                                     {}' +    --convert-subtitles FORMAT       Convert the subtitles to other format (currently supported: srt|ass|vtt)  # CONFIGURATION @@ -529,6 +359,10 @@ YouTube requires an additional signature since September 2012 which is not suppo  In February 2015, the new YouTube player contained a character sequence in a string that was misinterpreted by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. +### HTTP Error 429: Too Many Requests or 402: Payment Required + +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--network-address` options](#network-options) to select another IP address. +  ### SyntaxError: Non-ASCII character ###  The error @@ -573,6 +407,18 @@ A note on the service that they don't host the infringing content, but just link  Support requests for services that **do** purchase the rights to distribute their content are perfectly fine though. If in doubt, you can simply include a source that mentions the legitimate purchase of content. +### How can I speed up work on my issue? + +(Also known as: Help, my important issue not being solved!) The youtube-dl core developer team is quite small. While we do our best to solve as many issues as possible, sometimes that can take quite a while. To speed up your issue, here's what you can do: + +First of all, please do report the issue [at our issue tracker](https://yt-dl.org/bugs). That allows us to coordinate all efforts by users and developers, and serves as a unified point. Unfortunately, the youtube-dl project has grown too large to use personal email as an effective communication channel. + +Please read the [bug reporting instructions](#bugs) below. A lot of bugs lack all the necessary information. If you can, offer proxy, VPN, or shell access to the youtube-dl developers. If you are able to, test the issue from multiple computers in multiple countries to exclude local censorship or misconfiguration issues. + +If nobody is interested in solving your issue, you are welcome to take matters into your own hands and submit a pull request (or coerce/pay somebody else to do so). + +Feel free to bump the issue from time to time by writing a small comment ("Issue is still present in youtube-dl version ...from France, but fixed from Belgium"), but please not more than once a month. Please do not declare your issue as `important` or `urgent`. +  ### How can I detect whether a given URL is supported by youtube-dl?  For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. @@ -672,6 +518,7 @@ youtube-dl makes the best effort to be a good command-line program, and thus sho  From a Python program, you can embed youtube-dl in a more powerful fashion, like this:  ```python +from __future__ import unicode_literals  import youtube_dl  ydl_opts = {} @@ -684,6 +531,7 @@ Most likely, you'll want to use various options. For a list of what can be done,  Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file:  ```python +from __future__ import unicode_literals  import youtube_dl @@ -741,7 +589,9 @@ If your report is shorter than two lines, it is almost certainly missing some of  For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +If your server has multiple IPs or you suspect censorship, adding --call-home may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.  ###  Are you using the latest version? diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 6a5bd9eda..7a219ebe9 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -28,7 +28,7 @@ for test in get_testcases():      if METHOD == 'EURISTIC':          try:              webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() -        except: +        except Exception:              print('\nFail: {0}'.format(test['name']))              continue diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py new file mode 100644 index 000000000..2e389fc8e --- /dev/null +++ b/devscripts/generate_aes_testdata.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import codecs +import subprocess + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import intlist_to_bytes +from youtube_dl.aes import aes_encrypt, key_expansion + +secret_msg = b'Secret message goes here' + + +def hex_str(int_list): +    return codecs.encode(intlist_to_bytes(int_list), 'hex') + + +def openssl_encode(algo, key, iv): +    cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] +    prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) +    out, _ = prog.communicate(secret_msg) +    return out + +iv = key = [0x20, 0x15] + 14 * [0] + +r = openssl_encode('aes-128-cbc', key, iv) +print('aes_cbc_decrypt') +print(repr(r)) + +password = key +new_key = aes_encrypt(password, key_expansion(password)) +r = openssl_encode('aes-128-ctr', new_key, iv) +print('aes_decrypt_text 16') +print(repr(r)) + +password = key + 16 * [0] +new_key = aes_encrypt(password, key_expansion(password)) * (32 // 16) +r = openssl_encode('aes-256-ctr', new_key, iv) +print('aes_decrypt_text 32') +print(repr(r)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 062cb3d62..fd59cc2be 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -2,6 +2,8 @@   - **1tv**: Первый канал   - **1up.com**   - **220.ro** + - **22tracks:genre** + - **22tracks:track**   - **24video**   - **3sat**   - **4tube** @@ -47,6 +49,7 @@   - **Bandcamp**   - **Bandcamp:album**   - **bbc.co.uk**: BBC iPlayer + - **BeatportPro**   - **Beeg**   - **BehindKink**   - **Bet** @@ -111,12 +114,14 @@   - **Discovery**   - **divxstage**: DivxStage   - **Dotsub** + - **DouyuTV**   - **DRBonanza**   - **Dropbox**   - **DrTuber**   - **DRTV**   - **Dump**   - **dvtv**: http://video.aktualne.cz/ + - **EaglePlatform**   - **EbaumsWorld**   - **EchoMsk**   - **eHow** @@ -144,6 +149,7 @@   - **Firstpost**   - **Flickr**   - **Folketinget**: Folketinget (ft.dk; Danish parliament) + - **FootyRoom**   - **Foxgay**   - **FoxNews**   - **france2.fr:generation-quoi** @@ -161,6 +167,7 @@   - **GameSpot**   - **GameStar**   - **Gametrailers** + - **Gazeta**   - **GDCVault**   - **generic**: Generic downloader that works on some sites   - **GiantBomb** @@ -211,6 +218,7 @@   - **jpopsuki.tv**   - **Jukebox**   - **Kaltura** + - **KanalPlay**: Kanal 5/9/11 Play   - **Kankan**   - **Karaoketv**   - **keek** @@ -225,6 +233,7 @@   - **Letv**   - **LetvPlaylist**   - **LetvTv** + - **Libsyn**   - **lifenews**: LIFE | NEWS   - **LiveLeak**   - **livestream** @@ -304,6 +313,7 @@   - **npo.nl:radio**   - **npo.nl:radio:fragment**   - **NRK** + - **NRKPlaylist**   - **NRKTV**   - **ntv.ru**   - **Nuvid** @@ -315,6 +325,7 @@   - **Ooyala**   - **OpenFilm**   - **orf:fm4**: radio FM4 + - **orf:iptv**: iptv.ORF.at   - **orf:oe1**: Radio Österreich 1   - **orf:tvthek**: ORF TVthek   - **parliamentlive.tv**: UK parliament videos @@ -322,10 +333,12 @@   - **PBS**   - **Phoenix**   - **Photobucket** + - **Pladform**   - **PlanetaPlay**   - **play.fm**   - **played.to**   - **Playvid** + - **Playwire**   - **plus.google**: Google Plus   - **pluzz.francetv.fr**   - **podomatic** @@ -334,6 +347,7 @@   - **PornHubPlaylist**   - **Pornotube**   - **PornoXO** + - **PrimeShareTV**   - **PromptFile**   - **prosiebensat1**: ProSiebenSat.1 Digital   - **Puls4** @@ -359,6 +373,7 @@   - **RTP**   - **RTS**: RTS.ch   - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil   - **rtve.es:live**: RTVE.es live streams   - **RUHD**   - **rutube**: Rutube videos @@ -367,6 +382,8 @@   - **rutube:movie**: Rutube movies   - **rutube:person**: Rutube person videos   - **RUTV**: RUTV.RU + - **safari**: safaribooksonline.com online video + - **safari:course**: safaribooksonline.com online courses   - **Sandia**: Sandia National Laboratories   - **Sapo**: SAPO Vídeos   - **savefrom.net** @@ -409,6 +426,7 @@   - **SportBox**   - **SportDeutschland**   - **SRMediathek**: Saarländischer Rundfunk + - **SSA**   - **stanfordoc**: Stanford Open ClassRoom   - **Steam**   - **streamcloud.eu** @@ -478,6 +496,7 @@   - **Ubu**   - **udemy**   - **udemy:course** + - **Ultimedia**   - **Unistra**   - **Urort**: NRK P3 Urørt   - **ustream** @@ -485,6 +504,7 @@   - **Vbox7**   - **VeeHD**   - **Veoh** + - **Vessel**   - **Vesti**: Вести.Ru   - **Vevo**   - **VGTV** @@ -505,6 +525,7 @@   - **Vidzi**   - **vier**   - **vier:videos** + - **Viewster**   - **viki**   - **vimeo**   - **vimeo:album** @@ -551,6 +572,9 @@   - **XXXYMovies**   - **Yahoo**: Yahoo screen and movies   - **Yam** + - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист + - **yandexmusic:track**: Яндекс.Музыка - Трек   - **YesJapan**   - **Ynet**   - **YouJizz** @@ -569,7 +593,7 @@   - **youtube:show**: YouTube.com (multi-season) shows   - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)   - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)   - **Zapiks**   - **ZDF**   - **ZDFChannel** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 055e42555..652519831 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -14,6 +14,9 @@ from test.helper import FakeYDL, assertRegexpMatches  from youtube_dl import YoutubeDL  from youtube_dl.extractor import YoutubeIE  from youtube_dl.postprocessor.common import PostProcessor +from youtube_dl.utils import match_filter_func + +TEST_URL = 'http://localhost/sample.mp4'  class YDL(FakeYDL): @@ -46,8 +49,8 @@ class TestFormatSelection(unittest.TestCase):          ydl = YDL()          ydl.params['prefer_free_formats'] = True          formats = [ -            {'ext': 'webm', 'height': 460, 'url': 'x'}, -            {'ext': 'mp4', 'height': 460, 'url': 'y'}, +            {'ext': 'webm', 'height': 460, 'url': TEST_URL}, +            {'ext': 'mp4', 'height': 460, 'url': TEST_URL},          ]          info_dict = _make_result(formats)          yie = YoutubeIE(ydl) @@ -60,8 +63,8 @@ class TestFormatSelection(unittest.TestCase):          ydl = YDL()          ydl.params['prefer_free_formats'] = True          formats = [ -            {'ext': 'webm', 'height': 720, 'url': 'a'}, -            {'ext': 'mp4', 'height': 1080, 'url': 'b'}, +            {'ext': 'webm', 'height': 720, 'url': TEST_URL}, +            {'ext': 'mp4', 'height': 1080, 'url': TEST_URL},          ]          info_dict['formats'] = formats          yie = YoutubeIE(ydl) @@ -74,9 +77,9 @@ class TestFormatSelection(unittest.TestCase):          ydl = YDL()          ydl.params['prefer_free_formats'] = False          formats = [ -            {'ext': 'webm', 'height': 720, 'url': '_'}, -            {'ext': 'mp4', 'height': 720, 'url': '_'}, -            {'ext': 'flv', 'height': 720, 'url': '_'}, +            {'ext': 'webm', 'height': 720, 'url': TEST_URL}, +            {'ext': 'mp4', 'height': 720, 'url': TEST_URL}, +            {'ext': 'flv', 'height': 720, 'url': TEST_URL},          ]          info_dict['formats'] = formats          yie = YoutubeIE(ydl) @@ -88,8 +91,8 @@ class TestFormatSelection(unittest.TestCase):          ydl = YDL()          ydl.params['prefer_free_formats'] = False          formats = [ -            {'ext': 'flv', 'height': 720, 'url': '_'}, -            {'ext': 'webm', 'height': 720, 'url': '_'}, +            {'ext': 'flv', 'height': 720, 'url': TEST_URL}, +            {'ext': 'webm', 'height': 720, 'url': TEST_URL},          ]          info_dict['formats'] = formats          yie = YoutubeIE(ydl) @@ -133,10 +136,10 @@ class TestFormatSelection(unittest.TestCase):      def test_format_selection(self):          formats = [ -            {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, -            {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, -            {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, -            {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, +            {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, +            {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, +            {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, +            {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL},          ]          info_dict = _make_result(formats) @@ -167,10 +170,10 @@ class TestFormatSelection(unittest.TestCase):      def test_format_selection_audio(self):          formats = [ -            {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, -            {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, -            {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, -            {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, +            {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, +            {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, +            {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': TEST_URL}, +            {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': TEST_URL},          ]          info_dict = _make_result(formats) @@ -185,8 +188,8 @@ class TestFormatSelection(unittest.TestCase):          self.assertEqual(downloaded['format_id'], 'audio-low')          formats = [ -            {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, -            {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, +            {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, +            {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': TEST_URL},          ]          info_dict = _make_result(formats) @@ -228,9 +231,9 @@ class TestFormatSelection(unittest.TestCase):      def test_format_selection_video(self):          formats = [ -            {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, -            {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, -            {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, +            {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': TEST_URL}, +            {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': TEST_URL}, +            {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': TEST_URL},          ]          info_dict = _make_result(formats) @@ -337,6 +340,8 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], 'G') + +class TestYoutubeDL(unittest.TestCase):      def test_subtitles(self):          def s_formats(lang, autocaption=False):              return [{ @@ -459,6 +464,73 @@ class TestFormatSelection(unittest.TestCase):          self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)          os.unlink(audiofile) +    def test_match_filter(self): +        class FilterYDL(YDL): +            def __init__(self, *args, **kwargs): +                super(FilterYDL, self).__init__(*args, **kwargs) +                self.params['simulate'] = True + +            def process_info(self, info_dict): +                super(YDL, self).process_info(info_dict) + +            def _match_entry(self, info_dict, incomplete): +                res = super(FilterYDL, self)._match_entry(info_dict, incomplete) +                if res is None: +                    self.downloaded_info_dicts.append(info_dict) +                return res + +        first = { +            'id': '1', +            'url': TEST_URL, +            'title': 'one', +            'extractor': 'TEST', +            'duration': 30, +            'filesize': 10 * 1024, +        } +        second = { +            'id': '2', +            'url': TEST_URL, +            'title': 'two', +            'extractor': 'TEST', +            'duration': 10, +            'description': 'foo', +            'filesize': 5 * 1024, +        } +        videos = [first, second] + +        def get_videos(filter_=None): +            ydl = FilterYDL({'match_filter': filter_}) +            for v in videos: +                ydl.process_ie_result(v, download=True) +            return [v['id'] for v in ydl.downloaded_info_dicts] + +        res = get_videos() +        self.assertEqual(res, ['1', '2']) + +        def f(v): +            if v['id'] == '1': +                return None +            else: +                return 'Video id is not 1' +        res = get_videos(f) +        self.assertEqual(res, ['1']) + +        f = match_filter_func('duration < 30') +        res = get_videos(f) +        self.assertEqual(res, ['2']) + +        f = match_filter_func('description = foo') +        res = get_videos(f) +        self.assertEqual(res, ['2']) + +        f = match_filter_func('description =? foo') +        res = get_videos(f) +        self.assertEqual(res, ['1', '2']) + +        f = match_filter_func('filesize > 5KiB') +        res = get_videos(f) +        self.assertEqual(res, ['1']) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py new file mode 100644 index 000000000..4dc7de7b5 --- /dev/null +++ b/test/test_aes.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +import base64 + +# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' + + +class TestAES(unittest.TestCase): +    def setUp(self): +        self.key = self.iv = [0x20, 0x15] + 14 * [0] +        self.secret_msg = b'Secret message goes here' + +    def test_encrypt(self): +        msg = b'message' +        key = list(range(16)) +        encrypted = aes_encrypt(bytes_to_intlist(msg), key) +        decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) +        self.assertEqual(decrypted, msg) + +    def test_cbc_decrypt(self): +        data = bytes_to_intlist( +            b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" +        ) +        decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) +        self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + +    def test_decrypt_text(self): +        password = intlist_to_bytes(self.key).decode('utf-8') +        encrypted = base64.b64encode( +            intlist_to_bytes(self.iv[:8]) + +            b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' +        ) +        decrypted = (aes_decrypt_text(encrypted, password, 16)) +        self.assertEqual(decrypted, self.secret_msg) + +        password = intlist_to_bytes(self.key).decode('utf-8') +        encrypted = base64.b64encode( +            intlist_to_bytes(self.iv[:8]) + +            b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' +        ) +        decrypted = (aes_decrypt_text(encrypted, password, 32)) +        self.assertEqual(decrypted, self.secret_msg) + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index e66264b4b..a9db42b30 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -59,7 +59,7 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])      def test_youtube_feeds(self): -        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later']) +        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])          self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])          self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])          self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) @@ -104,11 +104,11 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertMatch(':tds', ['ComedyCentralShows'])      def test_vimeo_matching(self): -        self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) -        self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel']) -        self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo']) -        self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) -        self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user']) +        self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) +        self.assertMatch('https://vimeo.com/channels/31259', ['vimeo:channel']) +        self.assertMatch('https://vimeo.com/channels/31259/53576664', ['vimeo']) +        self.assertMatch('https://vimeo.com/user7108434', ['vimeo:user']) +        self.assertMatch('https://vimeo.com/user7108434/videos', ['vimeo:user'])          self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review'])      # https://github.com/rg3/youtube-dl/issues/1930 diff --git a/test/test_execution.py b/test/test_execution.py index 60df187de..f31e51558 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,4 +1,6 @@  #!/usr/bin/env python +# coding: utf-8 +  from __future__ import unicode_literals  import unittest @@ -27,5 +29,12 @@ class TestExecution(unittest.TestCase):      def test_main_exec(self):          subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) +    def test_cmdline_umlauts(self): +        p = subprocess.Popen( +            [sys.executable, 'youtube_dl/__main__.py', 'ä', '--version'], +            cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) +        _, stderr = p.communicate() +        self.assertFalse(stderr) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_http.py b/test/test_http.py index bd4d46fef..f2e305b6f 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,7 @@ import unittest  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server +from youtube_dl.compat import compat_http_server, compat_urllib_request  import ssl  import threading @@ -68,5 +68,52 @@ class TestHTTP(unittest.TestCase):          r = ydl.extract_info('https://localhost:%d/video.html' % self.port)          self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) + +def _build_proxy_handler(name): +    class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): +        proxy_name = name + +        def log_message(self, format, *args): +            pass + +        def do_GET(self): +            self.send_response(200) +            self.send_header('Content-Type', 'text/plain; charset=utf-8') +            self.end_headers() +            self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8')) +    return HTTPTestRequestHandler + + +class TestProxy(unittest.TestCase): +    def setUp(self): +        self.proxy = compat_http_server.HTTPServer( +            ('localhost', 0), _build_proxy_handler('normal')) +        self.port = self.proxy.socket.getsockname()[1] +        self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) +        self.proxy_thread.daemon = True +        self.proxy_thread.start() + +        self.cn_proxy = compat_http_server.HTTPServer( +            ('localhost', 0), _build_proxy_handler('cn')) +        self.cn_port = self.cn_proxy.socket.getsockname()[1] +        self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) +        self.cn_proxy_thread.daemon = True +        self.cn_proxy_thread.start() + +    def test_proxy(self): +        cn_proxy = 'localhost:{0}'.format(self.cn_port) +        ydl = YoutubeDL({ +            'proxy': 'localhost:{0}'.format(self.port), +            'cn_verification_proxy': cn_proxy, +        }) +        url = 'http://foo.com/bar' +        response = ydl.urlopen(url).read().decode('utf-8') +        self.assertEqual(response, 'normal: {0}'.format(url)) + +        req = compat_urllib_request.Request(url) +        req.add_header('Ytdl-request-proxy', cn_proxy) +        response = ydl.urlopen(req).read().decode('utf-8') +        self.assertEqual(response, 'cn: {0}'.format(url)) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_netrc.py b/test/test_netrc.py new file mode 100644 index 000000000..7cf3a6a2e --- /dev/null +++ b/test/test_netrc.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import ( +    gen_extractors, +) + + +class TestNetRc(unittest.TestCase): +    def test_netrc_present(self): +        for ie in gen_extractors(): +            if not hasattr(ie, '_login'): +                continue +            self.assertTrue( +                hasattr(ie, '_NETRC_MACHINE'), +                'Extractor %s supports login, but is missing a _NETRC_MACHINE property' % ie.IE_NAME) + + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py new file mode 100644 index 000000000..addb69d6f --- /dev/null +++ b/test/test_postprocessors.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.postprocessor import MetadataFromTitlePP + + +class TestMetadataFromTitle(unittest.TestCase): +    def test_format_to_regex(self): +        pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') +        self.assertEqual(pp._titleregex, '(?P<title>.+)\ \-\ (?P<artist>.+)') diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 3f2d8a2ba..891ee620b 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -26,6 +26,7 @@ from youtube_dl.extractor import (      VikiIE,      ThePlatformIE,      RTVEALaCartaIE, +    FunnyOrDieIE,  ) @@ -320,5 +321,17 @@ class TestRtveSubtitles(BaseTestSubtitles):          self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') +class TestFunnyOrDieSubtitles(BaseTestSubtitles): +    url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' +    IE = FunnyOrDieIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['en'])) +        self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') + +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py index 7f816698e..6c1b7ec91 100644 --- a/test/test_unicode_literals.py +++ b/test/test_unicode_literals.py @@ -17,13 +17,22 @@ IGNORED_FILES = [      'buildserver.py',  ] +IGNORED_DIRS = [ +    '.git', +    '.tox', +]  from test.helper import assertRegexpMatches  class TestUnicodeLiterals(unittest.TestCase):      def test_all_files(self): -        for dirpath, _, filenames in os.walk(rootDir): +        for dirpath, dirnames, filenames in os.walk(rootDir): +            for ignore_dir in IGNORED_DIRS: +                if ignore_dir in dirnames: +                    # If we remove the directory from dirnames os.walk won't +                    # recurse into it +                    dirnames.remove(ignore_dir)              for basename in filenames:                  if not basename.endswith('.py'):                      continue diff --git a/test/test_utils.py b/test/test_utils.py index 3fba8ae11..abaf1ab73 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -24,6 +24,7 @@ from youtube_dl.utils import (      encodeFilename,      escape_rfc3986,      escape_url, +    ExtractorError,      find_xpath_attr,      fix_xml_ampersands,      InAdvancePagedList, @@ -38,6 +39,8 @@ from youtube_dl.utils import (      parse_iso8601,      read_batch_urls,      sanitize_filename, +    sanitize_path, +    sanitize_url_path_consecutive_slashes,      shell_quote,      smuggle_url,      str_to_int, @@ -52,6 +55,7 @@ from youtube_dl.utils import (      urlencode_postdata,      version_tuple,      xpath_with_ns, +    xpath_text,      render_table,      match_str,  ) @@ -85,8 +89,11 @@ class TestUtil(unittest.TestCase):          self.assertEqual(              sanitize_filename('New World record at 0:12:34'),              'New World record at 0_12_34') +          self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf')          self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') +        self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf') +        self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf')          forbidden = '"\0\\/'          for fc in forbidden: @@ -128,6 +135,62 @@ class TestUtil(unittest.TestCase):          self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')          self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') +    def test_sanitize_path(self): +        if sys.platform != 'win32': +            return + +        self.assertEqual(sanitize_path('abc'), 'abc') +        self.assertEqual(sanitize_path('abc/def'), 'abc\\def') +        self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') +        self.assertEqual(sanitize_path('abc|def'), 'abc#def') +        self.assertEqual(sanitize_path('<>:"|?*'), '#######') +        self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def') +        self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def') + +        self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc') +        self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc') + +        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') +        self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc') +        self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f') +        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') + +        self.assertEqual( +            sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'), +            'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s') + +        self.assertEqual( +            sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'), +            'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part') +        self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#') +        self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def') +        self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#') + +        self.assertEqual(sanitize_path('../abc'), '..\\abc') +        self.assertEqual(sanitize_path('../../abc'), '..\\..\\abc') +        self.assertEqual(sanitize_path('./abc'), 'abc') +        self.assertEqual(sanitize_path('./../abc'), '..\\abc') + +    def test_sanitize_url_path_consecutive_slashes(self): +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'), +            'http://hostname/foo/bar/filename.html') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'), +            'http://hostname/foo/bar/filename.html') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname//'), +            'http://hostname/') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'), +            'http://hostname/foo/bar/filename.html') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/'), +            'http://hostname/') +        self.assertEqual( +            sanitize_url_path_consecutive_slashes('http://hostname/abc//'), +            'http://hostname/abc/') +      def test_ordered_set(self):          self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])          self.assertEqual(orderedSet([]), []) @@ -137,6 +200,8 @@ class TestUtil(unittest.TestCase):      def test_unescape_html(self):          self.assertEqual(unescapeHTML('%20;'), '%20;') +        self.assertEqual(unescapeHTML('/'), '/') +        self.assertEqual(unescapeHTML('/'), '/')          self.assertEqual(              unescapeHTML('é'), 'é') @@ -189,6 +254,17 @@ class TestUtil(unittest.TestCase):          self.assertEqual(find('media:song/media:author').text, 'The Author')          self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') +    def test_xpath_text(self): +        testxml = '''<root> +            <div> +                <p>Foo</p> +            </div> +        </root>''' +        doc = xml.etree.ElementTree.fromstring(testxml) +        self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') +        self.assertTrue(xpath_text(doc, 'div/bar') is None) +        self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) +      def test_smuggle_url(self):          data = {"ö": "ö", "abc": [3]}          url = 'https://foo.bar/baz?x=y#a' @@ -1,8 +1,11 @@  [tox] -envlist = py26,py27,py33 +envlist = py26,py27,py33,py34  [testenv]  deps =     nose     coverage -commands = nosetests --verbose {posargs:test}  # --with-coverage --cover-package=youtube_dl --cover-html +defaultargs = test --exclude test_download.py --exclude test_age_restriction.py +    --exclude test_subtitles.py --exclude test_write_annotations.py +    --exclude test_youtube_lists.py +commands = nosetests --verbose {posargs:{[testenv]defaultargs}}  # --with-coverage --cover-package=youtube_dl --cover-html                                                 # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d7c6db0ff..4fa2223ad 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -4,8 +4,10 @@  from __future__ import absolute_import, unicode_literals  import collections +import contextlib  import datetime  import errno +import fileinput  import io  import itertools  import json @@ -52,12 +54,14 @@ from .utils import (      MaxDownloadsReached,      PagedList,      parse_filesize, +    PerRequestProxyHandler,      PostProcessingError,      platform_name,      preferredencoding,      render_table,      SameFileError,      sanitize_filename, +    sanitize_path,      std_headers,      subtitles_filename,      takewhile_inclusive, @@ -181,6 +185,8 @@ class YoutubeDL(object):      prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.                         At the moment, this is only supported by YouTube.      proxy:             URL of the proxy server to use +    cn_verification_proxy:  URL of the proxy to use for IP address verification +                       on Chinese sites. (Experimental)      socket_timeout:    Time to wait for unresponsive hosts, in seconds      bidi_workaround:   Work around buggy terminals without bidirectional text                         support, using fridibi @@ -247,10 +253,10 @@ class YoutubeDL(object):      hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.      The following parameters are not used by YoutubeDL itself, they are used by -    the FileDownloader: +    the downloader (see youtube_dl/downloader/common.py):      nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,      noresizebuffer, retries, continuedl, noprogress, consoletitle, -    xattr_set_filesize. +    xattr_set_filesize, external_downloader_args.      The following options are used by the post processors:      prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available, @@ -317,8 +323,10 @@ class YoutubeDL(object):                  'Set the LC_ALL environment variable to fix this.')              self.params['restrictfilenames'] = True -        if '%(stitle)s' in self.params.get('outtmpl', ''): -            self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') +        if isinstance(params.get('outtmpl'), bytes): +            self.report_warning( +                'Parameter outtmpl is bytes, but should be a unicode string. ' +                'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')          self._setup_opener() @@ -557,7 +565,7 @@ class YoutubeDL(object):                                   if v is not None)              template_dict = collections.defaultdict(lambda: 'NA', template_dict) -            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) +            outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))              tmpl = compat_expanduser(outtmpl)              filename = tmpl % template_dict              # Temporary fix for #4787 @@ -624,7 +632,7 @@ class YoutubeDL(object):          Returns a list with a dictionary for each video we find.          If 'download', also downloads the videos.          extra_info is a dict containing the extra values to add to each result -         ''' +        '''          if ie_key:              ies = [self.get_info_extractor(ie_key)] @@ -1080,8 +1088,7 @@ class YoutubeDL(object):          if req_format is None:              req_format = 'best'          formats_to_download = [] -        # The -1 is for supporting YoutubeIE -        if req_format in ('-1', 'all'): +        if req_format == 'all':              formats_to_download = formats          else:              for rfstr in req_format.split(','): @@ -1208,9 +1215,6 @@ class YoutubeDL(object):          if len(info_dict['title']) > 200:              info_dict['title'] = info_dict['title'][:197] + '...' -        # Keep for backwards compatibility -        info_dict['stitle'] = info_dict['title'] -          if 'format' not in info_dict:              info_dict['format'] = info_dict['ext'] @@ -1256,7 +1260,7 @@ class YoutubeDL(object):              return          try: -            dn = os.path.dirname(encodeFilename(filename)) +            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))              if dn and not os.path.exists(dn):                  os.makedirs(dn)          except (OSError, IOError) as err: @@ -1452,8 +1456,11 @@ class YoutubeDL(object):          return self._download_retcode      def download_with_info_file(self, info_filename): -        with io.open(info_filename, 'r', encoding='utf-8') as f: -            info = json.load(f) +        with contextlib.closing(fileinput.FileInput( +                [info_filename], mode='r', +                openhook=fileinput.hook_encoded('utf-8'))) as f: +            # FileInput doesn't have a read method, we can't call json.load +            info = json.loads('\n'.join(f))          try:              self.process_ie_result(info, download=True)          except DownloadError: @@ -1694,10 +1701,10 @@ class YoutubeDL(object):              out = out.decode().strip()              if re.match('[0-9a-f]+', out):                  self._write_string('[debug] Git HEAD: ' + out + '\n') -        except: +        except Exception:              try:                  sys.exc_clear() -            except: +            except Exception:                  pass          self._write_string('[debug] Python version %s - %s\n' % (              platform.python_version(), platform_name())) @@ -1757,13 +1764,20 @@ class YoutubeDL(object):              # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)              if 'http' in proxies and 'https' not in proxies:                  proxies['https'] = proxies['http'] -        proxy_handler = compat_urllib_request.ProxyHandler(proxies) +        proxy_handler = PerRequestProxyHandler(proxies)          debuglevel = 1 if self.params.get('debug_printtraffic') else 0          https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) +        # The ssl context is only available in python 2.7.9 and 3.x +        if hasattr(https_handler, '_context'): +            if len(https_handler._context.get_ca_certs()) == 0: +                self.report_warning( +                    'No ssl certificates were loaded, urls that use https ' +                    'won\'t work')          ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)          opener = compat_urllib_request.build_opener( -            https_handler, proxy_handler, cookie_processor, ydlh) +            proxy_handler, https_handler, cookie_processor, ydlh) +          # Delete the default user-agent header, which would otherwise apply in          # cases where our custom HTTP handler doesn't come into play          # (See https://github.com/rg3/youtube-dl/issues/1309 for details) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 49f382695..852b2fc3d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -9,6 +9,7 @@ import codecs  import io  import os  import random +import shlex  import sys @@ -212,6 +213,11 @@ def _real_main(argv=None):      # PostProcessors      postprocessors = []      # Add the metadata pp first, the other pps will copy it +    if opts.metafromtitle: +        postprocessors.append({ +            'key': 'MetadataFromTitle', +            'titleformat': opts.metafromtitle +        })      if opts.addmetadata:          postprocessors.append({'key': 'FFmpegMetadata'})      if opts.extractaudio: @@ -255,6 +261,9 @@ def _real_main(argv=None):              xattr  # Confuse flake8          except ImportError:              parser.error('setting filesize xattr requested but python-xattr is not available') +    external_downloader_args = None +    if opts.external_downloader_args: +        external_downloader_args = shlex.split(opts.external_downloader_args)      match_filter = (          None if opts.match_filter is None          else match_filter_func(opts.match_filter)) @@ -359,6 +368,8 @@ def _real_main(argv=None):          'no_color': opts.no_color,          'ffmpeg_location': opts.ffmpeg_location,          'hls_prefer_native': opts.hls_prefer_native, +        'external_downloader_args': external_downloader_args, +        'cn_verification_proxy': opts.cn_verification_proxy,      }      with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b2bf149ef..973bcd320 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -389,7 +389,7 @@ else:                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)              out, err = sp.communicate()              lines, columns = map(int, out.split()) -        except: +        except Exception:              pass          return _terminal_size(columns, lines) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 3ae90021a..a0fc5ead0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -42,6 +42,8 @@ class FileDownloader(object):      max_filesize:       Skip files larger than this size      xattr_set_filesize: Set ytdl.filesize user xattribute with expected size.                          (experimenatal) +    external_downloader_args:  A list of additional command-line arguments for the +                        external downloader.      Subclasses of this one must re-define the real_download method.      """ @@ -202,7 +204,7 @@ class FileDownloader(object):              return          try:              os.utime(filename, (time.time(), filetime)) -        except: +        except Exception:              pass          return filetime @@ -316,7 +318,7 @@ class FileDownloader(object):          )          continuedl_and_exists = ( -            self.params.get('continuedl', False) and +            self.params.get('continuedl', True) and              os.path.isfile(encodeFilename(filename)) and              not self.params.get('nopart', False)          ) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 51c41c704..1673b2382 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -51,6 +51,13 @@ class ExternalFD(FileDownloader):              return []          return [command_option, source_address] +    def _configuration_args(self, default=[]): +        ex_args = self.params.get('external_downloader_args') +        if ex_args is None: +            return default +        assert isinstance(ex_args, list) +        return ex_args +      def _call_downloader(self, tmpfilename, info_dict):          """ Either overwrite this or implement _make_cmd """          cmd = self._make_cmd(tmpfilename, info_dict) @@ -79,6 +86,7 @@ class CurlFD(ExternalFD):          for key, val in info_dict['http_headers'].items():              cmd += ['--header', '%s: %s' % (key, val)]          cmd += self._source_address('--interface') +        cmd += self._configuration_args()          cmd += ['--', info_dict['url']]          return cmd @@ -89,15 +97,16 @@ class WgetFD(ExternalFD):          for key, val in info_dict['http_headers'].items():              cmd += ['--header', '%s: %s' % (key, val)]          cmd += self._source_address('--bind-address') +        cmd += self._configuration_args()          cmd += ['--', info_dict['url']]          return cmd  class Aria2cFD(ExternalFD):      def _make_cmd(self, tmpfilename, info_dict): -        cmd = [ -            self.exe, '-c', -            '--min-split-size', '1M', '--max-connection-per-server', '4'] +        cmd = [self.exe, '-c'] +        cmd += self._configuration_args([ +            '--min-split-size', '1M', '--max-connection-per-server', '4'])          dn = os.path.dirname(tmpfilename)          if dn:              cmd += ['--dir', dn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 3dc796faa..4ab000d67 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -281,7 +281,7 @@ class F4mFD(FileDownloader):              boot_info = self._get_bootstrap_from_url(bootstrap_url)          else:              bootstrap_url = None -            bootstrap = base64.b64decode(node.text) +            bootstrap = base64.b64decode(node.text.encode('ascii'))              boot_info = read_bootstrap_info(bootstrap)          return (boot_info, bootstrap_url) @@ -308,7 +308,7 @@ class F4mFD(FileDownloader):          live = boot_info['live']          metadata_node = media.find(_add_ns('metadata'))          if metadata_node is not None: -            metadata = base64.b64decode(metadata_node.text) +            metadata = base64.b64decode(metadata_node.text.encode('ascii'))          else:              metadata = None diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 2e3dac825..d136bebd1 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -49,7 +49,7 @@ class HttpFD(FileDownloader):          open_mode = 'wb'          if resume_len != 0: -            if self.params.get('continuedl', False): +            if self.params.get('continuedl', True):                  self.report_resuming_byte(resume_len)                  request.add_header('Range', 'bytes=%d-' % resume_len)                  open_mode = 'ab' @@ -92,6 +92,8 @@ class HttpFD(FileDownloader):                              self._hook_progress({                                  'filename': filename,                                  'status': 'finished', +                                'downloaded_bytes': resume_len, +                                'total_bytes': resume_len,                              })                              return True                          else: @@ -218,12 +220,6 @@ class HttpFD(FileDownloader):          if tmpfilename != '-':              stream.close() -        self._hook_progress({ -            'downloaded_bytes': byte_counter, -            'total_bytes': data_len, -            'tmpfilename': tmpfilename, -            'status': 'error', -        })          if data_len is not None and byte_counter != data_len:              raise ContentTooShortError(byte_counter, int(data_len))          self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 89e98ae61..ddf5724ae 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -105,7 +105,7 @@ class RtmpFD(FileDownloader):          protocol = info_dict.get('rtmp_protocol', None)          real_time = info_dict.get('rtmp_real_time', False)          no_resume = info_dict.get('no_resume', False) -        continue_dl = info_dict.get('continuedl', False) +        continue_dl = info_dict.get('continuedl', True)          self.report_destination(filename)          tmpfilename = self.temp_name(filename) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 370154773..0b9736f2d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -37,6 +37,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE  from .bbccouk import BBCCoUkIE  from .beeg import BeegIE  from .behindkink import BehindKinkIE +from .beatportpro import BeatportProIE  from .bet import BetIE  from .bild import BildIE  from .bilibili import BiliBiliIE @@ -105,17 +106,21 @@ from .dbtv import DBTVIE  from .dctp import DctpTvIE  from .deezer import DeezerPlaylistIE  from .dfb import DFBIE +from .dhm import DHMIE  from .dotsub import DotsubIE +from .douyutv import DouyuTVIE  from .dreisat import DreiSatIE  from .drbonanza import DRBonanzaIE  from .drtuber import DrTuberIE  from .drtv import DRTVIE  from .dvtv import DVTVIE  from .dump import DumpIE +from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .divxstage import DivxStageIE  from .dropbox import DropboxIE +from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE  from .echomsk import EchoMskIE  from .ehow import EHowIE @@ -150,6 +155,7 @@ from .fktv import (  )  from .flickr import FlickrIE  from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE  from .fourtube import FourTubeIE  from .foxgay import FoxgayIE  from .foxnews import FoxNewsIE @@ -174,6 +180,7 @@ from .gameone import (  from .gamespot import GameSpotIE  from .gamestar import GameStarIE  from .gametrailers import GametrailersIE +from .gazeta import GazetaIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE  from .giantbomb import GiantBombIE @@ -228,6 +235,7 @@ from .jove import JoveIE  from .jukebox import JukeboxIE  from .jpopsukitv import JpopsukiIE  from .kaltura import KalturaIE +from .kanalplay import KanalPlayIE  from .kankan import KankanIE  from .karaoketv import KaraoketvIE  from .keezmovies import KeezMoviesIE @@ -244,6 +252,7 @@ from .letv import (      LetvTvIE,      LetvPlaylistIE  ) +from .libsyn import LibsynIE  from .lifenews import LifeNewsIE  from .liveleak import LiveLeakIE  from .livestream import ( @@ -303,6 +312,8 @@ from .nba import NBAIE  from .nbc import (      NBCIE,      NBCNewsIE, +    NBCSportsIE, +    NBCSportsVPlayerIE,  )  from .ndr import NDRIE  from .ndtv import NDTVIE @@ -341,6 +352,7 @@ from .npo import (  )  from .nrk import (      NRKIE, +    NRKPlaylistIE,      NRKTVIE,  )  from .ntvde import NTVDeIE @@ -355,6 +367,7 @@ from .orf import (      ORFTVthekIE,      ORFOE1IE,      ORFFM4IE, +    ORFIPTVIE,  )  from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE @@ -362,9 +375,11 @@ from .pbs import PBSIE  from .phoenix import PhoenixIE  from .photobucket import PhotobucketIE  from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE  from .played import PlayedIE  from .playfm import PlayFMIE  from .playvid import PlayvidIE +from .playwire import PlaywireIE  from .podomatic import PodomaticIE  from .pornhd import PornHdIE  from .pornhub import ( @@ -373,6 +388,7 @@ from .pornhub import (  )  from .pornotube import PornotubeIE  from .pornoxo import PornoXOIE +from .primesharetv import PrimeShareTVIE  from .promptfile import PromptFileIE  from .prosiebensat1 import ProSiebenSat1IE  from .puls4 import Puls4IE @@ -398,7 +414,7 @@ from .rtlnow import RTLnowIE  from .rtl2 import RTL2IE  from .rtp import RTPIE  from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE  from .ruhd import RUHDIE  from .rutube import (      RutubeIE, @@ -409,6 +425,10 @@ from .rutube import (  )  from .rutv import RUTVIE  from .sandia import SandiaIE +from .safari import ( +    SafariIE, +    SafariCourseIE, +)  from .sapo import SapoIE  from .savefrom import SaveFromIE  from .sbs import SBSIE @@ -456,6 +476,7 @@ from .sport5 import Sport5IE  from .sportbox import SportBoxIE  from .sportdeutschland import SportDeutschlandIE  from .srmediathek import SRMediathekIE +from .ssa import SSAIE  from .stanfordoc import StanfordOpenClassroomIE  from .steam import SteamIE  from .streamcloud import StreamcloudIE @@ -514,6 +535,10 @@ from .tvp import TvpIE, TvpSeriesIE  from .tvplay import TVPlayIE  from .tweakers import TweakersIE  from .twentyfourvideo import TwentyFourVideoIE +from .twentytwotracks import ( +    TwentyTwoTracksIE, +    TwentyTwoTracksGenreIE +)  from .twitch import (      TwitchVideoIE,      TwitchChapterIE, @@ -528,12 +553,15 @@ from .udemy import (      UdemyIE,      UdemyCourseIE  ) +from .ultimedia import UltimediaIE  from .unistra import UnistraIE  from .urort import UrortIE  from .ustream import UstreamIE, UstreamChannelIE +from .varzesh3 import Varzesh3IE  from .vbox7 import Vbox7IE  from .veehd import VeeHDIE  from .veoh import VeohIE +from .vessel import VesselIE  from .vesti import VestiIE  from .vevo import VevoIE  from .vgtv import VGTVIE @@ -551,6 +579,7 @@ from .videoweed import VideoWeedIE  from .vidme import VidmeIE  from .vidzi import VidziIE  from .vier import VierIE, VierVideosIE +from .viewster import ViewsterIE  from .vimeo import (      VimeoIE,      VimeoAlbumIE, @@ -607,6 +636,11 @@ from .yahoo import (      YahooSearchIE,  )  from .yam import YamIE +from .yandexmusic import ( +    YandexMusicTrackIE, +    YandexMusicAlbumIE, +    YandexMusicPlaylistIE, +)  from .yesjapan import YesJapanIE  from .ynet import YnetIE  from .youjizz import YouJizzIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 34b8b0115..39335b827 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -2,13 +2,12 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import (      ExtractorError, -    xpath_text,      float_or_none, +    xpath_text,  ) @@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor):              'title': 'American Dad - Putting Francine Out of Business',              'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'          }, +    }, { +        'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', +        'playlist': [ +            { +                'md5': '3e346a2ab0087d687a05e1e7f3b3e529', +                'info_dict': { +                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', +                    'ext': 'flv', +                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', +                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +                }, +            } +        ], +        'info_dict': { +            'id': 'sY3cMUR_TbuE4YmdjzbIcQ', +            'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', +            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', +        },      }]      @staticmethod @@ -80,6 +97,7 @@ class AdultSwimIE(InfoExtractor):              for video in collection.get('videos'):                  if video.get('slug') == slug:                      return collection, video +        return None, None      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -90,28 +108,30 @@ class AdultSwimIE(InfoExtractor):          webpage = self._download_webpage(url, episode_path)          # Extract the value of `bootstrappedData` from the Javascript in the page. -        bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path) - -        try: -            bootstrappedData = json.loads(bootstrappedDataJS) -        except ValueError as ve: -            errmsg = '%s: Failed to parse JSON ' % episode_path -            raise ExtractorError(errmsg, cause=ve) +        bootstrapped_data = self._parse_json(self._search_regex( +            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)          # Downloading videos from a /videos/playlist/ URL needs to be handled differently.          # NOTE: We are only downloading one video (the current one) not the playlist          if is_playlist: -            collections = bootstrappedData['playlists']['collections'] +            collections = bootstrapped_data['playlists']['collections']              collection = self.find_collection_by_linkURL(collections, show_path)              video_info = self.find_video_info(collection, episode_path)              show_title = video_info['showTitle']              segment_ids = [video_info['videoPlaybackID']]          else: -            collections = bootstrappedData['show']['collections'] +            collections = bootstrapped_data['show']['collections']              collection, video_info = self.find_collection_containing_video(collections, episode_path) -            show = bootstrappedData['show'] +            # Video wasn't found in the collections, let's try `slugged_video`. +            if video_info is None: +                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: +                    video_info = bootstrapped_data['slugged_video'] +                else: +                    raise ExtractorError('Unable to find video info') + +            show = bootstrapped_data['show']              show_title = show['title']              segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py index 2b257ede7..e15c015fb 100644 --- a/youtube_dl/extractor/aftenposten.py +++ b/youtube_dl/extractor/aftenposten.py @@ -14,10 +14,10 @@ from ..utils import (  class AftenpostenIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/([^/]+/)*(?P<id>[^/]+)-\d+\.html' +    _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)'      _TEST = { -        'url': 'http://www.aftenposten.no/webtv/serier-og-programmer/sweatshopenglish/TRAILER-SWEATSHOP---I-cant-take-any-more-7800835.html?paging=§ion=webtv_serierogprogrammer_sweatshop_sweatshopenglish', +        'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more',          'md5': 'fd828cd29774a729bf4d4425fe192972',          'info_dict': {              'id': '21039', @@ -30,12 +30,7 @@ class AftenpostenIE(InfoExtractor):      }      def _real_extract(self, url): -        display_id = self._match_id(url) - -        webpage = self._download_webpage(url, display_id) - -        video_id = self._html_search_regex( -            r'data-xs-id="(\d+)"', webpage, 'video id') +        video_id = self._match_id(url)          data = self._download_xml(              'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 783b53e23..6a35ea463 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -50,6 +50,9 @@ class ARDMediathekIE(InfoExtractor):          if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:              raise ExtractorError('Video %s is no longer available' % video_id, expected=True) +        if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage: +            raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) +          if re.search(r'[\?&]rss($|[=&])', url):              doc = parse_xml(webpage)              if doc.tag == 'rss': diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 929dd3cc5..8273bd6c9 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -146,6 +146,7 @@ class ArteTVPlus7IE(InfoExtractor):              formats.append(format) +        self._check_formats(formats, video_id)          self._sort_formats(formats)          info_dict['formats'] = formats diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 7669e0e3d..29f8795d3 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -19,6 +19,7 @@ from ..utils import (  class AtresPlayerIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' +    _NETRC_MACHINE = 'atresplayer'      _TESTS = [          {              'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py new file mode 100644 index 000000000..3c7775d3e --- /dev/null +++ b/youtube_dl/extractor/beatportpro.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class BeatportProIE(InfoExtractor): +    _VALID_URL = r'https?://pro\.beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', +        'md5': 'b3c34d8639a2f6a7f734382358478887', +        'info_dict': { +            'id': '5379371', +            'display_id': 'synesthesia-original-mix', +            'ext': 'mp4', +            'title': 'Froxic - Synesthesia (Original Mix)', +        }, +    }, { +        'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896', +        'md5': 'e44c3025dfa38c6577fbaeb43da43514', +        'info_dict': { +            'id': '3756896', +            'display_id': 'love-and-war-original-mix', +            'ext': 'mp3', +            'title': 'Wolfgang Gartner - Love & War (Original Mix)', +        }, +    }, { +        'url': 'https://pro.beatport.com/track/birds-original-mix/4991738', +        'md5': 'a1fd8e8046de3950fd039304c186c05f', +        'info_dict': { +            'id': '4991738', +            'display_id': 'birds-original-mix', +            'ext': 'mp4', +            'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", +        } +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        track_id = mobj.group('id') +        display_id = mobj.group('display_id') + +        webpage = self._download_webpage(url, display_id) + +        playables = self._parse_json( +            self._search_regex( +                r'window\.Playables\s*=\s*({.+?});', webpage, +                'playables info', flags=re.DOTALL), +            track_id) + +        track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) + +        title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] +        if track['mix']: +            title += ' (' + track['mix'] + ')' + +        formats = [] +        for ext, info in track['preview'].items(): +            if not info['url']: +                continue +            fmt = { +                'url': info['url'], +                'ext': ext, +                'format_id': ext, +                'vcodec': 'none', +            } +            if ext == 'mp3': +                fmt['preference'] = 0 +                fmt['acodec'] = 'mp3' +                fmt['abr'] = 96 +                fmt['asr'] = 44100 +            elif ext == 'mp4': +                fmt['preference'] = 1 +                fmt['acodec'] = 'aac' +                fmt['abr'] = 96 +                fmt['asr'] = 44100 +            formats.append(fmt) +        self._sort_formats(formats) + +        images = [] +        for name, info in track['images'].items(): +            image_url = info.get('url') +            if name == 'dynamic' or not image_url: +                continue +            image = { +                'id': name, +                'url': image_url, +                'height': int_or_none(info.get('height')), +                'width': int_or_none(info.get('width')), +            } +            images.append(image) + +        return { +            'id': compat_str(track.get('id')) or track_id, +            'display_id': track.get('slug') or display_id, +            'title': title, +            'formats': formats, +            'thumbnails': images, +        } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 4bcc897c9..809287d14 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -41,7 +41,7 @@ class BreakIE(InfoExtractor):              'tbr': media['bitRate'],              'width': media['width'],              'height': media['height'], -        } for media in info['media']] +        } for media in info['media'] if media.get('mediaPurpose') == 'play']          if not formats:              formats.append({ diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index abf8cc280..0fa720ee8 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -105,6 +105,7 @@ class CloudyIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          file_key = self._search_regex( -            r'filekey\s*=\s*"([^"]+)"', webpage, 'file_key') +            [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], +            webpage, 'file_key')          return self._extract_video(video_host, video_id, file_key) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 90ea07438..0a77e951c 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import (  class CNNIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ -        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' +        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln|ktvk)(?:-ap)?|(?=&)))'''      _TESTS = [{          'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -45,6 +45,9 @@ class CNNIE(InfoExtractor):              'description': 'md5:e7223a503315c9f150acac52e76de086',              'upload_date': '20141222',          } +    }, { +        'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7977fa8d0..e5245ec3f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -767,6 +767,10 @@ class InfoExtractor(object):                  formats)      def _is_valid_url(self, url, video_id, item='video'): +        url = self._proto_relative_url(url, scheme='http:') +        # For now assume non HTTP(S) URLs always valid +        if not (url.startswith('http://') or url.startswith('https://')): +            return True          try:              self._request_webpage(url, video_id, 'Checking %s URL' % item)              return True @@ -835,7 +839,7 @@ class InfoExtractor(object):                                m3u8_id=None):          formats = [{ -            'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])), +            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),              'url': m3u8_url,              'ext': ext,              'protocol': 'm3u8', @@ -879,8 +883,13 @@ class InfoExtractor(object):                      formats.append({'url': format_url(line)})                      continue                  tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) +                format_id = [] +                if m3u8_id: +                    format_id.append(m3u8_id) +                last_media_name = last_media.get('NAME') if last_media else None +                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))                  f = { -                    'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])), +                    'format_id': '-'.join(format_id),                      'url': format_url(line.strip()),                      'tbr': tbr,                      'ext': ext, @@ -1053,6 +1062,9 @@ class InfoExtractor(object):      def _get_automatic_captions(self, *args, **kwargs):          raise NotImplementedError("This method must be implemented by subclasses") +    def _subtitles_timecode(self, seconds): +        return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f1da7d09b..6ded723c9 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -23,12 +23,12 @@ from ..utils import (  )  from ..aes import (      aes_cbc_decrypt, -    inc,  )  class CrunchyrollIE(InfoExtractor):      _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' +    _NETRC_MACHINE = 'crunchyroll'      _TESTS = [{          'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',          'info_dict': { @@ -101,13 +101,6 @@ class CrunchyrollIE(InfoExtractor):          key = obfuscate_key(id) -        class Counter: -            __value = iv - -            def next_value(self): -                temp = self.__value -                self.__value = inc(self.__value) -                return temp          decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))          return zlib.decompress(decrypted_data) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 42b20a46d..47d58330b 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):      def _build_request(url):          """Build a request with the family filter disabled"""          request = compat_urllib_request.Request(url) -        request.add_header('Cookie', 'family_filter=off') -        request.add_header('Cookie', 'ff=off') +        request.add_header('Cookie', 'family_filter=off; ff=off')          return request @@ -46,13 +45,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):      _TESTS = [          { -            'url': 'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', -            'md5': '392c4b85a60a90dc4792da41ce3144eb', +            'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', +            'md5': '2137c41a8e78554bb09225b8eb322406',              'info_dict': { -                'id': 'x33vw9', +                'id': 'x2iuewm',                  'ext': 'mp4', -                'uploader': 'Amphora Alex and Van .', -                'title': 'Tutoriel de Youtubeur"DL DES VIDEO DE YOUTUBE"', +                'uploader': 'IGN', +                'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',              }          },          # Vevo video @@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):              video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)          embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id -        embed_page = self._download_webpage(embed_url, video_id, -                                            'Downloading embed page') +        embed_request = self._build_request(embed_url) +        embed_page = self._download_webpage( +            embed_request, video_id, 'Downloading embed page')          info = self._search_regex(r'var info = ({.*?}),$', embed_page,                                    'video info', flags=re.MULTILINE)          info = json.loads(info) diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py new file mode 100644 index 000000000..3ed1f1663 --- /dev/null +++ b/youtube_dl/extractor/dhm.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    parse_duration, +) + + +class DHMIE(InfoExtractor): +    IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' +    _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)' + +    _TESTS = [{ +        'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/', +        'md5': '11c475f670209bf6acca0b2b7ef51827', +        'info_dict': { +            'id': 'the-marshallplan-at-work-in-west-germany', +            'ext': 'flv', +            'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE', +            'description': 'md5:1fabd480c153f97b07add61c44407c82', +            'duration': 660, +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    }, { +        'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/', +        'md5': '09890226332476a3e3f6f2cb74734aa5', +        'info_dict': { +            'id': 'rolle-1', +            'ext': 'flv', +            'title': 'ROLLE 1', +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        playlist_url = self._search_regex( +            r"file\s*:\s*'([^']+)'", webpage, 'playlist url') + +        playlist = self._download_xml(playlist_url, video_id) + +        track = playlist.find( +            './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track') + +        video_url = xpath_text( +            track, './{http://xspf.org/ns/0/}location', +            'video url', fatal=True) +        thumbnail = xpath_text( +            track, './{http://xspf.org/ns/0/}image', +            'thumbnail') + +        title = self._search_regex( +            [r'dc:title="([^"]+)"', r'<title> »([^<]+)</title>'], +            webpage, 'title').strip() +        description = self._html_search_regex( +            r'<p><strong>Description:</strong>(.+?)</p>', +            webpage, 'description', default=None) +        duration = parse_duration(self._search_regex( +            r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)', +            webpage, 'duration', default=None)) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py new file mode 100644 index 000000000..479430c51 --- /dev/null +++ b/youtube_dl/extractor/douyutv.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import time +from .common import InfoExtractor +from ..utils import (ExtractorError, unescapeHTML) +from ..compat import (compat_str, compat_basestring) + + +class DouyuTVIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' +    _TESTS = [{ +        'url': 'http://www.douyutv.com/iseven', +        'info_dict': { +            'id': '17732', +            'display_id': 'iseven', +            'ext': 'flv', +            'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:c93d6692dde6fe33809a46edcbecca44', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': '7师傅', +            'uploader_id': '431925', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        } +    }, { +        'url': 'http://www.douyutv.com/85982', +        'info_dict': { +            'id': '85982', +            'display_id': '85982', +            'ext': 'flv', +            'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:746a2f7a253966a06755a912f0acc0d2', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': 'douyu小漠', +            'uploader_id': '3769985', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        } +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        if video_id.isdigit(): +            room_id = video_id +        else: +            page = self._download_webpage(url, video_id) +            room_id = self._html_search_regex( +                r'"room_id"\s*:\s*(\d+),', page, 'room id') + +        prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( +            room_id, int(time.time())) + +        auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() +        config = self._download_json( +            'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), +            video_id) + +        data = config['data'] + +        error_code = config.get('error', 0) +        if error_code is not 0: +            error_desc = 'Server reported error %i' % error_code +            if isinstance(data, (compat_str, compat_basestring)): +                error_desc += ': ' + data +            raise ExtractorError(error_desc, expected=True) + +        show_status = data.get('show_status') +        # 1 = live, 2 = offline +        if show_status == '2': +            raise ExtractorError( +                'Live stream is offline', expected=True) + +        base_url = data['rtmp_url'] +        live_path = data['rtmp_live'] + +        title = self._live_title(unescapeHTML(data['room_name'])) +        description = data.get('show_details') +        thumbnail = data.get('room_src') + +        uploader = data.get('nickname') +        uploader_id = data.get('owner_uid') + +        multi_formats = data.get('rtmp_multi_bitrate') +        if not isinstance(multi_formats, dict): +            multi_formats = {} +        multi_formats['live'] = live_path + +        formats = [{ +            'url': '%s/%s' % (base_url, format_path), +            'format_id': format_id, +            'preference': 1 if format_id == 'live' else 0, +        } for format_id, format_path in multi_formats.items()] +        self._sort_formats(formats) + +        return { +            'id': room_id, +            'display_id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'formats': formats, +            'is_live': True, +        } diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py new file mode 100644 index 000000000..e43bc81b2 --- /dev/null +++ b/youtube_dl/extractor/dumpert.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import qualities + + +class DumpertIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' +    _TEST = { +        'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', +        'md5': '1b9318d7d5054e7dcb9dc7654f21d643', +        'info_dict': { +            'id': '6646981/951bc60f', +            'ext': 'mp4', +            'title': 'Ik heb nieuws voor je', +            'description': 'Niet schrikken hoor', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        files_base64 = self._search_regex( +            r'data-files="([^"]+)"', webpage, 'data files') + +        files = self._parse_json( +            base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), +            video_id) + +        quality = qualities(['flv', 'mobile', 'tablet', '720p']) + +        formats = [{ +            'url': video_url, +            'format_id': format_id, +            'quality': quality(format_id), +        } for format_id, video_url in files.items() if format_id != 'still'] +        self._sort_formats(formats) + +        title = self._html_search_meta( +            'title', webpage) or self._og_search_title(webpage) +        description = self._html_search_meta( +            'description', webpage) or self._og_search_description(webpage) +        thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats +        } diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py new file mode 100644 index 000000000..7173371ee --- /dev/null +++ b/youtube_dl/extractor/eagleplatform.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +) + + +class EaglePlatformIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    (?: +                        eagleplatform:(?P<custom_host>[^/]+):| +                        https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id= +                    ) +                    (?P<id>\d+) +                ''' +    _TESTS = [{ +        # http://lenta.ru/news/2015/03/06/navalny/ +        'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', +        'md5': '0b7994faa2bd5c0f69a3db6db28d078d', +        'info_dict': { +            'id': '227304', +            'ext': 'mp4', +            'title': 'Навальный вышел на свободу', +            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 87, +            'view_count': int, +            'age_limit': 0, +        }, +    }, { +        # http://muz-tv.ru/play/7129/ +        # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true +        'url': 'eagleplatform:media.clipyou.ru:12820', +        'md5': '6c2ebeab03b739597ce8d86339d5a905', +        'info_dict': { +            'id': '12820', +            'ext': 'mp4', +            'title': "'O Sole Mio", +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 216, +            'view_count': int, +        }, +    }] + +    def _handle_error(self, response): +        status = int_or_none(response.get('status', 200)) +        if status != 200: +            raise ExtractorError(' '.join(response['errors']), expected=True) + +    def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): +        response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) +        self._handle_error(response) +        return response + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + +        player_data = self._download_json( +            'http://%s/api/player_data?id=%s' % (host, video_id), video_id) + +        media = player_data['data']['playlist']['viewports'][0]['medialist'][0] + +        title = media['title'] +        description = media.get('description') +        thumbnail = media.get('snapshot') +        duration = int_or_none(media.get('duration')) +        view_count = int_or_none(media.get('views')) + +        age_restriction = media.get('age_restriction') +        age_limit = None +        if age_restriction: +            age_limit = 0 if age_restriction == 'allow_all' else 18 + +        m3u8_data = self._download_json( +            media['sources']['secure_m3u8']['auto'], +            video_id, 'Downloading m3u8 JSON') + +        formats = self._extract_m3u8_formats( +            m3u8_data['data'][0], video_id, +            'mp4', entry_protocol='m3u8_native') +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'view_count': view_count, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index fb5dbbe2b..0b61ea0ba 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals  import json  import random -import re  from .common import InfoExtractor  from ..compat import ( @@ -103,20 +102,23 @@ class EightTracksIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        playlist_id = mobj.group('id') +        playlist_id = self._match_id(url)          webpage = self._download_webpage(url, playlist_id) -        json_like = self._search_regex( -            r"(?s)PAGE.mix = (.*?);\n", webpage, 'trax information') -        data = json.loads(json_like) +        data = self._parse_json( +            self._search_regex( +                r"(?s)PAGE\.mix\s*=\s*({.+?});\n", webpage, 'trax information'), +            playlist_id)          session = str(random.randint(0, 1000000000))          mix_id = data['id']          track_count = data['tracks_count']          duration = data['duration']          avg_song_duration = float(duration) / track_count +        # duration is sometimes negative, use predefined avg duration +        if avg_song_duration <= 0: +            avg_song_duration = 300          first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)          next_url = first_url          entries = [] diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 79e2fbd39..0cbca90b0 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -1,11 +1,17 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ExtractorError  class EroProfileIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)' -    _TEST = { +    _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' +    _NETRC_MACHINE = 'eroprofile' +    _TESTS = [{          'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',          'md5': 'c26f351332edf23e1ea28ce9ec9de32f',          'info_dict': { @@ -16,13 +22,55 @@ class EroProfileIE(InfoExtractor):              'thumbnail': 're:https?://.*\.jpg',              'age_limit': 18,          } -    } +    }, { +        'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', +        'md5': '1baa9602ede46ce904c431f5418d8916', +        'info_dict': { +            'id': '1133519', +            'ext': 'm4v', +            'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', +            'thumbnail': 're:https?://.*\.jpg', +            'age_limit': 18, +        }, +        'skip': 'Requires login', +    }] + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        query = compat_urllib_parse.urlencode({ +            'username': username, +            'password': password, +            'url': 'http://www.eroprofile.com/', +        }) +        login_url = self._LOGIN_URL + query +        login_page = self._download_webpage(login_url, None, False) + +        m = re.search(r'Your username or password was incorrect\.', login_page) +        if m: +            raise ExtractorError( +                'Wrong username and/or password.', expected=True) + +        self.report_login() +        redirect_url = self._search_regex( +            r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') +        self._download_webpage(redirect_url, None, False) + +    def _real_initialize(self): +        self._login()      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) +        m = re.search(r'You must be logged in to view this video\.', webpage) +        if m: +            raise ExtractorError( +                'This video requires login. Please specify a username and password and try again.', expected=True) +          video_id = self._search_regex(              [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],              webpage, 'video id', default=None) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 36ba33128..c826a5404 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -4,11 +4,11 @@ import re  from .common import InfoExtractor  from ..compat import ( -    compat_urllib_parse_urlparse, +    compat_parse_qs,      compat_urllib_request, -    compat_urllib_parse,  )  from ..utils import ( +    qualities,      str_to_int,  ) @@ -17,7 +17,7 @@ class ExtremeTubeIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<id>[0-9]+))(?:[/?&]|$)'      _TESTS = [{          'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', -        'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', +        'md5': '344d0c6d50e2f16b06e49ca011d8ac69',          'info_dict': {              'id': '652431',              'ext': 'mp4', @@ -49,19 +49,27 @@ class ExtremeTubeIE(InfoExtractor):              r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>',              webpage, 'view count', fatal=False)) -        video_url = compat_urllib_parse.unquote(self._html_search_regex( -            r'video_url=(.+?)&', webpage, 'video_url')) -        path = compat_urllib_parse_urlparse(video_url).path -        format = path.split('/')[5].split('_')[:2] -        format = "-".join(format) +        flash_vars = compat_parse_qs(self._search_regex( +            r'<param[^>]+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars')) + +        formats = [] +        quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p']) +        for k, vals in flash_vars.items(): +            m = re.match(r'quality_(?P<quality>[0-9]+p)$', k) +            if m is not None: +                formats.append({ +                    'format_id': m.group('quality'), +                    'quality': quality(m.group('quality')), +                    'url': vals[0], +                }) + +        self._sort_formats(formats)          return {              'id': video_id,              'title': video_title, +            'formats': formats,              'uploader': uploader,              'view_count': view_count, -            'url': video_url, -            'format': format, -            'format_id': format,              'age_limit': 18,          } diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py new file mode 100644 index 000000000..2b4691ae8 --- /dev/null +++ b/youtube_dl/extractor/footyroom.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FootyRoomIE(InfoExtractor): +    _VALID_URL = r'http://footyroom\.com/(?P<id>[^/]+)' +    _TEST = { +        'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', +        'info_dict': { +            'id': 'schalke-04-0-2-real-madrid-2015-02', +            'title': 'Schalke 04 0 – 2 Real Madrid', +        }, +        'playlist_count': 3, +    } + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        playlist = self._parse_json( +            self._search_regex( +                r'VideoSelector\.load\((\[.+?\])\);', webpage, 'video selector'), +            playlist_id) + +        playlist_title = self._og_search_title(webpage) + +        entries = [] +        for video in playlist: +            payload = video.get('payload') +            if not payload: +                continue +            playwire_url = self._search_regex( +                r'data-config="([^"]+)"', payload, +                'playwire url', default=None) +            if playwire_url: +                entries.append(self.url_result(playwire_url, 'Playwire')) + +        return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index a49fc1151..dd87257c4 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -50,7 +50,6 @@ class FunnyOrDieIE(InfoExtractor):          bitrates.sort()          formats = [] -          for bitrate in bitrates:              for link in links:                  formats.append({ @@ -59,6 +58,13 @@ class FunnyOrDieIE(InfoExtractor):                      'vbr': bitrate,                  }) +        subtitles = {} +        for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage): +            subtitles[src_lang] = [{ +                'ext': src.split('/')[-1], +                'url': 'http://www.funnyordie.com%s' % src, +            }] +          post_json = self._search_regex(              r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')          post = json.loads(post_json) @@ -69,4 +75,5 @@ class FunnyOrDieIE(InfoExtractor):              'description': post.get('description'),              'thumbnail': post.get('picture'),              'formats': formats, +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py new file mode 100644 index 000000000..ea32b621c --- /dev/null +++ b/youtube_dl/extractor/gazeta.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class GazetaIE(InfoExtractor): +    _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' +    _TESTS = [{ +        'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', +        'md5': 'd49c9bdc6e5a7888f27475dc215ee789', +        'info_dict': { +            'id': '205566', +            'ext': 'mp4', +            'title': '«70–80 процентов гражданских в Донецке на грани голода»', +            'description': 'md5:38617526050bd17b234728e7f9620a71', +            'thumbnail': 're:^https?://.*\.jpg', +        }, +    }, { +        'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        display_id = mobj.group('id') +        embed_url = '%s?p=embed' % mobj.group('url') +        embed_page = self._download_webpage( +            embed_url, display_id, 'Downloading embed page') + +        video_id = self._search_regex( +            r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') + +        return self.url_result( +            'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f7b467b0a..51796f3a4 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -12,6 +12,7 @@ from ..utils import remove_end  class GDCVaultIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)' +    _NETRC_MACHINE = 'gdcvault'      _TESTS = [          {              'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 27e2bc300..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -26,8 +26,10 @@ from ..utils import (      unsmuggle_url,      UnsupportedError,      url_basename, +    xpath_text,  )  from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE  from .ooyala import OoyalaIE  from .rutv import RUTVIE  from .smotri import SmotriIE @@ -526,6 +528,17 @@ class GenericIE(InfoExtractor):              },              'add_ie': ['Viddler'],          }, +        # Libsyn embed +        { +            'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', +            'info_dict': { +                'id': '3377616', +                'ext': 'mp3', +                'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", +                'description': 'md5:601cb790edd05908957dae8aaa866465', +                'upload_date': '20150220', +            }, +        },          # jwplayer YouTube          {              'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', @@ -569,6 +582,75 @@ class GenericIE(InfoExtractor):                  'title': 'John Carlson Postgame 2/25/15',              },          }, +        # Eagle.Platform embed (generic URL) +        { +            'url': 'http://lenta.ru/news/2015/03/06/navalny/', +            'info_dict': { +                'id': '227304', +                'ext': 'mp4', +                'title': 'Навальный вышел на свободу', +                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 87, +                'view_count': int, +                'age_limit': 0, +            }, +        }, +        # ClipYou (Eagle.Platform) embed (custom URL) +        { +            'url': 'http://muz-tv.ru/play/7129/', +            'info_dict': { +                'id': '12820', +                'ext': 'mp4', +                'title': "'O Sole Mio", +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 216, +                'view_count': int, +            }, +        }, +        # Pladform embed +        { +            'url': 'http://muz-tv.ru/kinozal/view/7400/', +            'info_dict': { +                'id': '100183293', +                'ext': 'mp4', +                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +                'thumbnail': 're:^https?://.*\.jpg$', +                'duration': 694, +                'age_limit': 0, +            }, +        }, +        # 5min embed +        { +            'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', +            'md5': '4c6f127a30736b59b3e2c19234ee2bf7', +            'info_dict': { +                'id': '518726732', +                'ext': 'mp4', +                'title': 'Facebook Creates "On This Day" | Crunch Report', +            }, +        }, +        # RSS feed with enclosure +        { +            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', +            'info_dict': { +                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', +                'ext': 'm4v', +                'upload_date': '20150228', +                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', +            } +        }, +        # NBC Sports vplayer embed +        { +            'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', +            'info_dict': { +                'id': 'ln7x1qSThw4k', +                'ext': 'flv', +                'title': "PFT Live: New leader in the 'new-look' defense", +                'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', +            }, +        }      ]      def report_following_redirect(self, new_url): @@ -580,11 +662,24 @@ class GenericIE(InfoExtractor):          playlist_desc_el = doc.find('./channel/description')          playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text -        entries = [{ -            '_type': 'url', -            'url': e.find('link').text, -            'title': e.find('title').text, -        } for e in doc.findall('./channel/item')] +        entries = [] +        for it in doc.findall('./channel/item'): +            next_url = xpath_text(it, 'link', fatal=False) +            if not next_url: +                enclosure_nodes = it.findall('./enclosure') +                for e in enclosure_nodes: +                    next_url = e.attrib.get('url') +                    if next_url: +                        break + +            if not next_url: +                continue + +            entries.append({ +                '_type': 'url', +                'url': next_url, +                'title': it.find('title').text, +            })          return {              '_type': 'playlist', @@ -943,6 +1038,19 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result(mobj.group('url')) +        # Look for NYTimes player +        mobj = re.search( +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', +            webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url')) + +        # Look for Libsyn player +        mobj = re.search( +            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url')) +          # Look for Ooyala videos          mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or @@ -1131,6 +1239,35 @@ class GenericIE(InfoExtractor):          if mobj is not None:              return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') +        # Look for Eagle.Platform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'EaglePlatform') + +        # Look for ClipYou (uses Eagle.Platform) embeds +        mobj = re.search( +            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) +        if mobj is not None: +            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') + +        # Look for Pladform embeds +        mobj = re.search( +            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) +        if mobj is not None: +            return self.url_result(mobj.group('url'), 'Pladform') + +        # Look for 5min embeds +        mobj = re.search( +            r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) +        if mobj is not None: +            return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + +        # Look for NBC Sports VPlayer embeds +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') +          def check_video(vurl):              if YoutubeIE.suitable(vurl):                  return True @@ -1187,10 +1324,16 @@ class GenericIE(InfoExtractor):              # HTML5 video              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)          if not found: +            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'              found = re.search(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' -                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)', +                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,                  webpage) +            if not found: +                # Look also in Refresh HTTP header +                refresh_header = head_response.headers.get('Refresh') +                if refresh_header: +                    found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = found.group(1)                  self.report_following_redirect(new_url) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 29638a194..8a95793ca 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -20,7 +20,7 @@ class GloboIE(InfoExtractor):      _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)'      _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' -    _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s' +    _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s'      _VIDEOID_REGEXES = [          r'\bdata-video-id="(\d+)"', diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py index 848d17beb..36ad4915c 100644 --- a/youtube_dl/extractor/grooveshark.py +++ b/youtube_dl/extractor/grooveshark.py @@ -140,9 +140,9 @@ class GroovesharkIE(InfoExtractor):          if webpage is not None:              o = GroovesharkHtmlParser.extract_object_tags(webpage) -            return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']) +            return webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'] -        return (webpage, None) +        return webpage, None      def _real_initialize(self):          self.ts = int(time.time() * 1000)  # timestamp in millis @@ -154,7 +154,7 @@ class GroovesharkIE(InfoExtractor):          swf_referer = None          if self.do_playerpage_request:              (_, player_objs) = self._get_playerpage(url) -            if player_objs is not None: +            if player_objs:                  swf_referer = self._build_swf_referer(url, player_objs[0])                  self.to_screen('SWF Referer: %s' % swf_referer) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 8094cc2e4..d0720ff56 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals -import json  import re  from .common import InfoExtractor @@ -15,10 +14,10 @@ class JeuxVideoIE(InfoExtractor):          'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',          'md5': '046e491afb32a8aaac1f44dd4ddd54ee',          'info_dict': { -            'id': '5182', +            'id': '114765',              'ext': 'mp4', -            'title': 'GC 2013 : Tearaway nous présente ses papiers d\'identité', -            'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', +            'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité', +            'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.',          },      } @@ -26,26 +25,29 @@ class JeuxVideoIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          title = mobj.group(1)          webpage = self._download_webpage(url, title) -        xml_link = self._html_search_regex( -            r'<param name="flashvars" value="config=(.*?)" />', +        title = self._html_search_meta('name', webpage) +        config_url = self._html_search_regex( +            r'data-src="(/contenu/medias/video.php.*?)"',              webpage, 'config URL') +        config_url = 'http://www.jeuxvideo.com' + config_url          video_id = self._search_regex( -            r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml', -            xml_link, 'video ID') +            r'id=(\d+)', +            config_url, 'video ID') -        config = self._download_xml( -            xml_link, title, 'Downloading XML config') -        info_json = config.find('format.json').text -        info = json.loads(info_json)['versions'][0] +        config = self._download_json( +            config_url, title, 'Downloading JSON config') -        video_url = 'http://video720.jeuxvideo.com/' + info['file'] +        formats = [{ +            'url': source['file'], +            'format_id': source['label'], +            'resolution': source['label'], +        } for source in reversed(config['sources'])]          return {              'id': video_id, -            'title': config.find('titre_video').text, -            'ext': 'mp4', -            'url': video_url, +            'title': title, +            'formats': formats,              'description': self._og_search_description(webpage), -            'thumbnail': config.find('image').text, +            'thumbnail': config.get('image'),          } diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py new file mode 100644 index 000000000..2bb078036 --- /dev/null +++ b/youtube_dl/extractor/kanalplay.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    float_or_none, +) + + +class KanalPlayIE(InfoExtractor): +    IE_DESC = 'Kanal 5/9/11 Play' +    _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', +        'info_dict': { +            'id': '3270012277', +            'ext': 'flv', +            'title': 'Saknar både dusch och avlopp', +            'description': 'md5:6023a95832a06059832ae93bc3c7efb7', +            'duration': 2636.36, +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        } +    }, { +        'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', +        'only_matching': True, +    }, { +        'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', +        'only_matching': True, +    }] + +    def _fix_subtitles(self, subs): +        return '\r\n\r\n'.join( +            '%s\r\n%s --> %s\r\n%s' +            % ( +                num, +                self._subtitles_timecode(item['startMillis'] / 1000.0), +                self._subtitles_timecode(item['endMillis'] / 1000.0), +                item['text'], +            ) for num, item in enumerate(subs, 1)) + +    def _get_subtitles(self, channel_id, video_id): +        subs = self._download_json( +            'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), +            video_id, 'Downloading subtitles JSON', fatal=False) +        return {'se': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        channel_id = mobj.group('channel_id') + +        video = self._download_json( +            'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), +            video_id) + +        reasons_for_no_streams = video.get('reasonsForNoStreams') +        if reasons_for_no_streams: +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), +                expected=True) + +        title = video['title'] +        description = video.get('description') +        duration = float_or_none(video.get('length'), 1000) +        thumbnail = video.get('posterUrl') + +        stream_base_url = video['streamBaseUrl'] + +        formats = [{ +            'url': stream_base_url, +            'play_path': stream['source'], +            'ext': 'flv', +            'tbr': float_or_none(stream.get('bitrate'), 1000), +            'rtmp_real_time': True, +        } for stream in video['streams']] +        self._sort_formats(formats) + +        subtitles = {} +        if video.get('hasSubtitle'): +            subtitles = self.extract_subtitles(channel_id, video_id) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats, +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index e46954b47..96f95979a 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor):          description = self._og_search_description(webpage, default=None)          thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)          duration = int_or_none(flashvars.get('duration')) -        width = int_or_none(self._og_search_property('video:width', webpage, 'video width')) -        height = int_or_none(self._og_search_property('video:height', webpage, 'video height')) +        width = int_or_none(self._og_search_property( +            'video:width', webpage, 'video width', default=None)) +        height = int_or_none(self._og_search_property( +            'video:height', webpage, 'video height', default=None))          return {              'id': video_id, diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 583ce35b9..1484ac0d2 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -7,8 +7,9 @@ import time  from .common import InfoExtractor  from ..compat import ( -    compat_urlparse,      compat_urllib_parse, +    compat_urllib_request, +    compat_urlparse,  )  from ..utils import (      determine_ext, @@ -39,12 +40,20 @@ class LetvIE(InfoExtractor):              'title': '美人天下01',              'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',          }, -        'expected_warnings': [ -            'publish time' -        ] +    }, { +        'note': 'This video is available only in Mainland China, thus a proxy is needed', +        'url': 'http://www.letv.com/ptv/vplay/1118082.html', +        'md5': 'f80936fbe20fb2f58648e81386ff7927', +        'info_dict': { +            'id': '1118082', +            'ext': 'mp4', +            'title': '与龙共舞 完整版', +            'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', +        }, +        'params': { +            'cn_verification_proxy': 'http://proxy.uku.im:8888' +        },      }] -    # http://www.letv.com/ptv/vplay/1118082.html -    # This video is available only in Mainland China      @staticmethod      def urshift(val, n): @@ -76,9 +85,16 @@ class LetvIE(InfoExtractor):              'tkey': self.calc_time_key(int(time.time())),              'domain': 'www.letv.com'          } +        play_json_req = compat_urllib_request.Request( +            'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) +        ) +        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') +        if cn_verification_proxy: +            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) +          play_json = self._download_json( -            'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params), -            media_id, 'playJson data') +            play_json_req, +            media_id, 'Downloading playJson data')          # Check for errors          playstatus = play_json['playstatus'] @@ -114,7 +130,8 @@ class LetvIE(InfoExtractor):                  url_info_dict = {                      'url': media_url, -                    'ext': determine_ext(dispatch[format_id][1]) +                    'ext': determine_ext(dispatch[format_id][1]), +                    'format_id': format_id,                  }                  if format_id[-1:] == 'p': @@ -123,7 +140,7 @@ class LetvIE(InfoExtractor):                  urls.append(url_info_dict)          publish_time = parse_iso8601(self._html_search_regex( -            r'发布时间 ([^<>]+) ', page, 'publish time', fatal=False), +            r'发布时间 ([^<>]+) ', page, 'publish time', default=None),              delimiter=' ', timezone=datetime.timedelta(hours=8))          description = self._html_search_meta('description', page, fatal=False) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py new file mode 100644 index 000000000..9ab1416f5 --- /dev/null +++ b/youtube_dl/extractor/libsyn.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class LibsynIE(InfoExtractor): +    _VALID_URL = r'https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)' + +    _TEST = { +        'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', +        'md5': '443360ee1b58007bc3dcf09b41d093bb', +        'info_dict': { +            'id': '3377616', +            'ext': 'mp3', +            'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", +            'description': 'md5:601cb790edd05908957dae8aaa866465', +            'upload_date': '20150220', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        formats = [{ +            'url': media_url, +        } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] + +        podcast_title = self._search_regex( +            r'<h2>([^<]+)</h2>', webpage, 'title') +        episode_title = self._search_regex( +            r'<h3>([^<]+)</h3>', webpage, 'title', default=None) + +        title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + +        description = self._html_search_regex( +            r'<div id="info_text_body">(.+?)</div>', webpage, +            'description', fatal=False) + +        thumbnail = self._search_regex( +            r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"', +            webpage, 'thumbnail', fatal=False) + +        release_date = unified_strdate(self._search_regex( +            r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'upload_date': release_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 3642089f7..2467f8bdd 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals  import re  import json +import itertools  from .common import InfoExtractor  from ..compat import ( @@ -41,6 +42,13 @@ class LivestreamIE(InfoExtractor):          },          'playlist_mincount': 4,      }, { +        'url': 'http://new.livestream.com/chess24/tatasteelchess', +        'info_dict': { +            'title': 'Tata Steel Chess', +            'id': '3705884', +        }, +        'playlist_mincount': 60, +    }, {          'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640',          'only_matching': True,      }] @@ -117,6 +125,30 @@ class LivestreamIE(InfoExtractor):              'view_count': video_data.get('views'),          } +    def _extract_event(self, info): +        event_id = compat_str(info['id']) +        account = compat_str(info['owner_account_id']) +        root_url = ( +            'https://new.livestream.com/api/accounts/{account}/events/{event}/' +            'feed.json'.format(account=account, event=event_id)) + +        def _extract_videos(): +            last_video = None +            for i in itertools.count(1): +                if last_video is None: +                    info_url = root_url +                else: +                    info_url = '{root}?&id={id}&newer=-1&type=video'.format( +                        root=root_url, id=last_video) +                videos_info = self._download_json(info_url, event_id, 'Downloading page {0}'.format(i))['data'] +                videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] +                if not videos_info: +                    break +                for v in videos_info: +                    yield self._extract_video_info(v) +                last_video = videos_info[-1]['id'] +        return self.playlist_result(_extract_videos(), event_id, info['full_name']) +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') @@ -144,14 +176,13 @@ class LivestreamIE(InfoExtractor):                  result = result and compat_str(vdata['data']['id']) == vid              return result -        videos = [self._extract_video_info(video_data['data']) -                  for video_data in info['feed']['data'] -                  if is_relevant(video_data, video_id)]          if video_id is None:              # This is an event page: -            return self.playlist_result( -                videos, '%s' % info['id'], info['full_name']) +            return self._extract_event(info)          else: +            videos = [self._extract_video_info(video_data['data']) +                      for video_data in info['feed']['data'] +                      if is_relevant(video_data, video_id)]              if not videos:                  raise ExtractorError('Cannot find video %s' % video_id)              return videos[0] diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index 9c2fbdd96..e3236f7b5 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -52,6 +52,7 @@ class LRTIE(InfoExtractor):                      'url': data['streamer'],                      'play_path': 'mp4:%s' % data['file'],                      'preference': -1, +                    'rtmp_real_time': True,                  })              else:                  formats.extend( diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5dc22da22..cfd3b14f4 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -15,18 +15,73 @@ from ..utils import (  ) -class LyndaIE(InfoExtractor): +class LyndaBaseIE(InfoExtractor): +    _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' +    _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true' +    _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' +    _NETRC_MACHINE = 'lynda' + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return + +        login_form = { +            'username': username, +            'password': password, +            'remember': 'false', +            'stayPut': 'false' +        } +        request = compat_urllib_request.Request( +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) +        login_page = self._download_webpage( +            request, None, 'Logging in as %s' % username) + +        # Not (yet) logged in +        m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) +        if m is not None: +            response = m.group('json') +            response_json = json.loads(response) +            state = response_json['state'] + +            if state == 'notlogged': +                raise ExtractorError( +                    'Unable to login, incorrect username and/or password', +                    expected=True) + +            # This is when we get popup: +            # > You're already logged in to lynda.com on two devices. +            # > If you log in here, we'll log you out of another device. +            # So, we need to confirm this. +            if state == 'conflicted': +                confirm_form = { +                    'username': '', +                    'password': '', +                    'resolve': 'true', +                    'remember': 'false', +                    'stayPut': 'false', +                } +                request = compat_urllib_request.Request( +                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) +                login_page = self._download_webpage( +                    request, None, +                    'Confirming log in and log out from another device') + +        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: +            raise ExtractorError('Unable to log in') + + +class LyndaIE(LyndaBaseIE):      IE_NAME = 'lynda'      IE_DESC = 'lynda.com videos' -    _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(\d+)' -    _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' +    _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)'      _NETRC_MACHINE = 'lynda' -    _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'      _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' -    ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' -      _TESTS = [{          'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',          'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', @@ -41,23 +96,22 @@ class LyndaIE(InfoExtractor):          'only_matching': True,      }] -    def _real_initialize(self): -        self._login() -      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) +        video_id = self._match_id(url) -        page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id, -                                      'Downloading video JSON') +        page = self._download_webpage( +            'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, +            video_id, 'Downloading video JSON')          video_json = json.loads(page)          if 'Status' in video_json: -            raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True) +            raise ExtractorError( +                'lynda returned error: %s' % video_json['Message'], expected=True)          if video_json['HasAccess'] is False:              raise ExtractorError( -                'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True) +                'Video %s is only available for members. ' +                % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True)          video_id = compat_str(video_json['ID'])          duration = video_json['DurationInSeconds'] @@ -100,50 +154,9 @@ class LyndaIE(InfoExtractor):              'formats': formats          } -    def _login(self): -        (username, password) = self._get_login_info() -        if username is None: -            return - -        login_form = { -            'username': username, -            'password': password, -            'remember': 'false', -            'stayPut': 'false' -        } -        request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) -        login_page = self._download_webpage(request, None, 'Logging in as %s' % username) - -        # Not (yet) logged in -        m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) -        if m is not None: -            response = m.group('json') -            response_json = json.loads(response) -            state = response_json['state'] - -            if state == 'notlogged': -                raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) - -            # This is when we get popup: -            # > You're already logged in to lynda.com on two devices. -            # > If you log in here, we'll log you out of another device. -            # So, we need to confirm this. -            if state == 'conflicted': -                confirm_form = { -                    'username': '', -                    'password': '', -                    'resolve': 'true', -                    'remember': 'false', -                    'stayPut': 'false', -                } -                request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) -                login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device') - -        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: -            raise ExtractorError('Unable to log in') -      def _fix_subtitles(self, subs):          srt = '' +        seq_counter = 0          for pos in range(0, len(subs) - 1):              seq_current = subs[pos]              m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) @@ -155,8 +168,10 @@ class LyndaIE(InfoExtractor):                  continue              appear_time = m_current.group('timecode')              disappear_time = m_next.group('timecode') -            text = seq_current['Caption'].lstrip() -            srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) +            text = seq_current['Caption'].strip() +            if text: +                seq_counter += 1 +                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)          if srt:              return srt @@ -169,7 +184,7 @@ class LyndaIE(InfoExtractor):              return {} -class LyndaCourseIE(InfoExtractor): +class LyndaCourseIE(LyndaBaseIE):      IE_NAME = 'lynda:course'      IE_DESC = 'lynda.com online courses' @@ -182,35 +197,37 @@ class LyndaCourseIE(InfoExtractor):          course_path = mobj.group('coursepath')          course_id = mobj.group('courseid') -        page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, -                                      course_id, 'Downloading course JSON') +        page = self._download_webpage( +            'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, +            course_id, 'Downloading course JSON')          course_json = json.loads(page)          if 'Status' in course_json and course_json['Status'] == 'NotFound': -            raise ExtractorError('Course %s does not exist' % course_id, expected=True) +            raise ExtractorError( +                'Course %s does not exist' % course_id, expected=True)          unaccessible_videos = 0          videos = [] -        (username, _) = self._get_login_info()          # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided          # by single video API anymore          for chapter in course_json['Chapters']:              for video in chapter['Videos']: -                if username is None and video['HasAccess'] is False: +                if video['HasAccess'] is False:                      unaccessible_videos += 1                      continue                  videos.append(video['ID'])          if unaccessible_videos > 0: -            self._downloader.report_warning('%s videos are only available for members and will not be downloaded. ' -                                            % unaccessible_videos + LyndaIE.ACCOUNT_CREDENTIALS_HINT) +            self._downloader.report_warning( +                '%s videos are only available for members (or paid members) and will not be downloaded. ' +                % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)          entries = [ -            self.url_result('http://www.lynda.com/%s/%s-4.html' % -                            (course_path, video_id), -                            'Lynda') +            self.url_result( +                'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), +                'Lynda')              for video_id in videos]          course_title = course_json['Title'] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 1831c6749..21aea0c55 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  import re +import itertools  from .common import InfoExtractor  from ..compat import ( @@ -10,7 +11,6 @@ from ..utils import (      ExtractorError,      HEADRequest,      str_to_int, -    parse_iso8601,  ) @@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor):              'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',              'uploader': 'Daniel Holbach',              'uploader_id': 'dholbach', -            'upload_date': '20111115', -            'timestamp': 1321359578,              'thumbnail': 're:https?://.*\.jpg',              'view_count': int,              'like_count': int, @@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor):          'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',          'info_dict': {              'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', -            'ext': 'm4a', -            'title': 'Electric Relaxation vol. 3', +            'ext': 'mp3', +            'title': 'Caribou 7 inch Vinyl Mix & Chat',              'description': 'md5:2b8aec6adce69f9d41724647c65875e8', -            'uploader': 'Daniel Drumz', +            'uploader': 'Gilles Peterson Worldwide',              'uploader_id': 'gillespeterson', -            'thumbnail': 're:https?://.*\.jpg', +            'thumbnail': 're:https?://.*/images/',              'view_count': int,              'like_count': int,          },      }] -    def _get_url(self, track_id, template_url): -        server_count = 30 -        for i in range(server_count): -            url = template_url % i +    def _get_url(self, track_id, template_url, server_number): +        boundaries = (1, 30) +        for nr in server_numbers(server_number, boundaries): +            url = template_url % nr              try:                  # We only want to know if the request succeed                  # don't download the whole file                  self._request_webpage(                      HEADRequest(url), track_id, -                    'Checking URL %d/%d ...' % (i + 1, server_count + 1)) +                    'Checking URL %d/%d ...' % (nr, boundaries[-1]))                  return url              except ExtractorError:                  pass -          return None      def _real_extract(self, url): @@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor):          preview_url = self._search_regex(              r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')          song_url = preview_url.replace('/previews/', '/c/originals/') +        server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))          template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) -        final_song_url = self._get_url(track_id, template_url) +        final_song_url = self._get_url(track_id, template_url, server_number)          if final_song_url is None:              self.to_screen('Trying with m4a extension')              template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') -            final_song_url = self._get_url(track_id, template_url) +            final_song_url = self._get_url(track_id, template_url, server_number)          if final_song_url is None:              raise ExtractorError('Unable to extract track url')          PREFIX = ( -            r'<span class="play-button[^"]*?"' +            r'm-play-on-spacebar[^>]+'              r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')          title = self._html_search_regex(              PREFIX + r'm-title="([^"]+)"', webpage, 'title') @@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor):              r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)          description = self._og_search_description(webpage)          like_count = str_to_int(self._search_regex( -            [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"', -             r'/favorites/?">([0-9]+)<'], +            r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"',              webpage, 'like count', fatal=False))          view_count = str_to_int(self._search_regex(              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',               r'/listeners/?">([0-9,.]+)</a>'],              webpage, 'play count', fatal=False)) -        timestamp = parse_iso8601(self._search_regex( -            r'<time itemprop="dateCreated" datetime="([^"]+)">', -            webpage, 'upload date', default=None))          return {              'id': track_id, @@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor):              'thumbnail': thumbnail,              'uploader': uploader,              'uploader_id': uploader_id, -            'timestamp': timestamp,              'view_count': view_count,              'like_count': like_count,          } + + +def server_numbers(first, boundaries): +    """ Server numbers to try in descending order of probable availability. +    Starting from first (i.e. the number of the server hosting the preview file) +    and going further and further up to the higher boundary and down to the +    lower one in an alternating fashion. Namely: + +        server_numbers(2, (1, 5)) + +        # Where the preview server is 2, min number is 1 and max is 5. +        # Yields: 2, 3, 1, 4, 5 + +    Why not random numbers or increasing sequences? Since from what I've seen, +    full length files seem to be hosted on servers whose number is closer to +    that of the preview; to be confirmed. +    """ +    zip_longest = getattr(itertools, 'zip_longest', None) +    if zip_longest is None: +        # python 2.x +        zip_longest = itertools.izip_longest + +    if len(boundaries) != 2: +        raise ValueError("boundaries should be a two-element tuple") +    min, max = boundaries +    highs = range(first + 1, max + 1) +    lows = range(first - 1, min - 1, -1) +    rest = filter( +        None, itertools.chain.from_iterable(zip_longest(highs, lows))) +    yield first +    for n in rest: +        yield n diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 1a241aca7..e369551c2 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -10,7 +10,7 @@ from ..utils import (  class MLBIE(InfoExtractor): -    _VALID_URL = r'https?://m(?:lb)?\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)' +    _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'      _TESTS = [          {              'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', @@ -80,6 +80,10 @@ class MLBIE(InfoExtractor):              'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553',              'only_matching': True,          }, +        { +            'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', +            'only_matching': True, +        }      ]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3645d3033..ecd0ac8b1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -14,7 +14,7 @@ from ..utils import (  class NBCIE(InfoExtractor): -    _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' +    _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'      _TESTS = [          { @@ -50,6 +50,57 @@ class NBCIE(InfoExtractor):          return self.url_result(theplatform_url) +class NBCSportsVPlayerIE(InfoExtractor): +    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + +    _TESTS = [{ +        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', +        'info_dict': { +            'id': '9CsDKds0kvHI', +            'ext': 'flv', +            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', +            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', +        } +    }, { +        'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', +        'only_matching': True, +    }] + +    @staticmethod +    def _extract_url(webpage): +        iframe_m = re.search( +            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage) +        if iframe_m: +            return iframe_m.group('url') + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        theplatform_url = self._og_search_video_url(webpage) +        return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): +    # Does not include https becuase its certificate is invalid +    _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + +    _TEST = { +        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', +        'info_dict': { +            'id': 'PHJSaFWbrTY9', +            'ext': 'flv', +            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', +            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        return self.url_result( +            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + +  class NBCNewsIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/          (?:video/.+?/(?P<id>\d+)| diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 4c1890416..ddec7b338 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -22,7 +22,7 @@ class NiconicoIE(InfoExtractor):      IE_NAME = 'niconico'      IE_DESC = 'ニコニコ動画' -    _TEST = { +    _TESTS = [{          'url': 'http://www.nicovideo.jp/watch/sm22312215',          'md5': 'd1a75c0823e2f629128c43e1212760f9',          'info_dict': { @@ -39,9 +39,26 @@ class NiconicoIE(InfoExtractor):              'username': 'ydl.niconico@gmail.com',              'password': 'youtube-dl',          }, -    } +    }, { +        'url': 'http://www.nicovideo.jp/watch/nm14296458', +        'md5': '8db08e0158457cf852a31519fceea5bc', +        'info_dict': { +            'id': 'nm14296458', +            'ext': 'swf', +            'title': '【鏡音リン】Dance on media【オリジナル】take2!', +            'description': 'md5:', +            'uploader': 'りょうた', +            'uploader_id': '18822557', +            'upload_date': '20110429', +            'duration': 209, +        }, +        'params': { +            'username': 'ydl.niconico@gmail.com', +            'password': 'youtube-dl', +        }, +    }] -    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)' +    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'      _NETRC_MACHINE = 'niconico'      # Determine whether the downloader used authentication to download video      _AUTHENTICATED = False @@ -76,8 +93,7 @@ class NiconicoIE(InfoExtractor):          return True      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) +        video_id = self._match_id(url)          # Get video webpage. We are not actually interested in it, but need          # the cookies in order to be able to download the info webpage @@ -90,7 +106,7 @@ class NiconicoIE(InfoExtractor):          if self._AUTHENTICATED:              # Get flv info              flv_info_webpage = self._download_webpage( -                'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, +                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',                  video_id, 'Downloading flv info')          else:              # Get external player info diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 9c01eb0af..5d8448571 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -219,7 +219,8 @@ class NPOLiveIE(NPOBaseIE):          if streams:              for stream in streams:                  stream_type = stream.get('type').lower() -                if stream_type == 'ss': +                # smooth streaming is not supported +                if stream_type in ['ss', 'ms']:                      continue                  stream_info = self._download_json(                      'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' @@ -230,7 +231,10 @@ class NPOLiveIE(NPOBaseIE):                  stream_url = self._download_json(                      stream_info['stream'], display_id,                      'Downloading %s URL' % stream_type, -                    transform_source=strip_jsonp) +                    'Unable to download %s URL' % stream_type, +                    transform_source=strip_jsonp, fatal=False) +                if not stream_url: +                    continue                  if stream_type == 'hds':                      f4m_formats = self._extract_f4m_formats(stream_url, display_id)                      # f4m downloader downloads only piece of live stream @@ -242,6 +246,7 @@ class NPOLiveIE(NPOBaseIE):                  else:                      formats.append({                          'url': stream_url, +                        'preference': -10,                      })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 1e4cfa2e7..e91d3a248 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,46 +14,48 @@ from ..utils import (  class NRKIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})' +    _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'      _TESTS = [          { -            'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/', -            'md5': 'a6eac35052f3b242bb6bb7f43aed5886', +            'url': 'http://www.nrk.no/video/PS*150533', +            'md5': 'bccd850baebefe23b56d708a113229c2',              'info_dict': {                  'id': '150533',                  'ext': 'flv',                  'title': 'Dompap og andre fugler i Piip-Show', -                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f' +                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', +                'duration': 263,              }          },          { -            'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/', -            'md5': '3471f2a51718195164e88f46bf427668', +            'url': 'http://www.nrk.no/video/PS*154915', +            'md5': '0b1493ba1aae7d9579a5ad5531bc395a',              'info_dict': {                  'id': '154915',                  'ext': 'flv',                  'title': 'Slik høres internett ut når du er blind',                  'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', +                'duration': 20,              }          },      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - -        page = self._download_webpage(url, video_id) - -        video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id') +        video_id = self._match_id(url)          data = self._download_json( -            'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') +            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, +            video_id, 'Downloading media JSON')          if data['usageRights']['isGeoBlocked']: -            raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True) +            raise ExtractorError( +                'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', +                expected=True) + +        video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' -        video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' +        duration = parse_duration(data.get('duration'))          images = data.get('images')          if images: @@ -69,10 +71,51 @@ class NRKIE(InfoExtractor):              'ext': 'flv',              'title': data['title'],              'description': data['description'], +            'duration': duration,              'thumbnail': thumbnail,          } +class NRKPlaylistIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' + +    _TESTS = [{ +        'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', +        'info_dict': { +            'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', +            'title': 'Gjenopplev den historiske solformørkelsen', +            'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', +        }, +        'playlist_count': 2, +    }, { +        'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', +        'info_dict': { +            'id': 'rivertonprisen-til-karin-fossum-1.12266449', +            'title': 'Rivertonprisen til Karin Fossum', +            'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', +        }, +        'playlist_count': 5, +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        entries = [ +            self.url_result('nrk:%s' % video_id, 'NRK') +            for video_id in re.findall( +                r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"', +                webpage) +        ] + +        playlist_title = self._og_search_title(webpage) +        playlist_description = self._og_search_description(webpage) + +        return self.playlist_result( +            entries, playlist_id, playlist_title, playlist_description) + +  class NRKTVIE(InfoExtractor):      _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' @@ -149,9 +192,6 @@ class NRKTVIE(InfoExtractor):          }      ] -    def _seconds2str(self, s): -        return '%02d:%02d:%02d.%03d' % (s / 3600, (s % 3600) / 60, s % 60, (s % 1) * 1000) -      def _debug_print(self, txt):          if self._downloader.params.get('verbose', False):              self.to_screen('[debug] %s' % txt) @@ -168,8 +208,8 @@ class NRKTVIE(InfoExtractor):          for pos, p in enumerate(ps):              begin = parse_duration(p.get('begin'))              duration = parse_duration(p.get('dur')) -            starttime = self._seconds2str(begin) -            endtime = self._seconds2str(begin + duration) +            starttime = self._subtitles_timecode(begin) +            endtime = self._subtitles_timecode(begin + duration)              srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text)          return {lang: [              {'ext': 'ttml', 'url': url}, diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 56e1cad3b..03f0a4de6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,15 +1,17 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( +    float_or_none, +    int_or_none, +    parse_iso8601, +)  class NYTimesIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)' +    _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',          'md5': '18a525a510f942ada2720db5f31644c0',          'info_dict': { @@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor):              'uploader': 'Brett Weiner',              'duration': 419,          } -    } +    }, { +        'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', +        'only_matching': True, +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          video_data = self._download_json( -            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON') +            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, +            video_id, 'Downloading video JSON')          title = video_data['headline'] -        description = video_data['summary'] -        duration = video_data['duration'] / 1000.0 +        description = video_data.get('summary') +        duration = float_or_none(video_data.get('duration'), 1000)          uploader = video_data['byline']          timestamp = parse_iso8601(video_data['publication_date'][:-8]) @@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor):          formats = [              {                  'url': video['url'], -                'format_id': video['type'], -                'vcodec': video['video_codec'], -                'width': video['width'], -                'height': video['height'], -                'filesize': get_file_size(video['fileSize']), +                'format_id': video.get('type'), +                'vcodec': video.get('video_codec'), +                'width': int_or_none(video.get('width')), +                'height': int_or_none(video.get('height')), +                'filesize': get_file_size(video.get('fileSize')),              } for video in video_data['renditions']          ]          self._sort_formats(formats) @@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor):          thumbnails = [              {                  'url': 'http://www.nytimes.com/%s' % image['url'], -                'resolution': '%dx%d' % (image['width'], image['height']), +                'width': int_or_none(image.get('width')), +                'height': int_or_none(image.get('height')),              } for image in video_data['images']          ] diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 4e293392b..ca1a5bb3c 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -11,6 +11,11 @@ from ..utils import (      HEADRequest,      unified_strdate,      ExtractorError, +    strip_jsonp, +    int_or_none, +    float_or_none, +    determine_ext, +    remove_end,  ) @@ -197,3 +202,92 @@ class ORFFM4IE(InfoExtractor):              'description': data['subtitle'],              'entries': entries          } + + +class ORFIPTVIE(InfoExtractor): +    IE_NAME = 'orf:iptv' +    IE_DESC = 'iptv.ORF.at' +    _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://iptv.orf.at/stories/2267952', +        'md5': '26ffa4bab6dbce1eee78bbc7021016cd', +        'info_dict': { +            'id': '339775', +            'ext': 'flv', +            'title': 'Kreml-Kritiker Nawalny wieder frei', +            'description': 'md5:6f24e7f546d364dacd0e616a9e409236', +            'duration': 84.729, +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20150306', +        }, +    } + +    def _real_extract(self, url): +        story_id = self._match_id(url) + +        webpage = self._download_webpage( +            'http://iptv.orf.at/stories/%s' % story_id, story_id) + +        video_id = self._search_regex( +            r'data-video(?:id)?="(\d+)"', webpage, 'video id') + +        data = self._download_json( +            'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, +            video_id)[0] + +        duration = float_or_none(data['duration'], 1000) + +        video = data['sources']['default'] +        load_balancer_url = video['loadBalancerUrl'] +        abr = int_or_none(video.get('audioBitrate')) +        vbr = int_or_none(video.get('bitrate')) +        fps = int_or_none(video.get('videoFps')) +        width = int_or_none(video.get('videoWidth')) +        height = int_or_none(video.get('videoHeight')) +        thumbnail = video.get('preview') + +        rendition = self._download_json( +            load_balancer_url, video_id, transform_source=strip_jsonp) + +        f = { +            'abr': abr, +            'vbr': vbr, +            'fps': fps, +            'width': width, +            'height': height, +        } + +        formats = [] +        for format_id, format_url in rendition['redirect'].items(): +            if format_id == 'rtmp': +                ff = f.copy() +                ff.update({ +                    'url': format_url, +                    'format_id': format_id, +                }) +                formats.append(ff) +            elif determine_ext(format_url) == 'f4m': +                formats.extend(self._extract_f4m_formats( +                    format_url, video_id, f4m_id=format_id)) +            elif determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', m3u8_id=format_id)) +            else: +                continue +        self._sort_formats(formats) + +        title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') +        description = self._og_search_description(webpage) +        upload_date = unified_strdate(self._html_search_meta( +            'dc.date', webpage, 'upload date')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index a20672c0c..46cebc0d7 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url  class PhoenixIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.phoenix.de/content/884301', -        'md5': 'ed249f045256150c92e72dbb70eadec6', -        'info_dict': { -            'id': '884301', -            'ext': 'mp4', -            'title': 'Michael Krons mit Hans-Werner Sinn', -            'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', -            'upload_date': '20141025', -            'uploader': 'Im Dialog', -        } -    } +    _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ +        (?: +            phoenix/die_sendungen/(?:[^/]+/)? +        )? +        (?P<id>[0-9]+)''' +    _TESTS = [ +        { +            'url': 'http://www.phoenix.de/content/884301', +            'md5': 'ed249f045256150c92e72dbb70eadec6', +            'info_dict': { +                'id': '884301', +                'ext': 'mp4', +                'title': 'Michael Krons mit Hans-Werner Sinn', +                'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', +                'upload_date': '20141025', +                'uploader': 'Im Dialog', +            } +        }, +        { +            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', +            'only_matching': True, +        }, +        { +            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', +            'only_matching': True, +        }, +    ]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py new file mode 100644 index 000000000..abde34b94 --- /dev/null +++ b/youtube_dl/extractor/pladform.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    int_or_none, +    xpath_text, +    qualities, +) + + +class PladformIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?: +                                out\.pladform\.ru/player| +                                static\.pladform\.ru/player\.swf +                            ) +                            \?.*\bvideoid=| +                            video\.pladform\.ru/catalog/video/videoid/ +                        ) +                        (?P<id>\d+) +                    ''' +    _TESTS = [{ +        # http://muz-tv.ru/kinozal/view/7400/ +        'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', +        'md5': '61f37b575dd27f1bb2e1854777fe31f4', +        'info_dict': { +            'id': '100183293', +            'ext': 'mp4', +            'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', +            'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 694, +            'age_limit': 0, +        }, +    }, { +        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', +        'only_matching': True, +    }, { +        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        video = self._download_xml( +            'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, +            video_id) + +        if video.tag == 'error': +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, video.text), +                expected=True) + +        quality = qualities(('ld', 'sd', 'hd')) + +        formats = [{ +            'url': src.text, +            'format_id': src.get('quality'), +            'quality': quality(src.get('quality')), +        } for src in video.findall('./src')] +        self._sort_formats(formats) + +        webpage = self._download_webpage( +            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, +            video_id) + +        title = self._og_search_title(webpage, fatal=False) or xpath_text( +            video, './/title', 'title', fatal=True) +        description = self._search_regex( +            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) or xpath_text( +            video, './/cover', 'cover') + +        duration = int_or_none(xpath_text(video, './/time', 'duration')) +        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'age_limit': age_limit, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index 9576aed0e..e766ccca3 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -4,85 +4,72 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -) +from ..compat import compat_str  from ..utils import (      ExtractorError, -    float_or_none,      int_or_none, -    str_to_int, +    parse_iso8601,  )  class PlayFMIE(InfoExtractor):      IE_NAME = 'play.fm' -    _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])'      _TEST = { -        'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', +        'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12',          'md5': 'c505f8307825a245d0c7ad1850001f22',          'info_dict': { -            'id': '137220', +            'id': '71276',              'ext': 'mp3', -            'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', -            'uploader': 'Sven Tasnadi', -            'uploader_id': 'sventasnadi', -            'duration': 5627.428, -            'upload_date': '20140712', +            'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', +            'description': '', +            'duration': 5627, +            'timestamp': 1406033781, +            'upload_date': '20140722', +            'uploader': 'Dan Drastic', +            'uploader_id': '71170',              'view_count': int,              'comment_count': int, -            'thumbnail': 're:^https?://.*\.jpg$',          },      }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') -        upload_date = mobj.group('upload_date') - -        rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) -        req = compat_urllib_request.Request( -            'http://www.play.fm/flexRead/recording', data=rec_data) -        req.add_header('Content-Type', 'application/x-www-form-urlencoded') -        rec_doc = self._download_xml(req, video_id) +        slug = mobj.group('slug') -        error_node = rec_doc.find('./error') -        if error_node is not None: -            raise ExtractorError('An error occured: %s (code %s)' % ( -                error_node.text, rec_doc.find('./status').text)) +        recordings = self._download_json( +            'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) -        recording = rec_doc.find('./recording') -        title = recording.find('./title').text -        view_count = str_to_int(recording.find('./stats/playcount').text) -        comment_count = str_to_int(recording.find('./stats/comments').text) -        duration = float_or_none(recording.find('./duration').text, scale=1000) -        thumbnail = recording.find('./image').text +        error = recordings.get('error') +        if isinstance(error, dict): +            raise ExtractorError( +                '%s returned error: %s' % (self.IE_NAME, error.get('message')), +                expected=True) -        artist = recording.find('./artists/artist') -        uploader = artist.find('./name').text -        uploader_id = artist.find('./slug').text - -        video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( -            'http:', recording.find('./url').text, -            recording.find('./_class').text, recording.find('./file_id').text, -            rec_doc.find('./uuid').text, video_id, -            rec_doc.find('./jingle/file_id').text, -            'http%3A%2F%2Fwww.play.fm%2Fplayer', -        ) +        audio_url = recordings['audio'] +        video_id = compat_str(recordings.get('id') or video_id) +        title = recordings['title'] +        description = recordings.get('description') +        duration = int_or_none(recordings.get('recordingDuration')) +        timestamp = parse_iso8601(recordings.get('created_at')) +        uploader = recordings.get('page', {}).get('title') +        uploader_id = compat_str(recordings.get('page', {}).get('id')) +        view_count = int_or_none(recordings.get('playCount')) +        comment_count = int_or_none(recordings.get('commentCount')) +        categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')]          return {              'id': video_id, -            'url': video_url, -            'ext': 'mp3', -            'filesize': int_or_none(recording.find('./size').text), +            'url': audio_url,              'title': title, -            'upload_date': upload_date, -            'view_count': view_count, -            'comment_count': comment_count, +            'description': description,              'duration': duration, -            'thumbnail': thumbnail, +            'timestamp': timestamp,              'uploader': uploader,              'uploader_id': uploader_id, +            'view_count': view_count, +            'comment_count': comment_count, +            'categories': categories,          } diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py new file mode 100644 index 000000000..bdc71017b --- /dev/null +++ b/youtube_dl/extractor/playwire.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    xpath_text, +    float_or_none, +    int_or_none, +) + + +class PlaywireIE(InfoExtractor): +    _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', +        'md5': 'e6398701e3595888125729eaa2329ed9', +        'info_dict': { +            'id': '3353705', +            'ext': 'mp4', +            'title': 'S04_RM_UCL_Rus', +            'thumbnail': 're:^http://.*\.png$', +            'duration': 145.94, +        }, +    }, { +        'url': 'http://cdn.playwire.com/11625/embed/85228.html', +        'only_matching': True, +    }, { +        'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', +        'only_matching': True, +    }, { +        'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + +        player = self._download_json( +            'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), +            video_id) + +        title = player['settings']['title'] +        duration = float_or_none(player.get('duration'), 1000) + +        content = player['content'] +        thumbnail = content.get('poster') +        src = content['media']['f4m'] + +        f4m = self._download_xml(src, video_id) +        base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True) +        formats = [] +        for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'): +            media_url = media.get('url') +            if not media_url: +                continue +            tbr = int_or_none(media.get('bitrate')) +            width = int_or_none(media.get('width')) +            height = int_or_none(media.get('height')) +            f = { +                'url': '%s/%s' % (base_url, media.attrib['url']), +                'tbr': tbr, +                'width': width, +                'height': height, +            } +            if not (tbr or width or height): +                f['quality'] = 1 if '-hd.' in media_url else 0 +            formats.append(f) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'duration': duration, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3a27e3789..0c8b731cf 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor):      }      def _extract_count(self, pattern, webpage, name): -        count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) -        if count: -            count = str_to_int(count) -        return count +        return str_to_int(self._search_regex( +            pattern, webpage, '%s count' % name, fatal=False))      def _real_extract(self, url):          video_id = self._match_id(url) @@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor):          if thumbnail:              thumbnail = compat_urllib_parse.unquote(thumbnail) -        view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') -        like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') -        dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') +        view_count = self._extract_count( +            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') +        like_count = self._extract_count( +            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') +        dislike_count = self._extract_count( +            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')          comment_count = self._extract_count( -            r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') +            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')          video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))          if webpage.find('"encrypted":true') != -1: diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py new file mode 100644 index 000000000..01cc3d9ea --- /dev/null +++ b/youtube_dl/extractor/primesharetv.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request, +) +from ..utils import ExtractorError + + +class PrimeShareTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)' + +    _TEST = { +        'url': 'http://primeshare.tv/download/238790B611', +        'md5': 'b92d9bf5461137c36228009f31533fbc', +        'info_dict': { +            'id': '238790B611', +            'ext': 'mp4', +            'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        if '>File not exist<' in webpage: +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) + +        fields = dict(re.findall(r'''(?x)<input\s+ +            type="hidden"\s+ +            name="([^"]+)"\s+ +            (?:id="[^"]+"\s+)? +            value="([^"]*)" +            ''', webpage)) + +        headers = { +            'Referer': url, +            'Content-Type': 'application/x-www-form-urlencoded', +        } + +        wait_time = int(self._search_regex( +            r'var\s+cWaitTime\s*=\s*(\d+)', +            webpage, 'wait time', default=7)) + 1 +        self._sleep(wait_time, video_id) + +        req = compat_urllib_request.Request( +            url, compat_urllib_parse.urlencode(fields), headers) +        video_page = self._download_webpage( +            req, video_id, 'Downloading video page') + +        video_url = self._search_regex( +            r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'", +            video_page, 'video url') + +        title = self._html_search_regex( +            r'<h1>Watch\s*(?: )?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?: )?\s*<strong>', +            video_page, 'title') + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'ext': 'mp4', +        } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 846b76c81..d6054d717 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,17 +1,19 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..utils import ExtractorError  class RedTubeIE(InfoExtractor):      _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.redtube.com/66418', +        'md5': '7b8c22b5e7098a3e1c09709df1126d2d',          'info_dict': {              'id': '66418',              'ext': 'mp4', -            "title": "Sucked on a toilet", -            "age_limit": 18, +            'title': 'Sucked on a toilet', +            'age_limit': 18,          }      } @@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) +        if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): +            raise ExtractorError('Video %s has been removed' % video_id, expected=True) +          video_url = self._html_search_regex(              r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')          video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index b42442d12..13f071077 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -127,6 +127,47 @@ class RTVEALaCartaIE(InfoExtractor):              for s in subs) +class RTVEInfantilIE(InfoExtractor): +    IE_NAME = 'rtve.es:infantil' +    IE_DESC = 'RTVE infantil' +    _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/' + +    _TESTS = [{ +        'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', +        'md5': '915319587b33720b8e0357caaa6617e6', +        'info_dict': { +            'id': '3040283', +            'ext': 'mp4', +            'title': 'Maneras de vivir', +            'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', +            'duration': 357.958, +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        info = self._download_json( +            'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, +            video_id)['page']['items'][0] + +        webpage = self._download_webpage(url, video_id) +        vidplayer_id = self._search_regex( +            r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') + +        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id +        png = self._download_webpage(png_url, video_id, 'Downloading url information') +        video_url = _decrypt_url(png) + +        return { +            'id': video_id, +            'ext': 'mp4', +            'title': info['title'], +            'url': video_url, +            'thumbnail': info.get('image'), +            'duration': float_or_none(info.get('duration'), scale=1000), +        } + +  class RTVELiveIE(InfoExtractor):      IE_NAME = 'rtve.es:live'      IE_DESC = 'RTVE.es live streams' diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py new file mode 100644 index 000000000..10251f29e --- /dev/null +++ b/youtube_dl/extractor/safari.py @@ -0,0 +1,157 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE + +from ..compat import ( +    compat_urllib_parse, +    compat_urllib_request, +) +from ..utils import ( +    ExtractorError, +    smuggle_url, +    std_headers, +) + + +class SafariBaseIE(InfoExtractor): +    _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' +    _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' +    _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' +    _NETRC_MACHINE = 'safari' + +    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' +    _API_FORMAT = 'json' + +    LOGGED_IN = False + +    def _real_initialize(self): +        # We only need to log in once for courses or individual videos +        if not self.LOGGED_IN: +            self._login() +            SafariBaseIE.LOGGED_IN = True + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            raise ExtractorError( +                self._ACCOUNT_CREDENTIALS_HINT, +                expected=True) + +        headers = std_headers +        if 'Referer' not in headers: +            headers['Referer'] = self._LOGIN_URL + +        login_page = self._download_webpage( +            self._LOGIN_URL, None, +            'Downloading login form') + +        csrf = self._html_search_regex( +            r"name='csrfmiddlewaretoken'\s+value='([^']+)'", +            login_page, 'csrf token') + +        login_form = { +            'csrfmiddlewaretoken': csrf, +            'email': username, +            'password1': password, +            'login': 'Sign In', +            'next': '', +        } + +        request = compat_urllib_request.Request( +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) +        login_page = self._download_webpage( +            request, None, 'Logging in as %s' % username) + +        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: +            raise ExtractorError( +                'Login failed; make sure your credentials are correct and try again.', +                expected=True) + +        self.to_screen('Login successful') + + +class SafariIE(SafariBaseIE): +    IE_NAME = 'safari' +    IE_DESC = 'safaribooksonline.com online video' +    _VALID_URL = r'''(?x)https?:// +                            (?:www\.)?safaribooksonline\.com/ +                                (?: +                                    library/view/[^/]+| +                                    api/v1/book +                                )/ +                                (?P<course_id>\d+)/ +                                    (?:chapter(?:-content)?/)? +                                (?P<part>part\d+)\.html +    ''' + +    _TESTS = [{ +        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', +        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', +        'info_dict': { +            'id': '2842601850001', +            'ext': 'mp4', +            'title': 'Introduction', +        }, +        'skip': 'Requires safaribooksonline account credentials', +    }, { +        'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        course_id = mobj.group('course_id') +        part = mobj.group('part') + +        webpage = self._download_webpage( +            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), +            part) + +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if not bc_url: +            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + +        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove') + + +class SafariCourseIE(SafariBaseIE): +    IE_NAME = 'safari:course' +    IE_DESC = 'safaribooksonline.com online courses' + +    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)' + +    _TESTS = [{ +        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', +        'info_dict': { +            'id': '9780133392838', +            'title': 'Hadoop Fundamentals LiveLessons', +        }, +        'playlist_count': 22, +        'skip': 'Requires safaribooksonline account credentials', +    }, { +        'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        course_id = self._match_id(url) + +        course_json = self._download_json( +            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), +            course_id, 'Downloading course JSON') + +        if 'chapters' not in course_json: +            raise ExtractorError( +                'No chapters found for course %s' % course_id, expected=True) + +        entries = [ +            self.url_result(chapter, 'Safari') +            for chapter in course_json['chapters']] + +        course_title = course_json['title'] + +        return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 9f79ff5c1..0b717a1e4 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor):          page_title = mobj.group('title')          webpage = self._download_webpage(url, page_title)          slideshare_obj = self._search_regex( -            r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=', +            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',              webpage, 'slideshare object')          info = json.loads(slideshare_obj)          if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index c04791997..11edf616a 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -4,22 +4,87 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .common import compat_str +from ..compat import ( +    compat_str, +    compat_urllib_request +) +from ..utils import sanitize_url_path_consecutive_slashes  class SohuIE(InfoExtractor):      _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' -    _TEST = { +    _TESTS = [{ +        'note': 'This video is available only in Mainland China',          'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', -        'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7', +        'md5': '29175c8cadd8b5cc4055001e85d6b372',          'info_dict': {              'id': '382479172',              'ext': 'mp4',              'title': 'MV:Far East Movement《The Illest》',          }, -        'skip': 'Only available from China', -    } +        'params': { +            'cn_verification_proxy': 'proxy.uku.im:8888' +        } +    }, { +        'url': 'http://tv.sohu.com/20150305/n409385080.shtml', +        'md5': '699060e75cf58858dd47fb9c03c42cfb', +        'info_dict': { +            'id': '409385080', +            'ext': 'mp4', +            'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', +        } +    }, { +        'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', +        'md5': '9bf34be48f2f4dadcb226c74127e203c', +        'info_dict': { +            'id': '78693464', +            'ext': 'mp4', +            'title': '【爱范品】第31期:MWC见不到的奇葩手机', +        } +    }, { +        'note': 'Multipart video', +        'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', +        'info_dict': { +            'id': '78910339', +        }, +        'playlist': [{ +            'md5': 'bdbfb8f39924725e6589c146bc1883ad', +            'info_dict': { +                'id': '78910339_part1', +                'ext': 'mp4', +                'duration': 294, +                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', +            } +        }, { +            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', +            'info_dict': { +                'id': '78910339_part2', +                'ext': 'mp4', +                'duration': 300, +                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', +            } +        }, { +            'md5': '8407e634175fdac706766481b9443450', +            'info_dict': { +                'id': '78910339_part3', +                'ext': 'mp4', +                'duration': 150, +                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', +            } +        }] +    }, { +        'note': 'Video with title containing dash', +        'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', +        'info_dict': { +            'id': '78932792', +            'ext': 'mp4', +            'title': 'youtube-dl testing video', +        }, +        'params': { +            'skip_download': True +        } +    }]      def _real_extract(self, url): @@ -29,8 +94,14 @@ class SohuIE(InfoExtractor):              else:                  base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' +            req = compat_urllib_request.Request(base_data_url + vid_id) + +            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') +            if cn_verification_proxy: +                req.add_header('Ytdl-request-proxy', cn_verification_proxy) +              return self._download_json( -                base_data_url + vid_id, video_id, +                req, video_id,                  'Downloading JSON data for %s' % vid_id)          mobj = re.match(self._VALID_URL, url) @@ -38,10 +109,8 @@ class SohuIE(InfoExtractor):          mytv = mobj.group('mytv') is not None          webpage = self._download_webpage(url, video_id) -        raw_title = self._html_search_regex( -            r'(?s)<title>(.+?)</title>', -            webpage, 'video title') -        title = raw_title.partition('-')[0].strip() + +        title = self._og_search_title(webpage)          vid = self._html_search_regex(              r'var vid ?= ?["\'](\d+)["\']', @@ -77,7 +146,9 @@ class SohuIE(InfoExtractor):                      % (format_id, i + 1, part_count))                  part_info = part_str.split('|') -                video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + +                video_url = sanitize_url_path_consecutive_slashes( +                    '%s%s?key=%s' % (part_info[0], su[i], part_info[3]))                  formats.append({                      'url': video_url, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index c5284fa67..316b2c90f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -180,7 +180,7 @@ class SoundcloudIE(InfoExtractor):                      'format_id': key,                      'url': url,                      'play_path': 'mp3:' + path, -                    'ext': ext, +                    'ext': 'flv',                      'vcodec': 'none',                  }) @@ -200,8 +200,9 @@ class SoundcloudIE(InfoExtractor):                  if f['format_id'].startswith('rtmp'):                      f['protocol'] = 'rtmp' -            self._sort_formats(formats) -            result['formats'] = formats +        self._check_formats(formats, track_id) +        self._sort_formats(formats) +        result['formats'] = formats          return result @@ -241,7 +242,7 @@ class SoundcloudIE(InfoExtractor):  class SoundcloudSetIE(SoundcloudIE): -    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'      IE_NAME = 'soundcloud:set'      _TESTS = [{          'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -286,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE):  class SoundcloudUserIE(SoundcloudIE): -    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'      IE_NAME = 'soundcloud:user'      _TESTS = [{          'url': 'https://soundcloud.com/the-concept-band', diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py new file mode 100644 index 000000000..13101c714 --- /dev/null +++ b/youtube_dl/extractor/ssa.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    unescapeHTML, +    parse_duration, +) + + +class SSAIE(InfoExtractor): +    _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' +    _TEST = { +        'url': 'http://ssa.nls.uk/film/3561', +        'info_dict': { +            'id': '3561', +            'ext': 'flv', +            'title': 'SHETLAND WOOL', +            'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', +            'duration': 900, +            'thumbnail': 're:^https?://.*\.jpg$', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        streamer = self._search_regex( +            r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') +        play_path = self._search_regex( +            r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + +        def search_field(field_name, fatal=False): +            return self._search_regex( +                r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name, +                webpage, 'title', fatal=fatal) + +        title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') +        description = unescapeHTML(search_field('Description')) +        duration = parse_duration(search_field('Running time')) +        thumbnail = self._search_regex( +            r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + +        return { +            'id': video_id, +            'url': streamer, +            'play_path': play_path, +            'ext': 'flv', +            'title': title, +            'description': description, +            'duration': duration, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 5793dbc10..a46a7ecba 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -53,10 +53,10 @@ class TeamcocoIE(InfoExtractor):          embed = self._download_webpage(              embed_url, video_id, 'Downloading embed page') -        encoded_data = self._search_regex( -            r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') +        player_data = self._parse_json(self._search_regex( +            r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id)          data = self._parse_json( -            base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) +            base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)          formats = []          get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index feac666f7..0e3e627f4 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor):              error_msg = next(                  n.attrib['abstract']                  for n in meta.findall(_x('.//smil:ref')) -                if n.attrib.get('title') == 'Geographic Restriction') +                if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')          except StopIteration:              pass          else: diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 9a53a3c74..e83e31a31 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -16,6 +16,7 @@ class TVPlayIE(InfoExtractor):      _VALID_URL = r'''(?x)http://(?:www\.)?          (?:tvplay\.lv/parraides|             tv3play\.lt/programos| +           play\.tv3\.lt/programos|             tv3play\.ee/sisu|             tv3play\.se/program|             tv6play\.se/program| @@ -45,7 +46,7 @@ class TVPlayIE(InfoExtractor):              },          },          { -            'url': 'http://www.tv3play.lt/programos/moterys-meluoja-geriau/409229?autostart=true', +            'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true',              'info_dict': {                  'id': '409229',                  'ext': 'flv', diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py new file mode 100644 index 000000000..d6c0ab184 --- /dev/null +++ b/youtube_dl/extractor/twentytwotracks.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + +# 22Tracks regularly replace the audio tracks that can be streamed on their +# site. The tracks usually expire after 1 months, so we can't add tests. + + +class TwentyTwoTracksIE(InfoExtractor): +    _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)' +    IE_NAME = '22tracks:track' + +    _API_BASE = 'http://22tracks.com/api' + +    def _extract_info(self, city, genre_name, track_id=None): +        item_id = track_id if track_id else genre_name + +        cities = self._download_json( +            '%s/cities' % self._API_BASE, item_id, +            'Downloading cities info', +            'Unable to download cities info') +        city_id = [x['id'] for x in cities if x['slug'] == city][0] + +        genres = self._download_json( +            '%s/genres/%s' % (self._API_BASE, city_id), item_id, +            'Downloading %s genres info' % city, +            'Unable to download %s genres info' % city) +        genre = [x for x in genres if x['slug'] == genre_name][0] +        genre_id = genre['id'] + +        tracks = self._download_json( +            '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, +            'Downloading %s genre tracks info' % genre_name, +            'Unable to download track info') + +        return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] + +    def _get_track_url(self, filename, track_id): +        token = self._download_json( +            'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, +            track_id, 'Downloading token', 'Unable to download token') +        return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) + +    def _extract_track_info(self, track_info, track_id): +        download_url = self._get_track_url(track_info['filename'], track_id) +        title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) +        return { +            'id': track_id, +            'url': download_url, +            'ext': 'mp3', +            'title': title, +            'duration': int_or_none(track_info.get('duration')), +            'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) +        } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        city = mobj.group('city') +        genre = mobj.group('genre') +        track_id = mobj.group('id') + +        track_info = self._extract_info(city, genre, track_id) +        return self._extract_track_info(track_info, track_id) + + +class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): +    _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$' +    IE_NAME = '22tracks:genre' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        city = mobj.group('city') +        genre = mobj.group('genre') + +        genre_title, tracks = self._extract_info(city, genre) + +        entries = [ +            self._extract_track_info(track_info, track_info['id']) +            for track_info in tracks] + +        return self.playlist_result(entries, genre, genre_title) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4b0ce54df..94bd6345d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -23,6 +23,8 @@ class TwitchBaseIE(InfoExtractor):      _API_BASE = 'https://api.twitch.tv'      _USHER_BASE = 'http://usher.twitch.tv'      _LOGIN_URL = 'https://secure.twitch.tv/user/login' +    _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login' +    _NETRC_MACHINE = 'twitch'      def _handle_error(self, response):          if not isinstance(response, dict): @@ -66,14 +68,14 @@ class TwitchBaseIE(InfoExtractor):              'authenticity_token': authenticity_token,              'redirect_on_login': '',              'embed_form': 'false', -            'mp_source_action': '', +            'mp_source_action': 'login-button',              'follow': '', -            'user[login]': username, -            'user[password]': password, +            'login': username, +            'password': password,          }          request = compat_urllib_request.Request( -            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) +            self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))          request.add_header('Referer', self._LOGIN_URL)          response = self._download_webpage(              request, None, 'Logging in as %s' % username) @@ -84,6 +86,14 @@ class TwitchBaseIE(InfoExtractor):              raise ExtractorError(                  'Unable to login: %s' % m.group('msg').strip(), expected=True) +    def _prefer_source(self, formats): +        try: +            source = next(f for f in formats if f['format_id'] == 'Source') +            source['preference'] = 10 +        except StopIteration: +            pass  # No Source stream present +        self._sort_formats(formats) +  class TwitchItemBaseIE(TwitchBaseIE):      def _download_info(self, item, item_id): @@ -139,7 +149,7 @@ class TwitchItemBaseIE(TwitchBaseIE):  class TwitchVideoIE(TwitchItemBaseIE):      IE_NAME = 'twitch:video' -    _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE +    _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE      _ITEM_TYPE = 'video'      _ITEM_SHORTCUT = 'a' @@ -155,7 +165,7 @@ class TwitchVideoIE(TwitchItemBaseIE):  class TwitchChapterIE(TwitchItemBaseIE):      IE_NAME = 'twitch:chapter' -    _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE +    _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE      _ITEM_TYPE = 'chapter'      _ITEM_SHORTCUT = 'c' @@ -174,7 +184,7 @@ class TwitchChapterIE(TwitchItemBaseIE):  class TwitchVodIE(TwitchItemBaseIE):      IE_NAME = 'twitch:vod' -    _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE +    _VALID_URL = r'%s/[^/]+/v/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE      _ITEM_TYPE = 'vod'      _ITEM_SHORTCUT = 'v' @@ -208,6 +218,7 @@ class TwitchVodIE(TwitchItemBaseIE):              '%s/vod/%s?nauth=%s&nauthsig=%s'              % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),              item_id, 'mp4') +        self._prefer_source(formats)          info['formats'] = formats          return info @@ -348,21 +359,14 @@ class TwitchStreamIE(TwitchBaseIE):              'p': random.randint(1000000, 10000000),              'player': 'twitchweb',              'segment_preference': '4', -            'sig': access_token['sig'], -            'token': access_token['token'], +            'sig': access_token['sig'].encode('utf-8'), +            'token': access_token['token'].encode('utf-8'),          } -          formats = self._extract_m3u8_formats(              '%s/api/channel/hls/%s.m3u8?%s' -            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')), +            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),              channel_id, 'mp4') - -        # prefer the 'source' stream, the others are limited to 30 fps -        def _sort_source(f): -            if f.get('m3u8_media') is not None and f['m3u8_media'].get('NAME') == 'Source': -                return 1 -            return 0 -        formats = sorted(formats, key=_sort_source) +        self._prefer_source(formats)          view_count = stream.get('viewers')          timestamp = parse_iso8601(stream.get('created_at')) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py new file mode 100644 index 000000000..96c809eaf --- /dev/null +++ b/youtube_dl/extractor/ultimedia.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    qualities, +    unified_strdate, +    clean_html, +) + + +class UltimediaIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' +    _TESTS = [{ +        # news +        'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', +        'md5': '276a0e49de58c7e85d32b057837952a2', +        'info_dict': { +            'id': 's8uk0r', +            'ext': 'mp4', +            'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', +            'description': 'md5:3e5c8fd65791487333dda5db8aed32af', +            'thumbnail': 're:^https?://.*\.jpg', +            'upload_date': '20150317', +        }, +    }, { +        # music +        'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', +        'md5': '2ea3513813cf230605c7e2ffe7eca61c', +        'info_dict': { +            'id': 'xvpfp8', +            'ext': 'mp4', +            'title': "Two - C'est la vie (Clip)", +            'description': 'Two', +            'thumbnail': 're:^https?://.*\.jpg', +            'upload_date': '20150224', +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        deliver_url = self._search_regex( +            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', +            webpage, 'deliver URL') + +        deliver_page = self._download_webpage( +            deliver_url, video_id, 'Downloading iframe page') + +        if '>This video is currently not available' in deliver_page: +            raise ExtractorError( +                'Video %s is currently not available' % video_id, expected=True) + +        player = self._parse_json( +            self._search_regex( +                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), +            video_id) + +        quality = qualities(['flash', 'html5']) +        formats = [] +        for mode in player['modes']: +            video_url = mode.get('config', {}).get('file') +            if not video_url: +                continue +            if re.match(r'https?://www\.youtube\.com/.+?', video_url): +                return self.url_result(video_url, 'Youtube') +            formats.append({ +                'url': video_url, +                'format_id': mode.get('type'), +                'quality': quality(mode.get('type')), +            }) +        self._sort_formats(formats) + +        thumbnail = player.get('image') + +        title = clean_html(( +            self._html_search_regex( +                r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', +                webpage, 'title', default=None) or +            self._search_regex( +                r"var\s+nameVideo\s*=\s*'([^']+)'", +                deliver_page, 'title'))) + +        description = clean_html(self._html_search_regex( +            r'(?s)<span>Description</span>(.+?)</p>', webpage, +            'description', fatal=False)) + +        upload_date = unified_strdate(self._search_regex( +            r'Ajouté le\s*<span>([^<]+)', webpage, +            'upload date', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'upload_date': upload_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py new file mode 100644 index 000000000..9369abaf8 --- /dev/null +++ b/youtube_dl/extractor/varzesh3.py @@ -0,0 +1,45 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Varzesh3IE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' +    _TEST = { +        'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', +        'md5': '2a933874cb7dce4366075281eb49e855', +        'info_dict': { +            'id': '76337', +            'ext': 'mp4', +            'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', +            'description': 'فصل ۲۰۱۵-۲۰۱۴', +            'thumbnail': 're:^https?://.*\.jpg$', +        } +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        video_url = self._search_regex( +            r'<source[^>]+src="([^"]+)"', webpage, 'video url') + +        title = self._og_search_title(webpage) +        description = self._html_search_regex( +            r'(?s)<div class="matn">(.+?)</div>', +            webpage, 'description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) + +        video_id = self._search_regex( +            r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", +            webpage, display_id, default=display_id) + +        return { +            'url': video_url, +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py new file mode 100644 index 000000000..6215f0642 --- /dev/null +++ b/youtube_dl/extractor/vessel.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( +    ExtractorError, +    parse_iso8601, +) + + +class VesselIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' +    _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' +    _LOGIN_URL = 'https://www.vessel.com/api/account/login' +    _NETRC_MACHINE = 'vessel' +    _TEST = { +        'url': 'https://www.vessel.com/videos/HDN7G5UMs', +        'md5': '455cdf8beb71c6dd797fd2f3818d05c4', +        'info_dict': { +            'id': 'HDN7G5UMs', +            'ext': 'mp4', +            'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', +            'thumbnail': 're:^https?://.*\.jpg$', +            'upload_date': '20150317', +            'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', +            'timestamp': int, +        }, +    } + +    @staticmethod +    def make_json_request(url, data): +        payload = json.dumps(data).encode('utf-8') +        req = compat_urllib_request.Request(url, payload) +        req.add_header('Content-Type', 'application/json; charset=utf-8') +        return req + +    @staticmethod +    def find_assets(data, asset_type): +        for asset in data.get('assets', []): +            if asset.get('type') == asset_type: +                yield asset + +    def _check_access_rights(self, data): +        access_info = data.get('__view', {}) +        if not access_info.get('allow_access', True): +            err_code = access_info.get('error_code') or '' +            if err_code == 'ITEM_PAID_ONLY': +                raise ExtractorError( +                    'This video requires subscription.', expected=True) +            else: +                raise ExtractorError( +                    'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            return +        self.report_login() +        data = { +            'client_id': 'web', +            'type': 'password', +            'user_key': username, +            'password': password, +        } +        login_request = VesselIE.make_json_request(self._LOGIN_URL, data) +        self._download_webpage(login_request, None, False, 'Wrong login info') + +    def _real_initialize(self): +        self._login() + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        data = self._parse_json(self._search_regex( +            r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) +        asset_id = data['model']['data']['id'] + +        req = VesselIE.make_json_request( +            self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) +        data = self._download_json(req, video_id) + +        self._check_access_rights(data) + +        try: +            video_asset = next(VesselIE.find_assets(data, 'video')) +        except StopIteration: +            raise ExtractorError('No video assets found') + +        formats = [] +        for f in video_asset.get('sources', []): +            if f['name'] == 'hls-index': +                formats.extend(self._extract_m3u8_formats( +                    f['location'], video_id, ext='mp4', m3u8_id='m3u8')) +            else: +                formats.append({ +                    'format_id': f['name'], +                    'tbr': f.get('bitrate'), +                    'height': f.get('height'), +                    'width': f.get('width'), +                    'url': f['location'], +                }) +        self._sort_formats(formats) + +        thumbnails = [] +        for im_asset in VesselIE.find_assets(data, 'image'): +            thumbnails.append({ +                'url': im_asset['location'], +                'width': im_asset.get('width', 0), +                'height': im_asset.get('height', 0), +            }) + +        return { +            'id': video_id, +            'title': data['title'], +            'formats': formats, +            'thumbnails': thumbnails, +            'description': data.get('short_description'), +            'duration': data.get('duration'), +            'comment_count': data.get('comment_count'), +            'like_count': data.get('like_count'), +            'view_count': data.get('view_count'), +            'timestamp': parse_iso8601(data.get('released_at')), +        } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 273030316..eb309a7cd 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse, -    compat_urllib_request, -) -from ..utils import ( -    ExtractorError, -    remove_start, -) +from ..compat import compat_urllib_request  class VideoMegaIE(InfoExtractor):      _VALID_URL = r'''(?x)https?://          (?:www\.)?videomega\.tv/ -        (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+) +        (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)          '''      _TEST = { -        'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ', +        'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',          'md5': 'bf5c2f95c4c917536e80936af7bc51e1',          'info_dict': { -            'id': 'QR0HCUHI1661IHUCH0RQ', +            'id': '4GNA688SU99US886ANG4',              'ext': 'mp4', -            'title': 'Big Buck Bunny', +            'title': 'BigBuckBunny_320x180',              'thumbnail': 're:^https?://.*\.jpg$',          }      } @@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id) +        iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id          req = compat_urllib_request.Request(iframe_url)          req.add_header('Referer', url)          webpage = self._download_webpage(req, video_id) -        try: -            escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1] -        except IndexError: -            raise ExtractorError('Unable to extract escaped data') - -        playlist = compat_urllib_parse.unquote(escaped_data) - +        title = self._html_search_regex( +            r'<title>(.*?)</title>', webpage, 'title') +        title = re.sub( +            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)          thumbnail = self._search_regex( -            r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) -        video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') -        title = remove_start(self._html_search_regex( -            r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ') - -        formats = [{ -            'format_id': 'sd', -            'url': video_url, -        }] -        self._sort_formats(formats) +            r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) +        video_url = self._search_regex( +            r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')          return {              'id': video_id,              'title': title, -            'formats': formats, +            'url': video_url,              'thumbnail': thumbnail,              'http_headers': {                  'Referer': iframe_url, diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 5c89824c1..bd953fb4c 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,7 +1,5 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import (      int_or_none, @@ -28,12 +26,11 @@ class VidmeIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        video_url = self._html_search_regex(r'<source src="([^"]+)"', webpage, 'video URL') +        video_url = self._html_search_regex( +            r'<source src="([^"]+)"', webpage, 'video URL')          title = self._og_search_title(webpage)          description = self._og_search_description(webpage, default='') @@ -44,13 +41,10 @@ class VidmeIE(InfoExtractor):          duration = float_or_none(self._html_search_regex(              r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))          view_count = str_to_int(self._html_search_regex( -            r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) +            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))          like_count = str_to_int(self._html_search_regex(              r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',              webpage, 'like count', fatal=False)) -        comment_count = str_to_int(self._html_search_regex( -            r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">', -            webpage, 'comment count', fatal=False))          return {              'id': video_id, @@ -64,5 +58,4 @@ class VidmeIE(InfoExtractor):              'duration': duration,              'view_count': view_count,              'like_count': like_count, -            'comment_count': comment_count,          } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py new file mode 100644 index 000000000..1742e66f4 --- /dev/null +++ b/youtube_dl/extractor/viewster.py @@ -0,0 +1,129 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_request + + +class ViewsterIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P<id>\d+-\d+-\d+)' +    _TESTS = [{ +        # movielink, paymethod=fre +        'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/', +        'playlist': [{ +            'md5': '8f9d94b282d80c42b378dffdbb11caf3', +            'info_dict': { +                'id': '1293-19341-000-movie', +                'ext': 'flv', +                'title': "'Hout' (Wood) - Movie", +            }, +        }], +        'info_dict': { +            'id': '1293-19341-000', +            'title': "'Hout' (Wood)", +            'description': 'md5:925733185a9242ef96f436937683f33b', +        } +    }, { +        # movielink, paymethod=adv +        'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', +        'playlist': [{ +            'md5': '77a005453ca7396cbe3d35c9bea30aef', +            'info_dict': { +                'id': '1140-11855-000-movie', +                'ext': 'flv', +                'title': "THE LISTENING PROJECT - Movie", +            }, +        }], +        'info_dict': { +            'id': '1140-11855-000', +            'title': "THE LISTENING PROJECT", +            'description': 'md5:714421ae9957e112e672551094bf3b08', +        } +    }, { +        # direct links, no movielink +        'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/', +        'playlist': [{ +            'md5': '0307b7eac6bfb21ab0577a71f6eebd8f', +            'info_dict': { +                'id': '1198-56411-000-trailer', +                'ext': 'mp4', +                'title': "Sinister - Trailer", +            }, +        }, { +            'md5': '80b9ee3ad69fb368f104cb5d9732ae95', +            'info_dict': { +                'id': '1198-56411-000-behind-scenes', +                'ext': 'mp4', +                'title': "Sinister - Behind Scenes", +            }, +        }, { +            'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5', +            'info_dict': { +                'id': '1198-56411-000-scene-from-movie', +                'ext': 'mp4', +                'title': "Sinister - Scene from movie", +            }, +        }], +        'info_dict': { +            'id': '1198-56411-000', +            'title': "Sinister", +            'description': 'md5:014c40b0488848de9683566a42e33372', +        } +    }] + +    _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        request = compat_urllib_request.Request( +            'http://api.live.viewster.com/api/v1/movie/%s' % video_id) +        request.add_header('Accept', self._ACCEPT_HEADER) + +        movie = self._download_json( +            request, video_id, 'Downloading movie metadata JSON') + +        title = movie.get('title') or movie['original_title'] +        description = movie.get('synopsis') +        thumbnail = movie.get('large_artwork') or movie.get('artwork') + +        entries = [] +        for clip in movie['play_list']: +            entry = None + +            # movielink api +            link_request = clip.get('link_request') +            if link_request: +                request = compat_urllib_request.Request( +                    'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s¤cy=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s' +                    % link_request) +                request.add_header('Accept', self._ACCEPT_HEADER) + +                movie_link = self._download_json( +                    request, video_id, 'Downloading movie link JSON', fatal=False) + +                if movie_link: +                    formats = self._extract_f4m_formats( +                        movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id) +                    self._sort_formats(formats) +                    entry = { +                        'formats': formats, +                    } + +            # direct link +            clip_url = clip.get('clip_data', {}).get('url') +            if clip_url: +                entry = { +                    'url': clip_url, +                    'ext': 'mp4', +                } + +            if entry: +                entry.update({ +                    'id': '%s-%s' % (video_id, clip['canonical_title']), +                    'title': '%s - %s' % (title, clip['title']), +                }) +                entries.append(entry) + +        playlist = self.playlist_result(entries, video_id, title, description) +        playlist['thumbnail'] = thumbnail +        return playlist diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8f540f578..28bcc89cd 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals  import json  import re  import itertools -import hashlib  from .common import InfoExtractor  from ..compat import ( @@ -20,6 +19,7 @@ from ..utils import (      RegexNotFoundError,      smuggle_url,      std_headers, +    unified_strdate,      unsmuggle_url,      urlencode_postdata,  ) @@ -38,7 +38,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):          self.report_login()          login_url = 'https://vimeo.com/log_in'          webpage = self._download_webpage(login_url, None, False) -        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') +        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token')          data = urlencode_postdata({              'email': username,              'password': password, @@ -140,6 +140,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'description': 'md5:8678b246399b070816b12313e8b4eb5c',                  'uploader_id': 'atencio',                  'uploader': 'Peter Atencio', +                'upload_date': '20130927',                  'duration': 187,              },          }, @@ -176,17 +177,15 @@ class VimeoIE(VimeoBaseInfoExtractor):          password = self._downloader.params.get('videopassword', None)          if password is None:              raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) -        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') -        data = compat_urllib_parse.urlencode({ +        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token') +        data = urlencode_postdata({              'password': password,              'token': token,          }) -        # I didn't manage to use the password with https -        if url.startswith('https'): -            pass_url = url.replace('https', 'http') -        else: -            pass_url = url -        password_request = compat_urllib_request.Request(pass_url + '/password', data) +        if url.startswith('http://'): +            # vimeo only supports https now, but the user can give an http url +            url = url.replace('http://', 'https://') +        password_request = compat_urllib_request.Request(url + '/password', data)          password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')          password_request.add_header('Cookie', 'xsrft=%s' % token)          return self._download_webpage( @@ -223,12 +222,7 @@ class VimeoIE(VimeoBaseInfoExtractor):          video_id = mobj.group('id')          orig_url = url          if mobj.group('pro') or mobj.group('player'): -            url = 'http://player.vimeo.com/video/' + video_id - -        password = self._downloader.params.get('videopassword', None) -        if password: -            headers['Cookie'] = '%s_password=%s' % ( -                video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) +            url = 'https://player.vimeo.com/video/' + video_id          # Retrieve video webpage to extract further information          request = compat_urllib_request.Request(url, None, headers) @@ -250,6 +244,16 @@ class VimeoIE(VimeoBaseInfoExtractor):          # and latter we extract those that are Vimeo specific.          self.report_extraction(video_id) +        vimeo_config = self._search_regex( +            r'vimeo\.config\s*=\s*({.+?});', webpage, +            'vimeo config', default=None) +        if vimeo_config: +            seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) +            if seed_status.get('state') == 'failed': +                raise ExtractorError( +                    '%s returned error: %s' % (self.IE_NAME, seed_status['title']), +                    expected=True) +          # Extract the config JSON          try:              try: @@ -323,9 +327,9 @@ class VimeoIE(VimeoBaseInfoExtractor):          # Extract upload date          video_upload_date = None -        mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage) +        mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage)          if mobj is not None: -            video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) +            video_upload_date = unified_strdate(mobj.group(1))          try:              view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -379,7 +383,7 @@ class VimeoIE(VimeoBaseInfoExtractor):              for tt in text_tracks:                  subtitles[tt['lang']] = [{                      'ext': 'vtt', -                    'url': 'http://vimeo.com' + tt['url'], +                    'url': 'https://vimeo.com' + tt['url'],                  }]          return { @@ -402,11 +406,11 @@ class VimeoIE(VimeoBaseInfoExtractor):  class VimeoChannelIE(InfoExtractor):      IE_NAME = 'vimeo:channel' -    _VALID_URL = r'https?://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' +    _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'      _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'      _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'      _TESTS = [{ -        'url': 'http://vimeo.com/channels/tributes', +        'url': 'https://vimeo.com/channels/tributes',          'info_dict': {              'id': 'tributes',              'title': 'Vimeo Tributes', @@ -435,10 +439,10 @@ class VimeoChannelIE(InfoExtractor):              name="([^"]+)"\s+              value="([^"]*)"              ''', login_form)) -        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') +        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token')          fields['token'] = token          fields['password'] = password -        post = compat_urllib_parse.urlencode(fields) +        post = urlencode_postdata(fields)          password_path = self._search_regex(              r'action="([^"]+)"', login_form, 'password URL')          password_url = compat_urlparse.urljoin(page_url, password_path) @@ -465,7 +469,7 @@ class VimeoChannelIE(InfoExtractor):              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:                  break -        entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') +        entries = [self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')                     for video_id in video_ids]          return {'_type': 'playlist',                  'id': list_id, @@ -476,15 +480,15 @@ class VimeoChannelIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          channel_id = mobj.group('id') -        return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) +        return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id)  class VimeoUserIE(VimeoChannelIE):      IE_NAME = 'vimeo:user' -    _VALID_URL = r'https?://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)' +    _VALID_URL = r'https://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'      _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'      _TESTS = [{ -        'url': 'http://vimeo.com/nkistudio/videos', +        'url': 'https://vimeo.com/nkistudio/videos',          'info_dict': {              'title': 'Nki',              'id': 'nkistudio', @@ -495,15 +499,15 @@ class VimeoUserIE(VimeoChannelIE):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          name = mobj.group('name') -        return self._extract_videos(name, 'http://vimeo.com/%s' % name) +        return self._extract_videos(name, 'https://vimeo.com/%s' % name)  class VimeoAlbumIE(VimeoChannelIE):      IE_NAME = 'vimeo:album' -    _VALID_URL = r'https?://vimeo\.com/album/(?P<id>\d+)' +    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'      _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'      _TESTS = [{ -        'url': 'http://vimeo.com/album/2632481', +        'url': 'https://vimeo.com/album/2632481',          'info_dict': {              'id': '2632481',              'title': 'Staff Favorites: November 2013', @@ -527,14 +531,14 @@ class VimeoAlbumIE(VimeoChannelIE):      def _real_extract(self, url):          album_id = self._match_id(url) -        return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id) +        return self._extract_videos(album_id, 'https://vimeo.com/album/%s' % album_id)  class VimeoGroupsIE(VimeoAlbumIE):      IE_NAME = 'vimeo:group' -    _VALID_URL = r'(?:https?://)?vimeo\.com/groups/(?P<name>[^/]+)' +    _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)'      _TESTS = [{ -        'url': 'http://vimeo.com/groups/rolexawards', +        'url': 'https://vimeo.com/groups/rolexawards',          'info_dict': {              'id': 'rolexawards',              'title': 'Rolex Awards for Enterprise', @@ -548,13 +552,13 @@ class VimeoGroupsIE(VimeoAlbumIE):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          name = mobj.group('name') -        return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name) +        return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name)  class VimeoReviewIE(InfoExtractor):      IE_NAME = 'vimeo:review'      IE_DESC = 'Review pages on vimeo' -    _VALID_URL = r'https?://vimeo\.com/[^/]+/review/(?P<id>[^/]+)' +    _VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'      _TESTS = [{          'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',          'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -566,7 +570,7 @@ class VimeoReviewIE(InfoExtractor):          }      }, {          'note': 'video player needs Referer', -        'url': 'http://vimeo.com/user22258446/review/91613211/13f927e053', +        'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',          'md5': '6295fdab8f4bf6a002d058b2c6dce276',          'info_dict': {              'id': '91613211', @@ -588,11 +592,11 @@ class VimeoReviewIE(InfoExtractor):  class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):      IE_NAME = 'vimeo:watchlater'      IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' -    _VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater' +    _VALID_URL = r'https://vimeo\.com/home/watchlater|:vimeowatchlater'      _LOGIN_REQUIRED = True      _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'      _TESTS = [{ -        'url': 'http://vimeo.com/home/watchlater', +        'url': 'https://vimeo.com/home/watchlater',          'only_matching': True,      }] @@ -612,7 +616,7 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):  class VimeoLikesIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' +    _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'      IE_NAME = 'vimeo:likes'      IE_DESC = 'Vimeo user likes'      _TEST = { @@ -640,8 +644,8 @@ class VimeoLikesIE(InfoExtractor):          description = self._html_search_meta('description', webpage)          def _get_page(idx): -            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( -                self.http_scheme(), user_id, idx + 1) +            page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % ( +                user_id, idx + 1)              webpage = self._download_webpage(                  page_url, user_id,                  note='Downloading page %d/%d' % (idx + 1, page_count)) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0b58fe0fe..c3187cfeb 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -33,14 +33,13 @@ class VineIE(InfoExtractor):              r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))          formats = [{ -            'url': data['videoLowURL'], -            'ext': 'mp4', -            'format_id': 'low', -        }, { -            'url': data['videoUrl'], -            'ext': 'mp4', -            'format_id': 'standard', -        }] +            'format_id': '%(format)s-%(rate)s' % f, +            'vcodec': f['format'], +            'quality': f['rate'], +            'url': f['videoUrl'], +        } for f in data['videoUrls'] if f.get('rate')] + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 7dea8c59d..cc384adbf 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -31,7 +31,7 @@ class VKIE(InfoExtractor):                  'id': '162222515',                  'ext': 'flv',                  'title': 'ProtivoGunz - Хуёвая песня', -                'uploader': 're:Noize MC.*', +                'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',                  'duration': 195,                  'upload_date': '20120212',              }, @@ -140,7 +140,7 @@ class VKIE(InfoExtractor):          if not video_id:              video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) -        info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id +        info_url = 'http://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id          info_page = self._download_webpage(info_url, video_id)          ERRORS = { @@ -152,7 +152,10 @@ class VKIE(InfoExtractor):              'use --username and --password options to provide account credentials.',              r'<!>Unknown error': -            'Video %s does not exist.' +            'Video %s does not exist.', + +            r'<!>Видео временно недоступно': +            'Video %s is temporarily unavailable.',          }          for error_re, error_msg in ERRORS.items(): diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 4971965f9..81d885fdc 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def base64_decode_utf8(data): +        return base64.b64decode(data.encode('utf-8')).decode('utf-8') + +    @staticmethod +    def base64_encode_utf8(data): +        return base64.b64encode(data.encode('utf-8')).decode('utf-8') +      def _extract_flv_config(self, media_id): -        base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8') +        base64_media_id = self.base64_encode_utf8(media_id)          flv_config = self._download_xml(              'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,              'flv config')          prop_dict = {}          for prop in flv_config.findall('./property'): -            prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8') +            prop_id = self.base64_decode_utf8(prop.attrib['id'])              # CDATA may be empty in flv config              if not prop.text:                  continue -            encoded_content = base64.b64decode(prop.text).decode('utf-8') +            encoded_content = self.base64_decode_utf8(prop.text)              prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)          return prop_dict diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 97dbac4cc..b777159c5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,6 +17,8 @@ from ..utils import (      int_or_none,  ) +from .nbc import NBCSportsVPlayerIE +  class YahooIE(InfoExtractor):      IE_DESC = 'Yahoo screen and movies' @@ -129,6 +131,15 @@ class YahooIE(InfoExtractor):          }, {              'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',              'only_matching': True, +        }, { +            'note': 'NBC Sports embeds', +            'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', +            'info_dict': { +                'id': '9CsDKds0kvHI', +                'ext': 'flv', +                'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', +                'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', +            }          }      ] @@ -151,6 +162,10 @@ class YahooIE(InfoExtractor):                  items = json.loads(items_json)                  video_id = items[0]['id']                  return self._get_info(video_id, display_id, webpage) +        # Look for NBCSports iframes +        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) +        if nbc_sports_url: +            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')          items_json = self._search_regex(              r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index b294767c5..19f8762ae 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -8,6 +8,7 @@ from ..compat import compat_urlparse  from ..utils import (      float_or_none,      month_by_abbreviation, +    ExtractorError,  ) @@ -28,23 +29,45 @@ class YamIE(InfoExtractor):          }      }, {          # An external video hosted on YouTube -        'url': 'http://mymedia.yam.com/m/3598173', -        'md5': '0238ceec479c654e8c2f1223755bf3e9', +        'url': 'http://mymedia.yam.com/m/3599430', +        'md5': '03127cf10d8f35d120a9e8e52e3b17c6',          'info_dict': { -            'id': 'pJ2Deys283c', +            'id': 'CNpEoQlrIgA',              'ext': 'mp4', -            'upload_date': '20150202', +            'upload_date': '20150306',              'uploader': '新莊社大瑜伽社', -            'description': 'md5:f5cc72f0baf259a70fb731654b0d2eff', +            'description': 'md5:11e2e405311633ace874f2e6226c8b17',              'uploader_id': '2323agoy', -            'title': '外婆的澎湖灣KTV-潘安邦', -        } +            'title': '20090412陽明山二子坪-1', +        }, +        'skip': 'Video does not exist', +    }, { +        'url': 'http://mymedia.yam.com/m/3598173', +        'info_dict': { +            'id': '3598173', +            'ext': 'mp4', +        }, +        'skip': 'cause Yam system error', +    }, { +        'url': 'http://mymedia.yam.com/m/3599437', +        'info_dict': { +            'id': '3599437', +            'ext': 'mp4', +        }, +        'skip': 'invalid YouTube URL',      }]      def _real_extract(self, url):          video_id = self._match_id(url)          page = self._download_webpage(url, video_id) +        # Check for errors +        system_msg = self._html_search_regex( +            r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message', +            default=None) +        if system_msg: +            raise ExtractorError(system_msg, expected=True) +          # Is it hosted externally on YouTube?          youtube_url = self._html_search_regex(              r'<embed src="(http://www.youtube.com/[^"]+)"', diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py new file mode 100644 index 000000000..f4c0f5702 --- /dev/null +++ b/youtube_dl/extractor/yandexmusic.py @@ -0,0 +1,127 @@ +# coding=utf-8 +from __future__ import unicode_literals + +import re +import hashlib + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    float_or_none, +) + + +class YandexMusicBaseIE(InfoExtractor): +    def _get_track_url(self, storage_dir, track_id): +        data = self._download_json( +            'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' +            % storage_dir, +            track_id, 'Downloading track location JSON') + +        key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() +        storage = storage_dir.split('.') + +        return ('http://%s/get-mp3/%s/%s?track-id=%s&from=service-10-track&similarities-experiment=default' +                % (data['host'], key, data['ts'] + data['path'], storage[1])) + +    def _get_track_info(self, track): +        return { +            'id': track['id'], +            'ext': 'mp3', +            'url': self._get_track_url(track['storageDir'], track['id']), +            'title': '%s - %s' % (track['artists'][0]['name'], track['title']), +            'filesize': int_or_none(track.get('fileSize')), +            'duration': float_or_none(track.get('durationMs'), 1000), +        } + + +class YandexMusicTrackIE(YandexMusicBaseIE): +    IE_NAME = 'yandexmusic:track' +    IE_DESC = 'Яндекс.Музыка - Трек' +    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://music.yandex.ru/album/540508/track/4878838', +        'md5': 'f496818aa2f60b6c0062980d2e00dc20', +        'info_dict': { +            'id': '4878838', +            'ext': 'mp3', +            'title': 'Carlo Ambrosio - Gypsy Eyes 1', +            'filesize': 4628061, +            'duration': 193.04, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        album_id, track_id = mobj.group('album_id'), mobj.group('id') + +        track = self._download_json( +            'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), +            track_id, 'Downloading track JSON')['track'] + +        return self._get_track_info(track) + + +class YandexMusicAlbumIE(YandexMusicBaseIE): +    IE_NAME = 'yandexmusic:album' +    IE_DESC = 'Яндекс.Музыка - Альбом' +    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)' + +    _TEST = { +        'url': 'http://music.yandex.ru/album/540508', +        'info_dict': { +            'id': '540508', +            'title': 'Carlo Ambrosio - Gypsy Soul (2009)', +        }, +        'playlist_count': 50, +    } + +    def _real_extract(self, url): +        album_id = self._match_id(url) + +        album = self._download_json( +            'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, +            album_id, 'Downloading album JSON') + +        entries = [self._get_track_info(track) for track in album['volumes'][0]] + +        title = '%s - %s' % (album['artists'][0]['name'], album['title']) +        year = album.get('year') +        if year: +            title += ' (%s)' % year + +        return self.playlist_result(entries, compat_str(album['id']), title) + + +class YandexMusicPlaylistIE(YandexMusicBaseIE): +    IE_NAME = 'yandexmusic:playlist' +    IE_DESC = 'Яндекс.Музыка - Плейлист' +    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', +        'info_dict': { +            'id': '1245', +            'title': 'Что слушают Enter Shikari', +            'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', +        }, +        'playlist_count': 6, +    } + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) + +        webpage = self._download_webpage(url, playlist_id) + +        playlist = self._parse_json( +            self._search_regex( +                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), +            playlist_id)['pageData']['playlist'] + +        entries = [self._get_track_info(track) for track in playlist['tracks']] + +        return self.playlist_result( +            entries, compat_str(playlist_id), +            playlist['title'], playlist.get('description')) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 107c9ac36..6abe72f73 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -47,11 +47,12 @@ class YouPornIE(InfoExtractor):          # Get JSON parameters          json_params = self._search_regex( -            r'var currentVideo = new Video\((.*)\)[,;]', +            [r'var\s+videoJa?son\s*=\s*({.+?});', +             r'var\s+currentVideo\s*=\s*new\s+Video\((.+?)\)[,;]'],              webpage, 'JSON parameters')          try:              params = json.loads(json_params) -        except: +        except ValueError:              raise ExtractorError('Invalid JSON')          self.report_extraction(video_id) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3690f8021..5488101e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          return self.playlist_result(url_results, playlist_id, title) -    def _real_extract(self, url): -        # Extract playlist id -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) -        playlist_id = mobj.group(1) or mobj.group(2) - -        # Check if it's a video-specific URL -        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) -        if 'v' in query_dict: -            video_id = query_dict['v'][0] -            if self._downloader.params.get('noplaylist'): -                self.to_screen('Downloading just video %s because of --no-playlist' % video_id) -                return self.url_result(video_id, 'Youtube', video_id=video_id) -            else: -                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - -        if playlist_id.startswith('RD') or playlist_id.startswith('UL'): -            # Mixes require a custom extraction process -            return self._extract_mix(playlist_id) - +    def _extract_playlist(self, playlist_id):          url = self._TEMPLATE_URL % playlist_id          page = self._download_webpage(url, playlist_id)          more_widget_html = content_html = page @@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          url_results = self._ids_to_results(ids)          return self.playlist_result(url_results, playlist_id, playlist_title) +    def _real_extract(self, url): +        # Extract playlist id +        mobj = re.match(self._VALID_URL, url) +        if mobj is None: +            raise ExtractorError('Invalid URL: %s' % url) +        playlist_id = mobj.group(1) or mobj.group(2) + +        # Check if it's a video-specific URL +        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) +        if 'v' in query_dict: +            video_id = query_dict['v'][0] +            if self._downloader.params.get('noplaylist'): +                self.to_screen('Downloading just video %s because of --no-playlist' % video_id) +                return self.url_result(video_id, 'Youtube', video_id=video_id) +            else: +                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + +        if playlist_id.startswith('RD') or playlist_id.startswith('UL'): +            # Mixes require a custom extraction process +            return self._extract_mix(playlist_id) + +        return self._extract_playlist(playlist_id) +  class YoutubeChannelIE(InfoExtractor):      IE_DESC = 'YouTube.com channels' @@ -1532,7 +1535,7 @@ class YoutubeSearchURLIE(InfoExtractor):          webpage = self._download_webpage(url, query)          result_code = self._search_regex( -            r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML') +            r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')          part_codes = re.findall(              r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) @@ -1643,21 +1646,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): +    IE_NAME = 'youtube:recommended'      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'      _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'      _FEED_NAME = 'recommended'      _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): +    IE_NAME = 'youtube:watchlater'      IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' -    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' -    _FEED_NAME = 'watch_later' -    _PLAYLIST_TITLE = 'Youtube Watch Later' -    _PERSONAL_FEED = True +    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + +    _TESTS = []  # override PlaylistIE tests + +    def _real_extract(self, url): +        return self._extract_playlist('WL')  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): +    IE_NAME = 'youtube:history'      IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'      _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'      _FEED_NAME = 'history' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index a2ffe96bc..35c7e5fb3 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -195,6 +195,12 @@ def parseOpts(overrideArguments=None):          action='store_const', const='::', dest='source_address',          help='Make all connections via IPv6 (experimental)',      ) +    network.add_option( +        '--cn-verification-proxy', +        dest='cn_verification_proxy', default=None, metavar='URL', +        help='Use this proxy to verify the IP address for some Chinese sites. ' +        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' +    )      selection = optparse.OptionGroup(parser, 'Video Selection')      selection.add_option( @@ -435,8 +441,12 @@ def parseOpts(overrideArguments=None):      downloader.add_option(          '--external-downloader',          dest='external_downloader', metavar='COMMAND', -        help='(experimental) Use the specified external downloader. ' +        help='Use the specified external downloader. '               'Currently supports %s' % ','.join(list_external_downloaders())) +    downloader.add_option( +        '--external-downloader-args', +        dest='external_downloader_args', metavar='ARGS', +        help='Give these arguments to the external downloader.')      workarounds = optparse.OptionGroup(parser, 'Workarounds')      workarounds.add_option( @@ -553,7 +563,7 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='verbose', default=False,          help='print various debugging information')      verbosity.add_option( -        '--dump-intermediate-pages', +        '--dump-pages', '--dump-intermediate-pages',          action='store_true', dest='dump_intermediate_pages', default=False,          help='print downloaded pages to debug problems (very verbose)')      verbosity.add_option( @@ -726,6 +736,15 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='addmetadata', default=False,          help='write metadata to the video file')      postproc.add_option( +        '--metadata-from-title', +        metavar='FORMAT', dest='metafromtitle', +        help='parse additional metadata like song title / artist from the video title. ' +             'The format syntax is the same as --output, ' +             'the parsed parameters replace existing values. ' +             'Additional templates: %(album), %(artist). ' +             'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' +             '"Coldplay - Paradise"') +    postproc.add_option(          '--xattrs',          action='store_true', dest='xattrs', default=False,          help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)') @@ -775,6 +794,11 @@ def parseOpts(overrideArguments=None):              write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')      else:          command_line_conf = sys.argv[1:] +        # Workaround for Python 2.x, where argv is a byte list +        if sys.version_info < (3,): +            command_line_conf = [ +                a.decode('utf-8', 'replace') for a in command_line_conf] +          if '--ignore-config' in command_line_conf:              system_conf = []              user_conf = [] diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py index 708df3dd4..f39acadce 100644 --- a/youtube_dl/postprocessor/__init__.py +++ b/youtube_dl/postprocessor/__init__.py @@ -15,6 +15,7 @@ from .ffmpeg import (  )  from .xattrpp import XAttrMetadataPP  from .execafterdownload import ExecAfterDownloadPP +from .metadatafromtitle import MetadataFromTitlePP  def get_postprocessor(key): @@ -34,5 +35,6 @@ __all__ = [      'FFmpegPostProcessor',      'FFmpegSubtitlesConvertorPP',      'FFmpegVideoConvertorPP', +    'MetadataFromTitlePP',      'XAttrMetadataPP',  ] diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 30094c2f3..55adf9685 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals  import io  import os  import subprocess -import sys  import time @@ -269,19 +268,17 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):              else:                  self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path)                  self.run_ffmpeg(path, new_path, acodec, more_opts) -        except: -            etype, e, tb = sys.exc_info() -            if isinstance(e, AudioConversionError): -                msg = 'audio conversion failed: ' + e.msg -            else: -                msg = 'error running ' + self.basename -            raise PostProcessingError(msg) +        except AudioConversionError as e: +            raise PostProcessingError( +                'audio conversion failed: ' + e.msg) +        except Exception: +            raise PostProcessingError('error running ' + self.basename)          # Try to update the date time for extracted audio file.          if information.get('filetime') is not None:              try:                  os.utime(encodeFilename(new_path), (time.time(), information['filetime'])) -            except: +            except Exception:                  self._downloader.report_warning('Cannot update utime of audio file')          information['filepath'] = new_path @@ -545,7 +542,9 @@ class FFmpegMetadataPP(FFmpegPostProcessor):              metadata['title'] = info['title']          if info.get('upload_date') is not None:              metadata['date'] = info['upload_date'] -        if info.get('uploader') is not None: +        if info.get('artist') is not None: +            metadata['artist'] = info['artist'] +        elif info.get('uploader') is not None:              metadata['artist'] = info['uploader']          elif info.get('uploader_id') is not None:              metadata['artist'] = info['uploader_id'] @@ -554,6 +553,8 @@ class FFmpegMetadataPP(FFmpegPostProcessor):              metadata['comment'] = info['description']          if info.get('webpage_url') is not None:              metadata['purl'] = info['webpage_url'] +        if info.get('album') is not None: +            metadata['album'] = info['album']          if not metadata:              self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py new file mode 100644 index 000000000..5019433d3 --- /dev/null +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import re + +from .common import PostProcessor +from ..utils import PostProcessingError + + +class MetadataFromTitlePPError(PostProcessingError): +    pass + + +class MetadataFromTitlePP(PostProcessor): +    def __init__(self, downloader, titleformat): +        super(MetadataFromTitlePP, self).__init__(downloader) +        self._titleformat = titleformat +        self._titleregex = self.format_to_regex(titleformat) + +    def format_to_regex(self, fmt): +        """ +        Converts a string like +           '%(title)s - %(artist)s' +        to a regex like +           '(?P<title>.+)\ \-\ (?P<artist>.+)' +        """ +        lastpos = 0 +        regex = "" +        # replace %(..)s with regex group and escape other string parts +        for match in re.finditer(r'%\((\w+)\)s', fmt): +            regex += re.escape(fmt[lastpos:match.start()]) +            regex += r'(?P<' + match.group(1) + '>.+)' +            lastpos = match.end() +        if lastpos < len(fmt): +            regex += re.escape(fmt[lastpos:len(fmt)]) +        return regex + +    def run(self, info): +        title = info['title'] +        match = re.match(self._titleregex, title) +        if match is None: +            raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) +        for attribute, value in match.groupdict().items(): +            value = match.group(attribute) +            info[attribute] = value +            self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value) + +        return True, info diff --git a/youtube_dl/update.py b/youtube_dl/update.py index d8be4049f..de3169eef 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -65,7 +65,7 @@ def update_self(to_screen, verbose):      # Check if there is a new version      try:          newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() -    except: +    except Exception:          if verbose:              to_screen(compat_str(traceback.format_exc()))          to_screen('ERROR: can\'t find the current version. Please try again later.') @@ -78,7 +78,7 @@ def update_self(to_screen, verbose):      try:          versions_info = opener.open(JSON_URL).read().decode('utf-8')          versions_info = json.loads(versions_info) -    except: +    except Exception:          if verbose:              to_screen(compat_str(traceback.format_exc()))          to_screen('ERROR: can\'t obtain versions info. Please try again later.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d4938ec36..90e0ed9ab 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -75,7 +75,7 @@ def preferredencoding():      try:          pref = locale.getpreferredencoding()          'TEST'.encode(pref) -    except: +    except Exception:          pref = 'UTF-8'      return pref @@ -127,7 +127,7 @@ def write_json_file(obj, fn):              except OSError:                  pass          os.rename(tf.name, fn) -    except: +    except Exception:          try:              os.remove(tf.name)          except OSError: @@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode):              raise          # In case of error, try to remove win32 forbidden chars -        alt_filename = os.path.join( -            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) -            for path_part in os.path.split(filename) -        ) +        alt_filename = sanitize_path(filename)          if alt_filename == filename:              raise          else:              # An exception here should be caught in the caller -            stream = open(encodeFilename(filename), open_mode) +            stream = open(encodeFilename(alt_filename), open_mode)              return (stream, alt_filename) @@ -305,11 +302,37 @@ def sanitize_filename(s, restricted=False, is_id=False):              result = result[2:]          if result.startswith('-'):              result = '_' + result[len('-'):] +        result = result.lstrip('.')          if not result:              result = '_'      return result +def sanitize_path(s): +    """Sanitizes and normalizes path on Windows""" +    if sys.platform != 'win32': +        return s +    drive, _ = os.path.splitdrive(s) +    unc, _ = os.path.splitunc(s) +    unc_or_drive = unc or drive +    norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep) +    if unc_or_drive: +        norm_path.pop(0) +    sanitized_path = [ +        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) +        for path_part in norm_path] +    if unc_or_drive: +        sanitized_path.insert(0, unc_or_drive + os.path.sep) +    return os.path.join(*sanitized_path) + + +def sanitize_url_path_consecutive_slashes(url): +    """Collapses consecutive slashes in URLs' path""" +    parsed_url = list(compat_urlparse.urlparse(url)) +    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) +    return compat_urlparse.urlunparse(parsed_url) + +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] @@ -325,7 +348,7 @@ def _htmlentity_transform(entity):      if entity in compat_html_entities.name2codepoint:          return compat_chr(compat_html_entities.name2codepoint[entity]) -    mobj = re.match(r'#(x?[0-9]+)', entity) +    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)      if mobj is not None:          numstr = mobj.group(1)          if numstr.startswith('x'): @@ -1767,3 +1790,24 @@ def match_filter_func(filter_str):              video_title = info_dict.get('title', info_dict.get('id', 'video'))              return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)      return _match_func + + +class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): +    def __init__(self, proxies=None): +        # Set default handlers +        for type in ('http', 'https'): +            setattr(self, '%s_open' % type, +                    lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: +                        meth(r, proxy, type)) +        return compat_urllib_request.ProxyHandler.__init__(self, proxies) + +    def proxy_open(self, req, proxy, type): +        req_proxy = req.headers.get('Ytdl-request-proxy') +        if req_proxy is not None: +            proxy = req_proxy +            del req.headers['Ytdl-request-proxy'] + +        if proxy == '__noproxy__': +            return None  # No Proxy +        return compat_urllib_request.ProxyHandler.proxy_open( +            self, req, proxy, type) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5582348ba..dd93e295a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.02.28' +__version__ = '2015.03.28' | 
