diff options
99 files changed, 2121 insertions, 689 deletions
| diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 70f6b51ed..fc221594f 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@  --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.15*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.15** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.05.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.05.01**  ### Before submitting an *issue* make sure you have:  - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>  [debug] User config: []  [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']  [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.04.15 +[debug] youtube-dl version 2017.05.01  [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2  [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4  [debug] Proxy map: {} diff --git a/.gitignore b/.gitignore index 9ce4b5e2d..a5b585f43 100644 --- a/.gitignore +++ b/.gitignore @@ -35,8 +35,8 @@ updates_key.pem  *.mkv  *.swf  *.part +*.ytdl  *.swp -test/testdata  test/local_parameters.json  .tox  youtube-dl.zsh @@ -211,3 +211,4 @@ Juanjo Benages  Xiao Di Guan  Thomas Winant  Daniel Twardowski +Jeremie Jarosh @@ -1,3 +1,129 @@ +version <unreleased> + +Extractors ++ [cda] Support birthday verification (#12789) +* [leeco] Fix extraction (#12974) + + +version 2017.05.01 + +Core ++ [extractor/common] Extract view count from JSON-LD +* [utils] Improve unified_timestamp ++ [utils] Add video/mp2t to mimetype2ext +* [downloader/external] Properly handle live stream downloading cancellation +  (#8932) ++ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906) + +Extractors +* [infoq] Make audio format extraction non fatal (#12938) +* [brightcove] Allow whitespace around attribute names in embedded code ++ [zaq1] Add support for zaq1.pl (#12693) ++ [xvideos] Extract duration (#12828) +* [vevo] Fix extraction (#12879) ++ [noovo] Add support for noovo.ca (#12792) ++ [washingtonpost] Add support for embeds (#12699) +* [yandexmusic:playlist] Fix extraction for python 3 (#12888) +* [anvato] Improve extraction (#12913) +    * Promote to regular shortcut based extractor +    * Add mcp to access key mapping table +    * Add support for embeds extraction +    * Add support for anvato embeds in generic extractor +* [xtube] Fix extraction for older FLV videos (#12734) +* [tvplayer] Fix extraction (#12908) + + +version 2017.04.28 + +Core ++ [adobepass] Use geo verification headers for all requests +- [downloader/fragment] Remove assert for resume_len when no fragments +  downloaded ++ [extractor/common] Add manifest_url for explicit group rendition formats +* [extractor/common] Fix manifest_url for m3u8 formats +- [extractor/common] Don't list master m3u8 playlists in format list (#12832) + +Extractor +* [aenetworks] Fix extraction for shows with single season ++ [go] Add support for Disney, DisneyJunior and DisneyXD show pages +* [youtube] Recognize new locale-based player URLs (#12885) ++ [streamable] Add support for new embedded URL schema (#12844) +* [arte:+7] Relax URL regular expression (#12837) + + +version 2017.04.26 + +Core +* Introduce --keep-fragments for keeping fragments of fragmented download +  on disk after download is finished +* [YoutubeDL] Fix output template for missing timestamp (#12796) +* [socks] Handle cases where credentials are required but missing +* [extractor/common] Improve HLS extraction (#12211) +    * Extract m3u8 parsing to separate method +    * Improve rendition groups extraction +    * Build stream name according stream GROUP-ID +    * Ignore reference to AUDIO group without URI when stream has no CODECS +    * Use float for scaled tbr in _parse_m3u8_formats +* [utils] Add support for TTML styles in dfxp2srt +* [downloader/hls] No need to download keys for fragments that have been +  already downloaded +* [downloader/fragment] Improve fragment downloading +    * Resume immediately +    * Don't concatenate fragments and decrypt them on every resume +    * Optimize disk storage usage, don't store intermediate fragments on disk +    * Store bookkeeping download state file ++ [extractor/common] Add support for multiple getters in try_get ++ [extractor/common] Add support for video of WebPage context in _json_ld +  (#12778) ++ [extractor/common] Relax JWPlayer regular expression and remove +  duplicate URLs (#12768) + +Extractors +* [iqiyi] Fix extraction of Yule videos +* [vidio] Improve extraction and sort formats ++ [brightcove] Match only video elements with data-video-id attribute +* [iqiyi] Fix playlist detection (#12504) +- [azubu] Remove extractor (#12813) +* [porn91] Fix extraction (#12814) +* [vidzi] Fix extraction (#12793) ++ [amp] Extract error message (#12795) ++ [xfileshare] Add support for gorillavid.com and daclips.com (#12776) +* [instagram] Fix extraction (#12777) ++ [generic] Support Brightcove videos in <iframe> (#12482) ++ [brightcove] Support URLs with bcpid instead of playerID (#12482) +* [brightcove] Fix _extract_url (#12782) ++ [odnoklassniki] Extract HLS formats + + +version 2017.04.17 + +Extractors +* [limelight] Improve extraction LimelightEmbeddedPlayerFlash media embeds and +  add support for channel and channelList embeds +* [generic] Extract multiple Limelight embeds (#12761) ++ [itv] Extract series metadata +* [itv] Fix RTMP formats downloading (#12759) +* [itv] Use native HLS downloader by default ++ [go90] Extract subtitles (#12752) ++ [go90] Extract series metadata (#12752) + + +version 2017.04.16 + +Core +* [YoutubeDL] Apply expand_path after output template substitution ++ [YoutubeDL] Propagate overridden meta fields to extraction results of type +  url (#11163) + +Extractors ++ [generic] Extract RSS entries as url_transparent (#11163) ++ [streamango] Add support for streamango.com (#12643) ++ [wsj:article] Add support for articles (#12558) +* [brightcove] Relax video tag embeds extraction and validate ambiguous embeds' +  URLs (#9163, #12005, #12178, #12480) ++ [udemy] Add support for react rendition (#12744) + +  version 2017.04.15  Extractors @@ -1,7 +1,7 @@  all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites  clean: -	rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe +	rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe  	find . -name "*.pyc" -delete  	find . -name "*.class" -delete @@ -187,6 +187,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo                                       and ISM)      --abort-on-unavailable-fragment  Abort downloading when some fragment is not                                       available +    --keep-fragments                 Keep downloaded fragments on disk after +                                     downloading is finished; fragments are +                                     erased by default      --buffer-size SIZE               Size of download buffer (e.g. 1024 or 16K)                                       (default is 1024)      --no-resize-buffer               Do not automatically adjust the buffer diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b29b50c8d..e3c038c48 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -45,6 +45,7 @@   - **anderetijden**: npo.nl and ntr.nl   - **AnimeOnDemand**   - **anitube.se** + - **Anvato**   - **AnySex**   - **Aparat**   - **AppleConnect** @@ -81,8 +82,6 @@   - **AZMedien**: AZ Medien videos   - **AZMedienPlaylist**: AZ Medien playlists   - **AZMedienShowPlaylist**: AZ Medien show playlists - - **Azubu** - - **AzubuLive**   - **BaiduVideo**: 百度视频   - **bambuser**   - **bambuser:channel** @@ -531,6 +530,7 @@   - **NJPWWorld**: 新日本プロレスワールド   - **NobelPrize**   - **Noco** + - **Noovo**   - **Normalboots**   - **NosVideo**   - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz @@ -745,6 +745,7 @@   - **Steam**   - **Stitcher**   - **Streamable** + - **Streamango**   - **streamcloud.eu**   - **StreamCZ**   - **StreetVoice** @@ -966,6 +967,7 @@   - **wrzuta.pl**   - **wrzuta.pl:playlist**   - **WSJ**: Wall Street Journal + - **WSJArticle**   - **XBef**   - **XboxClips**   - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo @@ -1013,6 +1015,7 @@   - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)   - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)   - **Zapiks** + - **Zaq1**   - **ZDF**   - **ZDFChannel**   - **zingmp3**: mp3.zing.vn diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 881197afb..6f52e11f7 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -3,12 +3,13 @@  from __future__ import unicode_literals  # Allow direct execution +import io  import os  import sys  import unittest  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict +from test.helper import FakeYDL, expect_dict, expect_value  from youtube_dl.extractor.common import InfoExtractor  from youtube_dl.extractor import YoutubeIE, get_info_extractor  from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError @@ -175,6 +176,318 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/                  }]              }) +    def test_parse_m3u8_formats(self): +        _TEST_CASES = [ +            ( +                # https://github.com/rg3/youtube-dl/issues/11507 +                # http://pluzz.francetv.fr/videos/le_ministere.html +                'pluzz_francetv_11507', +                'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', +                [{ +                    'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0', +                    'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', +                    'ext': 'mp4', +                    'format_id': '180', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.66.30', +                    'tbr': 180, +                    'width': 256, +                    'height': 144, +                }, { +                    'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0', +                    'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', +                    'ext': 'mp4', +                    'format_id': '303', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.66.30', +                    'tbr': 303, +                    'width': 320, +                    'height': 180, +                }, { +                    'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0', +                    'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', +                    'ext': 'mp4', +                    'format_id': '575', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.66.30', +                    'tbr': 575, +                    'width': 512, +                    'height': 288, +                }, { +                    'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0', +                    'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', +                    'ext': 'mp4', +                    'format_id': '831', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.77.30', +                    'tbr': 831, +                    'width': 704, +                    'height': 396, +                }, { +                    'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0', +                    'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', +                    'ext': 'mp4', +                    'protocol': 'm3u8', +                    'format_id': '1467', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.77.30', +                    'tbr': 1467, +                    'width': 1024, +                    'height': 576, +                }] +            ), +            ( +                # https://github.com/rg3/youtube-dl/issues/11995 +                # http://teamcoco.com/video/clueless-gamer-super-bowl-for-honor +                'teamcoco_11995', +                'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                [{ +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': 'audio-0-Default', +                    'protocol': 'm3u8', +                    'vcodec': 'none', +                }, { +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': 'audio-1-Default', +                    'protocol': 'm3u8', +                    'vcodec': 'none', +                }, { +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': '71', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.5', +                    'vcodec': 'none', +                    'tbr': 71, +                }, { +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': '413', +                    'protocol': 'm3u8', +                    'acodec': 'none', +                    'vcodec': 'avc1.42001e', +                    'tbr': 413, +                    'width': 400, +                    'height': 224, +                }, { +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': '522', +                    'protocol': 'm3u8', +                    'acodec': 'none', +                    'vcodec': 'avc1.42001e', +                    'tbr': 522, +                    'width': 400, +                    'height': 224, +                }, { +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-1m_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': '1205', +                    'protocol': 'm3u8', +                    'acodec': 'none', +                    'vcodec': 'avc1.4d001e', +                    'tbr': 1205, +                    'width': 640, +                    'height': 360, +                }, { +                    'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-2m_v4.m3u8', +                    'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', +                    'ext': 'mp4', +                    'format_id': '2374', +                    'protocol': 'm3u8', +                    'acodec': 'none', +                    'vcodec': 'avc1.4d001f', +                    'tbr': 2374, +                    'width': 1024, +                    'height': 576, +                }] +            ), +            ( +                # https://github.com/rg3/youtube-dl/issues/12211 +                # http://video.toggle.sg/en/series/whoopie-s-world/ep3/478601 +                'toggle_mobile_12211', +                'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                [{ +                    'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8', +                    'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                    'ext': 'mp4', +                    'format_id': 'audio-English', +                    'protocol': 'm3u8', +                    'language': 'eng', +                    'vcodec': 'none', +                }, { +                    'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8', +                    'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                    'ext': 'mp4', +                    'format_id': 'audio-Undefined', +                    'protocol': 'm3u8', +                    'language': 'und', +                    'vcodec': 'none', +                }, { +                    'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8', +                    'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                    'ext': 'mp4', +                    'format_id': '155', +                    'protocol': 'm3u8', +                    'tbr': 155.648, +                    'width': 320, +                    'height': 180, +                }, { +                    'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8', +                    'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                    'ext': 'mp4', +                    'format_id': '502', +                    'protocol': 'm3u8', +                    'tbr': 502.784, +                    'width': 480, +                    'height': 270, +                }, { +                    'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8', +                    'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                    'ext': 'mp4', +                    'format_id': '827', +                    'protocol': 'm3u8', +                    'tbr': 827.392, +                    'width': 640, +                    'height': 360, +                }, { +                    'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8', +                    'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', +                    'ext': 'mp4', +                    'format_id': '1396', +                    'protocol': 'm3u8', +                    'tbr': 1396.736, +                    'width': 854, +                    'height': 480, +                }] +            ), +            ( +                # http://www.twitch.tv/riotgames/v/6528877 +                'twitch_vod', +                'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                [{ +                    'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8', +                    'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                    'ext': 'mp4', +                    'format_id': 'Audio Only', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'none', +                    'tbr': 182.725, +                }, { +                    'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8', +                    'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                    'ext': 'mp4', +                    'format_id': 'Mobile', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.42C00D', +                    'tbr': 280.474, +                    'width': 400, +                    'height': 226, +                }, { +                    'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8', +                    'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                    'ext': 'mp4', +                    'format_id': 'Low', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.42C01E', +                    'tbr': 628.347, +                    'width': 640, +                    'height': 360, +                }, { +                    'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8', +                    'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                    'ext': 'mp4', +                    'format_id': 'Medium', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.42C01E', +                    'tbr': 893.387, +                    'width': 852, +                    'height': 480, +                }, { +                    'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8', +                    'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                    'ext': 'mp4', +                    'format_id': 'High', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.42C01F', +                    'tbr': 1603.789, +                    'width': 1280, +                    'height': 720, +                }, { +                    'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8', +                    'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', +                    'ext': 'mp4', +                    'format_id': 'Source', +                    'protocol': 'm3u8', +                    'acodec': 'mp4a.40.2', +                    'vcodec': 'avc1.100.31', +                    'tbr': 3214.134, +                    'width': 1280, +                    'height': 720, +                }] +            ), +            ( +                # http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 +                # EXT-X-STREAM-INF tag with NAME attribute that is not defined +                # in HLS specification +                'vidio', +                'https://www.vidio.com/videos/165683/playlist.m3u8', +                [{ +                    'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8', +                    'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', +                    'ext': 'mp4', +                    'format_id': '270p 3G', +                    'protocol': 'm3u8', +                    'tbr': 300, +                    'width': 480, +                    'height': 270, +                }, { +                    'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8', +                    'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', +                    'ext': 'mp4', +                    'format_id': '360p SD', +                    'protocol': 'm3u8', +                    'tbr': 600, +                    'width': 640, +                    'height': 360, +                }, { +                    'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8', +                    'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', +                    'ext': 'mp4', +                    'format_id': '720p HD', +                    'protocol': 'm3u8', +                    'tbr': 1200, +                    'width': 1280, +                    'height': 720, +                }] +            ) +        ] + +        for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: +            with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, +                         mode='r', encoding='utf-8') as f: +                formats = self.ie._parse_m3u8_formats( +                    f.read(), m3u8_url, ext='mp4') +                self.ie._sort_formats(formats) +                expect_value(self, formats, expected_formats, None) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8491a88bd..75945e38f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -755,6 +755,7 @@ class TestYoutubeDL(unittest.TestCase):                      '_type': 'url_transparent',                      'url': 'foo2:',                      'ie_key': 'Foo2', +                    'title': 'foo1 title'                  }          class Foo2IE(InfoExtractor): @@ -771,7 +772,7 @@ class TestYoutubeDL(unittest.TestCase):              _VALID_URL = r'foo3:'              def _real_extract(self, url): -                return _make_result([{'url': TEST_URL}]) +                return _make_result([{'url': TEST_URL}], title='foo3 title')          ydl.add_info_extractor(Foo1IE(ydl))          ydl.add_info_extractor(Foo2IE(ydl)) @@ -779,6 +780,7 @@ class TestYoutubeDL(unittest.TestCase):          ydl.extract_info('foo1:')          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['url'], TEST_URL) +        self.assertEqual(downloaded['title'], 'foo1 title')  if __name__ == '__main__': diff --git a/test/test_download.py b/test/test_download.py index 0e9f293b5..209f5f6d6 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -225,7 +225,7 @@ def generator(test_case, tname):                                  format_bytes(got_fsize)))                      if 'md5' in tc:                          md5_for_file = _file_md5(tc_filename) -                        self.assertEqual(md5_for_file, tc['md5']) +                        self.assertEqual(tc['md5'], md5_for_file)                  # Finally, check test cases' data again but this time against                  # extracted data from info JSON file written during processing                  info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json' diff --git a/test/test_utils.py b/test/test_utils.py index aa4569b81..f31559e71 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -44,6 +44,7 @@ from youtube_dl.utils import (      limit_length,      mimetype2ext,      month_by_name, +    multipart_encode,      ohdave_rsa_encrypt,      OnDemandPagedList,      orderedSet, @@ -338,6 +339,7 @@ class TestUtil(unittest.TestCase):          self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)          self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500)          self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100) +        self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361)      def test_determine_ext(self):          self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -619,6 +621,16 @@ class TestUtil(unittest.TestCase):              'http://example.com/path', {'test': '第二行тест'})),              query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) +    def test_multipart_encode(self): +        self.assertEqual( +            multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0], +            b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n') +        self.assertEqual( +            multipart_encode({'欄位'.encode('utf-8'): '值'.encode('utf-8')}, boundary='AAAAAA')[0], +            b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n') +        self.assertRaises( +            ValueError, multipart_encode, {b'field': b'value'}, boundary='value') +      def test_dict_get(self):          FALSE_VALUES = {              'none': None, @@ -899,6 +911,7 @@ class TestUtil(unittest.TestCase):      def test_clean_html(self):          self.assertEqual(clean_html('a:\nb'), 'a: b')          self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"') +        self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')      def test_intlist_to_bytes(self):          self.assertEqual( @@ -1069,6 +1082,47 @@ The first line  '''          self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) +        dfxp_data_with_style = '''<?xml version="1.0" encoding="utf-8"?> +<tt xmlns="http://www.w3.org/2006/10/ttaf1" xmlns:ttp="http://www.w3.org/2006/10/ttaf1#parameter" ttp:timeBase="media" xmlns:tts="http://www.w3.org/2006/10/ttaf1#style" xml:lang="en" xmlns:ttm="http://www.w3.org/2006/10/ttaf1#metadata"> +  <head> +    <styling> +      <style id="s2" style="s0" tts:color="cyan" tts:fontWeight="bold" /> +      <style id="s1" style="s0" tts:color="yellow" tts:fontStyle="italic" /> +      <style id="s3" style="s0" tts:color="lime" tts:textDecoration="underline" /> +      <style id="s0" tts:backgroundColor="black" tts:fontStyle="normal" tts:fontSize="16" tts:fontFamily="sansSerif" tts:color="white" /> +    </styling> +  </head> +  <body tts:textAlign="center" style="s0"> +    <div> +      <p begin="00:00:02.08" id="p0" end="00:00:05.84">default style<span tts:color="red">custom style</span></p> +      <p style="s2" begin="00:00:02.08" id="p0" end="00:00:05.84"><span tts:color="lime">part 1<br /></span><span tts:color="cyan">part 2</span></p> +      <p style="s3" begin="00:00:05.84" id="p1" end="00:00:09.56">line 3<br />part 3</p> +      <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p> +    </div> +  </body> +</tt>''' +        srt_data = '''1 +00:00:02,080 --> 00:00:05,839 +<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font> + +2 +00:00:02,080 --> 00:00:05,839 +<b><font color="cyan" face="sansSerif" size="16"><font color="lime">part 1 +</font>part 2</font></b> + +3 +00:00:05,839 --> 00:00:09,560 +<u><font color="lime">line 3 +part 3</font></u> + +4 +00:00:09,560 --> 00:00:12,359 +<i><u><font color="yellow"><font color="lime">inner + </font>style</font></u></i> + +''' +        self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) +      def test_cli_option(self):          self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])          self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) diff --git a/test/testdata/m3u8/pluzz_francetv_11507.m3u8 b/test/testdata/m3u8/pluzz_francetv_11507.m3u8 new file mode 100644 index 000000000..0809f5aa0 --- /dev/null +++ b/test/testdata/m3u8/pluzz_francetv_11507.m3u8 @@ -0,0 +1,14 @@ +#EXTM3U +    
#EXT-X-VERSION:5 +    
#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="subs",NAME="Francais",DEFAULT=NO,FORCED=NO,URI="http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8",LANGUAGE="fra" +    
#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="aac",LANGUAGE="fra",NAME="Francais",DEFAULT=YES, AUTOSELECT=YES +#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=180000,RESOLUTION=256x144,CODECS="avc1.66.30, mp4a.40.2" +http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0 +#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=303000,RESOLUTION=320x180,CODECS="avc1.66.30, mp4a.40.2" +http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0 +#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=575000,RESOLUTION=512x288,CODECS="avc1.66.30, mp4a.40.2" +http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0 +#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=831000,RESOLUTION=704x396,CODECS="avc1.77.30, mp4a.40.2" +http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0 +#EXT-X-STREAM-INF:SUBTITLES="subs",AUDIO="aac",PROGRAM-ID=1,BANDWIDTH=1467000,RESOLUTION=1024x576,CODECS="avc1.77.30, mp4a.40.2" +http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0 diff --git a/test/testdata/m3u8/teamcoco_11995.m3u8 b/test/testdata/m3u8/teamcoco_11995.m3u8 new file mode 100644 index 000000000..a6e421697 --- /dev/null +++ b/test/testdata/m3u8/teamcoco_11995.m3u8 @@ -0,0 +1,16 @@ +#EXTM3U +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio-0",NAME="Default",AUTOSELECT=YES,DEFAULT=YES,URI="hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8" +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio-1",NAME="Default",AUTOSELECT=YES,DEFAULT=YES,URI="hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8" +#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=37862000,CODECS="avc1.4d001f",URI="hls/CONAN_020217_Highlight_show-2m_iframe.m3u8" +#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=18750000,CODECS="avc1.4d001e",URI="hls/CONAN_020217_Highlight_show-1m_iframe.m3u8" +#EXT-X-I-FRAME-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=6535000,CODECS="avc1.42001e",URI="hls/CONAN_020217_Highlight_show-400k_iframe.m3u8" +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=2374000,RESOLUTION=1024x576,CODECS="avc1.4d001f,mp4a.40.2",AUDIO="audio-0" +hls/CONAN_020217_Highlight_show-2m_v4.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1205000,RESOLUTION=640x360,CODECS="avc1.4d001e,mp4a.40.2",AUDIO="audio-0" +hls/CONAN_020217_Highlight_show-1m_v4.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=522000,RESOLUTION=400x224,CODECS="avc1.42001e,mp4a.40.2",AUDIO="audio-0" +hls/CONAN_020217_Highlight_show-400k_v4.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=413000,RESOLUTION=400x224,CODECS="avc1.42001e,mp4a.40.5",AUDIO="audio-1" +hls/CONAN_020217_Highlight_show-400k_v4.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=71000,CODECS="mp4a.40.5",AUDIO="audio-1" +hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8 diff --git a/test/testdata/m3u8/toggle_mobile_12211.m3u8 b/test/testdata/m3u8/toggle_mobile_12211.m3u8 new file mode 100644 index 000000000..69604e683 --- /dev/null +++ b/test/testdata/m3u8/toggle_mobile_12211.m3u8 @@ -0,0 +1,13 @@ +#EXTM3U +#EXT-X-VERSION:4 +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="eng",NAME="English",URI="http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8" +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",LANGUAGE="und",NAME="Undefined",URI="http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8" + +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=155648,RESOLUTION=320x180,AUDIO="audio" +http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=502784,RESOLUTION=480x270,AUDIO="audio" +http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=827392,RESOLUTION=640x360,AUDIO="audio" +http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8 +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1396736,RESOLUTION=854x480,AUDIO="audio" +http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8 diff --git a/test/testdata/m3u8/twitch_vod.m3u8 b/test/testdata/m3u8/twitch_vod.m3u8 new file mode 100644 index 000000000..7617277ca --- /dev/null +++ b/test/testdata/m3u8/twitch_vod.m3u8 @@ -0,0 +1,20 @@ +#EXTM3U +#EXT-X-TWITCH-INFO:ORIGIN="s3",CLUSTER="edgecast_vod",REGION="EU",MANIFEST-CLUSTER="edgecast_vod",USER-IP="109.171.17.81" +#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="chunked",NAME="Source",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=3214134,CODECS="avc1.100.31,mp4a.40.2",RESOLUTION="1280x720",VIDEO="chunked" +https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8 +#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="high",NAME="High",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1603789,CODECS="avc1.42C01F,mp4a.40.2",RESOLUTION="1280x720",VIDEO="high" +https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8 +#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="medium",NAME="Medium",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=893387,CODECS="avc1.42C01E,mp4a.40.2",RESOLUTION="852x480",VIDEO="medium" +https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8 +#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="low",NAME="Low",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=628347,CODECS="avc1.42C01E,mp4a.40.2",RESOLUTION="640x360",VIDEO="low" +https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8 +#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="mobile",NAME="Mobile",AUTOSELECT=YES,DEFAULT=YES +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=280474,CODECS="avc1.42C00D,mp4a.40.2",RESOLUTION="400x226",VIDEO="mobile" +https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8 +#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="audio_only",NAME="Audio Only",AUTOSELECT=NO,DEFAULT=NO +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=182725,CODECS="mp4a.40.2",VIDEO="audio_only" +https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8 diff --git a/test/testdata/m3u8/vidio.m3u8 b/test/testdata/m3u8/vidio.m3u8 new file mode 100644 index 000000000..89c244469 --- /dev/null +++ b/test/testdata/m3u8/vidio.m3u8 @@ -0,0 +1,10 @@ +#EXTM3U + +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=300000,RESOLUTION=480x270,NAME="270p 3G" +https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8 + +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=600000,RESOLUTION=640x360,NAME="360p SD" +https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8 + +#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1200000,RESOLUTION=1280x720,NAME="720p HD" +https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8 diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7953670a7..eb465c425 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -640,7 +640,7 @@ class YoutubeDL(object):              NUMERIC_FIELDS = set((                  'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', -                'upload_year', 'upload_month', 'upload_day', +                'timestamp', 'upload_year', 'upload_month', 'upload_day',                  'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',                  'average_rating', 'comment_count', 'age_limit',                  'start_time', 'end_time', @@ -672,8 +672,7 @@ class YoutubeDL(object):                          FORMAT_RE.format(numeric_field),                          r'%({0})s'.format(numeric_field), outtmpl) -            tmpl = expand_path(outtmpl) -            filename = tmpl % template_dict +            filename = expand_path(outtmpl % template_dict)              # Temporary fix for #4787              # 'Treat' all problem characters by passing filename through preferredencoding              # to workaround encoding issues with subprocess on python2 @ Windows @@ -851,7 +850,14 @@ class YoutubeDL(object):              new_result = info.copy()              new_result.update(force_properties) -            assert new_result.get('_type') != 'url_transparent' +            # Extracted info may not be a video result (i.e. +            # info.get('_type', 'video') != video) but rather an url or +            # url_transparent. In such cases outer metadata (from ie_result) +            # should be propagated to inner one (info). For this to happen +            # _type of info should be overridden with url_transparent. This +            # fixes issue from https://github.com/rg3/youtube-dl/pull/11163. +            if new_result.get('_type') == 'url': +                new_result['_type'] = 'url_transparent'              return self.process_ie_result(                  new_result, download=download, extra_info=extra_info) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f15606568..c4589411e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -343,6 +343,7 @@ def _real_main(argv=None):          'retries': opts.retries,          'fragment_retries': opts.fragment_retries,          'skip_unavailable_fragments': opts.skip_unavailable_fragments, +        'keep_fragments': opts.keep_fragments,          'buffersize': opts.buffersize,          'noresizebuffer': opts.noresizebuffer,          'continuedl': opts.continue_dl, diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2c4470a95..5d6621147 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -187,6 +187,9 @@ class FileDownloader(object):              return filename[:-len('.part')]          return filename +    def ytdl_filename(self, filename): +        return filename + '.ytdl' +      def try_rename(self, old_filename, new_filename):          try:              if old_filename == new_filename: @@ -327,21 +330,22 @@ class FileDownloader(object):              os.path.exists(encodeFilename(filename))          ) -        continuedl_and_exists = ( -            self.params.get('continuedl', True) and -            os.path.isfile(encodeFilename(filename)) and -            not self.params.get('nopart', False) -        ) - -        # Check file already present -        if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): -            self.report_file_already_downloaded(filename) -            self._hook_progress({ -                'filename': filename, -                'status': 'finished', -                'total_bytes': os.path.getsize(encodeFilename(filename)), -            }) -            return True +        if not hasattr(filename, 'write'): +            continuedl_and_exists = ( +                self.params.get('continuedl', True) and +                os.path.isfile(encodeFilename(filename)) and +                not self.params.get('nopart', False) +            ) + +            # Check file already present +            if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): +                self.report_file_already_downloaded(filename) +                self._hook_progress({ +                    'filename': filename, +                    'status': 'finished', +                    'total_bytes': os.path.getsize(encodeFilename(filename)), +                }) +                return True          min_sleep_interval = self.params.get('sleep_interval')          if min_sleep_interval: diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index e2ddc369e..7491fdad8 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,13 +1,7 @@  from __future__ import unicode_literals -import os -  from .fragment import FragmentFD  from ..compat import compat_urllib_error -from ..utils import ( -    sanitize_open, -    encodeFilename, -)  class DashSegmentsFD(FragmentFD): @@ -28,31 +22,24 @@ class DashSegmentsFD(FragmentFD):          self._prepare_and_start_frag_download(ctx) -        segments_filenames = [] -          fragment_retries = self.params.get('fragment_retries', 0)          skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) -        def process_segment(segment, tmp_filename, num): -            segment_url = segment['url'] -            segment_name = 'Frag%d' % num -            target_filename = '%s-%s' % (tmp_filename, segment_name) +        frag_index = 0 +        for i, segment in enumerate(segments): +            frag_index += 1 +            if frag_index <= ctx['fragment_index']: +                continue              # In DASH, the first segment contains necessary headers to              # generate a valid MP4 file, so always abort for the first segment -            fatal = num == 0 or not skip_unavailable_fragments +            fatal = i == 0 or not skip_unavailable_fragments              count = 0              while count <= fragment_retries:                  try: -                    success = ctx['dl'].download(target_filename, { -                        'url': segment_url, -                        'http_headers': info_dict.get('http_headers'), -                    }) +                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)                      if not success:                          return False -                    down, target_sanitized = sanitize_open(target_filename, 'rb') -                    ctx['dest_stream'].write(down.read()) -                    down.close() -                    segments_filenames.append(target_sanitized) +                    self._append_fragment(ctx, frag_content)                      break                  except compat_urllib_error.HTTPError as err:                      # YouTube may often return 404 HTTP error for a fragment causing the @@ -63,22 +50,14 @@ class DashSegmentsFD(FragmentFD):                      # HTTP error.                      count += 1                      if count <= fragment_retries: -                        self.report_retry_fragment(err, segment_name, count, fragment_retries) +                        self.report_retry_fragment(err, frag_index, count, fragment_retries)              if count > fragment_retries:                  if not fatal: -                    self.report_skip_fragment(segment_name) -                    return True +                    self.report_skip_fragment(frag_index) +                    continue                  self.report_error('giving up after %s fragment retries' % fragment_retries)                  return False -            return True - -        for i, segment in enumerate(segments): -            if not process_segment(segment, ctx['tmpfilename'], i): -                return False          self._finish_frag_download(ctx) -        for segment_file in segments_filenames: -            os.remove(encodeFilename(segment_file)) -          return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index e13cf547d..e78169a0d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -29,7 +29,17 @@ class ExternalFD(FileDownloader):          self.report_destination(filename)          tmpfilename = self.temp_name(filename) -        retval = self._call_downloader(tmpfilename, info_dict) +        try: +            retval = self._call_downloader(tmpfilename, info_dict) +        except KeyboardInterrupt: +            if not info_dict.get('is_live'): +                raise +            # Live stream downloading cancellation should be considered as +            # correct and expected termination thus all postprocessing +            # should take place +            retval = 0 +            self.to_screen('[%s] Interrupted by user' % self.get_basename()) +          if retval == 0:              fsize = os.path.getsize(encodeFilename(tmpfilename))              self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 688e086eb..c8fde9a89 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -3,7 +3,6 @@ from __future__ import division, unicode_literals  import base64  import io  import itertools -import os  import time  from .fragment import FragmentFD @@ -16,9 +15,7 @@ from ..compat import (      compat_struct_unpack,  )  from ..utils import ( -    encodeFilename,      fix_xml_ampersands, -    sanitize_open,      xpath_text,  ) @@ -366,17 +363,21 @@ class F4mFD(FragmentFD):          dest_stream = ctx['dest_stream'] -        write_flv_header(dest_stream) -        if not live: -            write_metadata_tag(dest_stream, metadata) +        if ctx['complete_frags_downloaded_bytes'] == 0: +            write_flv_header(dest_stream) +            if not live: +                write_metadata_tag(dest_stream, metadata)          base_url_parsed = compat_urllib_parse_urlparse(base_url)          self._start_frag_download(ctx) -        frags_filenames = [] +        frag_index = 0          while fragments_list:              seg_i, frag_i = fragments_list.pop(0) +            frag_index += 1 +            if frag_index <= ctx['fragment_index']: +                continue              name = 'Seg%d-Frag%d' % (seg_i, frag_i)              query = []              if base_url_parsed.query: @@ -386,17 +387,10 @@ class F4mFD(FragmentFD):              if info_dict.get('extra_param_to_segment_url'):                  query.append(info_dict['extra_param_to_segment_url'])              url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) -            frag_filename = '%s-%s' % (ctx['tmpfilename'], name)              try: -                success = ctx['dl'].download(frag_filename, { -                    'url': url_parsed.geturl(), -                    'http_headers': info_dict.get('http_headers'), -                }) +                success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)                  if not success:                      return False -                (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') -                down_data = down.read() -                down.close()                  reader = FlvReader(down_data)                  while True:                      try: @@ -411,12 +405,8 @@ class F4mFD(FragmentFD):                              break                          raise                      if box_type == b'mdat': -                        dest_stream.write(box_data) +                        self._append_fragment(ctx, box_data)                          break -                if live: -                    os.remove(encodeFilename(frag_sanitized)) -                else: -                    frags_filenames.append(frag_sanitized)              except (compat_urllib_error.HTTPError, ) as err:                  if live and (err.code == 404 or err.code == 410):                      # We didn't keep up with the live window. Continue @@ -436,7 +426,4 @@ class F4mFD(FragmentFD):          self._finish_frag_download(ctx) -        for frag_file in frags_filenames: -            os.remove(encodeFilename(frag_file)) -          return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 56f975266..bccc8ecc1 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -2,6 +2,7 @@ from __future__ import division, unicode_literals  import os  import time +import json  from .common import FileDownloader  from .http import HttpFD @@ -28,15 +29,37 @@ class FragmentFD(FileDownloader):                          and hlsnative only)      skip_unavailable_fragments:                          Skip unavailable fragments (DASH and hlsnative only) +    keep_fragments:     Keep downloaded fragments on disk after downloading is +                        finished + +    For each incomplete fragment download youtube-dl keeps on disk a special +    bookkeeping file with download state and metadata (in future such files will +    be used for any incomplete download handled by youtube-dl). This file is +    used to properly handle resuming, check download file consistency and detect +    potential errors. The file has a .ytdl extension and represents a standard +    JSON file of the following format: + +    extractor: +        Dictionary of extractor related data. TBD. + +    downloader: +        Dictionary of downloader related data. May contain following data: +            current_fragment: +                Dictionary with current (being downloaded) fragment data: +                index:  0-based index of current fragment among all fragments +            fragment_count: +                Total count of fragments + +    This feature is experimental and file format may change in future.      """ -    def report_retry_fragment(self, err, fragment_name, count, retries): +    def report_retry_fragment(self, err, frag_index, count, retries):          self.to_screen( -            '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' -            % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries))) +            '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...' +            % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) -    def report_skip_fragment(self, fragment_name): -        self.to_screen('[download] Skipping fragment %s...' % fragment_name) +    def report_skip_fragment(self, frag_index): +        self.to_screen('[download] Skipping fragment %d...' % frag_index)      def _prepare_url(self, info_dict, url):          headers = info_dict.get('http_headers') @@ -46,6 +69,51 @@ class FragmentFD(FileDownloader):          self._prepare_frag_download(ctx)          self._start_frag_download(ctx) +    @staticmethod +    def __do_ytdl_file(ctx): +        return not ctx['live'] and not ctx['tmpfilename'] == '-' + +    def _read_ytdl_file(self, ctx): +        stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') +        ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] +        stream.close() + +    def _write_ytdl_file(self, ctx): +        frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') +        downloader = { +            'current_fragment': { +                'index': ctx['fragment_index'], +            }, +        } +        if ctx.get('fragment_count') is not None: +            downloader['fragment_count'] = ctx['fragment_count'] +        frag_index_stream.write(json.dumps({'downloader': downloader})) +        frag_index_stream.close() + +    def _download_fragment(self, ctx, frag_url, info_dict, headers=None): +        fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) +        success = ctx['dl'].download(fragment_filename, { +            'url': frag_url, +            'http_headers': headers or info_dict.get('http_headers'), +        }) +        if not success: +            return False, None +        down, frag_sanitized = sanitize_open(fragment_filename, 'rb') +        ctx['fragment_filename_sanitized'] = frag_sanitized +        frag_content = down.read() +        down.close() +        return True, frag_content + +    def _append_fragment(self, ctx, frag_content): +        try: +            ctx['dest_stream'].write(frag_content) +        finally: +            if self.__do_ytdl_file(ctx): +                self._write_ytdl_file(ctx) +            if not self.params.get('keep_fragments', False): +                os.remove(ctx['fragment_filename_sanitized']) +            del ctx['fragment_filename_sanitized'] +      def _prepare_frag_download(self, ctx):          if 'live' not in ctx:              ctx['live'] = False @@ -66,11 +134,36 @@ class FragmentFD(FileDownloader):              }          )          tmpfilename = self.temp_name(ctx['filename']) -        dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb') +        open_mode = 'wb' +        resume_len = 0 + +        # Establish possible resume length +        if os.path.isfile(encodeFilename(tmpfilename)): +            open_mode = 'ab' +            resume_len = os.path.getsize(encodeFilename(tmpfilename)) + +        # Should be initialized before ytdl file check +        ctx.update({ +            'tmpfilename': tmpfilename, +            'fragment_index': 0, +        }) + +        if self.__do_ytdl_file(ctx): +            if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): +                self._read_ytdl_file(ctx) +            else: +                self._write_ytdl_file(ctx) +            if ctx['fragment_index'] > 0: +                assert resume_len > 0 + +        dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) +          ctx.update({              'dl': dl,              'dest_stream': dest_stream,              'tmpfilename': tmpfilename, +            # Total complete fragments downloaded so far in bytes +            'complete_frags_downloaded_bytes': resume_len,          })      def _start_frag_download(self, ctx): @@ -79,9 +172,9 @@ class FragmentFD(FileDownloader):          # hook          state = {              'status': 'downloading', -            'downloaded_bytes': 0, -            'frag_index': 0, -            'frag_count': total_frags, +            'downloaded_bytes': ctx['complete_frags_downloaded_bytes'], +            'fragment_index': ctx['fragment_index'], +            'fragment_count': total_frags,              'filename': ctx['filename'],              'tmpfilename': ctx['tmpfilename'],          } @@ -89,8 +182,6 @@ class FragmentFD(FileDownloader):          start = time.time()          ctx.update({              'started': start, -            # Total complete fragments downloaded so far in bytes -            'complete_frags_downloaded_bytes': 0,              # Amount of fragment's bytes downloaded by the time of the previous              # frag progress hook invocation              'prev_frag_downloaded_bytes': 0, @@ -106,11 +197,12 @@ class FragmentFD(FileDownloader):              if not ctx['live']:                  estimated_size = (                      (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / -                    (state['frag_index'] + 1) * total_frags) +                    (state['fragment_index'] + 1) * total_frags)                  state['total_bytes_estimate'] = estimated_size              if s['status'] == 'finished': -                state['frag_index'] += 1 +                state['fragment_index'] += 1 +                ctx['fragment_index'] = state['fragment_index']                  state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']                  ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']                  ctx['prev_frag_downloaded_bytes'] = 0 @@ -132,6 +224,10 @@ class FragmentFD(FileDownloader):      def _finish_frag_download(self, ctx):          ctx['dest_stream'].close() +        if self.__do_ytdl_file(ctx): +            ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) +            if os.path.isfile(ytdl_filename): +                os.remove(ytdl_filename)          elapsed = time.time() - ctx['started']          self.try_rename(ctx['tmpfilename'], ctx['filename'])          fsize = os.path.getsize(encodeFilename(ctx['filename'])) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index d0a5f7ba4..0e29c8a2a 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,6 +1,5 @@  from __future__ import unicode_literals -import os.path  import re  import binascii  try: @@ -18,8 +17,6 @@ from ..compat import (      compat_struct_pack,  )  from ..utils import ( -    encodeFilename, -    sanitize_open,      parse_m3u8_attributes,      update_url_query,  ) @@ -103,17 +100,18 @@ class HlsFD(FragmentFD):          media_sequence = 0          decrypt_info = {'METHOD': 'NONE'}          byte_range = {} -        frags_filenames = [] +        frag_index = 0          for line in s.splitlines():              line = line.strip()              if line:                  if not line.startswith('#'): +                    frag_index += 1 +                    if frag_index <= ctx['fragment_index']: +                        continue                      frag_url = (                          line                          if re.match(r'^https?://', line)                          else compat_urlparse.urljoin(man_url, line)) -                    frag_name = 'Frag%d' % i -                    frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)                      if extra_query:                          frag_url = update_url_query(frag_url, extra_query)                      count = 0 @@ -122,15 +120,10 @@ class HlsFD(FragmentFD):                          headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'])                      while count <= fragment_retries:                          try: -                            success = ctx['dl'].download(frag_filename, { -                                'url': frag_url, -                                'http_headers': headers, -                            }) +                            success, frag_content = self._download_fragment( +                                ctx, frag_url, info_dict, headers)                              if not success:                                  return False -                            down, frag_sanitized = sanitize_open(frag_filename, 'rb') -                            frag_content = down.read() -                            down.close()                              break                          except compat_urllib_error.HTTPError as err:                              # Unavailable (possibly temporary) fragments may be served. @@ -139,28 +132,29 @@ class HlsFD(FragmentFD):                              # https://github.com/rg3/youtube-dl/issues/10448).                              count += 1                              if count <= fragment_retries: -                                self.report_retry_fragment(err, frag_name, count, fragment_retries) +                                self.report_retry_fragment(err, frag_index, count, fragment_retries)                      if count > fragment_retries:                          if skip_unavailable_fragments:                              i += 1                              media_sequence += 1 -                            self.report_skip_fragment(frag_name) +                            self.report_skip_fragment(frag_index)                              continue                          self.report_error(                              'giving up after %s fragment retries' % fragment_retries)                          return False                      if decrypt_info['METHOD'] == 'AES-128':                          iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) +                        decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read()                          frag_content = AES.new(                              decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) -                    ctx['dest_stream'].write(frag_content) -                    frags_filenames.append(frag_sanitized) +                    self._append_fragment(ctx, frag_content)                      # We only download the first fragment during the test                      if test:                          break                      i += 1                      media_sequence += 1                  elif line.startswith('#EXT-X-KEY'): +                    decrypt_url = decrypt_info.get('URI')                      decrypt_info = parse_m3u8_attributes(line[11:])                      if decrypt_info['METHOD'] == 'AES-128':                          if 'IV' in decrypt_info: @@ -170,7 +164,8 @@ class HlsFD(FragmentFD):                                  man_url, decrypt_info['URI'])                          if extra_query:                              decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) -                        decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() +                        if decrypt_url != decrypt_info['URI']: +                            decrypt_info['KEY'] = None                  elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):                      media_sequence = int(line[22:])                  elif line.startswith('#EXT-X-BYTERANGE'): @@ -183,7 +178,4 @@ class HlsFD(FragmentFD):          self._finish_frag_download(ctx) -        for frag_file in frags_filenames: -            os.remove(encodeFilename(frag_file)) -          return True diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 63a636cb7..5f6f9faef 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -1,6 +1,5 @@  from __future__ import unicode_literals -import os  import time  import struct  import binascii @@ -8,10 +7,6 @@ import io  from .fragment import FragmentFD  from ..compat import compat_urllib_error -from ..utils import ( -    sanitize_open, -    encodeFilename, -)  u8 = struct.Struct(b'>B') @@ -225,50 +220,39 @@ class IsmFD(FragmentFD):          self._prepare_and_start_frag_download(ctx) -        segments_filenames = [] -          fragment_retries = self.params.get('fragment_retries', 0)          skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)          track_written = False +        frag_index = 0          for i, segment in enumerate(segments): -            segment_url = segment['url'] -            segment_name = 'Frag%d' % i -            target_filename = '%s-%s' % (ctx['tmpfilename'], segment_name) +            frag_index += 1 +            if frag_index <= ctx['fragment_index']: +                continue              count = 0              while count <= fragment_retries:                  try: -                    success = ctx['dl'].download(target_filename, { -                        'url': segment_url, -                        'http_headers': info_dict.get('http_headers'), -                    }) +                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)                      if not success:                          return False -                    down, target_sanitized = sanitize_open(target_filename, 'rb') -                    down_data = down.read()                      if not track_written: -                        tfhd_data = extract_box_data(down_data, [b'moof', b'traf', b'tfhd']) +                        tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])                          info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]                          write_piff_header(ctx['dest_stream'], info_dict['_download_params'])                          track_written = True -                    ctx['dest_stream'].write(down_data) -                    down.close() -                    segments_filenames.append(target_sanitized) +                    self._append_fragment(ctx, frag_content)                      break                  except compat_urllib_error.HTTPError as err:                      count += 1                      if count <= fragment_retries: -                        self.report_retry_fragment(err, segment_name, count, fragment_retries) +                        self.report_retry_fragment(err, frag_index, count, fragment_retries)              if count > fragment_retries:                  if skip_unavailable_fragments: -                    self.report_skip_fragment(segment_name) +                    self.report_skip_fragment(frag_index)                      continue                  self.report_error('giving up after %s fragment retries' % fragment_retries)                  return False          self._finish_frag_download(ctx) -        for segment_file in segments_filenames: -            os.remove(encodeFilename(segment_file)) -          return True diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 100cf997f..7da96c65c 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -1308,6 +1308,12 @@ class AdobePassIE(InfoExtractor):      _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'      _MVPD_CACHE = 'ap-mvpd' +    def _download_webpage_handle(self, *args, **kwargs): +        headers = kwargs.get('headers', {}) +        headers.update(self.geo_verification_headers()) +        kwargs['headers'] = headers +        return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) +      @staticmethod      def _get_mvpd_resource(provider_id, title, guid, rating):          channel = etree.Element('channel') diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index c01c67303..2dcdba9d2 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -101,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE):                  for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):                      entries.append(self.url_result(                          compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) -                return self.playlist_result( -                    entries, self._html_search_meta('aetn:SeriesId', webpage), -                    self._html_search_meta('aetn:SeriesTitle', webpage)) -            elif url_parts_len == 2: +                if entries: +                    return self.playlist_result( +                        entries, self._html_search_meta('aetn:SeriesId', webpage), +                        self._html_search_meta('aetn:SeriesTitle', webpage)) +                else: +                    # single season +                    url_parts_len = 2 +            if url_parts_len == 2:                  entries = []                  for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):                      episode_attributes = extract_attributes(episode_item) @@ -112,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE):                          url, episode_attributes['data-canonical'])                      entries.append(self.url_result(                          episode_url, 'AENetworks', -                        episode_attributes['data-videoid'])) +                        episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))                  return self.playlist_result(                      entries, self._html_search_meta('aetn:SeasonId', webpage)) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 78d29c861..c8cb91dcb 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -207,11 +207,10 @@ class AfreecaTVIE(InfoExtractor):                      file_url, video_id, 'mp4', entry_protocol='m3u8_native',                      m3u8_id='hls',                      note='Downloading part %d m3u8 information' % file_num) -                title = title if one else '%s (part %d)' % (title, file_num)                  file_info = common_entry.copy()                  file_info.update({                      'id': format_id, -                    'title': title, +                    'title': title if one else '%s (part %d)' % (title, file_num),                      'upload_date': upload_date,                      'duration': file_duration,                      'formats': formats, diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index e8e40126b..fde1a8ff7 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -7,15 +7,19 @@ from ..utils import (      parse_iso8601,      mimetype2ext,      determine_ext, +    ExtractorError,  )  class AMPIE(InfoExtractor):      # parse Akamai Adaptive Media Player feed      def _extract_feed_info(self, url): -        item = self._download_json( +        feed = self._download_json(              url, None, 'Downloading Akamai AMP feed', -            'Unable to download Akamai AMP feed')['channel']['item'] +            'Unable to download Akamai AMP feed') +        item = feed.get('channel', {}).get('item') +        if not item: +            raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))          video_id = item['guid'] @@ -30,9 +34,12 @@ class AMPIE(InfoExtractor):              if isinstance(media_thumbnail, dict):                  media_thumbnail = [media_thumbnail]              for thumbnail_data in media_thumbnail: -                thumbnail = thumbnail_data['@attributes'] +                thumbnail = thumbnail_data.get('@attributes', {}) +                thumbnail_url = thumbnail.get('url') +                if not thumbnail_url: +                    continue                  thumbnails.append({ -                    'url': self._proto_relative_url(thumbnail['url'], 'http:'), +                    'url': self._proto_relative_url(thumbnail_url, 'http:'),                      'width': int_or_none(thumbnail.get('width')),                      'height': int_or_none(thumbnail.get('height')),                  }) @@ -43,9 +50,14 @@ class AMPIE(InfoExtractor):              if isinstance(media_subtitle, dict):                  media_subtitle = [media_subtitle]              for subtitle_data in media_subtitle: -                subtitle = subtitle_data['@attributes'] -                lang = subtitle.get('lang') or 'en' -                subtitles[lang] = [{'url': subtitle['href']}] +                subtitle = subtitle_data.get('@attributes', {}) +                subtitle_href = subtitle.get('href') +                if not subtitle_href: +                    continue +                subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ +                    'url': subtitle_href, +                    'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href), +                })          formats = []          media_content = get_media_node('content') diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 623f44dce..8023da702 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -5,6 +5,7 @@ import base64  import hashlib  import json  import random +import re  import time  from .common import InfoExtractor @@ -16,6 +17,7 @@ from ..utils import (      intlist_to_bytes,      int_or_none,      strip_jsonp, +    unescapeHTML,  ) @@ -26,6 +28,8 @@ def md5_text(s):  class AnvatoIE(InfoExtractor): +    _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' +      # Copied from anvplayer.min.js      _ANVACK_TABLE = {          'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):          'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'      } +    _MCP_TO_ACCESS_KEY_TABLE = { +        'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', +        'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', +        'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', +        'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', +        'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', +        'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', +        'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', +        'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', +        'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', +        'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', +        'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', +        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' +    } + +    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'      _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'      def __init__(self, *args, **kwargs): @@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor):              }              if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): -                # Not using _extract_m3u8_formats here as individual media -                # playlists are also included in published_urls. -                if tbr is None: -                    formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) -                    continue -                else: +                if tbr is not None:                      a_format.update({                          'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),                          'ext': 'mp4', @@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor):              'subtitles': subtitles,          } +    @staticmethod +    def _extract_urls(ie, webpage, video_id): +        entries = [] +        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): +            anvplayer_data = ie._parse_json( +                mobj.group('anvp'), video_id, transform_source=unescapeHTML, +                fatal=False) +            if not anvplayer_data: +                continue +            video = anvplayer_data.get('video') +            if not isinstance(video, compat_str) or not video.isdigit(): +                continue +            access_key = anvplayer_data.get('accessKey') +            if not access_key: +                mcp = anvplayer_data.get('mcp') +                if mcp: +                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( +                        mcp.lower()) +            if not access_key: +                continue +            entries.append(ie.url_result( +                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), +                video_id=video)) +        return entries +      def _extract_anvato_videos(self, webpage, video_id): -        anvplayer_data = self._parse_json(self._html_search_regex( -            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, -            'Anvato player data'), video_id) +        anvplayer_data = self._parse_json( +            self._html_search_regex( +                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), +            video_id)          return self._get_anvato_videos(              anvplayer_data['accessKey'], anvplayer_data['video']) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        access_key, video_id = mobj.group('access_key_or_mcp', 'id') +        if access_key not in self._ANVACK_TABLE: +            access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] +        return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index ea7a70393..a84b8b1eb 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor):      _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'      _TEST = {          'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', -        'md5': '10d0f2799111df4cb1c924520ca78f98', +        'md5': 'e7c38568a01ea45402570e6029206723',          'info_dict': {              'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',              'ext': 'm4v',              'title': 'Energy',              'uploader': 'Drake', -            'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', +            'thumbnail': r're:^https?://.*\.jpg$',              'upload_date': '20150710',              'timestamp': 1436545535,          }, diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a6801f3d4..b45b431e1 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor):      }, {          'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',          'info_dict': { -            'id': 'blackthorn', +            'id': '4489', +            'title': 'Blackthorn',          },          'playlist_mincount': 2,          'expected_warnings': ['Unable to download JSON metadata'], @@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor):              'title': 'Most Popular',              'id': 'mostpopular',          }, -        'playlist_mincount': 80, +        'playlist_mincount': 30,      }, {          'url': 'http://trailers.apple.com/#section=moviestudios',          'info_dict': { diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index e21045bed..3c7d7250b 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor):          }      }, {          'url': 'https://archive.org/details/Cops1922', -        'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', +        'md5': '0869000b4ce265e8ca62738b336b268a',          'info_dict': {              'id': 'Cops1922',              'ext': 'mp4',              'title': 'Buster Keaton\'s "Cops" (1922)', -            'description': 'md5:b4544662605877edd99df22f9620d858', +            'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',          }      }, {          'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 69a23e88c..56baef29d 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,7 +180,7 @@ class ArteTVBaseIE(InfoExtractor):  class ArteTVPlus7IE(ArteTVBaseIE):      IE_NAME = 'arte.tv:+7' -    _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' +    _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)'      _TESTS = [{          'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', @@ -188,6 +188,9 @@ class ArteTVPlus7IE(ArteTVBaseIE):      }, {          'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',          'only_matching': True, +    }, { +        'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn', +        'only_matching': True,      }]      @classmethod diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 99af6dc5a..01fa308ff 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor):          },          {              'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', -            'md5': '0d0e918533bbd4b263f2de4d197d4aac', +            'md5': '6e52cbb513c405e403dbacb7aacf8747',              'info_dict': {                  'id': 'capitulo-112-david-bustamante',                  'ext': 'flv', diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 8fc5f65c6..e48bb8972 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor):              'title': '3/09/2016 Czaban Hour 3',              'description': 'Guest:   Nate Davis - NFL free agency,   Guest:   Stan Gans',              'duration': 2245.72, -            'uploader': 'Steve Czaban', +            'uploader': 'SB Nation A.M.',              'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',          }      }, { diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py deleted file mode 100644 index 3ba2f00d3..000000000 --- a/youtube_dl/extractor/azubu.py +++ /dev/null @@ -1,140 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( -    ExtractorError, -    float_or_none, -    sanitized_Request, -) - - -class AzubuIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/[^/]+#!/play/(?P<id>\d+)' -    _TESTS = [ -        { -            'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1', -            'md5': 'a88b42fcf844f29ad6035054bd9ecaf4', -            'info_dict': { -                'id': '15575', -                'ext': 'mp4', -                'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1', -                'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01', -                'thumbnail': r're:^https?://.*\.jpe?g', -                'timestamp': 1417523507.334, -                'upload_date': '20141202', -                'duration': 9988.7, -                'uploader': 'GSL', -                'uploader_id': 414310, -                'view_count': int, -            }, -        }, -        { -            'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-', -            'md5': 'b72a871fe1d9f70bd7673769cdb3b925', -            'info_dict': { -                'id': '9344', -                'ext': 'mp4', -                'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"', -                'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af', -                'thumbnail': r're:^https?://.*\.jpe?g', -                'timestamp': 1410530893.320, -                'upload_date': '20140912', -                'duration': 172.385, -                'uploader': 'FnaticTV', -                'uploader_id': 272749, -                'view_count': int, -            }, -            'skip': 'Channel offline', -        }, -    ] - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        data = self._download_json( -            'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] - -        title = data['title'].strip() -        description = data.get('description') -        thumbnail = data.get('thumbnail') -        view_count = data.get('view_count') -        user = data.get('user', {}) -        uploader = user.get('username') -        uploader_id = user.get('id') - -        stream_params = json.loads(data['stream_params']) - -        timestamp = float_or_none(stream_params.get('creationDate'), 1000) -        duration = float_or_none(stream_params.get('length'), 1000) - -        renditions = stream_params.get('renditions') or [] -        video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') -        if video: -            renditions.append(video) - -        if not renditions and not user.get('channel', {}).get('is_live', True): -            raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True) - -        formats = [{ -            'url': fmt['url'], -            'width': fmt['frameWidth'], -            'height': fmt['frameHeight'], -            'vbr': float_or_none(fmt['encodingRate'], 1000), -            'filesize': fmt['size'], -            'vcodec': fmt['videoCodec'], -            'container': fmt['videoContainer'], -        } for fmt in renditions if fmt['url']] -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'timestamp': timestamp, -            'duration': duration, -            'uploader': uploader, -            'uploader_id': uploader_id, -            'view_count': view_count, -            'formats': formats, -        } - - -class AzubuLiveIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/(?P<id>[^/]+)$' - -    _TESTS = [{ -        'url': 'http://www.azubu.tv/MarsTVMDLen', -        'only_matching': True, -    }, { -        'url': 'http://azubu.uol.com.br/adolfz', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        user = self._match_id(url) - -        info = self._download_json( -            'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user), -            user)['data'] -        if info['type'] != 'STREAM': -            raise ExtractorError('{0} is not streaming live'.format(user), expected=True) - -        req = sanitized_Request( -            'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id']) -        req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV') -        bc_info = self._download_json(req, user) -        m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') -        formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') -        self._sort_formats(formats) - -        return { -            'id': info['id'], -            'title': self._live_title(info['title']), -            'uploader_id': user, -            'formats': formats, -            'is_live': True, -            'thumbnail': bc_info['poster'], -        } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 056e06376..df2972f26 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor):          '_skip': 'There is a limit of 200 free downloads / month for the test song'      }, {          'url': 'http://benprunty.bandcamp.com/track/lanius-battle', -        'md5': '73d0b3171568232574e45652f8720b5c', +        'md5': '0369ace6b939f0927e62c67a1a8d9fa7',          'info_dict': {              'id': '2650410135', -            'ext': 'mp3', -            'title': 'Lanius (Battle)', -            'uploader': 'Ben Prunty Music', +            'ext': 'aiff', +            'title': 'Ben Prunty - Lanius (Battle)', +            'uploader': 'Ben Prunty',          },      }] diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b0b7914d8..d5c5822f2 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -16,7 +16,7 @@ class BeegIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'      _TEST = {          'url': 'http://beeg.com/5416503', -        'md5': '46c384def73b33dbc581262e5ee67cef', +        'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',          'info_dict': {              'id': '5416503',              'ext': 'mp4', diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 7a8e1f60b..e829974ff 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor):              'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',              'timestamp': 1446839961,              'uploader': 'Sean Fay', -            'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', +            'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',              'uploader_id': 6466954,              'upload_date': '20151011',          }, @@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE):      _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})'      _TESTS = [{          'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', -        'md5': '8c2c12e3af7805152675446c905d159b', +        'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',          'info_dict': {              'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', -            'ext': 'mp4', +            'ext': 'flv',              'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',              'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',          }, -        'params': { -            # m3u8 download -            'skip_download': True, -        },      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index ff0aa11b1..2c32b6ae2 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -77,7 +77,7 @@ class BRIE(InfoExtractor):                  'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',                  'duration': 893,                  'uploader': 'Eva Maria Steimle', -                'upload_date': '20140117', +                'upload_date': '20170208',              }          },      ] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 124497e95..3f017a2b1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -131,6 +131,12 @@ class BrightcoveLegacyIE(InfoExtractor):              },              'playlist_mincount': 10,          }, +        { +            # playerID inferred from bcpid +            # from http://www.un.org/chinese/News/story.asp?NewsID=27724 +            'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', +            'only_matching': True,  # Tested in GenericIE +        }      ]      FLV_VCODECS = {          1: 'SORENSON', @@ -266,9 +272,13 @@ class BrightcoveLegacyIE(InfoExtractor):          if matches:              return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) -        return list(filter(None, [ -            cls._build_brighcove_url_from_js(custom_bc) -            for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) +        matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) +        if matches: +            return list(filter(None, [ +                cls._build_brighcove_url_from_js(custom_bc) +                for custom_bc in matches])) +        return [src for _, src in re.findall( +            r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]      def _real_extract(self, url):          url, smuggled_data = unsmuggle_url(url, {}) @@ -285,6 +295,10 @@ class BrightcoveLegacyIE(InfoExtractor):          if videoPlayer:              # We set the original url as the default 'Referer' header              referer = smuggled_data.get('Referer', url) +            if 'playerID' not in query: +                mobj = re.search(r'/bcpid(\d+)', url) +                if mobj is not None: +                    query['playerID'] = [mobj.group(1)]              return self._get_video_info(                  videoPlayer[0], query, referer=referer)          elif 'playerKey' in query: @@ -484,8 +498,8 @@ class BrightcoveNewIE(InfoExtractor):      }]      @staticmethod -    def _extract_url(webpage): -        urls = BrightcoveNewIE._extract_urls(webpage) +    def _extract_url(ie, webpage): +        urls = BrightcoveNewIE._extract_urls(ie, webpage)          return urls[0] if urls else None      @staticmethod @@ -508,7 +522,7 @@ class BrightcoveNewIE(InfoExtractor):          # [2] looks like:          for video, script_tag, account_id, player_id, embed in re.findall(                  r'''(?isx) -                    (<video\s+[^>]+>) +                    (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)                      (?:.*?                          (<script[^>]+                              src=["\'](?:https?:)?//players\.brightcove\.net/ diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index f1f128c45..acd87e371 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor):          'md5': '060158428b650f896c542dfbb3d6487f',          'info_dict': {              'id': '12163', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Terrasses du Numérique',              'duration': 122,          }, -        'params': { -            'skip_download': True,  # Requires rtmpdump -        }      }, {          'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',          'only_matching': True, diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index cf678e7f8..87ad14e91 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -96,6 +96,7 @@ class CBCIE(InfoExtractor):          'info_dict': {              'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',              'id': 'dog-indoor-exercise-winter-1.3928238', +            'description': 'md5:c18552e41726ee95bd75210d1ca9194c',          },          'playlist_mincount': 6,      }] @@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor):              'uploader': 'CBCC-NEW',          },      }, { -        # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url          'url': 'http://www.cbc.ca/player/play/2164402062', -        'md5': '17a61eb813539abea40618d6323a7f82', +        'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',          'info_dict': {              'id': '2164402062', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Cancer survivor four times over',              'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',              'timestamp': 1320410746, diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 8d5f11dd1..7d78e3aae 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE):              'title': 'A Very Blue Anniversary',              'description': 'CBS2’s Cindy Hsu has more.',              'thumbnail': 're:^https?://.*', -            'timestamp': 1479962220, -            'upload_date': '20161124', +            'timestamp': int, +            'upload_date': r're:^\d{8}$',              'uploader': 'CBS',              'subtitles': {                  'en': 'mincount:5', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1ee35b501..78b7a923c 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -9,7 +9,10 @@ from ..utils import (      ExtractorError,      float_or_none,      int_or_none, +    multipart_encode,      parse_duration, +    random_birthday, +    urljoin,  ) @@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):              'description': 'md5:269ccd135d550da90d1662651fcb9772',              'thumbnail': r're:^https?://.*\.jpg$',              'average_rating': float, -            'duration': 39 +            'duration': 39, +            'age_limit': 0,          }      }, {          'url': 'http://www.cda.pl/video/57413289', @@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):              'uploader': 'crash404',              'view_count': int,              'average_rating': float, -            'duration': 137 +            'duration': 137, +            'age_limit': 0,          }      }, { +        # Age-restricted +        'url': 'http://www.cda.pl/video/1273454c4', +        'info_dict': { +            'id': '1273454c4', +            'ext': 'mp4', +            'title': 'Bronson (2008) napisy HD 1080p', +            'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', +            'height': 1080, +            'uploader': 'boniek61', +            'thumbnail': r're:^https?://.*\.jpg$', +            'duration': 5554, +            'age_limit': 18, +            'view_count': int, +            'average_rating': float, +        }, +    }, {          'url': 'http://ebd.cda.pl/0x0/5749950c',          'only_matching': True,      }] +    def _download_age_confirm_page(self, url, video_id, *args, **kwargs): +        form_data = random_birthday('rok', 'miesiac', 'dzien') +        form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) +        data, content_type = multipart_encode(form_data) +        return self._download_webpage( +            urljoin(url, '/a/validatebirth'), video_id, *args, +            data=data, headers={ +                'Referer': url, +                'Content-Type': content_type, +            }, **kwargs) +      def _real_extract(self, url):          video_id = self._match_id(url)          self._set_cookie('cda.pl', 'cda.player', 'html5') @@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):          if 'Ten film jest dostępny dla użytkowników premium' in webpage:              raise ExtractorError('This video is only available for premium users.', expected=True) +        need_confirm_age = False +        if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")', +                                   webpage, 'birthday validate form', default=None): +            webpage = self._download_age_confirm_page( +                url, video_id, note='Confirming age') +            need_confirm_age = True +          formats = []          uploader = self._search_regex(r'''(?x) @@ -81,6 +120,7 @@ class CDAIE(InfoExtractor):              'thumbnail': self._og_search_thumbnail(webpage),              'formats': formats,              'duration': None, +            'age_limit': 18 if need_confirm_age else 0,          }          def extract_format(page, version): @@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):          for href, resolution in re.findall(                  r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',                  webpage): -            webpage = self._download_webpage( +            if need_confirm_age: +                handler = self._download_age_confirm_page +            else: +                handler = self._download_webpage + +            webpage = handler(                  self._BASE_URL + href, video_id,                  'Downloading %s version information' % resolution, fatal=False)              if not webpage: @@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):                  # invalid version is requested.                  self.report_warning('Unable to download %s version information' % resolution)                  continue +              extract_format(webpage, resolution)          self._sort_formats(formats) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index bb52e0c6f..0920f6219 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', -        'md5': '720563e467b86374c194bdead08d207d', +        'md5': 'b9a5dc46294154c1193e2d10e0c95693',          'info_dict': {              'id': '4343170',              'ext': 'mp4', diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index 18c734766..6a41db87c 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',                  'description': '', -                'thumbnail': r're:^https?://.*\.jpg$', +                'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',                  'duration': 7713.088,                  'timestamp': 1413309600,                  'upload_date': '20141014', @@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor):                  'ext': 'wmv',                  'title': '64ste Vakantiecursus: Afvalwater',                  'description': 'md5:7fd774865cc69d972f542b157c328305', +                'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',                  'duration': 10853,                  'timestamp': 1326446400,                  'upload_date': '20120113', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e54adc9f0..76b5378e9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -245,6 +245,10 @@ class InfoExtractor(object):                      specified in the URL.      end_time:       Time in seconds where the reproduction should end, as                      specified in the URL. +    chapters:       A list of dictionaries, with the following entries: +                        * "start_time" - The start time of the chapter in seconds +                        * "end_time" - The end time of the chapter in seconds +                        * "title" (optional, string)      The following fields should only be used when the video belongs to some logical      chapter or section: @@ -976,6 +980,23 @@ class InfoExtractor(object):              return info          if isinstance(json_ld, dict):              json_ld = [json_ld] + +        def extract_video_object(e): +            assert e['@type'] == 'VideoObject' +            info.update({ +                'url': e.get('contentUrl'), +                'title': unescapeHTML(e.get('name')), +                'description': unescapeHTML(e.get('description')), +                'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), +                'duration': parse_duration(e.get('duration')), +                'timestamp': unified_timestamp(e.get('uploadDate')), +                'filesize': float_or_none(e.get('contentSize')), +                'tbr': int_or_none(e.get('bitrate')), +                'width': int_or_none(e.get('width')), +                'height': int_or_none(e.get('height')), +                'view_count': int_or_none(e.get('interactionCount')), +            }) +          for e in json_ld:              if e.get('@context') == 'http://schema.org':                  item_type = e.get('@type') @@ -1000,18 +1021,11 @@ class InfoExtractor(object):                          'description': unescapeHTML(e.get('articleBody')),                      })                  elif item_type == 'VideoObject': -                    info.update({ -                        'url': e.get('contentUrl'), -                        'title': unescapeHTML(e.get('name')), -                        'description': unescapeHTML(e.get('description')), -                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), -                        'duration': parse_duration(e.get('duration')), -                        'timestamp': unified_timestamp(e.get('uploadDate')), -                        'filesize': float_or_none(e.get('contentSize')), -                        'tbr': int_or_none(e.get('bitrate')), -                        'width': int_or_none(e.get('width')), -                        'height': int_or_none(e.get('height')), -                    }) +                    extract_video_object(e) +                elif item_type == 'WebPage': +                    video = e.get('video') +                    if isinstance(video, dict) and video.get('@type') == 'VideoObject': +                        extract_video_object(video)                  break          return dict((k, v) for k, v in info.items() if v is not None) @@ -1303,40 +1317,50 @@ class InfoExtractor(object):                                entry_protocol='m3u8', preference=None,                                m3u8_id=None, note=None, errnote=None,                                fatal=True, live=False): -          res = self._download_webpage_handle(              m3u8_url, video_id,              note=note or 'Downloading m3u8 information',              errnote=errnote or 'Failed to download m3u8 information',              fatal=fatal) +          if res is False:              return [] +          m3u8_doc, urlh = res          m3u8_url = urlh.geturl() +        return self._parse_m3u8_formats( +            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, +            preference=preference, m3u8_id=m3u8_id, live=live) + +    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, +                            entry_protocol='m3u8', preference=None, +                            m3u8_id=None, live=False):          if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access              return [] -        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] +        formats = []          format_url = lambda u: (              u              if re.match(r'^https?://', u)              else compat_urlparse.urljoin(m3u8_url, u)) -        # We should try extracting formats only from master playlists [1], i.e. -        # playlists that describe available qualities. On the other hand media -        # playlists [2] should be returned as is since they contain just the media -        # without qualities renditions. +        # References: +        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 +        # 2. https://github.com/rg3/youtube-dl/issues/12211 + +        # We should try extracting formats only from master playlists [1, 4.3.4], +        # i.e. playlists that describe available qualities. On the other hand +        # media playlists [1, 4.3.3] should be returned as is since they contain +        # just the media without qualities renditions.          # Fortunately, master playlist can be easily distinguished from media -        # playlist based on particular tags availability. As of [1, 2] master -        # playlist tags MUST NOT appear in a media playist and vice versa. -        # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist -        # and MUST NOT appear in master playlist thus we can clearly detect media -        # playlist with this criterion. -        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 -        # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 -        # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 +        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] +        # master playlist tags MUST NOT appear in a media playist and vice versa. +        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every +        # media playlist and MUST NOT appear in master playlist thus we can +        # clearly detect media playlist with this criterion. +          if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is              return [{                  'url': m3u8_url, @@ -1345,52 +1369,72 @@ class InfoExtractor(object):                  'protocol': entry_protocol,                  'preference': preference,              }] -        audio_in_video_stream = {} -        last_info = {} -        last_media = {} + +        groups = {} +        last_stream_inf = {} + +        def extract_media(x_media_line): +            media = parse_m3u8_attributes(x_media_line) +            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED +            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') +            if not (media_type and group_id and name): +                return +            groups.setdefault(group_id, []).append(media) +            if media_type not in ('VIDEO', 'AUDIO'): +                return +            media_url = media.get('URI') +            if media_url: +                format_id = [] +                for v in (group_id, name): +                    if v: +                        format_id.append(v) +                f = { +                    'format_id': '-'.join(format_id), +                    'url': format_url(media_url), +                    'manifest_url': m3u8_url, +                    'language': media.get('LANGUAGE'), +                    'ext': ext, +                    'protocol': entry_protocol, +                    'preference': preference, +                } +                if media_type == 'AUDIO': +                    f['vcodec'] = 'none' +                formats.append(f) + +        def build_stream_name(): +            # Despite specification does not mention NAME attribute for +            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] +            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) +            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 +            stream_name = last_stream_inf.get('NAME') +            if stream_name: +                return stream_name +            # If there is no NAME in EXT-X-STREAM-INF it will be obtained +            # from corresponding rendition group +            stream_group_id = last_stream_inf.get('VIDEO') +            if not stream_group_id: +                return +            stream_group = groups.get(stream_group_id) +            if not stream_group: +                return stream_group_id +            rendition = stream_group[0] +            return rendition.get('NAME') or stream_group_id +          for line in m3u8_doc.splitlines():              if line.startswith('#EXT-X-STREAM-INF:'): -                last_info = parse_m3u8_attributes(line) +                last_stream_inf = parse_m3u8_attributes(line)              elif line.startswith('#EXT-X-MEDIA:'): -                media = parse_m3u8_attributes(line) -                media_type = media.get('TYPE') -                if media_type in ('VIDEO', 'AUDIO'): -                    group_id = media.get('GROUP-ID') -                    media_url = media.get('URI') -                    if media_url: -                        format_id = [] -                        for v in (group_id, media.get('NAME')): -                            if v: -                                format_id.append(v) -                        f = { -                            'format_id': '-'.join(format_id), -                            'url': format_url(media_url), -                            'language': media.get('LANGUAGE'), -                            'ext': ext, -                            'protocol': entry_protocol, -                            'preference': preference, -                        } -                        if media_type == 'AUDIO': -                            f['vcodec'] = 'none' -                            if group_id and not audio_in_video_stream.get(group_id): -                                audio_in_video_stream[group_id] = False -                        formats.append(f) -                    else: -                        # When there is no URI in EXT-X-MEDIA let this tag's -                        # data be used by regular URI lines below -                        last_media = media -                        if media_type == 'AUDIO' and group_id: -                            audio_in_video_stream[group_id] = True +                extract_media(line)              elif line.startswith('#') or not line.strip():                  continue              else: -                tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) +                tbr = float_or_none( +                    last_stream_inf.get('AVERAGE-BANDWIDTH') or +                    last_stream_inf.get('BANDWIDTH'), scale=1000)                  format_id = []                  if m3u8_id:                      format_id.append(m3u8_id) -                # Despite specification does not mention NAME attribute for -                # EXT-X-STREAM-INF it still sometimes may be present -                stream_name = last_info.get('NAME') or last_media.get('NAME') +                stream_name = build_stream_name()                  # Bandwidth of live streams may differ over time thus making                  # format_id unpredictable. So it's better to keep provided                  # format_id intact. @@ -1400,14 +1444,14 @@ class InfoExtractor(object):                  f = {                      'format_id': '-'.join(format_id),                      'url': manifest_url, -                    'manifest_url': manifest_url, +                    'manifest_url': m3u8_url,                      'tbr': tbr,                      'ext': ext, -                    'fps': float_or_none(last_info.get('FRAME-RATE')), +                    'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),                      'protocol': entry_protocol,                      'preference': preference,                  } -                resolution = last_info.get('RESOLUTION') +                resolution = last_stream_inf.get('RESOLUTION')                  if resolution:                      mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)                      if mobj: @@ -1423,13 +1467,26 @@ class InfoExtractor(object):                          'vbr': vbr,                          'abr': abr,                      }) -                f.update(parse_codecs(last_info.get('CODECS'))) -                if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none': -                    # TODO: update acodec for audio only formats with the same GROUP-ID -                    f['acodec'] = 'none' +                codecs = parse_codecs(last_stream_inf.get('CODECS')) +                f.update(codecs) +                audio_group_id = last_stream_inf.get('AUDIO') +                # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which +                # references a rendition group MUST have a CODECS attribute. +                # However, this is not always respected, for example, [2] +                # contains EXT-X-STREAM-INF tag which references AUDIO +                # rendition group but does not have CODECS and despite +                # referencing audio group an audio group, it represents +                # a complete (with audio and video) format. So, for such cases +                # we will ignore references to rendition groups and treat them +                # as complete formats. +                if audio_group_id and codecs and f.get('vcodec') != 'none': +                    audio_group = groups.get(audio_group_id) +                    if audio_group and audio_group[0].get('URI'): +                        # TODO: update acodec for audio only formats with +                        # the same GROUP-ID +                        f['acodec'] = 'none'                  formats.append(f) -                last_info = {} -                last_media = {} +                last_stream_inf = {}          return formats      @staticmethod @@ -1803,7 +1860,7 @@ class InfoExtractor(object):                              'ext': mimetype2ext(mime_type),                              'width': int_or_none(representation_attrib.get('width')),                              'height': int_or_none(representation_attrib.get('height')), -                            'tbr': int_or_none(bandwidth, 1000), +                            'tbr': float_or_none(bandwidth, 1000),                              'asr': int_or_none(representation_attrib.get('audioSamplingRate')),                              'fps': int_or_none(representation_attrib.get('frameRate')),                              'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, @@ -2182,7 +2239,7 @@ class InfoExtractor(object):      def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):          mobj = re.search( -            r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)', +            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',              webpage)          if mobj:              try: @@ -2258,11 +2315,17 @@ class InfoExtractor(object):      def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,                                  m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): +        urls = []          formats = []          for source in jwplayer_sources_data: -            source_url = self._proto_relative_url(source['file']) +            source_url = self._proto_relative_url(source.get('file')) +            if not source_url: +                continue              if base_url:                  source_url = compat_urlparse.urljoin(base_url, source_url) +            if source_url in urls: +                continue +            urls.append(source_url)              source_type = source.get('type') or ''              ext = mimetype2ext(source_type) or determine_ext(source_url)              if source_type == 'hls' or ext == 'm3u8': diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py index 5fa1f006b..6ea03e65c 100644 --- a/youtube_dl/extractor/coub.py +++ b/youtube_dl/extractor/coub.py @@ -24,12 +24,11 @@ class CoubIE(InfoExtractor):              'duration': 4.6,              'timestamp': 1428527772,              'upload_date': '20150408', -            'uploader': 'Артём Лоскутников', +            'uploader': 'Artyom Loskutnikov',              'uploader_id': 'artyom.loskutnikov',              'view_count': int,              'like_count': int,              'repost_count': int, -            'comment_count': int,              'age_limit': 0,          },      }, { @@ -118,7 +117,6 @@ class CoubIE(InfoExtractor):          view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))          like_count = int_or_none(coub.get('likes_count'))          repost_count = int_or_none(coub.get('recoubs_count')) -        comment_count = int_or_none(coub.get('comments_count'))          age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))          if age_restricted is not None: @@ -137,7 +135,6 @@ class CoubIE(InfoExtractor):              'view_count': view_count,              'like_count': like_count,              'repost_count': repost_count, -            'comment_count': comment_count,              'age_limit': age_limit,              'formats': formats,          } diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2ed8b30bb..2ffa4a7f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):          'info_dict': {              'id': '727589',              'ext': 'mp4', -            'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", +            'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!",              'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d',              'thumbnail': r're:^https?://.*\.jpg$',              'uploader': 'Kadokawa Pictures Inc.', @@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):              'series': "KONOSUBA -God's blessing on this wonderful world!",              'season': "KONOSUBA -God's blessing on this wonderful world! 2",              'season_number': 2, -            'episode': 'Give Me Deliverance from this Judicial Injustice!', +            'episode': 'Give Me Deliverance From This Judicial Injustice!',              'episode_number': 1,          },          'params': { diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 246efde43..441114d19 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -51,6 +51,24 @@ class DailymotionIE(DailymotionBaseInfoExtractor):      _TESTS = [          { +            'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', +            'md5': '074b95bdee76b9e3654137aee9c79dfe', +            'info_dict': { +                'id': 'x5kesuj', +                'ext': 'mp4', +                'title': 'Office Christmas Party Review –  Jason Bateman, Olivia Munn, T.J. Miller', +                'description': 'Office Christmas Party Review -  Jason Bateman, Olivia Munn, T.J. Miller', +                'thumbnail': r're:^https?:.*\.(?:jpg|png)$', +                'duration': 187, +                'timestamp': 1493651285, +                'upload_date': '20170501', +                'uploader': 'Deadline', +                'uploader_id': 'x1xm8ri', +                'age_limit': 0, +                'view_count': int, +            }, +        }, +        {              'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',              'md5': '2137c41a8e78554bb09225b8eb322406',              'info_dict': { @@ -66,7 +84,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):                  'uploader_id': 'xijv66',                  'age_limit': 0,                  'view_count': int, -            } +            }, +            'skip': 'video gone',          },          # Vevo video          { diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index bdfe638b4..5c9c0ecdc 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor):          'info_dict': {              'id': '2015-0703-001',              'ext': 'mp4', -            'title': 'Daily Show', +            'title': 'Daily Show for July 03, 2015', +            'description': 'md5:80eb927244d6749900de6072c7cc2c86',          },      }, {          'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index 1f75352ca..148605c0b 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor):              'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',              'duration': 290,              'timestamp': 1476767794.2809999, -            'upload_date': '20160525', +            'upload_date': '20161018',              'uploader': 'parthivi001',              'uploader_id': 'user52596202',              'view_count': int, diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 82d8a042f..d22133d24 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -20,7 +20,7 @@ class DouyuTVIE(InfoExtractor):              'id': '17732',              'display_id': 'iseven',              'ext': 'flv', -            'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'description': r're:.*m7show@163\.com.*',              'thumbnail': r're:^https?://.*\.jpg$',              'uploader': '7师傅', @@ -51,7 +51,7 @@ class DouyuTVIE(InfoExtractor):              'id': '17732',              'display_id': '17732',              'ext': 'flv', -            'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'description': r're:.*m7show@163\.com.*',              'thumbnail': r're:^https?://.*\.jpg$',              'uploader': '7师傅', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1671090f4..c0020dd7d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE  from .amcnetworks import AMCNetworksIE  from .animeondemand import AnimeOnDemandIE  from .anitube import AnitubeIE +from .anvato import AnvatoIE  from .anysex import AnySexIE  from .aol import AolIE  from .allocine import AllocineIE @@ -87,7 +88,6 @@ from .azmedien import (      AZMedienPlaylistIE,      AZMedienShowPlaylistIE,  ) -from .azubu import AzubuIE, AzubuLiveIE  from .baidu import BaiduVideoIE  from .bambuser import BambuserIE, BambuserChannelIE  from .bandcamp import BandcampIE, BandcampAlbumIE @@ -663,6 +663,7 @@ from .nintendo import NintendoIE  from .njpwworld import NJPWWorldIE  from .nobelprize import NobelPrizeIE  from .noco import NocoIE +from .noovo import NoovoIE  from .normalboots import NormalbootsIE  from .nosvideo import NosVideoIE  from .nova import NovaIE @@ -939,6 +940,7 @@ from .srmediathek import SRMediathekIE  from .stanfordoc import StanfordOpenClassroomIE  from .steam import SteamIE  from .streamable import StreamableIE +from .streamango import StreamangoIE  from .streamcloud import StreamcloudIE  from .streamcz import StreamCZIE  from .streetvoice import StreetVoiceIE @@ -1233,7 +1235,10 @@ from .wrzuta import (      WrzutaIE,      WrzutaPlaylistIE,  ) -from .wsj import WSJIE +from .wsj import ( +    WSJIE, +    WSJArticleIE, +)  from .xbef import XBefIE  from .xboxclips import XboxClipsIE  from .xfileshare import XFileShareIE @@ -1295,5 +1300,6 @@ from .youtube import (      YoutubeWatchLaterIE,  )  from .zapiks import ZapiksIE +from .zaq1 import Zaq1IE  from .zdf import ZDFIE, ZDFChannelIE  from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index a3bb98377..985542727 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'      _TEST = { -        'url': 'http://www.foxsports.com/video?vid=432609859715', +        'url': 'http://www.foxsports.com/tennessee/video/432609859715',          'md5': 'b49050e955bebe32c301972e4012ac17',          'info_dict': { -            'id': 'i0qKWsk3qJaM', +            'id': 'bwduI3X_TgUB',              'ext': 'mp4',              'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',              'description': 'Courtney Lee talks about Memphis being focused.', @@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          config = self._parse_json( -            self._search_regex( -                r"data-player-config='([^']+)'", webpage, 'data player config'), +            self._html_search_regex( +                r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", +                webpage, 'data player config'),              video_id)          return self.url_result(smuggle_url(update_url_query( diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 81c0ce9a3..49409369c 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor):              m3u8_url, video_id, 'mp4', 'm3u8_native',              m3u8_id='hls', fatal=False)          source_formats = list(filter( -            lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', -            m3u8_formats)) +            lambda f: f.get('vcodec') != 'none', m3u8_formats))          bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]          bitrates.sort() diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 682c49e79..00d311158 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -78,8 +78,7 @@ class GameSpotIE(OnceIE):                      if m3u8_formats:                          self._sort_formats(m3u8_formats)                          m3u8_formats = list(filter( -                            lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', -                            m3u8_formats)) +                            lambda f: f.get('vcodec') != 'none', m3u8_formats))                      if len(qualities) == len(m3u8_formats):                          for q, m3u8_format in zip(qualities, m3u8_formats):                              f = m3u8_format.copy() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bc7c21f7a..b06f43446 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -85,6 +85,9 @@ from .ustream import UstreamIE  from .openload import OpenloadIE  from .videopress import VideoPressIE  from .rutube import RutubeIE +from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE  class GenericIE(InfoExtractor): @@ -430,6 +433,22 @@ class GenericIE(InfoExtractor):              },          },          { +            # Brightcove video in <iframe> +            'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', +            'md5': '36d74ef5e37c8b4a2ce92880d208b968', +            'info_dict': { +                'id': '5360463607001', +                'ext': 'mp4', +                'title': '叙利亚失明儿童在废墟上演唱《心跳》  呼吁获得正常童年生活', +                'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', +                'uploader': 'United Nations', +                'uploader_id': '1362235914001', +                'timestamp': 1489593889, +                'upload_date': '20170315', +            }, +            'add_ie': ['BrightcoveLegacy'], +        }, +        {              # Brightcove with alternative playerID key              'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',              'info_dict': { @@ -1410,6 +1429,22 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              },          }, +        { +            # Brightcove embed with whitespace around attribute names +            'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', +            'info_dict': { +                'id': '3167554373001', +                'ext': 'mp4', +                'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", +                'description': 'md5:57bacb0e0f29349de4972bfda3191713', +                'uploader_id': '1079349493', +                'upload_date': '20140207', +                'timestamp': 1391810548, +            }, +            'params': { +                'skip_download': True, +            }, +        },          # Another form of arte.tv embed          {              'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1651,6 +1686,38 @@ class GenericIE(InfoExtractor):              },              'add_ie': [SenateISVPIE.ie_key()],          }, +        { +            # Limelight embeds (1 channel embed + 4 media embeds) +            'url': 'http://www.sedona.com/FacilitatorTraining2017', +            'info_dict': { +                'id': 'FacilitatorTraining2017', +                'title': 'Facilitator Training 2017', +            }, +            'playlist_mincount': 5, +        }, +        { +            'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', +            'info_dict': { +                'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', +                'title': 'Standoff with Walnut Creek murder suspect ends', +                'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', +            }, +            'playlist_mincount': 4, +        }, +        { +            # WashingtonPost embed +            'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', +            'info_dict': { +                'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', +                'ext': 'mp4', +                'title': "No one has seen the drama series based on Trump's life \u2014 until now", +                'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', +                'timestamp': 1455216756, +                'uploader': 'The Washington Post', +                'upload_date': '20160211', +            }, +            'add_ie': [WashingtonPostIE.ie_key()], +        },          # {          #     # TODO: find another test          #     # http://schema.org/VideoObject @@ -1693,7 +1760,7 @@ class GenericIE(InfoExtractor):                  continue              entries.append({ -                '_type': 'url', +                '_type': 'url_transparent',                  'url': next_url,                  'title': it.find('title').text,              }) @@ -2483,6 +2550,11 @@ class GenericIE(InfoExtractor):              return self.url_result(piksel_url, PikselIE.ie_key())          # Look for Limelight embeds +        limelight_urls = LimelightBaseIE._extract_urls(webpage, url) +        if limelight_urls: +            return self.playlist_result( +                limelight_urls, video_id, video_title, video_description) +          mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)          if mobj:              lm = { @@ -2506,6 +2578,12 @@ class GenericIE(InfoExtractor):                  'limelight:media:%s' % mobj.group('id'),                  {'source_url': url}), 'LimelightMedia', mobj.group('id')) +        # Look for Anvato embeds +        anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) +        if anvato_urls: +            return self.playlist_result( +                anvato_urls, video_id, video_title, video_description) +          # Look for AdobeTVVideo embeds          mobj = re.search(              r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', @@ -2623,6 +2701,12 @@ class GenericIE(InfoExtractor):              return self.playlist_from_matches(                  rutube_urls, ie=RutubeIE.ie_key()) +        # Look for WashingtonPost embeds +        wapo_urls = WashingtonPostIE._extract_urls(webpage) +        if wapo_urls: +            return self.playlist_from_matches( +                wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) +          # Looking for http://schema.org/VideoObject          json_ld = self._search_json_ld(              webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 4c9be47b4..9c7b1bd37 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,22 +36,26 @@ class GoIE(AdobePassIE):              'requestor_id': 'DisneyXD',          }      } -    _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) +    _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())      _TESTS = [{ -        'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', +        'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',          'info_dict': { -            'id': '0_g86w5onx', +            'id': 'VDKA3807643',              'ext': 'mp4', -            'title': 'Sneak Peek: Language Arts', -            'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', +            'title': 'The Traitor in the White House', +            'description': 'md5:05b009d2d145a1e85d25111bd37222e8',          },          'params': {              # m3u8 download              'skip_download': True,          },      }, { -        'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', -        'only_matching': True, +        'url': 'http://watchdisneyxd.go.com/doraemon', +        'info_dict': { +            'title': 'Doraemon', +            'id': 'SH55574025', +        }, +        'playlist_mincount': 51,      }, {          'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',          'only_matching': True, @@ -60,19 +64,36 @@ class GoIE(AdobePassIE):          'only_matching': True,      }] +    def _extract_videos(self, brand, video_id='-1', show_id='-1'): +        display_id = video_id if video_id != '-1' else show_id +        return self._download_json( +            'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), +            display_id)['video'] +      def _real_extract(self, url):          sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() +        site_info = self._SITE_INFO[sub_domain] +        brand = site_info['brand']          if not video_id:              webpage = self._download_webpage(url, display_id)              video_id = self._search_regex(                  # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"                  # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood -                r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') -        site_info = self._SITE_INFO[sub_domain] -        brand = site_info['brand'] -        video_data = self._download_json( -            'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), -            video_id)['video'][0] +                r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) +            if not video_id: +                # show extraction works for Disney, DisneyJunior and DisneyXD +                # ABC and Freeform has different layout +                show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') +                videos = self._extract_videos(brand, show_id=show_id) +                show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) +                entries = [] +                for video in videos: +                    entries.append(self.url_result( +                        video['url'], 'Go', video.get('id'), video.get('title'))) +                entries.reverse() +                return self.playlist_result(entries, show_id, show_title) +        video_data = self._extract_videos(brand, video_id)[0] +        video_id = video_data['id']          title = video_data['title']          formats = [] @@ -105,7 +126,7 @@ class GoIE(AdobePassIE):                      self._initialize_geo_bypass(['US'])                  entitlement = self._download_json(                      'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', -                    video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers()) +                    video_id, data=urlencode_postdata(data))                  errors = entitlement.get('errors', {}).get('errors', [])                  if errors:                      for error in errors: diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 3550eca7c..9b2e1c164 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -5,6 +5,7 @@ import re  from .common import InfoExtractor  from ..utils import ( +    determine_ext,      int_or_none,      parse_iso8601,  ) @@ -18,7 +19,7 @@ class Go90IE(InfoExtractor):          'info_dict': {              'id': '84BUqjLpf9D',              'ext': 'mp4', -            'title': 'Inside The Utah Coalition Against Pornography Convention', +            'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention',              'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.',              'timestamp': 1491868800,              'upload_date': '20170411', @@ -32,11 +33,28 @@ class Go90IE(InfoExtractor):              video_id, headers={                  'Content-Type': 'application/json; charset=utf-8',              }, data=b'{"client":"web","device_type":"pc"}') -        title = video_data['title']          main_video_asset = video_data['main_video_asset'] +        episode_number = int_or_none(video_data.get('episode_number')) +        series = None +        season = None +        season_id = None +        season_number = None +        for metadata in video_data.get('__children', {}).get('Item', {}).values(): +            if metadata.get('type') == 'show': +                series = metadata.get('title') +            elif metadata.get('type') == 'season': +                season = metadata.get('title') +                season_id = metadata.get('id') +                season_number = int_or_none(metadata.get('season_number')) + +        title = episode = video_data.get('title') or series +        if series and series != title: +            title = '%s - %s' % (series, title) +          thumbnails = []          formats = [] +        subtitles = {}          for asset in video_data.get('assets'):              if asset.get('id') == main_video_asset:                  for source in asset.get('sources', []): @@ -70,6 +88,15 @@ class Go90IE(InfoExtractor):                              'height': int_or_none(source.get('height')),                              'tbr': int_or_none(source.get('bitrate')),                          }) + +                for caption in asset.get('caption_metadata', []): +                    caption_url = caption.get('source_url') +                    if not caption_url: +                        continue +                    subtitles.setdefault(caption.get('language', 'en'), []).append({ +                        'url': caption_url, +                        'ext': determine_ext(caption_url, 'vtt'), +                    })              elif asset.get('type') == 'image':                  asset_location = asset.get('location')                  if not asset_location: @@ -89,4 +116,11 @@ class Go90IE(InfoExtractor):              'description': video_data.get('short_description'),              'like_count': int_or_none(video_data.get('like_count')),              'timestamp': parse_iso8601(video_data.get('released_at')), +            'series': series, +            'episode': episode, +            'season': season, +            'season_id': season_id, +            'season_number': season_number, +            'episode_number': episode_number, +            'subtitles': subtitles,          } diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 9fb71e8ef..fe425e786 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE):      def _extract_http_audio(self, webpage, video_id):          fields = self._hidden_inputs(webpage) -        http_audio_url = fields['filename'] -        if http_audio_url is None: +        http_audio_url = fields.get('filename') +        if not http_audio_url:              return []          cookies_header = {'Cookie': self._extract_cookies(webpage)} diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c1921cbcf..4667335e0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -112,7 +112,8 @@ class InstagramIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          (video_url, description, thumbnail, timestamp, uploader, -         uploader_id, like_count, comment_count, height, width) = [None] * 10 +         uploader_id, like_count, comment_count, comments, height, +         width) = [None] * 11          shared_data = self._parse_json(              self._search_regex( @@ -121,7 +122,10 @@ class InstagramIE(InfoExtractor):              video_id, fatal=False)          if shared_data:              media = try_get( -                shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict) +                shared_data, +                (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], +                 lambda x: x['entry_data']['PostPage'][0]['media']), +                dict)              if media:                  video_url = media.get('video_url')                  height = int_or_none(media.get('dimensions', {}).get('height')) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2af6a6db4..fdfa7de9e 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -189,7 +189,11 @@ class IqiyiIE(InfoExtractor):          'only_matching': True,      }, {          'url': 'http://yule.iqiyi.com/pcb.html', -        'only_matching': True, +        'info_dict': { +            'id': '4a0af228fddb55ec96398a364248ed7f', +            'ext': 'mp4', +            'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰', +        },      }, {          # VIP-only video. The first 2 parts (6 minutes) are available without login          # MD5 sums omitted as values are different on Travis CI and my machine @@ -337,15 +341,18 @@ class IqiyiIE(InfoExtractor):              url, 'temp_id', note='download video page')          # There's no simple way to determine whether an URL is a playlist or not -        # So detect it -        playlist_result = self._extract_playlist(webpage) -        if playlist_result: -            return playlist_result - +        # Sometimes there are playlist links in individual videos, so treat it +        # as a single video first          tvid = self._search_regex( -            r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') +            r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None) +        if tvid is None: +            playlist_result = self._extract_playlist(webpage) +            if playlist_result: +                return playlist_result +            raise ExtractorError('Can\'t find any video') +          video_id = self._search_regex( -            r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') +            r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')          formats = []          for _ in range(5): @@ -377,7 +384,8 @@ class IqiyiIE(InfoExtractor):          self._sort_formats(formats)          title = (get_element_by_id('widget-videotitle', webpage) or -                 clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) +                 clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or +                 self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title'))          return {              'id': video_id, diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 021c6b278..f3156804d 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -116,13 +116,25 @@ class ITVIE(InfoExtractor):              if not play_path:                  continue              tbr = int_or_none(media_file.get('bitrate'), 1000) -            formats.append({ +            f = {                  'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), -                'url': rtmp_url,                  'play_path': play_path, +                # Providing this swfVfy allows to avoid truncated downloads +                'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', +                'page_url': url,                  'tbr': tbr,                  'ext': 'flv', -            }) +            } +            app = self._search_regex( +                'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) +            if app: +                f.update({ +                    'url': rtmp_url.split('?', 1)[0], +                    'app': app, +                }) +            else: +                f['url'] = rtmp_url +            formats.append(f)          ios_playlist_url = params.get('data-video-playlist')          hmac = params.get('data-video-hmac') @@ -172,7 +184,9 @@ class ITVIE(InfoExtractor):                          href = ios_base_url + href                      ext = determine_ext(href)                      if ext == 'm3u8': -                        formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False)) +                        formats.extend(self._extract_m3u8_formats( +                            href, video_id, 'mp4', entry_protocol='m3u8_native', +                            m3u8_id='hls', fatal=False))                      else:                          formats.append({                              'url': href, @@ -189,7 +203,8 @@ class ITVIE(InfoExtractor):                  'ext': 'ttml' if ext == 'xml' else ext,              }) -        return { +        info = self._search_json_ld(webpage, video_id, default={}) +        info.update({              'id': video_id,              'title': title,              'formats': formats, @@ -198,4 +213,5 @@ class ITVIE(InfoExtractor):              'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),              'series': xpath_text(playlist, 'ProgrammeTitle'),              'duartion': parse_duration(xpath_text(playlist, 'Duration')), -        } +        }) +        return info diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 9eda956d2..0a07c1320 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,7 +23,6 @@ from ..utils import (      str_or_none,      url_basename,      urshift, -    update_url_query,  ) @@ -51,7 +50,7 @@ class LeIE(InfoExtractor):              'id': '1415246',              'ext': 'mp4',              'title': '美人天下01', -            'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', +            'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',          },          'params': {              'hls_prefer_native': True, @@ -69,7 +68,6 @@ class LeIE(InfoExtractor):          'params': {              'hls_prefer_native': True,          }, -        'skip': 'Only available in China',      }, {          'url': 'http://sports.le.com/video/25737697.html',          'only_matching': True, @@ -81,7 +79,7 @@ class LeIE(InfoExtractor):          'only_matching': True,      }] -    # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf +    # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf      def ror(self, param1, param2):          _loc3_ = 0          while _loc3_ < param2: @@ -90,15 +88,8 @@ class LeIE(InfoExtractor):          return param1      def calc_time_key(self, param1): -        _loc2_ = 773625421 -        _loc3_ = self.ror(param1, _loc2_ % 13) -        _loc3_ = _loc3_ ^ _loc2_ -        _loc3_ = self.ror(_loc3_, _loc2_ % 17) -        return _loc3_ - -    # reversed from http://jstatic.letvcdn.com/sdk/player.js -    def get_mms_key(self, time): -        return self.ror(time, 8) ^ 185025305 +        _loc2_ = 185025305 +        return self.ror(param1, _loc2_ % 17) ^ _loc2_      # see M3U8Encryption class in KLetvPlayer.swf      @staticmethod @@ -122,7 +113,7 @@ class LeIE(InfoExtractor):      def _check_errors(self, play_json):          # Check for errors -        playstatus = play_json['playstatus'] +        playstatus = play_json['msgs']['playstatus']          if playstatus['status'] == 0:              flag = playstatus['flag']              if flag == 1: @@ -134,58 +125,31 @@ class LeIE(InfoExtractor):          media_id = self._match_id(url)          page = self._download_webpage(url, media_id) -        play_json_h5 = self._download_json( -            'http://api.le.com/mms/out/video/playJsonH5', -            media_id, 'Downloading html5 playJson data', query={ -                'id': media_id, -                'platid': 3, -                'splatid': 304, -                'format': 1, -                'tkey': self.get_mms_key(int(time.time())), -                'domain': 'www.le.com', -                'tss': 'no', -            }, -            headers=self.geo_verification_headers()) -        self._check_errors(play_json_h5) -          play_json_flash = self._download_json( -            'http://api.le.com/mms/out/video/playJson', +            'http://player-pc.le.com/mms/out/video/playJson',              media_id, 'Downloading flash playJson data', query={                  'id': media_id,                  'platid': 1,                  'splatid': 101,                  'format': 1, +                'source': 1000,                  'tkey': self.calc_time_key(int(time.time())),                  'domain': 'www.le.com', +                'region': 'cn',              },              headers=self.geo_verification_headers())          self._check_errors(play_json_flash) -        def get_h5_urls(media_url, format_id): -            location = self._download_json( -                media_url, media_id, -                'Download JSON metadata for format %s' % format_id, query={ -                    'format': 1, -                    'expect': 3, -                    'tss': 'no', -                })['location'] - -            return { -                'http': update_url_query(location, {'tss': 'no'}), -                'hls': update_url_query(location, {'tss': 'ios'}), -            } -          def get_flash_urls(media_url, format_id): -            media_url += '&' + compat_urllib_parse_urlencode({ -                'm3v': 1, -                'format': 1, -                'expect': 3, -                'rateid': format_id, -            }) -              nodes_data = self._download_json(                  media_url, media_id, -                'Download JSON metadata for format %s' % format_id) +                'Download JSON metadata for format %s' % format_id, +                query={ +                    'm3v': 1, +                    'format': 1, +                    'expect': 3, +                    'tss': 'ios', +                })              req = self._request_webpage(                  nodes_data['nodelist'][0]['location'], media_id, @@ -199,29 +163,28 @@ class LeIE(InfoExtractor):          extracted_formats = []          formats = [] -        for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): -            playurl = play_json['playurl'] -            play_domain = playurl['domain'][0] - -            for format_id, format_data in playurl.get('dispatch', []).items(): -                if format_id in extracted_formats: -                    continue -                extracted_formats.append(format_id) - -                media_url = play_domain + format_data[0] -                for protocol, format_url in get_urls(media_url, format_id).items(): -                    f = { -                        'url': format_url, -                        'ext': determine_ext(format_data[1]), -                        'format_id': '%s-%s' % (protocol, format_id), -                        'protocol': 'm3u8_native' if protocol == 'hls' else 'http', -                        'quality': int_or_none(format_id), -                    } - -                    if format_id[-1:] == 'p': -                        f['height'] = int_or_none(format_id[:-1]) - -                    formats.append(f) +        playurl = play_json_flash['msgs']['playurl'] +        play_domain = playurl['domain'][0] + +        for format_id, format_data in playurl.get('dispatch', []).items(): +            if format_id in extracted_formats: +                continue +            extracted_formats.append(format_id) + +            media_url = play_domain + format_data[0] +            for protocol, format_url in get_flash_urls(media_url, format_id).items(): +                f = { +                    'url': format_url, +                    'ext': determine_ext(format_data[1]), +                    'format_id': '%s-%s' % (protocol, format_id), +                    'protocol': 'm3u8_native' if protocol == 'hls' else 'http', +                    'quality': int_or_none(format_id), +                } + +                if format_id[-1:] == 'p': +                    f['height'] = int_or_none(format_id[:-1]) + +                formats.append(f)          self._sort_formats(formats, ('height', 'quality', 'format_id'))          publish_time = parse_iso8601(self._html_search_regex( diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py index d3bca6435..b312e77f1 100644 --- a/youtube_dl/extractor/lego.py +++ b/youtube_dl/extractor/lego.py @@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor):          formats = self._extract_akamai_formats(              '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id)          m3u8_formats = list(filter( -            lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', +            lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none',              formats))          if len(m3u8_formats) == len(self._BITRATES):              self._sort_formats(m3u8_formats) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index f52c2e169..0a5a3956c 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -9,6 +9,7 @@ from ..utils import (      determine_ext,      float_or_none,      int_or_none, +    smuggle_url,      unsmuggle_url,      ExtractorError,  ) @@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor):      _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'      _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' +    @classmethod +    def _extract_urls(cls, webpage, source_url): +        lm = { +            'Media': 'media', +            'Channel': 'channel', +            'ChannelList': 'channel_list', +        } +        entries = [] +        for kind, video_id in re.findall( +                r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', +                webpage): +            entries.append(cls.url_result( +                smuggle_url( +                    'limelight:%s:%s' % (lm[kind], video_id), +                    {'source_url': source_url}), +                'Limelight%s' % kind, video_id)) +        for mobj in re.finditer( +                # As per [1] class attribute should be exactly equal to +                # LimelightEmbeddedPlayerFlash but numerous examples seen +                # that don't exactly match it (e.g. [2]). +                # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage +                # 2. http://www.sedona.com/FacilitatorTraining2017 +                r'''(?sx) +                    <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? +                        <param[^>]+ +                            name=(["\'])flashVars\2[^>]+ +                            value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32}) +                ''', webpage): +            kind, video_id = mobj.group('kind'), mobj.group('id') +            entries.append(cls.url_result( +                smuggle_url( +                    'limelight:%s:%s' % (kind, video_id), +                    {'source_url': source_url}), +                'Limelight%s' % kind.capitalize(), video_id)) +        return entries +      def _call_playlist_service(self, item_id, method, fatal=True, referer=None):          headers = {}          if referer: diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py new file mode 100644 index 000000000..f7fa098a5 --- /dev/null +++ b/youtube_dl/extractor/noovo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    smuggle_url, +    try_get, +) + + +class NoovoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)' +    _TESTS = [{ +        # clip +        'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial', +        'info_dict': { +            'id': '5386045029001', +            'ext': 'mp4', +            'title': 'Chrysler Imperial', +            'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056', +            'timestamp': 1491399228, +            'upload_date': '20170405', +            'uploader_id': '618566855001', +            'creator': 'vtele', +            'view_count': int, +            'series': 'RPM+', +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # episode +        'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8', +        'info_dict': { +            'id': '5395865725001', +            'title': 'Épisode 13 : Les retrouvailles', +            'description': 'md5:336d5ebc5436534e61d16e63ddfca327', +            'ext': 'mp4', +            'timestamp': 1492019320, +            'upload_date': '20170412', +            'uploader_id': '618566855001', +            'creator': 'vtele', +            'view_count': int, +            'series': "L'amour est dans le pré", +            'season_number': 5, +            'episode': 'Épisode 13', +            'episode_number': 13, +        }, +        'params': { +            'skip_download': True, +        }, +    }] +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s' + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        data = self._download_json( +            'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, +            video_id)['data'] + +        content = try_get(data, lambda x: x['contents'][0]) + +        brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + +        series = try_get( +            data, ( +                lambda x: x['show']['title'], +                lambda x: x['season']['show']['title']), +            compat_str) + +        episode = None +        og = data.get('og') +        if isinstance(og, dict) and og.get('type') == 'video.episode': +            episode = og.get('title') + +        video = content or data + +        return { +            '_type': 'url_transparent', +            'ie_key': BrightcoveNewIE.ie_key(), +            'url': smuggle_url( +                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, +                {'geo_countries': ['CA']}), +            'id': brightcove_id, +            'title': video.get('title'), +            'creator': video.get('source'), +            'view_count': int_or_none(video.get('viewsCount')), +            'series': series, +            'season_number': int_or_none(try_get( +                data, lambda x: x['season']['seasonNumber'])), +            'episode': episode, +            'episode_number': int_or_none(data.get('episodeNumber')), +        } diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b6c5ee6e4..f26dafb8f 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -28,7 +28,7 @@ class NownessBaseIE(InfoExtractor):                          bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)                          if bc_url:                              return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) -                        bc_url = BrightcoveNewIE._extract_url(player_code) +                        bc_url = BrightcoveNewIE._extract_url(self, player_code)                          if bc_url:                              return self.url_result(bc_url, BrightcoveNewIE.ie_key())                          raise ExtractorError('Could not find player definition') diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 0ee56a45b..854b6800c 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..compat import ( +    compat_etree_fromstring,      compat_parse_qs,      compat_urllib_parse_unquote,      compat_urllib_parse_urlparse, @@ -37,7 +38,7 @@ class OdnoklassnikiIE(InfoExtractor):      }, {          # metadataUrl          'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', -        'md5': '9676cf86eff5391d35dea675d224e131', +        'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',          'info_dict': {              'id': '63567059965189-0',              'ext': 'mp4', @@ -53,7 +54,7 @@ class OdnoklassnikiIE(InfoExtractor):      }, {          # YouTube embed (metadataUrl, provider == USER_YOUTUBE)          'url': 'http://ok.ru/video/64211978996595-1', -        'md5': '5d7475d428845cd2e13bae6f1a992278', +        'md5': '2f206894ffb5dbfcce2c5a14b909eea5',          'info_dict': {              'id': '64211978996595-1',              'ext': 'mp4', @@ -61,8 +62,8 @@ class OdnoklassnikiIE(InfoExtractor):              'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',              'duration': 440,              'upload_date': '20150826', -            'uploader_id': '750099571', -            'uploader': 'Алина П', +            'uploader_id': 'tvroscosmos', +            'uploader': 'Телестудия Роскосмоса',              'age_limit': 0,          },      }, { @@ -81,6 +82,7 @@ class OdnoklassnikiIE(InfoExtractor):          'params': {              'skip_download': True,          }, +        'skip': 'Video has not been found',      }, {          'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',          'only_matching': True, @@ -176,14 +178,32 @@ class OdnoklassnikiIE(InfoExtractor):              })              return info -        quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd', 'full')) +        quality = qualities(('4', '0', '1', '2', '3', '5'))          formats = [{              'url': f['url'],              'ext': 'mp4',              'format_id': f['name'], -            'quality': quality(f['name']),          } for f in metadata['videos']] + +        m3u8_url = metadata.get('hlsManifestUrl') +        if m3u8_url: +            formats.extend(self._extract_m3u8_formats( +                m3u8_url, video_id, 'mp4', 'm3u8_native', +                m3u8_id='hls', fatal=False)) + +        dash_manifest = metadata.get('metadataEmbedded') +        if dash_manifest: +            formats.extend(self._parse_mpd_formats( +                compat_etree_fromstring(dash_manifest), 'mpd')) + +        for fmt in formats: +            fmt_type = self._search_regex( +                r'\btype[/=](\d)', fmt['url'], +                'format type', default=None) +            if fmt_type: +                fmt['quality'] = quality(fmt_type) +          self._sort_formats(formats)          info['formats'] = formats diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3e51b4dd7..0727e381b 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,6 +8,7 @@ from ..utils import (      ExtractorError,      determine_ext,      int_or_none, +    float_or_none,      js_to_json,      strip_jsonp,      strip_or_none, @@ -464,6 +465,7 @@ class PBSIE(InfoExtractor):                      redirects.append(redirect)                      redirect_urls.add(redirect_url) +        chapters = []          # Player pages may also serve different qualities          for page in ('widget/partnerplayer', 'portalplayer'):              player = self._download_webpage( @@ -479,6 +481,20 @@ class PBSIE(InfoExtractor):                      extract_redirect_urls(video_info)                      if not info:                          info = video_info +                if not chapters: +                    for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): +                        chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) +                        if not chapter: +                            continue +                        start_time = float_or_none(chapter.get('start_time'), 1000) +                        duration = float_or_none(chapter.get('duration'), 1000) +                        if start_time is None or duration is None: +                            continue +                        chapters.append({ +                            'start_time': start_time, +                            'end_time': start_time + duration, +                            'title': chapter.get('title'), +                        })          formats = []          http_url = None @@ -515,7 +531,7 @@ class PBSIE(InfoExtractor):                      http_url = format_url          self._remove_duplicate_formats(formats)          m3u8_formats = list(filter( -            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', +            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',              formats))          if http_url:              for m3u8_format in m3u8_formats: @@ -588,4 +604,5 @@ class PBSIE(InfoExtractor):              'upload_date': upload_date,              'formats': formats,              'subtitles': subtitles, +            'chapters': chapters,          } diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 073fc3e21..24c3600fe 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,10 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -from ..compat import ( -    compat_urllib_parse_unquote, -    compat_urllib_parse_urlencode, -)  from .common import InfoExtractor  from ..utils import (      parse_duration, @@ -19,7 +15,7 @@ class Porn91IE(InfoExtractor):      _TEST = {          'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', -        'md5': '6df8f6d028bc8b14f5dbd73af742fb20', +        'md5': '7fcdb5349354f40d41689bd0fa8db05a',          'info_dict': {              'id': '7e42283b4f5ab36da134',              'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', @@ -43,24 +39,7 @@ class Porn91IE(InfoExtractor):              r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')          title = title.replace('\n', '') -        # get real url -        file_id = self._search_regex( -            r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') -        sec_code = self._search_regex( -            r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') -        max_vid = self._search_regex( -            r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') -        url_params = compat_urllib_parse_urlencode({ -            'VID': file_id, -            'mp4': '1', -            'seccode': sec_code, -            'max_vid': max_vid, -        }) -        info_cn = self._download_webpage( -            'http://91porn.com/getfile.php?' + url_params, video_id, -            'Downloading real video url') -        video_url = compat_urllib_parse_unquote(self._search_regex( -            r'file=([^&]+)&', info_cn, 'url')) +        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]          duration = parse_duration(self._search_regex(              r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) @@ -68,11 +47,12 @@ class Porn91IE(InfoExtractor):          comment_count = int_or_none(self._search_regex(              r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False)) -        return { +        info_dict.update({              'id': video_id,              'title': title, -            'url': video_url,              'duration': duration,              'comment_count': comment_count,              'age_limit': self._rta_search(webpage), -        } +        }) + +        return info_dict diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index ed38c77eb..e2202d603 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -62,8 +62,7 @@ class R7IE(InfoExtractor):              # m3u8 format always matches the http format, let's copy metadata from              # one to another              m3u8_formats = list(filter( -                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', -                formats)) +                lambda f: f.get('vcodec') != 'none', formats))              if len(m3u8_formats) == 1:                  f_copy = m3u8_formats[0].copy()                  f_copy.update(f) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index 9f5c237ef..34725274e 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -12,7 +12,7 @@ from ..utils import (  class StreamableIE(InfoExtractor): -    _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)' +    _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'      _TESTS = [          {              'url': 'https://streamable.com/dnd1', @@ -47,6 +47,10 @@ class StreamableIE(InfoExtractor):          {              'url': 'https://streamable.com/e/dnd1',              'only_matching': True, +        }, +        { +            'url': 'https://streamable.com/s/okkqk/drxjds', +            'only_matching': True,          }      ] diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py new file mode 100644 index 000000000..aa4fad162 --- /dev/null +++ b/youtube_dl/extractor/streamango.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    int_or_none, +    js_to_json, +) + + +class StreamangoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', +        'md5': 'e992787515a182f55e38fc97588d802a', +        'info_dict': { +            'id': 'clapasobsptpkdfe', +            'ext': 'mp4', +            'title': '20170315_150006.mp4', +        } +    }, { +        'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        title = self._og_search_title(webpage) + +        formats = [] +        for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): +            video = self._parse_json( +                format_, video_id, transform_source=js_to_json, fatal=False) +            if not video: +                continue +            src = video.get('src') +            if not src: +                continue +            ext = determine_ext(src, default_ext=None) +            if video.get('type') == 'application/dash+xml' or ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    src, video_id, mpd_id='dash', fatal=False)) +            else: +                formats.append({ +                    'url': src, +                    'ext': ext or 'mp4', +                    'width': int_or_none(video.get('width')), +                    'height': int_or_none(video.get('height')), +                    'tbr': int_or_none(video.get('bitrate')), +                }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'url': url, +            'title': title, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1b1afab32..3f3c681ae 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -210,7 +210,7 @@ class TEDIE(InfoExtractor):                      resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))          m3u8_formats = list(filter( -            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', +            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',              formats))          if http_url:              for m3u8_format in m3u8_formats: diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 06ea2b40a..c5b3288ad 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor):                  'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)              self._sort_formats(m3u8_formats)              m3u8_formats = list(filter( -                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', -                m3u8_formats)) +                lambda f: f.get('vcodec') != 'none', m3u8_formats))              formats.extend(m3u8_formats)              for i, m3u8_format in enumerate(m3u8_formats, 2):                  http_url = '%s-%d.mp4' % (video_url_base, i) diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index b6537141a..ebde6053f 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -2,9 +2,13 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( +    compat_HTTPError, +    compat_str, +)  from ..utils import (      extract_attributes, +    try_get,      urlencode_postdata,      ExtractorError,  ) @@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):              webpage, 'channel element'))          title = current_channel['data-name'] -        resource_id = self._search_regex( -            r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id') -        platform = self._search_regex( -            r'platform\s*=\s*"([^"]+)"', webpage, 'platform') +        resource_id = current_channel['data-id'] +          token = self._search_regex( -            r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null') -        validate = self._search_regex( -            r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null') +            r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage, +            'token', group='token') + +        context = self._download_json( +            'https://tvplayer.com/watch/context', display_id, +            'Downloading JSON context', query={ +                'resource': resource_id, +                'nonce': token, +            }) + +        validate = context['validate'] +        platform = try_get( +            context, lambda x: x['platform']['key'], compat_str) or 'firefox'          try:              response = self._download_json(                  'http://api.tvplayer.com/api/v2/stream/live', -                resource_id, headers={ +                display_id, 'Downloading JSON stream', headers={                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',                  }, data=urlencode_postdata({ +                    'id': resource_id,                      'service': 1,                      'platform': platform, -                    'id': resource_id, -                    'token': token,                      'validate': validate,                  }))['tvplayer']['response']          except ExtractorError as e: @@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):                      '%s said: %s' % (self.IE_NAME, response['error']), expected=True)              raise -        formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4') +        formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 9aa38bc5a..890a149ea 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,6 +1,7 @@  from __future__ import unicode_literals  import re +import json  from .common import InfoExtractor  from ..compat import ( @@ -11,7 +12,6 @@ from ..compat import (  from ..utils import (      ExtractorError,      int_or_none, -    sanitized_Request,      parse_iso8601,  ) @@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE):      }      def _initialize_api(self, video_id): -        req = sanitized_Request( -            'http://www.vevo.com/auth', data=b'')          webpage = self._download_webpage( -            req, None, +            'https://accounts.vevo.com/token', None,              note='Retrieving oauth token', -            errnote='Unable to retrieve oauth token') +            errnote='Unable to retrieve oauth token', +            data=json.dumps({ +                'client_id': 'SPupX1tvqFEopQ1YS6SS', +                'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', +            }).encode('utf-8'), +            headers={ +                'Content-Type': 'application/json', +            })          if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):              self.raise_geo_restricted(                  '%s said: This page is currently unavailable in your region' % self.IE_NAME)          auth_info = self._parse_json(webpage, video_id) -        self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] +        self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']      def _call_api(self, path, *args, **kwargs):          try: diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index 049db25a5..e5f964d39 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -1,7 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import random  import re  from .common import InfoExtractor @@ -11,6 +10,7 @@ from ..utils import (      float_or_none,      parse_age_limit,      qualities, +    random_birthday,      try_get,      unified_timestamp,      urljoin, @@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) +        query = random_birthday('birth_year', 'birth_month', 'birth_day')          video = self._download_json(              'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, -            video_id, query={ -                'birth_month': random.randint(1, 12), -                'birth_day': random.randint(1, 31), -                'birth_year': random.randint(1950, 1995), -            }) +            video_id, query=query)          title = video['title'] diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 4e4b4e38c..701bb1d01 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -49,8 +49,11 @@ class VidioIE(InfoExtractor):              thumbnail = clip.get('image')          m3u8_url = m3u8_url or self._search_regex( -            r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url') -        formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') +            r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1', +            webpage, 'hls url') +        formats = self._extract_m3u8_formats( +            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') +        self._sort_formats(formats)          duration = int_or_none(duration or self._search_regex(              r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index d0556297e..e64873bce 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -42,14 +42,15 @@ class VidziIE(InfoExtractor):          title = self._html_search_regex(              r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') -        packed_codes = [mobj.group(0) for mobj in re.finditer( -            PACKED_CODES_RE, webpage)] -        for num, pc in enumerate(packed_codes, 1): -            code = decode_packed_codes(pc).replace('\\\'', '\'') +        codes = [webpage] +        codes.extend([ +            decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') +            for mobj in re.finditer(PACKED_CODES_RE, webpage)]) +        for num, code in enumerate(codes, 1):              jwplayer_data = self._parse_json(                  self._search_regex(                      r'setup\(([^)]+)\)', code, 'jwplayer data', -                    default=NO_DEFAULT if num == len(packed_codes) else '{}'), +                    default=NO_DEFAULT if num == len(codes) else '{}'),                  video_id, transform_source=js_to_json)              if jwplayer_data:                  break diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index fcf0cb100..d5d5b4c69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor):                      if m3u8_formats:                          self._sort_formats(m3u8_formats)                          m3u8_formats = list(filter( -                            lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', -                            m3u8_formats)) +                            lambda f: f.get('vcodec') != 'none', m3u8_formats))                      if len(qualities) == len(m3u8_formats):                          for q, m3u8_format in zip(qualities, m3u8_formats):                              f = m3u8_format.copy() diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 839cad986..625d0a1cc 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -13,6 +13,7 @@ from ..utils import (  class WashingtonPostIE(InfoExtractor):      IE_NAME = 'washingtonpost'      _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' +    _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'      _TEST = {          'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',          'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor):          },      } +    @classmethod +    def _extract_urls(cls, webpage): +        return re.findall( +            r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) +      def _real_extract(self, url):          video_id = self._match_id(url)          video_data = self._download_json( diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index deb7483ae..45cfca7c5 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -10,12 +10,14 @@ from ..utils import (  class WSJIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?:// -        (?: -            video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| -            (?:www\.)?wsj\.com/video/[^/]+/ -        ) -        (?P<id>[a-zA-Z0-9-]+)''' +    _VALID_URL = r'''(?x) +                        (?: +                            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| +                            https?://(?:www\.)?wsj\.com/video/[^/]+/| +                            wsj: +                        ) +                        (?P<id>[a-fA-F0-9-]{36}) +                    '''      IE_DESC = 'Wall Street Journal'      _TESTS = [{          'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', @@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        api_url = ( -            'http://video-api.wsj.com/api-video/find_all_videos.asp?' -            'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' -            'thumbnailList,author,description,name,duration,videoURL,' -            'titletag,formattedCreationDate,keywords,editor' % video_id) -        info = self._download_json(api_url, video_id)['items'][0] +        info = self._download_json( +            'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, +            query={ +                'type': 'guid', +                'count': 1, +                'query': video_id, +                'fields': ','.join(( +                    'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', +                    'description', 'name', 'duration', 'videoURL', 'titletag', +                    'formattedCreationDate', 'keywords', 'editor')), +            })['items'][0]          title = info.get('name', info.get('titletag'))          formats = [] @@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):              'title': title,              'categories': info.get('keywords'),          } + + +class WSJArticleIE(InfoExtractor): +    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' +    _TEST = { +        'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', +        'info_dict': { +            'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', +            'ext': 'mp4', +            'upload_date': '20170221', +            'uploader_id': 'ralcaraz', +            'title': 'Bao Bao the Panda Leaves for China', +        } +    } + +    def _real_extract(self, url): +        article_id = self._match_id(url) +        webpage = self._download_webpage(url, article_id) +        video_id = self._search_regex( +            r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') +        return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 1b5cd122a..13f8be6cb 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -17,24 +17,24 @@ from ..utils import (  class XFileShareIE(InfoExtractor):      _SITES = ( -        ('daclips.in', 'DaClips'), -        ('filehoot.com', 'FileHoot'), -        ('gorillavid.in', 'GorillaVid'), -        ('movpod.in', 'MovPod'), -        ('powerwatch.pw', 'PowerWatch'), -        ('rapidvideo.ws', 'Rapidvideo.ws'), -        ('thevideobee.to', 'TheVideoBee'), -        ('vidto.me', 'Vidto'), -        ('streamin.to', 'Streamin.To'), -        ('xvidstage.com', 'XVIDSTAGE'), -        ('vidabc.com', 'Vid ABC'), -        ('vidbom.com', 'VidBom'), -        ('vidlo.us', 'vidlo'), +        (r'daclips\.(?:in|com)', 'DaClips'), +        (r'filehoot\.com', 'FileHoot'), +        (r'gorillavid\.(?:in|com)', 'GorillaVid'), +        (r'movpod\.in', 'MovPod'), +        (r'powerwatch\.pw', 'PowerWatch'), +        (r'rapidvideo\.ws', 'Rapidvideo.ws'), +        (r'thevideobee\.to', 'TheVideoBee'), +        (r'vidto\.me', 'Vidto'), +        (r'streamin\.to', 'Streamin.To'), +        (r'xvidstage\.com', 'XVIDSTAGE'), +        (r'vidabc\.com', 'Vid ABC'), +        (r'vidbom\.com', 'VidBom'), +        (r'vidlo\.us', 'vidlo'),      )      IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])      _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' -                  % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0])) +                  % '|'.join(site for site in list(zip(*_SITES))[0]))      _FILE_NOT_FOUND_REGEXES = (          r'>(?:404 - )?File Not Found<', diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 5584674a0..bea9b87ad 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -6,6 +6,7 @@ import re  from .common import InfoExtractor  from ..utils import (      int_or_none, +    js_to_json,      orderedSet,      parse_duration,      sanitized_Request, @@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor):              'age_limit': 18,          }      }, { +        # FLV videos with duplicated formats +        'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', +        'md5': 'a406963eb349dd43692ec54631efd88b', +        'info_dict': { +            'id': '9299752', +            'display_id': 'A-Super-Run-Part-1-YT', +            'ext': 'flv', +            'title': 'A Super Run - Part 1 (YT)', +            'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', +            'uploader': 'tshirtguy59', +            'duration': 579, +            'view_count': int, +            'comment_count': int, +            'age_limit': 18, +        }, +    }, {          # new URL schema          'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',          'only_matching': True, @@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):              })          sources = self._parse_json(self._search_regex( -            r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),', -            webpage, 'sources', group='sources'), video_id) +            r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', +            webpage, 'sources', group='sources'), video_id, +            transform_source=js_to_json)          formats = []          for format_id, format_url in sources.items(): @@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):                  'format_id': format_id,                  'height': int_or_none(format_id),              }) +        self._remove_duplicate_formats(formats)          self._sort_formats(formats)          title = self._search_regex( diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 30825daae..eca603028 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -6,8 +6,10 @@ from .common import InfoExtractor  from ..compat import compat_urllib_parse_unquote  from ..utils import (      clean_html, -    ExtractorError,      determine_ext, +    ExtractorError, +    int_or_none, +    parse_duration,  ) @@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor):              'id': '4588838',              'ext': 'mp4',              'title': 'Biker Takes his Girl', +            'duration': 108,              'age_limit': 18,          }      } @@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor):              r'<title>(.*?)\s+-\s+XVID', webpage, 'title')          video_thumbnail = self._search_regex(              r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) +        video_duration = int_or_none(self._og_search_property( +            'duration', webpage, default=None)) or parse_duration( +            self._search_regex( +                r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', +                webpage, 'duration', fatal=False))          formats = [] @@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor):              'id': video_id,              'formats': formats,              'title': video_title, +            'duration': video_duration,              'thumbnail': video_thumbnail,              'age_limit': 18,          } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4951414e9..38f82bf44 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -258,7 +258,7 @@ class YahooIE(InfoExtractor):              return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())          # Look for Brightcove New Studio embeds -        bc_url = BrightcoveNewIE._extract_url(webpage) +        bc_url = BrightcoveNewIE._extract_url(self, webpage)          if bc_url:              return self.url_result(bc_url, BrightcoveNewIE.ie_key()) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index fd6268ba4..eb1062142 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):                  'overembed': 'false',              })['playlist'] -        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) +        tracks = playlist['tracks'] +        track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]          # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,          # missing tracks should be retrieved manually. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2b9115c..480f403da 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -963,7 +963,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      def _extract_signature_function(self, video_id, player_url, example_sig):          id_m = re.match( -            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$', +            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',              player_url)          if not id_m:              raise ExtractorError('Cannot identify player %r' % player_url) @@ -1629,7 +1629,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                  player_desc = 'flash player %s' % player_version                              else:                                  player_version = self._search_regex( -                                    [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], +                                    [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', +                                     r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],                                      player_url,                                      'html5 player', fatal=False)                                  player_desc = 'html5 player %s' % player_version diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py new file mode 100644 index 000000000..889aff5d8 --- /dev/null +++ b/youtube_dl/extractor/zaq1.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'http://zaq1.pl/video/xev0e', +        'md5': '24a5eb3f052e604ae597c4d0d19b351e', +        'info_dict': { +            'id': 'xev0e', +            'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', +            'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', +            'ext': 'mp4', +            'duration': 511, +            'timestamp': 1490896361, +            'uploader': 'Anonim', +            'upload_date': '20170330', +            'view_count': int, +        } +    }, { +        # malformed JSON-LD +        'url': 'http://zaq1.pl/video/x81vn', +        'info_dict': { +            'id': 'x81vn', +            'title': 'SEKRETNE ŻYCIE WALTERA MITTY', +            'ext': 'mp4', +            'duration': 6234, +            'timestamp': 1493494860, +            'uploader': 'Anonim', +            'upload_date': '20170429', +            'view_count': int, +        }, +        'params': { +            'skip_download': True, +        }, +        'expected_warnings': ['Failed to parse JSON'], +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        video_url = self._search_regex( +            r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, +            'video url', group='url') + +        info = self._search_json_ld(webpage, video_id, fatal=False) + +        def extract_data(field, name, fatal=False): +            return self._search_regex( +                r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, +                webpage, field, fatal=fatal, group='field') + +        if not info.get('title'): +            info['title'] = extract_data('file-name', 'title', fatal=True) + +        if not info.get('duration'): +            info['duration'] = int_or_none(extract_data('duration', 'duration')) + +        if not info.get('thumbnail'): +            info['thumbnail'] = extract_data('photo-url', 'thumbnail') + +        if not info.get('timestamp'): +            info['timestamp'] = unified_timestamp(self._html_search_meta( +                'uploadDate', webpage, 'timestamp')) + +        if not info.get('interactionCount'): +            info['view_count'] = int_or_none(self._html_search_meta( +                'interactionCount', webpage, 'view count')) + +        uploader = self._html_search_regex( +            r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', +            fatal=False) + +        width = int_or_none(self._html_search_meta( +            'width', webpage, fatal=False)) +        height = int_or_none(self._html_search_meta( +            'height', webpage, fatal=False)) + +        info.update({ +            'id': video_id, +            'formats': [{ +                'url': video_url, +                'width': width, +                'height': height, +                'http_headers': { +                    'Referer': url, +                }, +            }], +            'uploader': uploader, +        }) + +        return info diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 2d2f5e47b..52309fb84 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -469,6 +469,10 @@ def parseOpts(overrideArguments=None):          action='store_false', dest='skip_unavailable_fragments',          help='Abort downloading when some fragment is not available')      downloader.add_option( +        '--keep-fragments', +        action='store_true', dest='keep_fragments', default=False, +        help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default') +    downloader.add_option(          '--buffer-size',          dest='buffersize', metavar='SIZE', default='1024',          help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 665109558..c91ec8588 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -4,6 +4,7 @@ import io  import os  import subprocess  import time +import re  from .common import AudioConversionError, PostProcessor @@ -22,6 +23,7 @@ from ..utils import (      subtitles_filename,      dfxp2srt,      ISO639Utils, +    replace_extension,  ) @@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor):          filename = info['filepath']          temp_filename = prepend_extension(filename, 'temp') +        in_filenames = [filename] +        options = []          if info['ext'] == 'm4a': -            options = ['-vn', '-acodec', 'copy'] +            options.extend(['-vn', '-acodec', 'copy'])          else: -            options = ['-c', 'copy'] +            options.extend(['-c', 'copy'])          for (name, value) in metadata.items():              options.extend(['-metadata', '%s=%s' % (name, value)]) +        chapters = info.get('chapters', []) +        if chapters: +            metadata_filename = encodeFilename(replace_extension(filename, 'meta')) +            with io.open(metadata_filename, 'wt', encoding='utf-8') as f: +                def ffmpeg_escape(text): +                    return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) + +                metadata_file_content = ';FFMETADATA1\n' +                for chapter in chapters: +                    metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' +                    metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) +                    metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) +                    chapter_title = chapter.get('title') +                    if chapter_title: +                        metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) +                f.write(metadata_file_content) +                in_filenames.append(metadata_filename) +                options.extend(['-map_metadata', '1']) +          self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) -        self.run_ffmpeg(filename, temp_filename, options) +        self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) +        if chapters: +            os.remove(metadata_filename)          os.remove(encodeFilename(filename))          os.rename(encodeFilename(temp_filename), encodeFilename(filename))          return [], info diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index 0f5d7bdb2..5d4adbe72 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -193,9 +193,10 @@ class sockssocket(socket.socket):          self._check_response_version(SOCKS5_VERSION, version) -        if method == Socks5Auth.AUTH_NO_ACCEPTABLE: +        if method == Socks5Auth.AUTH_NO_ACCEPTABLE or ( +                method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)):              self.close() -            raise Socks5Error(method) +            raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE)          if method == Socks5Auth.AUTH_USER_PASS:              username = self._proxy.username.encode('utf-8') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 84aaac664..c67f95ac9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -11,6 +11,7 @@ import contextlib  import ctypes  import datetime  import email.utils +import email.header  import errno  import functools  import gzip @@ -421,8 +422,8 @@ def clean_html(html):      # Newline vs <br />      html = html.replace('\n', ' ') -    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) -    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) +    html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) +    html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)      # Strip html tags      html = re.sub('<.*?>', '', html)      # Replace html entities @@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True):      # Remove AM/PM + timezone      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) +    # Remove unrecognized timezones from ISO 8601 alike timestamps +    m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) +    if m: +        date_str = date_str[:-len(m.group('tz'))] +      for expression in date_formats(day_first):          try:              dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) @@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):      return new_req +def try_multipart_encode(data, boundary): +    content_type = 'multipart/form-data; boundary=%s' % boundary + +    out = b'' +    for k, v in data.items(): +        out += b'--' + boundary.encode('ascii') + b'\r\n' +        if isinstance(k, compat_str): +            k = k.encode('utf-8') +        if isinstance(v, compat_str): +            v = v.encode('utf-8') +        # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 +        # suggests sending UTF-8 directly. Firefox sends UTF-8, too +        content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n' +        if boundary.encode('ascii') in content: +            raise ValueError('Boundary overlaps with data') +        out += content + +    out += b'--' + boundary.encode('ascii') + b'--\r\n' + +    return out, content_type + + +def multipart_encode(data, boundary=None): +    ''' +    Encode a dict to RFC 7578-compliant form-data + +    data: +        A dict where keys and values can be either Unicode or bytes-like +        objects. +    boundary: +        If specified a Unicode object, it's used as the boundary. Otherwise +        a random boundary is generated. + +    Reference: https://tools.ietf.org/html/rfc7578 +    ''' +    has_specified_boundary = boundary is not None + +    while True: +        if boundary is None: +            boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + +        try: +            out, content_type = try_multipart_encode(data, boundary) +            break +        except ValueError: +            if has_specified_boundary: +                raise +            boundary = None + +    return out, content_type + +  def dict_get(d, key_or_keys, default=None, skip_false_values=True):      if isinstance(key_or_keys, (list, tuple)):          for key in key_or_keys: @@ -2103,13 +2161,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):  def try_get(src, getter, expected_type=None): -    try: -        v = getter(src) -    except (AttributeError, KeyError, TypeError, IndexError): -        pass -    else: -        if expected_type is None or isinstance(v, expected_type): -            return v +    if not isinstance(getter, (list, tuple)): +        getter = [getter] +    for get in getter: +        try: +            v = get(src) +        except (AttributeError, KeyError, TypeError, IndexError): +            pass +        else: +            if expected_type is None or isinstance(v, expected_type): +                return v  def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): @@ -2270,10 +2331,8 @@ def mimetype2ext(mt):      return {          '3gpp': '3gp',          'smptett+xml': 'tt', -        'srt': 'srt',          'ttaf+xml': 'dfxp',          'ttml+xml': 'ttml', -        'vtt': 'vtt',          'x-flv': 'flv',          'x-mp4-fragmented': 'mp4',          'x-ms-wmv': 'wmv', @@ -2281,11 +2340,11 @@ def mimetype2ext(mt):          'x-mpegurl': 'm3u8',          'vnd.apple.mpegurl': 'm3u8',          'dash+xml': 'mpd', -        'f4m': 'f4m',          'f4m+xml': 'f4m',          'hds+xml': 'f4m',          'vnd.ms-sstr+xml': 'ism',          'quicktime': 'mov', +        'mp2t': 'ts',      }.get(res, res) @@ -2508,27 +2567,97 @@ def srt_subtitles_timecode(seconds):  def dfxp2srt(dfxp_data): +    LEGACY_NAMESPACES = ( +        ('http://www.w3.org/ns/ttml', [ +            'http://www.w3.org/2004/11/ttaf1', +            'http://www.w3.org/2006/04/ttaf1', +            'http://www.w3.org/2006/10/ttaf1', +        ]), +        ('http://www.w3.org/ns/ttml#styling', [ +            'http://www.w3.org/ns/ttml#style', +        ]), +    ) + +    SUPPORTED_STYLING = [ +        'color', +        'fontFamily', +        'fontSize', +        'fontStyle', +        'fontWeight', +        'textDecoration' +    ] +      _x = functools.partial(xpath_with_ns, ns_map={          'ttml': 'http://www.w3.org/ns/ttml', -        'ttaf1': 'http://www.w3.org/2006/10/ttaf1', -        'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1', +        'tts': 'http://www.w3.org/ns/ttml#styling',      }) +    styles = {} +    default_style = {} +      class TTMLPElementParser(object): -        out = '' +        _out = '' +        _unclosed_elements = [] +        _applied_styles = []          def start(self, tag, attrib): -            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): -                self.out += '\n' +            if tag in (_x('ttml:br'), 'br'): +                self._out += '\n' +            else: +                unclosed_elements = [] +                style = {} +                element_style_id = attrib.get('style') +                if default_style: +                    style.update(default_style) +                if element_style_id: +                    style.update(styles.get(element_style_id, {})) +                for prop in SUPPORTED_STYLING: +                    prop_val = attrib.get(_x('tts:' + prop)) +                    if prop_val: +                        style[prop] = prop_val +                if style: +                    font = '' +                    for k, v in sorted(style.items()): +                        if self._applied_styles and self._applied_styles[-1].get(k) == v: +                            continue +                        if k == 'color': +                            font += ' color="%s"' % v +                        elif k == 'fontSize': +                            font += ' size="%s"' % v +                        elif k == 'fontFamily': +                            font += ' face="%s"' % v +                        elif k == 'fontWeight' and v == 'bold': +                            self._out += '<b>' +                            unclosed_elements.append('b') +                        elif k == 'fontStyle' and v == 'italic': +                            self._out += '<i>' +                            unclosed_elements.append('i') +                        elif k == 'textDecoration' and v == 'underline': +                            self._out += '<u>' +                            unclosed_elements.append('u') +                    if font: +                        self._out += '<font' + font + '>' +                        unclosed_elements.append('font') +                    applied_style = {} +                    if self._applied_styles: +                        applied_style.update(self._applied_styles[-1]) +                    applied_style.update(style) +                    self._applied_styles.append(applied_style) +                self._unclosed_elements.append(unclosed_elements)          def end(self, tag): -            pass +            if tag not in (_x('ttml:br'), 'br'): +                unclosed_elements = self._unclosed_elements.pop() +                for element in reversed(unclosed_elements): +                    self._out += '</%s>' % element +                if unclosed_elements and self._applied_styles: +                    self._applied_styles.pop()          def data(self, data): -            self.out += data +            self._out += data          def close(self): -            return self.out.strip() +            return self._out.strip()      def parse_node(node):          target = TTMLPElementParser() @@ -2536,13 +2665,45 @@ def dfxp2srt(dfxp_data):          parser.feed(xml.etree.ElementTree.tostring(node))          return parser.close() +    for k, v in LEGACY_NAMESPACES: +        for ns in v: +            dfxp_data = dfxp_data.replace(ns, k) +      dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))      out = [] -    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p') +    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')      if not paras:          raise ValueError('Invalid dfxp/TTML subtitle') +    repeat = False +    while True: +        for style in dfxp.findall(_x('.//ttml:style')): +            style_id = style.get('id') +            parent_style_id = style.get('style') +            if parent_style_id: +                if parent_style_id not in styles: +                    repeat = True +                    continue +                styles[style_id] = styles[parent_style_id].copy() +            for prop in SUPPORTED_STYLING: +                prop_val = style.get(_x('tts:' + prop)) +                if prop_val: +                    styles.setdefault(style_id, {})[prop] = prop_val +        if repeat: +            repeat = False +        else: +            break + +    for p in ('body', 'div'): +        ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) +        if ele is None: +            continue +        style = styles.get(ele.get('style')) +        if not style: +            continue +        default_style.update(style) +      for para, index in zip(paras, itertools.count(1)):          begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))          end_time = parse_dfxp_time_expr(para.attrib.get('end')) @@ -3862,3 +4023,10 @@ class PhantomJSwrapper(object):          return (html, encodeArgument(out)) + +def random_birthday(year_field, month_field, day_field): +    return { +        year_field: str(random.randint(1950, 1995)), +        month_field: str(random.randint(1, 12)), +        day_field: str(random.randint(1, 31)), +    } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 612b50f7b..c19ac49b0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2017.04.15' +__version__ = '2017.05.01' | 
