diff options
45 files changed, 1605 insertions, 540 deletions
| diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0decf19a1..5469c73cf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@  --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.18*. If it's not, read [this FAQ entry](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.04.07*. If it's not, read [this FAQ entry](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.04.07**  ### Before submitting an *issue* make sure you have:  - [ ] At least skimmed through the [README](https://github.com/ytdl-org/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/ytdl-org/youtube-dl#faq) and [BUGS](https://github.com/ytdl-org/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl  [debug] User config: []  [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']  [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.03.18 +[debug] youtube-dl version 2019.04.07  [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2  [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4  [debug] Proxy map: {} @@ -1,3 +1,49 @@ +version 2019.04.07 + +Core ++ [downloader/external] Pass rtmp_conn to ffmpeg + +Extractors ++ [ruutu] Add support for audio podcasts (#20473, #20545) ++ [xvideos] Extract all thumbnails (#20432) ++ [platzi] Add support for platzi.com (#20562) +* [dvtv] Fix extraction (#18514, #19174) ++ [vrv] Add basic support for individual movie links (#19229) ++ [bfi:player] Add support for player.bfi.org.uk (#19235) +* [hbo] Fix extraction and extract subtitles (#14629, #13709) +* [youtube] Extract srv[1-3] subtitle formats (#20566) +* [adultswim] Fix extraction (#18025) +* [teamcoco] Fix extraction and add suport for subdomains (#17099, #20339) +* [adn] Fix subtitle compatibility with ffmpeg +* [adn] Fix extraction and add support for positioning styles (#20549) +* [vk] Use unique video id (#17848) +* [newstube] Fix extraction +* [rtl2] Actualize extraction ++ [adobeconnect] Add support for adobeconnect.com (#20283) ++ [gaia] Add support for authentication (#14605) ++ [mediasite] Add support for dashed ids and named catalogs (#20531) + + +version 2019.04.01 + +Core +* [utils] Improve int_or_none and float_or_none (#20403) +* Check for valid --min-sleep-interval when --max-sleep-interval is specified +  (#20435) + +Extractors ++ [weibo] Extend URL regular expression (#20496) ++ [xhamster] Add support for xhamster.one (#20508) ++ [mediasite] Add support for catalogs (#20507) ++ [teamtreehouse] Add support for teamtreehouse.com (#9836) ++ [ina] Add support for audio URLs +* [ina] Improve extraction +* [cwtv] Fix episode number extraction (#20461) +* [npo] Improve DRM detection ++ [pornhub] Add support for DASH formats (#20403) +* [svtplay] Update API endpoint (#20430) + +  version 2019.03.18  Core @@ -642,6 +642,7 @@ The simplest case is requesting a specific format, for example with `-f 22` you  You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file.  You can also use special names to select particular edge case formats: +   - `best`: Select the best quality format represented by a single file with video and audio.   - `worst`: Select the worst quality format represented by a single file with video and audio.   - `bestvideo`: Select the best quality video-only format (e.g. DASH video). May not be available. @@ -658,6 +659,7 @@ If you want to download several formats of the same video use a comma as a separ  You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`).  The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `>=`, `=` (equals), `!=` (not equals): +   - `filesize`: The number of bytes, if known in advance   - `width`: Width of the video, if known   - `height`: Height of the video, if known @@ -668,6 +670,7 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `   - `fps`: Frame rate  Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields: +   - `ext`: File extension   - `acodec`: Name of the audio codec in use   - `vcodec`: Name of the video codec in use @@ -697,7 +700,7 @@ Note that on Windows you may need to use double quotes instead of single.  # Download best mp4 format available or any other best if no mp4 available  $ youtube-dl -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' -# Download best format available but not better that 480p +# Download best format available but no better than 480p  $ youtube-dl -f 'bestvideo[height<=480]+bestaudio/best[height<=480]'  # Download best video only format but no bigger than 50 MB diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a3d4447a8..df272c479 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,6 +28,7 @@   - **acast:channel**   - **AddAnime**   - **ADN**: Anime Digital Network + - **AdobeConnect**   - **AdobeTV**   - **AdobeTVChannel**   - **AdobeTVShow** @@ -101,6 +102,7 @@   - **Bellator**   - **BellMedia**   - **Bet** + - **bfi:player**   - **Bigflix**   - **Bild**: Bild.de   - **BiliBili** @@ -345,7 +347,6 @@   - **Groupon**   - **Hark**   - **hbo** - - **hbo:episode**   - **HearThisAt**   - **Heise**   - **HellPorno** @@ -488,6 +489,8 @@   - **Medialaan**   - **Mediaset**   - **Mediasite** + - **MediasiteCatalog** + - **MediasiteNamedCatalog**   - **Medici**   - **megaphone.fm**: megaphone.fm embedded players   - **Meipai**: 美拍 @@ -670,6 +673,8 @@   - **Piksel**   - **Pinkbike**   - **Pladform** + - **Platzi** + - **PlatziCourse**   - **play.fm**   - **PlayPlusTV**   - **PlaysTV** @@ -869,6 +874,7 @@   - **teachertube:user:collection**: teachertube.com user and collection videos   - **TeachingChannel**   - **Teamcoco** + - **TeamTreeHouse**   - **TechTalks**   - **techtv.mit.edu**   - **ted** diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3b92acd97..57f52f888 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -309,6 +309,8 @@ class YoutubeDL(object):      The following options are used by the post processors:      prefer_ffmpeg:     If False, use avconv instead of ffmpeg if both are available,                         otherwise prefer ffmpeg. +    ffmpeg_location:   Location of the ffmpeg/avconv binary; either the path +                       to the binary or its containing directory.      postprocessor_args: A list of additional command-line arguments for the                          postprocessor. diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 5f73f7f0f..acdb27712 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -289,6 +289,7 @@ class FFmpegFD(ExternalFD):              tc_url = info_dict.get('tc_url')              flash_version = info_dict.get('flash_version')              live = info_dict.get('rtmp_live', False) +            conn = info_dict.get('rtmp_conn')              if player_url is not None:                  args += ['-rtmp_swfverify', player_url]              if page_url is not None: @@ -303,6 +304,11 @@ class FFmpegFD(ExternalFD):                  args += ['-rtmp_flashver', flash_version]              if live:                  args += ['-rtmp_live', 'live'] +            if isinstance(conn, list): +                for entry in conn: +                    args += ['-rtmp_conn', entry] +            elif isinstance(conn, compat_str): +                args += ['-rtmp_conn', conn]          args += ['-i', url, '-c', 'copy'] diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 1eb99c39a..923c351e4 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -21,7 +21,6 @@ from ..utils import (      intlist_to_bytes,      long_to_bytes,      pkcs1pad, -    srt_subtitles_timecode,      strip_or_none,      urljoin,  ) @@ -42,6 +41,18 @@ class ADNIE(InfoExtractor):      }      _BASE_URL = 'http://animedigitalnetwork.fr'      _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) +    _POS_ALIGN_MAP = { +        'start': 1, +        'end': 3, +    } +    _LINE_ALIGN_MAP = { +        'middle': 8, +        'end': 4, +    } + +    @staticmethod +    def _ass_subtitles_timecode(seconds): +        return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100)      def _get_subtitles(self, sub_path, video_id):          if not sub_path: @@ -49,14 +60,19 @@ class ADNIE(InfoExtractor):          enc_subtitles = self._download_webpage(              urljoin(self._BASE_URL, sub_path), -            video_id, fatal=False) +            video_id, 'Downloading subtitles location', fatal=False) or '{}' +        subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') +        if subtitle_location: +            enc_subtitles = self._download_webpage( +                urljoin(self._BASE_URL, subtitle_location), +                video_id, 'Downloading subtitles data', fatal=False)          if not enc_subtitles:              return None          # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js          dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(              bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), -            bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), +            bytes_to_intlist(binascii.unhexlify(self._K + '4421de0a5f0814ba')),              bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))          ))          subtitles_json = self._parse_json( @@ -67,23 +83,27 @@ class ADNIE(InfoExtractor):          subtitles = {}          for sub_lang, sub in subtitles_json.items(): -            srt = '' -            for num, current in enumerate(sub): -                start, end, text = ( +            ssa = '''[Script Info] +ScriptType:V4.00 +[V4 Styles] +Format: Name,Fontname,Fontsize,PrimaryColour,SecondaryColour,TertiaryColour,BackColour,Bold,Italic,BorderStyle,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,AlphaLevel,Encoding +Style: Default,Arial,18,16777215,16777215,16777215,0,-1,0,1,1,0,2,20,20,20,0,0 +[Events] +Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' +            for current in sub: +                start, end, text, line_align, position_align = (                      float_or_none(current.get('startTime')),                      float_or_none(current.get('endTime')), -                    current.get('text')) +                    current.get('text'), current.get('lineAlign'), +                    current.get('positionAlign'))                  if start is None or end is None or text is None:                      continue -                srt += os.linesep.join( -                    ( -                        '%d' % num, -                        '%s --> %s' % ( -                            srt_subtitles_timecode(start), -                            srt_subtitles_timecode(end)), -                        text, -                        os.linesep, -                    )) +                alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0) +                ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % ( +                    self._ass_subtitles_timecode(start), +                    self._ass_subtitles_timecode(end), +                    '{\\a%d}' % alignment if alignment != 2 else '', +                    text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}'))              if sub_lang == 'vostf':                  sub_lang = 'fr' @@ -91,8 +111,8 @@ class ADNIE(InfoExtractor):                  'ext': 'json',                  'data': json.dumps(sub),              }, { -                'ext': 'srt', -                'data': srt, +                'ext': 'ssa', +                'data': ssa,              }])          return subtitles @@ -100,7 +120,15 @@ class ADNIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          player_config = self._parse_json(self._search_regex( -            r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id) +            r'playerConfig\s*=\s*({.+});', webpage, +            'player config', default='{}'), video_id, fatal=False) +        if not player_config: +            config_url = urljoin(self._BASE_URL, self._search_regex( +                r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', +                webpage, 'config url')) +            player_config = self._download_json( +                config_url, video_id, +                'Downloading player config JSON metadata')['player']          video_info = {}          video_info_str = self._search_regex( @@ -129,12 +157,15 @@ class ADNIE(InfoExtractor):              encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))              authorization = base64.b64encode(encrypted_message).decode()              links_data = self._download_json( -                urljoin(self._BASE_URL, links_url), video_id, headers={ +                urljoin(self._BASE_URL, links_url), video_id, +                'Downloading links JSON metadata', headers={                      'Authorization': 'Bearer ' + authorization,                  })              links = links_data.get('links') or {}              metas = metas or links_data.get('meta') or {} -            sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token +            sub_path = sub_path or links_data.get('subtitles') or \ +                'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id +            sub_path += '&token=' + token              error = links_data.get('error')          title = metas.get('title') or video_info['title'] @@ -142,9 +173,11 @@ class ADNIE(InfoExtractor):          for format_id, qualities in links.items():              if not isinstance(qualities, dict):                  continue -            for load_balancer_url in qualities.values(): +            for quality, load_balancer_url in qualities.items():                  load_balancer_data = self._download_json( -                    load_balancer_url, video_id, fatal=False) or {} +                    load_balancer_url, video_id, +                    'Downloading %s %s JSON metadata' % (format_id, quality), +                    fatal=False) or {}                  m3u8_url = load_balancer_data.get('location')                  if not m3u8_url:                      continue diff --git a/youtube_dl/extractor/adobeconnect.py b/youtube_dl/extractor/adobeconnect.py new file mode 100644 index 000000000..728549eb9 --- /dev/null +++ b/youtube_dl/extractor/adobeconnect.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( +    compat_parse_qs, +    compat_urlparse, +) + + +class AdobeConnectIE(InfoExtractor): +    _VALID_URL = r'https?://\w+\.adobeconnect\.com/(?P<id>[\w-]+)' + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') +        qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) +        is_live = qs.get('isLive', ['false'])[0] == 'true' +        formats = [] +        for con_string in qs['conStrings'][0].split(','): +            formats.append({ +                'format_id': con_string.split('://')[0], +                'app': compat_urlparse.quote('?' + con_string.split('?')[1] + 'flvplayerapp/' + qs['appInstance'][0]), +                'ext': 'flv', +                'play_path': 'mp4:' + qs['streamName'][0], +                'rtmp_conn': 'S:' + qs['ticket'][0], +                'rtmp_live': is_live, +                'url': con_string, +            }) + +        return { +            'id': video_id, +            'title': self._live_title(title) if is_live else title, +            'formats': formats, +            'is_live': is_live, +        } diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 88c96a950..8d1d9ac7d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -1,13 +1,19 @@  # coding: utf-8  from __future__ import unicode_literals +import json  import re  from .turner import TurnerBaseIE  from ..utils import ( +    determine_ext, +    float_or_none,      int_or_none, +    mimetype2ext, +    parse_age_limit, +    parse_iso8601,      strip_or_none, -    url_or_none, +    try_get,  ) @@ -21,8 +27,8 @@ class AdultSwimIE(TurnerBaseIE):              'ext': 'mp4',              'title': 'Rick and Morty - Pilot',              'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', -            'timestamp': 1493267400, -            'upload_date': '20170427', +            'timestamp': 1543294800, +            'upload_date': '20181127',          },          'params': {              # m3u8 download @@ -43,6 +49,7 @@ class AdultSwimIE(TurnerBaseIE):              # m3u8 download              'skip_download': True,          }, +        'skip': '404 Not Found',      }, {          'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',          'info_dict': { @@ -61,9 +68,9 @@ class AdultSwimIE(TurnerBaseIE):      }, {          'url': 'http://www.adultswim.com/videos/attack-on-titan',          'info_dict': { -            'id': 'b7A69dzfRzuaXIECdxW8XQ', +            'id': 'attack-on-titan',              'title': 'Attack on Titan', -            'description': 'md5:6c8e003ea0777b47013e894767f5e114', +            'description': 'md5:41caa9416906d90711e31dc00cb7db7e',          },          'playlist_mincount': 12,      }, { @@ -78,83 +85,118 @@ class AdultSwimIE(TurnerBaseIE):              # m3u8 download              'skip_download': True,          }, +        'skip': '404 Not Found',      }]      def _real_extract(self, url):          show_path, episode_path = re.match(self._VALID_URL, url).groups()          display_id = episode_path or show_path -        webpage = self._download_webpage(url, display_id) -        initial_data = self._parse_json(self._search_regex( -            r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', -            webpage, 'initial data'), display_id) - -        is_stream = show_path == 'streams' -        if is_stream: -            if not episode_path: -                episode_path = 'live-stream' +        query = '''query { +  getShowBySlug(slug:"%s") { +    %%s +  } +}''' % show_path +        if episode_path: +            query = query % '''title +    getVideoBySlug(slug:"%s") { +      _id +      auth +      description +      duration +      episodeNumber +      launchDate +      mediaID +      seasonNumber +      poster +      title +      tvRating +    }''' % episode_path +            ['getVideoBySlug'] +        else: +            query = query % '''metaDescription +    title +    videos(first:1000,sort:["episode_number"]) { +      edges { +        node { +           _id +           slug +        } +      } +    }''' +        show_data = self._download_json( +            'https://www.adultswim.com/api/search', display_id, +            data=json.dumps({'query': query}).encode(), +            headers={'Content-Type': 'application/json'})['data']['getShowBySlug'] +        if episode_path: +            video_data = show_data['getVideoBySlug'] +            video_id = video_data['_id'] +            episode_title = title = video_data['title'] +            series = show_data.get('title') +            if series: +                title = '%s - %s' % (series, title) +            info = { +                'id': video_id, +                'title': title, +                'description': strip_or_none(video_data.get('description')), +                'duration': float_or_none(video_data.get('duration')), +                'formats': [], +                'subtitles': {}, +                'age_limit': parse_age_limit(video_data.get('tvRating')), +                'thumbnail': video_data.get('poster'), +                'timestamp': parse_iso8601(video_data.get('launchDate')), +                'series': series, +                'season_number': int_or_none(video_data.get('seasonNumber')), +                'episode': episode_title, +                'episode_number': int_or_none(video_data.get('episodeNumber')), +            } -            video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) -            video_id = video_data.get('stream') +            auth = video_data.get('auth') +            media_id = video_data.get('mediaID') +            if media_id: +                info.update(self._extract_ngtv_info(media_id, { +                    # CDN_TOKEN_APP_ID from: +                    # https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js +                    'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE', +                }, { +                    'url': url, +                    'site_name': 'AdultSwim', +                    'auth_required': auth, +                })) -            if not video_id: -                entries = [] -                for episode in video_data.get('archiveEpisodes', []): -                    episode_url = url_or_none(episode.get('url')) -                    if not episode_url: +            if not auth: +                extract_data = self._download_json( +                    'https://www.adultswim.com/api/shows/v1/videos/' + video_id, +                    video_id, query={'fields': 'stream'}, fatal=False) or {} +                assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or [] +                for asset in assets: +                    asset_url = asset.get('url') +                    if not asset_url:                          continue -                    entries.append(self.url_result( -                        episode_url, 'AdultSwim', episode.get('id'))) -                return self.playlist_result( -                    entries, video_data.get('id'), video_data.get('title'), -                    strip_or_none(video_data.get('description'))) -        else: -            show_data = initial_data['show'] - -            if not episode_path: -                entries = [] -                for video in show_data.get('videos', []): -                    slug = video.get('slug') -                    if not slug: +                    ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) +                    if ext == 'm3u8': +                        info['formats'].extend(self._extract_m3u8_formats( +                            asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) +                    elif ext == 'f4m':                          continue -                    entries.append(self.url_result( -                        'http://adultswim.com/videos/%s/%s' % (show_path, slug), -                        'AdultSwim', video.get('id'))) -                return self.playlist_result( -                    entries, show_data.get('id'), show_data.get('title'), -                    strip_or_none(show_data.get('metadata', {}).get('description'))) - -            video_data = show_data['sluggedVideo'] -            video_id = video_data['id'] +                        # info['formats'].extend(self._extract_f4m_formats( +                        #     asset_url, video_id, f4m_id='hds', fatal=False)) +                    elif ext in ('scc', 'ttml', 'vtt'): +                        info['subtitles'].setdefault('en', []).append({ +                            'url': asset_url, +                        }) +            self._sort_formats(info['formats']) -        info = self._extract_cvp_info( -            'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, -            video_id, { -                'secure': { -                    'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', -                    'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', -                }, -            }, { -                'url': url, -                'site_name': 'AdultSwim', -                'auth_required': video_data.get('auth'), -            }) - -        info.update({ -            'id': video_id, -            'display_id': display_id, -            'description': info.get('description') or strip_or_none(video_data.get('description')), -        }) -        if not is_stream: -            info.update({ -                'duration': info.get('duration') or int_or_none(video_data.get('duration')), -                'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), -                'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), -                'episode': info['title'], -                'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')), -            }) - -            info['series'] = video_data.get('collection_title') or info.get('series') -            if info['series'] and info['series'] != info['title']: -                info['title'] = '%s - %s' % (info['series'], info['title']) - -        return info +            return info +        else: +            entries = [] +            for edge in show_data.get('videos', {}).get('edges', []): +                video = edge.get('node') or {} +                slug = video.get('slug') +                if not slug: +                    continue +                entries.append(self.url_result( +                    'http://adultswim.com/videos/%s/%s' % (show_path, slug), +                    'AdultSwim', video.get('_id'))) +            return self.playlist_result( +                entries, show_path, show_data.get('title'), +                strip_or_none(show_data.get('metaDescription'))) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 85ec6392d..611b948f5 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -1,14 +1,15 @@ +# coding: utf-8  from __future__ import unicode_literals  import re  from .theplatform import ThePlatformIE  from ..utils import ( +    extract_attributes, +    ExtractorError, +    int_or_none,      smuggle_url,      update_url_query, -    unescapeHTML, -    extract_attributes, -    get_element_by_attribute,  )  from ..compat import (      compat_urlparse, @@ -19,6 +20,43 @@ class AENetworksBaseIE(ThePlatformIE):      _THEPLATFORM_KEY = 'crazyjava'      _THEPLATFORM_SECRET = 's3cr3t' +    def _extract_aen_smil(self, smil_url, video_id, auth=None): +        query = {'mbr': 'true'} +        if auth: +            query['auth'] = auth +        TP_SMIL_QUERY = [{ +            'assetTypes': 'high_video_ak', +            'switch': 'hls_high_ak' +        }, { +            'assetTypes': 'high_video_s3' +        }, { +            'assetTypes': 'high_video_s3', +            'switch': 'hls_ingest_fastly' +        }] +        formats = [] +        subtitles = {} +        last_e = None +        for q in TP_SMIL_QUERY: +            q.update(query) +            m_url = update_url_query(smil_url, q) +            m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) +            try: +                tp_formats, tp_subtitles = self._extract_theplatform_smil( +                    m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) +            except ExtractorError as e: +                last_e = e +                continue +            formats.extend(tp_formats) +            subtitles = self._merge_subtitles(subtitles, tp_subtitles) +        if last_e and not formats: +            raise last_e +        self._sort_formats(formats) +        return { +            'id': video_id, +            'formats': formats, +            'subtitles': subtitles, +        } +  class AENetworksIE(AENetworksBaseIE):      IE_NAME = 'aenetworks' @@ -33,22 +71,25 @@ class AENetworksIE(AENetworksBaseIE):                          (?:                              shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|                              movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?| -                            specials/(?P<special_display_id>[^/]+)/full-special| +                            specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)|                              collections/[^/]+/(?P<collection_display_id>[^/]+)                          )                      '''      _TESTS = [{          'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', -        'md5': 'a97a65f7e823ae10e9244bc5433d5fe6',          'info_dict': {              'id': '22253814',              'ext': 'mp4', -            'title': 'Winter Is Coming', +            'title': 'Winter is Coming',              'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',              'timestamp': 1338306241,              'upload_date': '20120529',              'uploader': 'AENE-NEW',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },          'add_ie': ['ThePlatform'],      }, {          'url': 'http://www.history.com/shows/ancient-aliens/season-1', @@ -84,6 +125,9 @@ class AENetworksIE(AENetworksBaseIE):      }, {          'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward',          'only_matching': True +    }, { +        'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', +        'only_matching': True      }]      _DOMAIN_TO_REQUESTOR_ID = {          'history.com': 'HISTORY', @@ -124,11 +168,6 @@ class AENetworksIE(AENetworksBaseIE):                  return self.playlist_result(                      entries, self._html_search_meta('aetn:SeasonId', webpage)) -        query = { -            'mbr': 'true', -            'assetTypes': 'high_video_ak', -            'switch': 'hls_high_ak', -        }          video_id = self._html_search_meta('aetn:VideoID', webpage)          media_url = self._search_regex(              [r"media_url\s*=\s*'(?P<url>[^']+)'", @@ -138,64 +177,39 @@ class AENetworksIE(AENetworksBaseIE):          theplatform_metadata = self._download_theplatform_metadata(self._search_regex(              r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)          info = self._parse_theplatform_metadata(theplatform_metadata) +        auth = None          if theplatform_metadata.get('AETN$isBehindWall'):              requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]              resource = self._get_mvpd_resource(                  requestor_id, theplatform_metadata['title'],                  theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),                  theplatform_metadata['ratings'][0]['rating']) -            query['auth'] = self._extract_mvpd_auth( +            auth = self._extract_mvpd_auth(                  url, video_id, requestor_id, resource)          info.update(self._search_json_ld(webpage, video_id, fatal=False)) -        media_url = update_url_query(media_url, query) -        media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) -        formats, subtitles = self._extract_theplatform_smil(media_url, video_id) -        self._sort_formats(formats) -        info.update({ -            'id': video_id, -            'formats': formats, -            'subtitles': subtitles, -        }) +        info.update(self._extract_aen_smil(media_url, video_id, auth))          return info  class HistoryTopicIE(AENetworksBaseIE):      IE_NAME = 'history:topic'      IE_DESC = 'History.com Topic' -    _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?' +    _VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P<id>[\w+-]+?)-video'      _TESTS = [{ -        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', +        'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video',          'info_dict': {              'id': '40700995724',              'ext': 'mp4', -            'title': "Bet You Didn't Know: Valentine's Day", +            'title': "History of Valentine’s Day",              'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',              'timestamp': 1375819729,              'upload_date': '20130806', -            'uploader': 'AENE-NEW',          },          'params': {              # m3u8 download              'skip_download': True,          },          'add_ie': ['ThePlatform'], -    }, { -        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', -        'info_dict': -        { -            'id': 'world-war-i-history', -            'title': 'World War I History', -        }, -        'playlist_mincount': 23, -    }, { -        'url': 'http://www.history.com/topics/world-war-i-history/videos', -        'only_matching': True, -    }, { -        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', -        'only_matching': True, -    }, { -        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', -        'only_matching': True,      }]      def theplatform_url_result(self, theplatform_url, video_id, query): @@ -215,27 +229,19 @@ class HistoryTopicIE(AENetworksBaseIE):          }      def _real_extract(self, url): -        topic_id, video_display_id = re.match(self._VALID_URL, url).groups() -        if video_display_id: -            webpage = self._download_webpage(url, video_display_id) -            release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() -            release_url = unescapeHTML(release_url) - -            return self.theplatform_url_result( -                release_url, video_id, { -                    'mbr': 'true', -                    'switch': 'hls', -                    'assetTypes': 'high_video_ak', -                }) -        else: -            webpage = self._download_webpage(url, topic_id) -            entries = [] -            for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage): -                video_attributes = extract_attributes(episode_item) -                entries.append(self.theplatform_url_result( -                    video_attributes['data-release-url'], video_attributes['data-id'], { -                        'mbr': 'true', -                        'switch': 'hls', -                        'assetTypes': 'high_video_ak', -                    })) -            return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        video_id = self._search_regex( +            r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid') +        result = self._download_json( +            'https://feeds.video.aetnd.com/api/v2/history/videos', +            video_id, query={'filter[id]': video_id})['results'][0] +        title = result['title'] +        info = self._extract_aen_smil(result['publicUrl'], video_id) +        info.update({ +            'title': title, +            'description': result.get('description'), +            'duration': int_or_none(result.get('duration')), +            'timestamp': int_or_none(result.get('added'), 1000), +        }) +        return info diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index cb9279193..e87994a6a 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import ( +    compat_parse_qs, +    compat_urllib_parse_urlparse, +)  from ..utils import (      ExtractorError,      int_or_none, @@ -12,12 +16,12 @@ from ..utils import (  class AolIE(InfoExtractor): -    IE_NAME = 'on.aol.com' -    _VALID_URL = r'(?:aol-video:|https?://(?:(?:www|on)\.)?aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P<id>[^/?#&]+)' +    IE_NAME = 'aol.com' +    _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)'      _TESTS = [{          # video with 5min ID -        'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', +        'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/',          'md5': '18ef68f48740e86ae94b98da815eec42',          'info_dict': {              'id': '518167793', @@ -34,7 +38,7 @@ class AolIE(InfoExtractor):          }      }, {          # video with vidible ID -        'url': 'http://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', +        'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/',          'info_dict': {              'id': '5707d6b8e4b090497b04f706',              'ext': 'mp4', @@ -49,16 +53,28 @@ class AolIE(InfoExtractor):              'skip_download': True,          }      }, { -        'url': 'http://on.aol.com/partners/abc-551438d309eab105804dbfe8/sneak-peek-was-haley-really-framed-570eaebee4b0448640a5c944', +        'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/',          'only_matching': True,      }, { -        'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763', +        'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/',          'only_matching': True,      }, { -        'url': 'http://on.aol.com/video/519442220', +        'url': 'aol-video:5707d6b8e4b090497b04f706',          'only_matching': True,      }, { -        'url': 'aol-video:5707d6b8e4b090497b04f706', +        'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/', +        'only_matching': True, +    }, { +        'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/', +        'only_matching': True, +    }, { +        'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/', +        'only_matching': True, +    }, { +        'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/', +        'only_matching': True, +    }, { +        'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/',          'only_matching': True,      }] @@ -73,7 +89,7 @@ class AolIE(InfoExtractor):          video_data = response['data']          formats = [] -        m3u8_url = video_data.get('videoMasterPlaylist') +        m3u8_url = url_or_none(video_data.get('videoMasterPlaylist'))          if m3u8_url:              formats.extend(self._extract_m3u8_formats(                  m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) @@ -96,6 +112,12 @@ class AolIE(InfoExtractor):                          'width': int(mobj.group(1)),                          'height': int(mobj.group(2)),                      }) +                else: +                    qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query) +                    f.update({ +                        'width': int_or_none(qs.get('w', [None])[0]), +                        'height': int_or_none(qs.get('h', [None])[0]), +                    })                  formats.append(f)          self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) diff --git a/youtube_dl/extractor/bfi.py b/youtube_dl/extractor/bfi.py new file mode 100644 index 000000000..60c8944b5 --- /dev/null +++ b/youtube_dl/extractor/bfi.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class BFIPlayerIE(InfoExtractor): +    IE_NAME = 'bfi:player' +    _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online' +    _TEST = { +        'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online', +        'md5': 'e8783ebd8e061ec4bc6e9501ed547de8', +        'info_dict': { +            'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63', +            'ext': 'mp4', +            'title': 'Computer Doctor', +            'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b', +        }, +        'skip': 'BFI Player films cannot be played outside of the UK', +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        entries = [] +        for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage): +            player_attr = extract_attributes(player_el) +            ooyala_id = player_attr.get('data-video-id') +            if not ooyala_id: +                continue +            entries.append(self.url_result( +                'ooyala:' + ooyala_id, 'Ooyala', +                ooyala_id, player_attr.get('data-label'))) +        return self.playlist_result(entries) diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index 4c5c6be10..3707dc97f 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -28,7 +28,7 @@ class BIQLEIE(InfoExtractor):          'url': 'http://biqle.org/watch/-44781847_168547604',          'md5': '7f24e72af1db0edf7c1aaba513174f97',          'info_dict': { -            'id': '168547604', +            'id': '-44781847_168547604',              'ext': 'mp4',              'title': 'Ребенок в шоке от автоматической мойки',              'timestamp': 1396633454, diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1799d63ea..376db7263 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -13,13 +13,17 @@ from ..utils import (  class CBSBaseIE(ThePlatformFeedIE):      def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): -        closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') -        return { -            'en': [{ -                'ext': 'ttml', -                'url': closed_caption_e.attrib['value'], -            }] -        } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] +        subtitles = {} +        for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: +            cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) +            if cc_e is not None: +                cc_url = cc_e.get('value') +                if cc_url: +                    subtitles.setdefault(subtitles_lang, []).append({ +                        'ext': ext, +                        'url': cc_url, +                    }) +        return subtitles  class CBSIE(CBSBaseIE): diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f9bd535f6..73382431b 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -79,7 +79,7 @@ class CWTVIE(InfoExtractor):          season = str_or_none(video_data.get('season'))          episode = str_or_none(video_data.get('episode'))          if episode and season: -            episode = episode.lstrip(season) +            episode = episode[len(season):]          return {              '_type': 'url_transparent', diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index c05f601e2..c345e0274 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -58,10 +58,17 @@ class DigitallySpeakingIE(InfoExtractor):              stream_name = xpath_text(a_format, 'streamName', fatal=True)              video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path')              url = video_root + video_path -            vbr = xpath_text(a_format, 'bitrate') +            bitrate = xpath_text(a_format, 'bitrate') +            tbr = int_or_none(bitrate) +            vbr = int_or_none(self._search_regex( +                r'-(\d+)\.mp4', video_path, 'vbr', default=None)) +            abr = tbr - vbr if tbr and vbr else None              video_formats.append({ +                'format_id': bitrate,                  'url': url, -                'vbr': int_or_none(vbr), +                'tbr': tbr, +                'vbr': vbr, +                'abr': abr,              })          return video_formats diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 20996962a..de7f6d670 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -10,16 +10,16 @@ from ..utils import (      int_or_none,      js_to_json,      mimetype2ext, +    try_get,      unescapeHTML, +    parse_iso8601,  )  class DVTVIE(InfoExtractor):      IE_NAME = 'dvtv'      IE_DESC = 'http://video.aktualne.cz/' -      _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})' -      _TESTS = [{          'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/',          'md5': '67cb83e4a955d36e1b5d31993134a0c2', @@ -28,11 +28,13 @@ class DVTVIE(InfoExtractor):              'ext': 'mp4',              'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně',              'duration': 1484, +            'upload_date': '20141217', +            'timestamp': 1418792400,          }      }, {          'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/',          'info_dict': { -            'title': r're:^DVTV 16\. 12\. 2014: útok Talibanu, boj o kliniku, uprchlíci', +            'title': r'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci',              'id': '973eb3bc854e11e498be002590604f2e',          },          'playlist': [{ @@ -84,6 +86,8 @@ class DVTVIE(InfoExtractor):              'ext': 'mp4',              'title': 'Zeman si jen léčí mindráky, Sobotku nenávidí a Babiš se mu teď hodí, tvrdí Kmenta',              'duration': 1103, +            'upload_date': '20170511', +            'timestamp': 1494514200,          },          'params': {              'skip_download': True, @@ -91,43 +95,59 @@ class DVTVIE(InfoExtractor):      }, {          'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/',          'only_matching': True, +    }, { +        # Test live stream video (liveStarter) parsing +        'url': 'https://video.aktualne.cz/dvtv/zive-mistryne-sveta-eva-samkova-po-navratu-ze-sampionatu/r~182654c2288811e990fd0cc47ab5f122/', +        'md5': '2e552e483f2414851ca50467054f9d5d', +        'info_dict': { +            'id': '8d116360288011e98c840cc47ab5f122', +            'ext': 'mp4', +            'title': 'Živě: Mistryně světa Eva Samková po návratu ze šampionátu', +            'upload_date': '20190204', +            'timestamp': 1549289591, +        }, +        'params': { +            # Video content is no longer available +            'skip_download': True, +        },      }] -    def _parse_video_metadata(self, js, video_id, live_js=None): +    def _parse_video_metadata(self, js, video_id, timestamp):          data = self._parse_json(js, video_id, transform_source=js_to_json) -        if live_js: -            data.update(self._parse_json( -                live_js, video_id, transform_source=js_to_json)) -          title = unescapeHTML(data['title']) +        live_starter = try_get(data, lambda x: x['plugins']['liveStarter'], dict) +        if live_starter: +            data.update(live_starter) +          formats = [] -        for video in data['sources']: -            video_url = video.get('file') -            if not video_url: -                continue -            video_type = video.get('type') -            ext = determine_ext(video_url, mimetype2ext(video_type)) -            if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8': -                formats.extend(self._extract_m3u8_formats( -                    video_url, video_id, 'mp4', entry_protocol='m3u8_native', -                    m3u8_id='hls', fatal=False)) -            elif video_type == 'application/dash+xml' or ext == 'mpd': -                formats.extend(self._extract_mpd_formats( -                    video_url, video_id, mpd_id='dash', fatal=False)) -            else: -                label = video.get('label') -                height = self._search_regex( -                    r'^(\d+)[pP]', label or '', 'height', default=None) -                format_id = ['http'] -                for f in (ext, label): -                    if f: -                        format_id.append(f) -                formats.append({ -                    'url': video_url, -                    'format_id': '-'.join(format_id), -                    'height': int_or_none(height), -                }) +        for tracks in data.get('tracks', {}).values(): +            for video in tracks: +                video_url = video.get('src') +                if not video_url: +                    continue +                video_type = video.get('type') +                ext = determine_ext(video_url, mimetype2ext(video_type)) +                if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        m3u8_id='hls', fatal=False)) +                elif video_type == 'application/dash+xml' or ext == 'mpd': +                    formats.extend(self._extract_mpd_formats( +                        video_url, video_id, mpd_id='dash', fatal=False)) +                else: +                    label = video.get('label') +                    height = self._search_regex( +                        r'^(\d+)[pP]', label or '', 'height', default=None) +                    format_id = ['http'] +                    for f in (ext, label): +                        if f: +                            format_id.append(f) +                    formats.append({ +                        'url': video_url, +                        'format_id': '-'.join(format_id), +                        'height': int_or_none(height), +                    })          self._sort_formats(formats)          return { @@ -136,41 +156,29 @@ class DVTVIE(InfoExtractor):              'description': data.get('description'),              'thumbnail': data.get('image'),              'duration': int_or_none(data.get('duration')), -            'timestamp': int_or_none(data.get('pubtime')), +            'timestamp': int_or_none(timestamp),              'formats': formats          }      def _real_extract(self, url):          video_id = self._match_id(url) -          webpage = self._download_webpage(url, video_id) +        timestamp = parse_iso8601(self._html_search_meta( +            'article:published_time', webpage, 'published time', default=None)) -        # live content -        live_item = self._search_regex( -            r'(?s)embedData[0-9a-f]{32}\.asset\.liveStarter\s*=\s*(\{.+?\});', -            webpage, 'video', default=None) +        items = re.findall(r'(?s)playlist\.push\(({.+?})\);', webpage) +        if items: +            return self.playlist_result( +                [self._parse_video_metadata(i, video_id, timestamp) for i in items], +                video_id, self._html_search_meta('twitter:title', webpage)) -        # single video          item = self._search_regex( -            r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});', +            r'(?s)BBXPlayer\.setup\((.+?)\);',              webpage, 'video', default=None) -          if item: -            return self._parse_video_metadata(item, video_id, live_item) - -        # playlist -        items = re.findall( -            r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);", -            webpage) -        if not items: -            items = re.findall(r'(?s)var\s+asset\s*=\s*({.+?});\n', webpage) - -        if items: -            return { -                '_type': 'playlist', -                'id': video_id, -                'title': self._og_search_title(webpage), -                'entries': [self._parse_video_metadata(i, video_id) for i in items] -            } +            # remove function calls (ex. htmldeentitize) +            # TODO this should be fixed in a general way in the js_to_json +            item = re.sub(r'\w+?\((.+)\)', r'\1', item) +            return self._parse_video_metadata(item, video_id, timestamp)          raise ExtractorError('Could not find neither video nor playlist') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8e7a5bf41..86ecc0b66 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,6 +20,7 @@ from .acast import (  )  from .addanime import AddAnimeIE  from .adn import ADNIE +from .adobeconnect import AdobeConnectIE  from .adobetv import (      AdobeTVIE,      AdobeTVShowIE, @@ -106,6 +107,7 @@ from .behindkink import BehindKinkIE  from .bellmedia import BellMediaIE  from .beatport import BeatportIE  from .bet import BetIE +from .bfi import BFIPlayerIE  from .bigflix import BigflixIE  from .bild import BildIE  from .bilibili import ( @@ -440,10 +442,7 @@ from .goshgay import GoshgayIE  from .gputechconf import GPUTechConfIE  from .groupon import GrouponIE  from .hark import HarkIE -from .hbo import ( -    HBOIE, -    HBOEpisodeIE, -) +from .hbo import HBOIE  from .hearthisat import HearThisAtIE  from .heise import HeiseIE  from .hellporno import HellPornoIE @@ -632,7 +631,11 @@ from .massengeschmacktv import MassengeschmackTVIE  from .matchtv import MatchTVIE  from .mdr import MDRIE  from .mediaset import MediasetIE -from .mediasite import MediasiteIE +from .mediasite import ( +    MediasiteIE, +    MediasiteCatalogIE, +    MediasiteNamedCatalogIE, +)  from .medici import MediciIE  from .megaphone import MegaphoneIE  from .meipai import MeipaiIE @@ -865,6 +868,10 @@ from .picarto import (  from .piksel import PikselIE  from .pinkbike import PinkbikeIE  from .pladform import PladformIE +from .platzi import ( +    PlatziIE, +    PlatziCourseIE, +)  from .playfm import PlayFMIE  from .playplustv import PlayPlusTVIE  from .plays import PlaysTVIE @@ -1086,6 +1093,7 @@ from .streamcloud import StreamcloudIE  from .streamcz import StreamCZIE  from .streetvoice import StreetVoiceIE  from .stretchinternet import StretchInternetIE +from .stv import STVPlayerIE  from .sunporno import SunPornoIE  from .svt import (      SVTIE, @@ -1114,6 +1122,7 @@ from .teachertube import (  )  from .teachingchannel import TeachingChannelIE  from .teamcoco import TeamcocoIE +from .teamtreehouse import TeamTreeHouseIE  from .techtalks import TechTalksIE  from .ted import TEDIE  from .tele5 import Tele5IE @@ -1443,6 +1452,8 @@ from .xxxymovies import XXXYMoviesIE  from .yahoo import (      YahooIE,      YahooSearchIE, +    YahooGyaOPlayerIE, +    YahooGyaOIE,  )  from .yandexdisk import YandexDiskIE  from .yandexmusic import ( diff --git a/youtube_dl/extractor/gaia.py b/youtube_dl/extractor/gaia.py index f2eef3f4c..e9527758f 100644 --- a/youtube_dl/extractor/gaia.py +++ b/youtube_dl/extractor/gaia.py @@ -4,12 +4,17 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( +    compat_str, +    compat_urllib_parse_unquote, +)  from ..utils import ( +    ExtractorError,      int_or_none,      str_or_none,      strip_or_none,      try_get, +    urlencode_postdata,  ) @@ -46,6 +51,29 @@ class GaiaIE(InfoExtractor):              'skip_download': True,          },      }] +    _NETRC_MACHINE = 'gaia' +    _jwt = None + +    def _real_initialize(self): +        auth = self._get_cookies('https://www.gaia.com/').get('auth') +        if auth: +            auth = self._parse_json( +                compat_urllib_parse_unquote(auth.value), +                None, fatal=False) +        if not auth: +            username, password = self._get_login_info() +            if username is None: +                return +            auth = self._download_json( +                'https://auth.gaia.com/v1/login', +                None, data=urlencode_postdata({ +                    'username': username, +                    'password': password +                })) +            if auth.get('success') is False: +                raise ExtractorError(', '.join(auth['messages']), expected=True) +        if auth: +            self._jwt = auth.get('jwt')      def _real_extract(self, url):          display_id, vtype = re.search(self._VALID_URL, url).groups() @@ -59,8 +87,12 @@ class GaiaIE(InfoExtractor):          media_id = compat_str(vdata['nid'])          title = node['title'] +        headers = None +        if self._jwt: +            headers = {'Authorization': 'Bearer ' + self._jwt}          media = self._download_json( -            'https://brooklyn.gaia.com/media/' + media_id, media_id) +            'https://brooklyn.gaia.com/media/' + media_id, +            media_id, headers=headers)          formats = self._extract_m3u8_formats(              media['mediaUrls']['bcHLS'], media_id, 'mp4')          self._sort_formats(formats) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 8806dc48a..2f555c1d4 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -3,22 +3,24 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from .kaltura import KalturaIE  from ..utils import (      HEADRequest,      sanitized_Request, +    smuggle_url,      urlencode_postdata,  )  class GDCVaultIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)?' +    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)(?:/(?P<name>[\w-]+))?'      _NETRC_MACHINE = 'gdcvault'      _TESTS = [          {              'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',              'md5': '7ce8388f544c88b7ac11c7ab1b593704',              'info_dict': { -                'id': '1019721', +                'id': '201311826596_AWNY',                  'display_id': 'Doki-Doki-Universe-Sweet-Simple',                  'ext': 'mp4',                  'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' @@ -27,7 +29,7 @@ class GDCVaultIE(InfoExtractor):          {              'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',              'info_dict': { -                'id': '1015683', +                'id': '201203272_1330951438328RSXR',                  'display_id': 'Embracing-the-Dark-Art-of',                  'ext': 'flv',                  'title': 'Embracing the Dark Art of Mathematical Modeling in AI' @@ -56,7 +58,7 @@ class GDCVaultIE(InfoExtractor):              'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface',              'md5': 'a8efb6c31ed06ca8739294960b2dbabd',              'info_dict': { -                'id': '1023460', +                'id': '840376_BQRC',                  'ext': 'mp4',                  'display_id': 'Tenacious-Design-and-The-Interface',                  'title': 'Tenacious Design and The Interface of \'Destiny\'', @@ -66,26 +68,38 @@ class GDCVaultIE(InfoExtractor):              # Multiple audios              'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC',              'info_dict': { -                'id': '1014631', -                'ext': 'flv', +                'id': '12396_1299111843500GMPX', +                'ext': 'mp4',                  'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man',              }, -            'params': { -                'skip_download': True,  # Requires rtmpdump -                'format': 'jp',  # The japanese audio -            } +            # 'params': { +            #     'skip_download': True,  # Requires rtmpdump +            #     'format': 'jp',  # The japanese audio +            # }          },          {              # gdc-player.html              'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo',              'info_dict': { -                'id': '1435', +                'id': '9350_1238021887562UHXB',                  'display_id': 'An-American-engine-in-Tokyo', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT',              }, +        }, +        { +            # Kaltura Embed +            'url': 'https://www.gdcvault.com/play/1026180/Mastering-the-Apex-of-Scaling', +            'info_dict': { +                'id': '0_h1fg8j3p', +                'ext': 'mp4', +                'title': 'Mastering the Apex of Scaling Game Servers (Presented by Multiplay)', +                'timestamp': 1554401811, +                'upload_date': '20190404', +                'uploader_id': 'joe@blazestreaming.com', +            },              'params': { -                'skip_download': True,  # Requires rtmpdump +                'format': 'mp4-408',              },          },      ] @@ -114,10 +128,8 @@ class GDCVaultIE(InfoExtractor):          return start_page      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) - -        video_id = mobj.group('id') -        display_id = mobj.group('name') or video_id +        video_id, name = re.match(self._VALID_URL, url).groups() +        display_id = name or video_id          webpage_url = 'http://www.gdcvault.com/play/' + video_id          start_page = self._download_webpage(webpage_url, display_id) @@ -127,12 +139,12 @@ class GDCVaultIE(InfoExtractor):              start_page, 'url', default=None)          if direct_url:              title = self._html_search_regex( -                r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>', +                r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>',                  start_page, 'title')              video_url = 'http://www.gdcvault.com' + direct_url              # resolve the url so that we can detect the correct extension -            head = self._request_webpage(HEADRequest(video_url), video_id) -            video_url = head.geturl() +            video_url = self._request_webpage( +                HEADRequest(video_url), video_id).geturl()              return {                  'id': video_id, @@ -141,34 +153,36 @@ class GDCVaultIE(InfoExtractor):                  'title': title,              } -        PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' - -        xml_root = self._html_search_regex( -            PLAYER_REGEX, start_page, 'xml root', default=None) -        if xml_root is None: -            # Probably need to authenticate -            login_res = self._login(webpage_url, display_id) -            if login_res is None: -                self.report_warning('Could not login.') -            else: -                start_page = login_res -                # Grab the url from the authenticated page -                xml_root = self._html_search_regex( -                    PLAYER_REGEX, start_page, 'xml root') - -        xml_name = self._html_search_regex( -            r'<iframe src=".*?\?xml=(.+?\.xml).*?".*?</iframe>', -            start_page, 'xml filename', default=None) -        if xml_name is None: -            # Fallback to the older format +        embed_url = KalturaIE._extract_url(start_page) +        if embed_url: +            embed_url = smuggle_url(embed_url, {'source_url': url}) +            ie_key = 'Kaltura' +        else: +            PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' + +            xml_root = self._html_search_regex( +                PLAYER_REGEX, start_page, 'xml root', default=None) +            if xml_root is None: +                # Probably need to authenticate +                login_res = self._login(webpage_url, display_id) +                if login_res is None: +                    self.report_warning('Could not login.') +                else: +                    start_page = login_res +                    # Grab the url from the authenticated page +                    xml_root = self._html_search_regex( +                        PLAYER_REGEX, start_page, 'xml root') +              xml_name = self._html_search_regex( -                r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', +                r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>',                  start_page, 'xml filename') +            embed_url = '%s/xml/%s' % (xml_root, xml_name) +            ie_key = 'DigitallySpeaking'          return {              '_type': 'url_transparent',              'id': video_id,              'display_id': display_id, -            'url': '%s/xml/%s' % (xml_root, xml_name), -            'ie_key': 'DigitallySpeaking', +            'url': embed_url, +            'ie_key': ie_key,          } diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index 859ad5429..44440233d 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -4,16 +4,28 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_str  from ..utils import (      xpath_text,      xpath_element,      int_or_none,      parse_duration, +    urljoin,  ) -class HBOBaseIE(InfoExtractor): +class HBOIE(InfoExtractor): +    IE_NAME = 'hbo' +    _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?:video|embed)(?:/[^/]+)*/(?P<id>[^/?#]+)' +    _TEST = { +        'url': 'https://www.hbo.com/video/game-of-thrones/seasons/season-8/videos/trailer', +        'md5': '8126210656f433c452a21367f9ad85b3', +        'info_dict': { +            'id': '22113301', +            'ext': 'mp4', +            'title': 'Game of Thrones - Trailer', +        }, +        'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], +    }      _FORMATS_INFO = {          'pro7': {              'width': 1280, @@ -53,10 +65,17 @@ class HBOBaseIE(InfoExtractor):          },      } -    def _extract_from_id(self, video_id): -        video_data = self._download_xml( -            'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) -        title = xpath_text(video_data, 'title', 'title', True) +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        location_path = self._parse_json(self._html_search_regex( +            r'data-state="({.+?})"', webpage, 'state'), display_id)['video']['locationUrl'] +        video_data = self._download_xml(urljoin(url, location_path), display_id) +        video_id = xpath_text(video_data, 'id', fatal=True) +        episode_title = title = xpath_text(video_data, 'title', fatal=True) +        series = xpath_text(video_data, 'program') +        if series: +            title = '%s - %s' % (series, title)          formats = []          for source in xpath_element(video_data, 'videos', 'sources', True): @@ -128,68 +147,23 @@ class HBOBaseIE(InfoExtractor):                      'width': width,                  }) +        subtitles = None +        caption_url = xpath_text(video_data, 'captionUrl') +        if caption_url: +            subtitles = { +                'en': [{ +                    'url': caption_url, +                    'ext': 'ttml' +                }], +            } +          return {              'id': video_id,              'title': title,              'duration': parse_duration(xpath_text(video_data, 'duration/tv14')), +            'series': series, +            'episode': episode_title,              'formats': formats,              'thumbnails': thumbnails, +            'subtitles': subtitles,          } - - -class HBOIE(HBOBaseIE): -    IE_NAME = 'hbo' -    _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', -        'md5': '2c6a6bc1222c7e91cb3334dad1746e5a', -        'info_dict': { -            'id': '1437839', -            'ext': 'mp4', -            'title': 'Ep. 64 Clip: Encryption', -            'thumbnail': r're:https?://.*\.jpg$', -            'duration': 1072, -        } -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        return self._extract_from_id(video_id) - - -class HBOEpisodeIE(HBOBaseIE): -    IE_NAME = 'hbo:episode' -    _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?' - -    _TESTS = [{ -        'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true', -        'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb', -        'info_dict': { -            'id': '1439518', -            'display_id': 'ep-52-inside-the-episode', -            'ext': 'mp4', -            'title': 'Ep. 52: Inside the Episode', -            'thumbnail': r're:https?://.*\.jpg$', -            'duration': 240, -        }, -    }, { -        'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true', -        'only_matching': True, -    }, { -        'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver', -        'only_matching': True, -    }] - -    def _real_extract(self, url): -        path, display_id = re.match(self._VALID_URL, url).groups() - -        content = self._download_json( -            'http://www.hbo.com/api/content/' + path, display_id)['content'] - -        video_id = compat_str((content.get('parsed', {}).get( -            'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId']) - -        info_dict = self._extract_from_id(video_id) -        info_dict['display_id'] = display_id - -        return info_dict diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 9544ff9d4..12695af27 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -1,36 +1,83 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    int_or_none, +    strip_or_none, +    xpath_attr, +    xpath_text, +)  class InaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' +    _TESTS = [{          'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',          'md5': 'a667021bf2b41f8dc6049479d9bb38a3',          'info_dict': {              'id': 'I12055569',              'ext': 'mp4',              'title': 'François Hollande "Je crois que c\'est clair"', +            'description': 'md5:3f09eb072a06cb286b8f7e4f77109663',          } -    } +    }, { +        'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', +        'only_matching': True, +    }, { +        'url': 'https://www.ina.fr/audio/P16173408', +        'only_matching': True, +    }, { +        'url': 'https://www.ina.fr/video/P16173408-video.html', +        'only_matching': True, +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) - -        video_id = mobj.group('id') -        mrss_url = 'http://player.ina.fr/notices/%s.mrss' % video_id -        info_doc = self._download_xml(mrss_url, video_id) +        video_id = self._match_id(url) +        info_doc = self._download_xml( +            'http://player.ina.fr/notices/%s.mrss' % video_id, video_id) +        item = info_doc.find('channel/item') +        title = xpath_text(item, 'title', fatal=True) +        media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/') +        content = item.find(media_ns_xpath('content')) -        self.report_extraction(video_id) +        get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url') +        formats = [] +        for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)): +            q_url = get_furl(q) +            if not q_url: +                continue +            formats.append({ +                'format_id': q, +                'url': q_url, +                'width': w, +                'height': h, +            }) +        if not formats: +            furl = get_furl('player') or content.attrib['url'] +            ext = determine_ext(furl) +            formats = [{ +                'url': furl, +                'vcodec': 'none' if ext == 'mp3' else None, +                'ext': ext, +            }] -        video_url = info_doc.find('.//{http://search.yahoo.com/mrss/}player').attrib['url'] +        thumbnails = [] +        for thumbnail in content.findall(media_ns_xpath('thumbnail')): +            thumbnail_url = thumbnail.get('url') +            if not thumbnail_url: +                continue +            thumbnails.append({ +                'url': thumbnail_url, +                'height': int_or_none(thumbnail.get('height')), +                'width': int_or_none(thumbnail.get('width')), +            })          return {              'id': video_id, -            'url': video_url, -            'title': info_doc.find('.//title').text, +            'formats': formats, +            'title': title, +            'description': strip_or_none(xpath_text(item, 'description')), +            'thumbnails': thumbnails,          } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index d19a6a774..647b905f1 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,7 +7,7 @@ from .common import InfoExtractor  class JWPlatformIE(InfoExtractor): -    _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' +    _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'      _TESTS = [{          'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',          'md5': 'fa8899fa601eb7c83a64e9d568bdf325', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index fdf7f5bbc..79162f665 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -145,6 +145,8 @@ class KalturaIE(InfoExtractor):          )          if mobj:              embed_info = mobj.groupdict() +            for k, v in embed_info.items(): +                embed_info[k] = v.strip()              url = 'kaltura:%(partner_id)s:%(id)s' % embed_info              escaped_pid = re.escape(embed_info['partner_id'])              service_url = re.search( diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py index 5a86b0064..26fc703d1 100644 --- a/youtube_dl/extractor/linkedin.py +++ b/youtube_dl/extractor/linkedin.py @@ -9,11 +9,13 @@ from ..utils import (      float_or_none,      int_or_none,      urlencode_postdata, +    urljoin,  )  class LinkedInLearningBaseIE(InfoExtractor):      _NETRC_MACHINE = 'linkedin' +    _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning'      def _call_api(self, course_slug, fields, video_slug=None, resolution=None):          query = { @@ -50,11 +52,10 @@ class LinkedInLearningBaseIE(InfoExtractor):              return          login_page = self._download_webpage( -            'https://www.linkedin.com/uas/login?trk=learning', -            None, 'Downloading login page') -        action_url = self._search_regex( +            self._LOGIN_URL, None, 'Downloading login page') +        action_url = urljoin(self._LOGIN_URL, self._search_regex(              r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', -            default='https://www.linkedin.com/uas/login-submit', group='url') +            default='https://www.linkedin.com/uas/login-submit', group='url'))          data = self._hidden_inputs(login_page)          data.update({              'session_key': email, diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index ef9628e65..694a264d6 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -13,6 +13,8 @@ from ..utils import (      ExtractorError,      float_or_none,      mimetype2ext, +    str_or_none, +    try_get,      unescapeHTML,      unsmuggle_url,      url_or_none, @@ -20,8 +22,11 @@ from ..utils import (  ) +_ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' + +  class MediasiteIE(InfoExtractor): -    _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)' +    _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE      _TESTS = [          {              'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -93,6 +98,11 @@ class MediasiteIE(InfoExtractor):              'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d',              'only_matching': True,          }, +        { +            # dashed id +            'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', +            'only_matching': True, +        }      ]      # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) @@ -109,7 +119,7 @@ class MediasiteIE(InfoExtractor):          return [              unescapeHTML(mobj.group('url'))              for mobj in re.finditer( -                r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1', +                r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE,                  webpage)]      def _real_extract(self, url): @@ -221,3 +231,136 @@ class MediasiteIE(InfoExtractor):              'formats': formats,              'thumbnails': thumbnails,          } + + +class MediasiteCatalogIE(InfoExtractor): +    _VALID_URL = r'''(?xi) +                        (?P<url>https?://[^/]+/Mediasite) +                        /Catalog/Full/ +                        (?P<catalog_id>{0}) +                        (?: +                            /(?P<current_folder_id>{0}) +                            /(?P<root_dynamic_folder_id>{0}) +                        )? +                    '''.format(_ID_RE) +    _TESTS = [{ +        'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', +        'info_dict': { +            'id': '631f9e48530d454381549f955d08c75e21', +            'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically', +        }, +        'playlist_count': 6, +        'expected_warnings': ['is not a supported codec'], +    }, { +        # with CurrentFolderId and RootDynamicFolderId +        'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', +        'info_dict': { +            'id': '9518c4a6c5cf4993b21cbd53e828a92521', +            'title': 'IUSM Family and Friends Sessions', +        }, +        'playlist_count': 2, +    }, { +        'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21', +        'only_matching': True, +    }, { +        # no AntiForgeryToken +        'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21', +        'only_matching': True, +    }, { +        'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', +        'only_matching': True, +    }, { +        # dashed id +        'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        mediasite_url = mobj.group('url') +        catalog_id = mobj.group('catalog_id') +        current_folder_id = mobj.group('current_folder_id') or catalog_id +        root_dynamic_folder_id = mobj.group('root_dynamic_folder_id') + +        webpage = self._download_webpage(url, catalog_id) + +        # AntiForgeryToken is optional (e.g. [1]) +        # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 +        anti_forgery_token = self._search_regex( +            r'AntiForgeryToken\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', +            webpage, 'anti forgery token', default=None, group='value') +        if anti_forgery_token: +            anti_forgery_header = self._search_regex( +                r'AntiForgeryHeaderName\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', +                webpage, 'anti forgery header name', +                default='X-SOFO-AntiForgeryHeader', group='value') + +        data = { +            'IsViewPage': True, +            'IsNewFolder': True, +            'AuthTicket': None, +            'CatalogId': catalog_id, +            'CurrentFolderId': current_folder_id, +            'RootDynamicFolderId': root_dynamic_folder_id, +            'ItemsPerPage': 1000, +            'PageIndex': 0, +            'PermissionMask': 'Execute', +            'CatalogSearchType': 'SearchInFolder', +            'SortBy': 'Date', +            'SortDirection': 'Descending', +            'StartDate': None, +            'EndDate': None, +            'StatusFilterList': None, +            'PreviewKey': None, +            'Tags': [], +        } + +        headers = { +            'Content-Type': 'application/json; charset=UTF-8', +            'Referer': url, +            'X-Requested-With': 'XMLHttpRequest', +        } +        if anti_forgery_token: +            headers[anti_forgery_header] = anti_forgery_token + +        catalog = self._download_json( +            '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, +            catalog_id, data=json.dumps(data).encode(), headers=headers) + +        entries = [] +        for video in catalog['PresentationDetailsList']: +            if not isinstance(video, dict): +                continue +            video_id = str_or_none(video.get('Id')) +            if not video_id: +                continue +            entries.append(self.url_result( +                '%s/Play/%s' % (mediasite_url, video_id), +                ie=MediasiteIE.ie_key(), video_id=video_id)) + +        title = try_get( +            catalog, lambda x: x['CurrentFolder']['Name'], compat_str) + +        return self.playlist_result(entries, catalog_id, title,) + + +class MediasiteNamedCatalogIE(InfoExtractor): +    _VALID_URL = r'(?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+)' +    _TESTS = [{ +        'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        mediasite_url = mobj.group('url') +        catalog_name = mobj.group('catalog_name') + +        webpage = self._download_webpage(url, catalog_name) + +        catalog_id = self._search_regex( +            r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') + +        return self.url_result( +            '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), +            ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id) diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index d53d96aae..84137df50 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -1,22 +1,32 @@  # coding: utf-8  from __future__ import unicode_literals +import base64 +import time +import uuid +  from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none +from ..compat import ( +    compat_HTTPError, +    compat_str, +) +from ..utils import ( +    ExtractorError, +    int_or_none, +)  class MGTVIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'      IE_DESC = '芒果TV' +    _GEO_COUNTRIES = ['CN']      _TESTS = [{          'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', -        'md5': 'b1ffc0fc163152acf6beaa81832c9ee7',          'info_dict': {              'id': '3116640',              'ext': 'mp4', -            'title': '我是歌手第四季双年巅峰会:韩红李玟“双王”领军对抗', +            'title': '我是歌手 第四季',              'description': '我是歌手第四季双年巅峰会',              'duration': 7461,              'thumbnail': r're:^https?://.*\.jpg$', @@ -28,16 +38,30 @@ class MGTVIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        api_data = self._download_json( -            'http://pcweb.api.mgtv.com/player/video', video_id, -            query={'video_id': video_id}, -            headers=self.geo_verification_headers())['data'] +        try: +            api_data = self._download_json( +                'https://pcweb.api.mgtv.com/player/video', video_id, query={ +                    'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], +                    'video_id': video_id, +                }, headers=self.geo_verification_headers())['data'] +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: +                error = self._parse_json(e.cause.read().decode(), None) +                if error.get('code') == 40005: +                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES) +                raise ExtractorError(error['msg'], expected=True) +            raise          info = api_data['info']          title = info['title'].strip() -        stream_domain = api_data['stream_domain'][0] +        stream_data = self._download_json( +            'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ +                'pm2': api_data['atc']['pm2'], +                'video_id': video_id, +            }, headers=self.geo_verification_headers())['data'] +        stream_domain = stream_data['stream_domain'][0]          formats = [] -        for idx, stream in enumerate(api_data['stream']): +        for idx, stream in enumerate(stream_data['stream']):              stream_path = stream.get('url')              if not stream_path:                  continue @@ -47,7 +71,7 @@ class MGTVIE(InfoExtractor):              format_url = format_data.get('info')              if not format_url:                  continue -            tbr = int_or_none(self._search_regex( +            tbr = int_or_none(stream.get('filebitrate') or self._search_regex(                  r'_(\d+)_mp4/', format_url, 'tbr', default=None))              formats.append({                  'format_id': compat_str(tbr or idx), diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index e3f35f1d8..dab4aec44 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -1,12 +1,17 @@  # coding: utf-8  from __future__ import unicode_literals -import re +import base64 +import hashlib  from .common import InfoExtractor +from ..aes import aes_cbc_decrypt  from ..utils import ( -    ExtractorError, +    bytes_to_intlist,      int_or_none, +    intlist_to_bytes, +    parse_codecs, +    parse_duration,  ) @@ -14,7 +19,7 @@ class NewstubeIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'      _TEST = {          'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym', -        'md5': '801eef0c2a9f4089fa04e4fe3533abdc', +        'md5': '9d10320ad473444352f72f746ccb8b8c',          'info_dict': {              'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6',              'ext': 'mp4', @@ -25,84 +30,45 @@ class NewstubeIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url) -        page = self._download_webpage(url, video_id, 'Downloading page') +        page = self._download_webpage(url, video_id) +        title = self._html_search_meta(['og:title', 'twitter:title'], page, fatal=True)          video_guid = self._html_search_regex( -            r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', +            r'<meta\s+property="og:video(?::(?:(?:secure_)?url|iframe))?"\s+content="https?://(?:www\.)?newstube\.ru/embed/(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',              page, 'video GUID') -        player = self._download_xml( -            'http://p.newstube.ru/v2/player.asmx/GetAutoPlayInfo6?state=&url=%s&sessionId=&id=%s&placement=profile&location=n2' % (url, video_guid), -            video_guid, 'Downloading player XML') - -        def ns(s): -            return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'} - -        error_message = player.find(ns('./ErrorMessage')) -        if error_message is not None: -            raise ExtractorError('%s returned error: %s' % (self.IE_NAME, error_message.text), expected=True) - -        session_id = player.find(ns('./SessionId')).text -        media_info = player.find(ns('./Medias/MediaInfo')) -        title = media_info.find(ns('./Name')).text -        description = self._og_search_description(page) -        thumbnail = media_info.find(ns('./KeyFrame')).text -        duration = int(media_info.find(ns('./Duration')).text) / 1000.0 +        enc_data = base64.b64decode(self._download_webpage( +            'https://www.newstube.ru/embed/api/player/getsources2', +            video_guid, query={ +                'guid': video_guid, +                'ff': 3, +            })) +        key = hashlib.pbkdf2_hmac( +            'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16] +        dec_data = aes_cbc_decrypt( +            bytes_to_intlist(enc_data[32:]), bytes_to_intlist(key), +            bytes_to_intlist(enc_data[16:32])) +        sources = self._parse_json(intlist_to_bytes(dec_data[:-dec_data[-1]]), video_guid)          formats = [] - -        for stream_info in media_info.findall(ns('./Streams/StreamInfo')): -            media_location = stream_info.find(ns('./MediaLocation')) -            if media_location is None: +        for source in sources: +            source_url = source.get('Src') +            if not source_url:                  continue - -            server = media_location.find(ns('./Server')).text -            app = media_location.find(ns('./App')).text -            media_id = stream_info.find(ns('./Id')).text -            name = stream_info.find(ns('./Name')).text -            width = int(stream_info.find(ns('./Width')).text) -            height = int(stream_info.find(ns('./Height')).text) - -            formats.append({ -                'url': 'rtmp://%s/%s' % (server, app), -                'app': app, -                'play_path': '01/%s' % video_guid.upper(), -                'rtmp_conn': ['S:%s' % session_id, 'S:%s' % media_id, 'S:n2'], -                'page_url': url, -                'ext': 'flv', -                'format_id': 'rtmp' + ('-%s' % name if name else ''), -                'width': width, +            height = int_or_none(source.get('Height')) +            f = { +                'format_id': 'http' + ('-%dp' % height if height else ''), +                'url': source_url, +                'width': int_or_none(source.get('Width')),                  'height': height, -            }) - -        sources_data = self._download_json( -            'http://www.newstube.ru/player2/getsources?guid=%s' % video_guid, -            video_guid, fatal=False) -        if sources_data: -            for source in sources_data.get('Sources', []): -                source_url = source.get('Src') -                if not source_url: -                    continue -                height = int_or_none(source.get('Height')) -                f = { -                    'format_id': 'http' + ('-%dp' % height if height else ''), -                    'url': source_url, -                    'width': int_or_none(source.get('Width')), -                    'height': height, -                } -                source_type = source.get('Type') -                if source_type: -                    mobj = re.search(r'codecs="([^,]+),\s*([^"]+)"', source_type) -                    if mobj: -                        vcodec, acodec = mobj.groups() -                        f.update({ -                            'vcodec': vcodec, -                            'acodec': acodec, -                        }) -                formats.append(f) +            } +            source_type = source.get('Type') +            if source_type: +                f.update(parse_codecs(self._search_regex( +                    r'codecs="([^"]+)"', source_type, 'codecs', fatal=False))) +            formats.append(f)          self._check_formats(formats, video_guid)          self._sort_formats(formats) @@ -110,8 +76,8 @@ class NewstubeIE(InfoExtractor):          return {              'id': video_guid,              'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'duration': duration, +            'description': self._html_search_meta(['description', 'og:description'], page), +            'thumbnail': self._html_search_meta(['og:image:secure_url', 'og:image', 'twitter:image'], page), +            'duration': parse_duration(self._html_search_meta('duration', page)),              'formats': formats,          } diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py new file mode 100644 index 000000000..557b2b5ad --- /dev/null +++ b/youtube_dl/extractor/platzi.py @@ -0,0 +1,217 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( +    compat_b64decode, +    compat_str, +) +from ..utils import ( +    clean_html, +    ExtractorError, +    int_or_none, +    str_or_none, +    try_get, +    url_or_none, +    urlencode_postdata, +    urljoin, +) + + +class PlatziIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            platzi\.com/clases|           # es version +                            courses\.platzi\.com/classes  # en version +                        )/[^/]+/(?P<id>\d+)-[^/?\#&]+ +                    ''' +    _LOGIN_URL = 'https://platzi.com/login/' +    _NETRC_MACHINE = 'platzi' + +    _TESTS = [{ +        'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', +        'md5': '8f56448241005b561c10f11a595b37e3', +        'info_dict': { +            'id': '12074', +            'ext': 'mp4', +            'title': 'Creando nuestra primera página', +            'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', +            'duration': 420, +        }, +        'skip': 'Requires platzi account credentials', +    }, { +        'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', +        'info_dict': { +            'id': '13430', +            'ext': 'mp4', +            'title': 'Background', +            'description': 'md5:49c83c09404b15e6e71defaf87f6b305', +            'duration': 360, +        }, +        'skip': 'Requires platzi account credentials', +        'params': { +            'skip_download': True, +        }, +    }] + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        username, password = self._get_login_info() +        if username is None: +            return + +        login_page = self._download_webpage( +            self._LOGIN_URL, None, 'Downloading login page') + +        login_form = self._hidden_inputs(login_page) + +        login_form.update({ +            'email': username, +            'password': password, +        }) + +        urlh = self._request_webpage( +            self._LOGIN_URL, None, 'Logging in', +            data=urlencode_postdata(login_form), +            headers={'Referer': self._LOGIN_URL}) + +        # login succeeded +        if 'platzi.com/login' not in compat_str(urlh.geturl()): +            return + +        login_error = self._webpage_read_content( +            urlh, self._LOGIN_URL, None, 'Downloading login error page') + +        login = self._parse_json( +            self._search_regex( +                r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), +            None) + +        for kind in ('error', 'password', 'nonFields'): +            error = str_or_none(login.get('%sError' % kind)) +            if error: +                raise ExtractorError( +                    'Unable to login: %s' % error, expected=True) +        raise ExtractorError('Unable to log in') + +    def _real_extract(self, url): +        lecture_id = self._match_id(url) + +        webpage = self._download_webpage(url, lecture_id) + +        data = self._parse_json( +            self._search_regex( +                r'client_data\s*=\s*({.+?})\s*;', webpage, 'client data'), +            lecture_id) + +        material = data['initialState']['material'] +        desc = material['description'] +        title = desc['title'] + +        formats = [] +        for server_id, server in material['videos'].items(): +            if not isinstance(server, dict): +                continue +            for format_id in ('hls', 'dash'): +                format_url = url_or_none(server.get(format_id)) +                if not format_url: +                    continue +                if format_id == 'hls': +                    formats.extend(self._extract_m3u8_formats( +                        format_url, lecture_id, 'mp4', +                        entry_protocol='m3u8_native', m3u8_id=format_id, +                        note='Downloading %s m3u8 information' % server_id, +                        fatal=False)) +                elif format_id == 'dash': +                    formats.extend(self._extract_mpd_formats( +                        format_url, lecture_id, mpd_id=format_id, +                        note='Downloading %s MPD manifest' % server_id, +                        fatal=False)) +        self._sort_formats(formats) + +        content = str_or_none(desc.get('content')) +        description = (clean_html(compat_b64decode(content).decode('utf-8')) +                       if content else None) +        duration = int_or_none(material.get('duration'), invscale=60) + +        return { +            'id': lecture_id, +            'title': title, +            'description': description, +            'duration': duration, +            'formats': formats, +        } + + +class PlatziCourseIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            platzi\.com/clases|           # es version +                            courses\.platzi\.com/classes  # en version +                        )/(?P<id>[^/?\#&]+) +                    ''' +    _TESTS = [{ +        'url': 'https://platzi.com/clases/next-js/', +        'info_dict': { +            'id': '1311', +            'title': 'Curso de Next.js', +        }, +        'playlist_count': 22, +    }, { +        'url': 'https://courses.platzi.com/classes/communication-codestream/', +        'info_dict': { +            'id': '1367', +            'title': 'Codestream Course', +        }, +        'playlist_count': 14, +    }] + +    @classmethod +    def suitable(cls, url): +        return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + +    def _real_extract(self, url): +        course_name = self._match_id(url) + +        webpage = self._download_webpage(url, course_name) + +        props = self._parse_json( +            self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), +            course_name)['initialProps'] + +        entries = [] +        for chapter_num, chapter in enumerate(props['concepts'], 1): +            if not isinstance(chapter, dict): +                continue +            materials = chapter.get('materials') +            if not materials or not isinstance(materials, list): +                continue +            chapter_title = chapter.get('title') +            chapter_id = str_or_none(chapter.get('id')) +            for material in materials: +                if not isinstance(material, dict): +                    continue +                if material.get('material_type') != 'video': +                    continue +                video_url = urljoin(url, material.get('url')) +                if not video_url: +                    continue +                entries.append({ +                    '_type': 'url_transparent', +                    'url': video_url, +                    'title': str_or_none(material.get('name')), +                    'id': str_or_none(material.get('id')), +                    'ie_key': PlatziIE.ie_key(), +                    'chapter': chapter_title, +                    'chapter_number': chapter_num, +                    'chapter_id': chapter_id, +                }) + +        course_id = compat_str(try_get(props, lambda x: x['course']['id'])) +        course_title = try_get(props, lambda x: x['course']['name'], compat_str) + +        return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 18a327d81..70f000ca8 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -21,7 +21,7 @@ from ..utils import (  class RTL2IE(InfoExtractor):      IE_NAME = 'rtl2' -    _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))' +    _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)'      _TESTS = [{          'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',          'info_dict': { @@ -34,10 +34,11 @@ class RTL2IE(InfoExtractor):              # rtmp download              'skip_download': True,          }, +        'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],      }, {          'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/',          'info_dict': { -            'id': '21040-anna-erwischt-alex', +            'id': 'anna-erwischt-alex',              'ext': 'mp4',              'title': 'Anna erwischt Alex!',              'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' @@ -46,31 +47,29 @@ class RTL2IE(InfoExtractor):              # rtmp download              'skip_download': True,          }, +        'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],      }]      def _real_extract(self, url): -        # Some rtl2 urls have no slash at the end, so append it. -        if not url.endswith('/'): -            url += '/' - -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        mobj = re.search( -            r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', -            webpage) -        if mobj: -            vico_id = mobj.group('vico_id') -            vivi_id = mobj.group('vivi_id') -        else: -            vico_id = self._html_search_regex( -                r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') -            vivi_id = self._html_search_regex( -                r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') +        vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups() +        if not vico_id: +            webpage = self._download_webpage(url, display_id) + +            mobj = re.search( +                r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', +                webpage) +            if mobj: +                vico_id = mobj.group('vico_id') +                vivi_id = mobj.group('vivi_id') +            else: +                vico_id = self._html_search_regex( +                    r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') +                vivi_id = self._html_search_regex( +                    r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')          info = self._download_json( -            'http://www.rtl2.de/sites/default/modules/rtl2/mediathek/php/get_video_jw.php', -            video_id, query={ +            'https://service.rtl2.de/api-player-vipo/video.php', +            display_id, query={                  'vico_id': vico_id,                  'vivi_id': vivi_id,              }) @@ -89,7 +88,7 @@ class RTL2IE(InfoExtractor):                  'format_id': 'rtmp',                  'url': rtmp_url,                  'play_path': stream_url, -                'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', +                'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf',                  'page_url': url,                  'flash_version': 'LNX 11,2,202,429',                  'rtmp_conn': rtmp_conn, @@ -99,12 +98,12 @@ class RTL2IE(InfoExtractor):          m3u8_url = video_info.get('streamurl_hls')          if m3u8_url: -            formats.extend(self._extract_akamai_formats(m3u8_url, video_id)) +            formats.extend(self._extract_akamai_formats(m3u8_url, display_id))          self._sort_formats(formats)          return { -            'id': video_id, +            'id': display_id,              'title': title,              'thumbnail': video_info.get('image'),              'description': video_info.get('beschreibung'), diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index f530f0083..f05401b36 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -59,6 +59,20 @@ class RuutuIE(InfoExtractor):              'url': 'http://www.ruutu.fi/video/3193728',              'only_matching': True,          }, +        { +            # audio podcast +            'url': 'https://www.supla.fi/supla/3382410', +            'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', +            'info_dict': { +                'id': '3382410', +                'ext': 'mp3', +                'title': 'Mikä ihmeen poltergeist?', +                'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', +                'thumbnail': r're:^https?://.*\.jpg$', +                'age_limit': 0, +            }, +            'expected_warnings': ['HTTP Error 502: Bad Gateway'], +        }      ]      def _real_extract(self, url): @@ -94,6 +108,12 @@ class RuutuIE(InfoExtractor):                          continue                          formats.extend(self._extract_mpd_formats(                              video_url, video_id, mpd_id='dash', fatal=False)) +                    elif ext == 'mp3' or child.tag == 'AudioMediaFile': +                        formats.append({ +                            'format_id': 'audio', +                            'url': video_url, +                            'vcodec': 'none', +                        })                      else:                          proto = compat_urllib_parse_urlparse(video_url).scheme                          if not child.tag.startswith('HTTP') and proto != 'rtmp': diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index efb259f96..f1e17dd88 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -14,7 +14,7 @@ from ..utils import (  class StreamangoIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net)/(?:f|embed)/(?P<id>[^/?#&]+)' +    _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net|streamcherry\.com)/(?:f|embed)/(?P<id>[^/?#&]+)'      _TESTS = [{          'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',          'md5': 'e992787515a182f55e38fc97588d802a', @@ -41,6 +41,9 @@ class StreamangoIE(InfoExtractor):      }, {          'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4',          'only_matching': True, +    }, { +        'url': 'https://streamcherry.com/f/clapasobsptpkdfe/', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py new file mode 100644 index 000000000..ccb074cd4 --- /dev/null +++ b/youtube_dl/extractor/stv.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( +    compat_parse_qs, +    compat_urllib_parse_urlparse +) +from ..utils import ( +    extract_attributes, +    float_or_none, +    int_or_none, +    str_or_none, +) + + +class STVPlayerIE(InfoExtractor): +    IE_NAME = 'stv:player' +    _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' +    _TEST = { +        'url': 'https://player.stv.tv/video/7srz/victoria/interview-with-the-cast-ahead-of-new-victoria/', +        'md5': '2ad867d4afd641fa14187596e0fbc91b', +        'info_dict': { +            'id': '6016487034001', +            'ext': 'mp4', +            'upload_date': '20190321', +            'title': 'Interview with the cast ahead of new Victoria', +            'description': 'Nell Hudson and Lily Travers tell us what to expect in the new season of Victoria.', +            'timestamp': 1553179628, +            'uploader_id': '1486976045', +        }, +        'skip': 'this resource is unavailable outside of the UK', +    } +    _PUBLISHER_ID = '1486976045' +    _PTYPE_MAP = { +        'episode': 'episodes', +        'video': 'shortform', +    } + +    def _real_extract(self, url): +        ptype, video_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, video_id) + +        qs = compat_parse_qs(compat_urllib_parse_urlparse(self._search_regex( +            r'itemprop="embedURL"[^>]+href="([^"]+)', +            webpage, 'embed URL', default=None)).query) +        publisher_id = qs.get('publisherID', [None])[0] or self._PUBLISHER_ID + +        player_attr = extract_attributes(self._search_regex( +            r'(<[^>]+class="bcplayer"[^>]+>)', webpage, 'player', default=None)) or {} + +        info = {} +        duration = ref_id = series = video_id = None +        api_ref_id = player_attr.get('data-player-api-refid') +        if api_ref_id: +            resp = self._download_json( +                'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], api_ref_id), +                api_ref_id, fatal=False) +            if resp: +                result = resp.get('results') or {} +                video = result.get('video') or {} +                video_id = str_or_none(video.get('id')) +                ref_id = video.get('guid') +                duration = video.get('length') +                programme = result.get('programme') or {} +                series = programme.get('name') or programme.get('shortName') +                subtitles = {} +                _subtitles = result.get('_subtitles') or {} +                for ext, sub_url in _subtitles.items(): +                    subtitles.setdefault('en', []).append({ +                        'ext': 'vtt' if ext == 'webvtt' else ext, +                        'url': sub_url, +                    }) +                info.update({ +                    'description': result.get('summary'), +                    'subtitles': subtitles, +                    'view_count': int_or_none(result.get('views')), +                }) +        if not video_id: +            video_id = qs.get('videoId', [None])[0] or self._search_regex( +                r'<link\s+itemprop="url"\s+href="(\d+)"', +                webpage, 'video id', default=None) or 'ref:' + (ref_id or player_attr['data-refid']) + +        info.update({ +            '_type': 'url_transparent', +            'duration': float_or_none(duration or player_attr.get('data-duration'), 1000), +            'id': video_id, +            'ie_key': 'BrightcoveNew', +            'series': series or player_attr.get('data-programme-name'), +            'url': 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id), +        }) +        return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 73469cc5d..7640cf00a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..utils import (  class TeamcocoIE(TurnerBaseIE): -    _VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' +    _VALID_URL = r'https?://(?:\w+\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)'      _TESTS = [          {              'url': 'http://teamcoco.com/video/mary-kay-remote', @@ -79,15 +79,20 @@ class TeamcocoIE(TurnerBaseIE):          }, {              'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv',              'only_matching': True, +        }, { +            'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft', +            'only_matching': True,          }      ]      def _graphql_call(self, query_template, object_type, object_id):          find_object = 'find' + object_type          return self._download_json( -            'http://teamcoco.com/graphql/', object_id, data=json.dumps({ +            'https://teamcoco.com/graphql', object_id, data=json.dumps({                  'query': query_template % (find_object, object_id) -            }))['data'][find_object] +            }).encode(), headers={ +                'Content-Type': 'application/json', +            })['data'][find_object]      def _real_extract(self, url):          display_id = self._match_id(url) @@ -145,7 +150,12 @@ class TeamcocoIE(TurnerBaseIE):                  'accessTokenType': 'jws',              }))          else: -            video_sources = self._graphql_call('''{ +            d = self._download_json( +                'https://teamcoco.com/_truman/d/' + video_id, +                video_id, fatal=False) or {} +            video_sources = d.get('meta') or {} +            if not video_sources: +                video_sources = self._graphql_call('''{    %s(id: "%s") {      src    } diff --git a/youtube_dl/extractor/teamtreehouse.py b/youtube_dl/extractor/teamtreehouse.py new file mode 100644 index 000000000..d347e97ef --- /dev/null +++ b/youtube_dl/extractor/teamtreehouse.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    determine_ext, +    ExtractorError, +    float_or_none, +    get_element_by_class, +    get_element_by_id, +    parse_duration, +    remove_end, +    urlencode_postdata, +    urljoin, +) + + +class TeamTreeHouseIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)' +    _TESTS = [{ +        # Course +        'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php', +        'info_dict': { +            'id': 'introduction-to-user-authentication-in-php', +            'title': 'Introduction to User Authentication in PHP', +            'description': 'md5:405d7b4287a159b27ddf30ca72b5b053', +        }, +        'playlist_mincount': 24, +    }, { +        # WorkShop +        'url': 'https://teamtreehouse.com/library/deploying-a-react-app', +        'info_dict': { +            'id': 'deploying-a-react-app', +            'title': 'Deploying a React App', +            'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921', +        }, +        'playlist_mincount': 4, +    }, { +        # Video +        'url': 'https://teamtreehouse.com/library/application-overview-2', +        'info_dict': { +            'id': 'application-overview-2', +            'ext': 'mp4', +            'title': 'Application Overview', +            'description': 'md5:4b0a234385c27140a4378de5f1e15127', +        }, +        'expected_warnings': ['This is just a preview'], +    }] +    _NETRC_MACHINE = 'teamtreehouse' + +    def _real_initialize(self): +        email, password = self._get_login_info() +        if email is None: +            return + +        signin_page = self._download_webpage( +            'https://teamtreehouse.com/signin', +            None, 'Downloading signin page') +        data = self._form_hidden_inputs('new_user_session', signin_page) +        data.update({ +            'user_session[email]': email, +            'user_session[password]': password, +        }) +        error_message = get_element_by_class('error-message', self._download_webpage( +            'https://teamtreehouse.com/person_session', +            None, 'Logging in', data=urlencode_postdata(data))) +        if error_message: +            raise ExtractorError(clean_html(error_message), expected=True) + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        title = self._html_search_meta(['og:title', 'twitter:title'], webpage) +        description = self._html_search_meta( +            ['description', 'og:description', 'twitter:description'], webpage) +        entries = self._parse_html5_media_entries(url, webpage, display_id) +        if entries: +            info = entries[0] + +            for subtitles in info.get('subtitles', {}).values(): +                for subtitle in subtitles: +                    subtitle['ext'] = determine_ext(subtitle['url'], 'srt') + +            is_preview = 'data-preview="true"' in webpage +            if is_preview: +                self.report_warning( +                    'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id) +                duration = 30 +            else: +                duration = float_or_none(self._search_regex( +                    r'data-duration="(\d+)"', webpage, 'duration'), 1000) +                if not duration: +                    duration = parse_duration(get_element_by_id( +                        'video-duration', webpage)) + +            info.update({ +                'id': display_id, +                'title': title, +                'description': description, +                'duration': duration, +            }) +            return info +        else: +            def extract_urls(html, extract_info=None): +                for path in re.findall(r'<a[^>]+href="([^"]+)"', html): +                    page_url = urljoin(url, path) +                    entry = { +                        '_type': 'url_transparent', +                        'id': self._match_id(page_url), +                        'url': page_url, +                        'id_key': self.ie_key(), +                    } +                    if extract_info: +                        entry.update(extract_info) +                    entries.append(entry) + +            workshop_videos = self._search_regex( +                r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>', +                webpage, 'workshop videos', default=None) +            if workshop_videos: +                extract_urls(workshop_videos) +            else: +                stages_path = self._search_regex( +                    r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"', +                    webpage, 'stages path') +                if stages_path: +                    stages_page = self._download_webpage( +                        urljoin(url, stages_path), display_id, 'Downloading stages page') +                    for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1): +                        extract_urls(steps_list, { +                            'chapter': chapter, +                            'chapter_number': chapter_number, +                        }) +                    title = remove_end(title, ' Course') + +            return self.playlist_result( +                entries, display_id, title, description) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 083e9f36d..66088b9ab 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -65,8 +65,15 @@ class TikTokBaseIE(InfoExtractor):  class TikTokIE(TikTokBaseIE): -    _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>\d+)' -    _TEST = { +    _VALID_URL = r'''(?x) +                        https?:// +                            (?: +                                (?:m\.)?tiktok\.com/v| +                                (?:www\.)?tiktok\.com/share/video +                            ) +                            /(?P<id>\d+) +                    ''' +    _TESTS = [{          'url': 'https://m.tiktok.com/v/6606727368545406213.html',          'md5': 'd584b572e92fcd48888051f238022420',          'info_dict': { @@ -81,25 +88,39 @@ class TikTokIE(TikTokBaseIE):              'comment_count': int,              'repost_count': int,          } -    } +    }, { +        'url': 'https://www.tiktok.com/share/video/6606727368545406213', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_webpage( +            'https://m.tiktok.com/v/%s.html' % video_id, video_id)          data = self._parse_json(self._search_regex(              r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id)          return self._extract_aweme(data)  class TikTokUserIE(TikTokBaseIE): -    _VALID_URL = r'https?://(?:m\.)?tiktok\.com/h5/share/usr/(?P<id>\d+)' -    _TEST = { +    _VALID_URL = r'''(?x) +                        https?:// +                            (?: +                                (?:m\.)?tiktok\.com/h5/share/usr| +                                (?:www\.)?tiktok\.com/share/user +                            ) +                            /(?P<id>\d+) +                    ''' +    _TESTS = [{          'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html',          'info_dict': {              'id': '188294915489964032',          },          'playlist_mincount': 24, -    } +    }, { +        'url': 'https://www.tiktok.com/share/user/188294915489964032', +        'only_matching': True, +    }]      def _real_extract(self, url):          user_id = self._match_id(url) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d1fe95654..1072550f1 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -6,10 +6,7 @@ import re  import sys  from .common import InfoExtractor -from ..compat import ( -    compat_str, -    compat_urlparse, -) +from ..compat import compat_urlparse  from ..utils import (      clean_html,      ExtractorError, @@ -103,7 +100,7 @@ class VKIE(VKBaseIE):              'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',              'md5': '7babad3b85ea2e91948005b1b8b0cb84',              'info_dict': { -                'id': '162222515', +                'id': '-77521_162222515',                  'ext': 'mp4',                  'title': 'ProtivoGunz - Хуёвая песня',                  'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', @@ -117,7 +114,7 @@ class VKIE(VKBaseIE):              'url': 'http://vk.com/video205387401_165548505',              'md5': '6c0aeb2e90396ba97035b9cbde548700',              'info_dict': { -                'id': '165548505', +                'id': '205387401_165548505',                  'ext': 'mp4',                  'title': 'No name',                  'uploader': 'Tom Cruise', @@ -132,7 +129,7 @@ class VKIE(VKBaseIE):              'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',              'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',              'info_dict': { -                'id': '162925554', +                'id': '32194266_162925554',                  'ext': 'mp4',                  'uploader': 'Vladimir Gavrin',                  'title': 'Lin Dan', @@ -149,7 +146,7 @@ class VKIE(VKBaseIE):              'md5': 'a590bcaf3d543576c9bd162812387666',              'note': 'Only available for registered users',              'info_dict': { -                'id': '164049491', +                'id': '-8871596_164049491',                  'ext': 'mp4',                  'uploader': 'Триллеры',                  'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', @@ -163,7 +160,7 @@ class VKIE(VKBaseIE):              'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',              'md5': '4d7a5ef8cf114dfa09577e57b2993202',              'info_dict': { -                'id': '168067957', +                'id': '-43215063_168067957',                  'ext': 'mp4',                  'uploader': 'Киномания - лучшее из мира кино',                  'title': ' ', @@ -177,7 +174,7 @@ class VKIE(VKBaseIE):              'md5': '0c45586baa71b7cb1d0784ee3f4e00a6',              'note': 'ivi.ru embed',              'info_dict': { -                'id': '60690', +                'id': '-43215063_169084319',                  'ext': 'mp4',                  'title': 'Книга Илая',                  'duration': 6771, @@ -191,7 +188,7 @@ class VKIE(VKBaseIE):              'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',              'md5': '091287af5402239a1051c37ec7b92913',              'info_dict': { -                'id': '171201961', +                'id': '30481095_171201961',                  'ext': 'mp4',                  'title': 'ТюменцевВВ_09.07.2015',                  'uploader': 'Anton Ivanov', @@ -206,10 +203,10 @@ class VKIE(VKBaseIE):              'url': 'https://vk.com/video276849682_170681728',              'info_dict': {                  'id': 'V3K4mi0SYkc', -                'ext': 'webm', +                'ext': 'mp4',                  'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",                  'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', -                'duration': 179, +                'duration': 178,                  'upload_date': '20130116',                  'uploader': "Children's Joy Foundation Inc.",                  'uploader_id': 'thecjf', @@ -239,7 +236,7 @@ class VKIE(VKBaseIE):              'url': 'http://vk.com/video-110305615_171782105',              'md5': 'e13fcda136f99764872e739d13fac1d1',              'info_dict': { -                'id': '171782105', +                'id': '-110305615_171782105',                  'ext': 'mp4',                  'title': 'S-Dance, репетиции к The way show',                  'uploader': 'THE WAY SHOW | 17 апреля', @@ -254,14 +251,17 @@ class VKIE(VKBaseIE):          {              # finished live stream, postlive_mp4              'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', -            'md5': '90d22d051fccbbe9becfccc615be6791',              'info_dict': { -                'id': '456242764', +                'id': '-387766_456242764',                  'ext': 'mp4', -                'title': 'ИгроМир 2016 — день 1', +                'title': 'ИгроМир 2016 День 1 — Игромания Утром',                  'uploader': 'Игромания',                  'duration': 5239, -                'view_count': int, +                # TODO: use act=show to extract view_count +                # 'view_count': int, +                'upload_date': '20160929', +                'uploader_id': '-387766', +                'timestamp': 1475137527,              },          },          { @@ -465,7 +465,7 @@ class VKIE(VKBaseIE):          self._sort_formats(formats)          return { -            'id': compat_str(data.get('vid') or video_id), +            'id': video_id,              'formats': formats,              'title': title,              'thumbnail': data.get('jpg'), diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 6c060ae76..c11da97de 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -150,9 +150,10 @@ class VRVIE(VRVBaseIE):      def _real_extract(self, url):          video_id = self._match_id(url) -        episode_path = self._get_cms_resource( -            'cms:/episodes/' + video_id, video_id) -        video_data = self._call_cms(episode_path, video_id, 'video') +        object_data = self._call_cms(self._get_cms_resource( +            'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0] +        resource_path = object_data['__links__']['resource']['href'] +        video_data = self._call_cms(resource_path, video_id, 'video')          title = video_data['title']          streams_path = video_data['__links__'].get('streams', {}).get('href') diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 3cb4d71a6..621df5b54 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -19,7 +19,7 @@ from ..utils import (  class WeiboIE(InfoExtractor): -    _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)'      _TEST = {          'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',          'info_dict': { diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 68a48034e..d268372e6 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -20,7 +20,7 @@ from ..utils import (  class XHamsterIE(InfoExtractor):      _VALID_URL = r'''(?x)                      https?:// -                        (?:.+?\.)?xhamster\.com/ +                        (?:.+?\.)?xhamster\.(?:com|one)/                          (?:                              movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html|                              videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+) @@ -91,6 +91,9 @@ class XHamsterIE(InfoExtractor):          # new URL schema          'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821',          'only_matching': True, +    }, { +        'url': 'https://xhamster.one/videos/femaleagent-shy-beauty-takes-the-bait-1509445', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index ec2d913fc..166bcf443 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -57,10 +57,17 @@ class XVideosIE(InfoExtractor):              webpage, 'title', default=None,              group='title') or self._og_search_title(webpage) -        thumbnail = self._search_regex( -            (r'setThumbUrl\(\s*(["\'])(?P<thumbnail>(?:(?!\1).)+)\1', -             r'url_bigthumb=(?P<thumbnail>.+?)&'), -            webpage, 'thumbnail', fatal=False, group='thumbnail') +        thumbnails = [] +        for preference, thumbnail in enumerate(('', '169')): +            thumbnail_url = self._search_regex( +                r'setThumbUrl%s\(\s*(["\'])(?P<thumbnail>(?:(?!\1).)+)\1' % thumbnail, +                webpage, 'thumbnail', default=None, group='thumbnail') +            if thumbnail_url: +                thumbnails.append({ +                    'url': thumbnail_url, +                    'preference': preference, +                }) +          duration = int_or_none(self._og_search_property(              'duration', webpage, default=None)) or parse_duration(              self._search_regex( @@ -98,6 +105,6 @@ class XVideosIE(InfoExtractor):              'formats': formats,              'title': title,              'duration': duration, -            'thumbnail': thumbnail, +            'thumbnails': thumbnails,              'age_limit': 18,          } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 552013a74..86ba7d3c9 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -477,3 +477,77 @@ class YahooSearchIE(SearchInfoExtractor):              'id': query,              'entries': entries,          } + + +class YahooGyaOPlayerIE(InfoExtractor): +    IE_NAME = 'yahoo:gyao:player' +    _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode/[^/]+)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' +    _TESTS = [{ +        'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', +        'info_dict': { +            'id': '5993125228001', +            'ext': 'mp4', +            'title': 'フューリー 【字幕版】', +            'description': 'md5:21e691c798a15330eda4db17a8fe45a5', +            'uploader_id': '4235717419001', +            'upload_date': '20190124', +            'timestamp': 1548294365, +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', +        'only_matching': True, +    }, { +        'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url).replace('/', ':') +        video = self._download_json( +            'https://gyao.yahoo.co.jp/dam/v1/videos/' + video_id, +            video_id, query={ +                'fields': 'longDescription,title,videoId', +            }) +        return { +            '_type': 'url_transparent', +            'id': video_id, +            'title': video['title'], +            'url': smuggle_url( +                'http://players.brightcove.net/4235717419001/default_default/index.html?videoId=' + video['videoId'], +                {'geo_countries': ['JP']}), +            'description': video.get('longDescription'), +            'ie_key': BrightcoveNewIE.ie_key(), +        } + + +class YahooGyaOIE(InfoExtractor): +    IE_NAME = 'yahoo:gyao' +    _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/p|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+)' +    _TESTS = [{ +        'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', +        'info_dict': { +            'id': '00449:v03102', +        }, +        'playlist_count': 2, +    }, { +        'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        program_id = self._match_id(url).replace('/', ':') +        videos = self._download_json( +            'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos'] +        entries = [] +        for video in videos: +            video_id = video.get('id') +            if not video_id: +                continue +            entries.append(self.url_result( +                'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), +                YahooGyaOPlayerIE.ie_key(), video_id)) +        return self.playlist_result(entries, program_id) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index 2c63f9752..b1d1eb6b6 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -8,8 +8,8 @@ from ..utils import (  class YourPornIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?yourporn\.sexy/post/(?P<id>[^/?#&.]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:yourporn\.sexy|sxyprn\.com)/post/(?P<id>[^/?#&.]+)' +    _TESTS = [{          'url': 'https://yourporn.sexy/post/57ffcb2e1179b.html',          'md5': '6f8682b6464033d87acaa7a8ff0c092e',          'info_dict': { @@ -23,7 +23,10 @@ class YourPornIE(InfoExtractor):          'params': {              'skip_download': True,          }, -    } +    }, { +        'url': 'https://sxyprn.com/post/57ffcb2e1179b.html', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 886fc1591..132572c88 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -484,7 +484,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # RTMP (unnamed)          '_rtmp': {'protocol': 'rtmp'},      } -    _SUBTITLE_FORMATS = ('ttml', 'vtt') +    _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')      _GEO_BYPASS = False diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5e86bc4d5..5c7d550f5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2019.03.18' +__version__ = '2019.04.07' | 
