diff options
| -rwxr-xr-x | youtube_dl/YoutubeDL.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/cinemassacre.py | 119 | ||||
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/imdb.py | 37 | ||||
| -rw-r--r-- | youtube_dl/extractor/mgtv.py | 43 | ||||
| -rw-r--r-- | youtube_dl/extractor/nrk.py | 430 | ||||
| -rw-r--r-- | youtube_dl/extractor/openload.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/sina.py | 124 | ||||
| -rw-r--r-- | youtube_dl/extractor/twitch.py | 27 | ||||
| -rw-r--r-- | youtube_dl/extractor/ustudio.py | 66 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 4 | 
12 files changed, 403 insertions, 469 deletions
| diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 34eeb77c5..03a6a1890 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -720,6 +720,7 @@ class YoutubeDL(object):          result_type = ie_result.get('_type', 'video')          if result_type in ('url', 'url_transparent'): +            ie_result['url'] = sanitize_url(ie_result['url'])              extract_flat = self.params.get('extract_flat', False)              if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or                      extract_flat is True): diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f0781fc27..fc7fc5b16 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,9 +307,10 @@ class BrightcoveLegacyIE(InfoExtractor):                                      playlist_title=playlist_info['mediaCollectionDTO']['displayName'])      def _extract_video_info(self, video_info): +        video_id = compat_str(video_info['id'])          publisher_id = video_info.get('publisherId')          info = { -            'id': compat_str(video_info['id']), +            'id': video_id,              'title': video_info['displayName'].strip(),              'description': video_info.get('shortDescription'),              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -331,7 +332,8 @@ class BrightcoveLegacyIE(InfoExtractor):                      url_comp = compat_urllib_parse_urlparse(url)                      if url_comp.path.endswith('.m3u8'):                          formats.extend( -                            self._extract_m3u8_formats(url, info['id'], 'mp4')) +                            self._extract_m3u8_formats( +                                url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))                          continue                      elif 'akamaihd.net' in url_comp.netloc:                          # This type of renditions are served through @@ -365,7 +367,7 @@ class BrightcoveLegacyIE(InfoExtractor):                      a_format.update({                          'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''),                          'ext': 'mp4', -                        'protocol': 'm3u8', +                        'protocol': 'm3u8_native',                      })                  formats.append(a_format) @@ -395,7 +397,7 @@ class BrightcoveLegacyIE(InfoExtractor):                      return ad_info          if 'url' not in info and not info.get('formats'): -            raise ExtractorError('Unable to extract video url for %s' % info['id']) +            raise ExtractorError('Unable to extract video url for %s' % video_id)          return info @@ -527,7 +529,7 @@ class BrightcoveNewIE(InfoExtractor):                  if not src:                      continue                  formats.extend(self._extract_m3u8_formats( -                    src, video_id, 'mp4', m3u8_id='hls', fatal=False)) +                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))              elif source_type == 'application/dash+xml':                  if not src:                      continue diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py deleted file mode 100644 index 042c4f2f1..000000000 --- a/youtube_dl/extractor/cinemassacre.py +++ /dev/null @@ -1,119 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError -from .screenwavemedia import ScreenwaveMediaIE - - -class CinemassacreIE(InfoExtractor): -    _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)' -    _TESTS = [ -        { -            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', -            'md5': 'fde81fbafaee331785f58cd6c0d46190', -            'info_dict': { -                'id': 'Cinemassacre-19911', -                'ext': 'mp4', -                'upload_date': '20121110', -                'title': '“Angry Video Game Nerd: The Movie” – Trailer', -                'description': 'md5:fb87405fcb42a331742a0dce2708560b', -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', -            'md5': 'd72f10cd39eac4215048f62ab477a511', -            'info_dict': { -                'id': 'Cinemassacre-521be8ef82b16', -                'ext': 'mp4', -                'upload_date': '20131002', -                'title': 'The Mummy’s Hand (1940)', -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            # Youtube embedded video -            'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', -            'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9', -            'info_dict': { -                'id': 'OEVzPCY2T-g', -                'ext': 'webm', -                'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', -                'upload_date': '20061207', -                'uploader': 'Cinemassacre', -                'uploader_id': 'JamesNintendoNerd', -                'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', -            } -        }, -        { -            # Youtube embedded video -            'url': 'http://cinemassacre.com/2006/09/01/mckids/', -            'md5': '7393c4e0f54602ad110c793eb7a6513a', -            'info_dict': { -                'id': 'FnxsNhuikpo', -                'ext': 'webm', -                'upload_date': '20060901', -                'uploader': 'Cinemassacre Extra', -                'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', -                'uploader_id': 'Cinemassacre', -                'title': 'AVGN: McKids', -            } -        }, -        { -            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', -            'md5': '1376908e49572389e7b06251a53cdd08', -            'info_dict': { -                'id': 'Cinemassacre-555779690c440', -                'ext': 'mp4', -                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', -                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', -                'upload_date': '20150525', -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        } -    ] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        display_id = mobj.group('display_id') -        video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') - -        webpage = self._download_webpage(url, display_id) - -        playerdata_url = self._search_regex( -            [ -                ScreenwaveMediaIE.EMBED_PATTERN, -                r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', -            ], -            webpage, 'player data URL', default=None, group='url') -        if not playerdata_url: -            raise ExtractorError('Unable to find player data') - -        video_title = self._html_search_regex( -            r'<title>(?P<title>.+?)\|', webpage, 'title') -        video_description = self._html_search_regex( -            r'<div class="entry-content">(?P<description>.+?)</div>', -            webpage, 'description', flags=re.DOTALL, fatal=False) -        video_thumbnail = self._og_search_thumbnail(webpage) - -        return { -            '_type': 'url_transparent', -            'display_id': display_id, -            'title': video_title, -            'description': video_description, -            'upload_date': video_date, -            'thumbnail': video_thumbnail, -            'url': playerdata_url, -        } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a0bb3d4c2..b6f4ccc5d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -124,7 +124,6 @@ from .chirbit import (      ChirbitProfileIE,  )  from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE  from .cliprs import ClipRsIE  from .clipfish import ClipfishIE  from .cliphunter import CliphunterIE @@ -834,7 +833,6 @@ from .twitch import (      TwitchVodIE,      TwitchProfileIE,      TwitchPastBroadcastsIE, -    TwitchBookmarksIE,      TwitchStreamIE,  )  from .twitter import ( @@ -852,7 +850,10 @@ from .unistra import UnistraIE  from .urort import UrortIE  from .usatoday import USATodayIE  from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import UstudioIE +from .ustudio import ( +    UstudioIE, +    UstudioEmbedIE, +)  from .varzesh3 import Varzesh3IE  from .vbox7 import Vbox7IE  from .veehd import VeeHDIE diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 8bed8ccd0..3a2b7cec5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,10 +1,10 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import ( +    mimetype2ext,      qualities,  ) @@ -12,9 +12,9 @@ from ..utils import (  class ImdbIE(InfoExtractor):      IE_NAME = 'imdb'      IE_DESC = 'Internet Movie Database trailers' -    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.imdb.com/video/imdb/vi2524815897',          'info_dict': {              'id': '2524815897', @@ -22,7 +22,10 @@ class ImdbIE(InfoExtractor):              'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',              'description': 'md5:9061c2219254e5d14e03c25c98e96a81',          } -    } +    }, { +        'url': 'http://www.imdb.com/video/_/vi2524815897', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -48,13 +51,27 @@ class ImdbIE(InfoExtractor):              json_data = self._search_regex(                  r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',                  format_page, 'json data', flags=re.DOTALL) -            info = json.loads(json_data) -            format_info = info['videoPlayerObject']['video'] -            f_id = format_info['ffname'] +            info = self._parse_json(json_data, video_id, fatal=False) +            if not info: +                continue +            format_info = info.get('videoPlayerObject', {}).get('video', {}) +            if not format_info: +                continue +            video_info_list = format_info.get('videoInfoList') +            if not video_info_list or not isinstance(video_info_list, list): +                continue +            video_info = video_info_list[0] +            if not video_info or not isinstance(video_info, dict): +                continue +            video_url = video_info.get('videoUrl') +            if not video_url: +                continue +            format_id = format_info.get('ffname')              formats.append({ -                'format_id': f_id, -                'url': format_info['videoInfoList'][0]['videoUrl'], -                'quality': quality(f_id), +                'format_id': format_id, +                'url': video_url, +                'ext': mimetype2ext(video_info.get('videoMimeType')), +                'quality': quality(format_id),              })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index a14d176a5..9fbc74f5d 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -11,7 +11,7 @@ class MGTVIE(InfoExtractor):      _TEST = {          'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', -        'md5': '', +        'md5': '1bdadcf760a0b90946ca68ee9a2db41a',          'info_dict': {              'id': '3116640',              'ext': 'mp4', @@ -20,15 +20,6 @@ class MGTVIE(InfoExtractor):              'duration': 7461,              'thumbnail': 're:^https?://.*\.jpg$',          }, -        'params': { -            'skip_download': True,  # m3u8 download -        }, -    } - -    _FORMAT_MAP = { -        '标清': ('Standard', 0), -        '高清': ('High', 1), -        '超清': ('SuperHigh', 2),      }      def _real_extract(self, url): @@ -40,17 +31,27 @@ class MGTVIE(InfoExtractor):          formats = []          for idx, stream in enumerate(api_data['stream']): -            format_name = stream.get('name') -            format_id, preference = self._FORMAT_MAP.get(format_name, (None, None)) -            format_info = self._download_json( -                stream['url'], video_id, -                note='Download video info for format %s' % format_id or '#%d' % idx) -            formats.append({ -                'format_id': format_id, -                'url': format_info['info'], -                'ext': 'mp4',  # These are m3u8 playlists -                'preference': preference, -            }) +            stream_url = stream.get('url') +            if not stream_url: +                continue +            tbr = int_or_none(self._search_regex( +                r'(\d+)\.mp4', stream_url, 'tbr', default=None)) + +            def extract_format(stream_url, format_id, idx, query={}): +                format_info = self._download_json( +                    stream_url, video_id, +                    note='Download video info for format %s' % format_id or '#%d' % idx, query=query) +                return { +                    'format_id': format_id, +                    'url': format_info['info'], +                    'ext': 'mp4', +                    'tbr': tbr, +                } + +            formats.append(extract_format( +                stream_url, 'hls-%d' % tbr if tbr else None, idx * 2)) +            formats.append(extract_format(stream_url.replace( +                '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031}))          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 51dfc27ac..7532f40c1 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,91 +4,217 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urlparse, -    compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote  from ..utils import ( -    determine_ext,      ExtractorError, -    float_or_none, +    int_or_none, +    parse_age_limit,      parse_duration, -    unified_strdate,  ) -class NRKIE(InfoExtractor): -    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' - -    _TESTS = [ -        { -            'url': 'http://www.nrk.no/video/PS*150533', -            # MD5 is unstable -            'info_dict': { -                'id': '150533', -                'ext': 'flv', -                'title': 'Dompap og andre fugler i Piip-Show', -                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', -                'duration': 263, -            } -        }, -        { -            'url': 'http://www.nrk.no/video/PS*154915', -            # MD5 is unstable -            'info_dict': { -                'id': '154915', -                'ext': 'flv', -                'title': 'Slik høres internett ut når du er blind', -                'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', -                'duration': 20, -            } -        }, -    ] +class NRKBaseIE(InfoExtractor): +    def _extract_formats(self, manifest_url, video_id, fatal=True): +        formats = [] +        formats.extend(self._extract_f4m_formats( +            manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', +            video_id, f4m_id='hds', fatal=fatal)) +        formats.extend(self._extract_m3u8_formats(manifest_url.replace( +            'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), +            video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) +        return formats      def _real_extract(self, url):          video_id = self._match_id(url)          data = self._download_json( -            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, -            video_id, 'Downloading media JSON') +            'http://%s/mediaelement/%s' % (self._API_HOST, video_id), +            video_id, 'Downloading mediaelement JSON') + +        title = data.get('fullTitle') or data.get('mainTitle') or data['title'] +        video_id = data.get('id') or video_id + +        entries = [] + +        media_assets = data.get('mediaAssets') +        if media_assets and isinstance(media_assets, list): +            def video_id_and_title(idx): +                return ((video_id, title) if len(media_assets) == 1 +                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) +            for num, asset in enumerate(media_assets, 1): +                asset_url = asset.get('url') +                if not asset_url: +                    continue +                formats = self._extract_formats(asset_url, video_id, fatal=False) +                if not formats: +                    continue +                self._sort_formats(formats) +                entry_id, entry_title = video_id_and_title(num) +                duration = parse_duration(asset.get('duration')) +                subtitles = {} +                for subtitle in ('webVtt', 'timedText'): +                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) +                    if subtitle_url: +                        subtitles.setdefault('no', []).append({'url': subtitle_url}) +                entries.append({ +                    'id': asset.get('carrierId') or entry_id, +                    'title': entry_title, +                    'duration': duration, +                    'subtitles': subtitles, +                    'formats': formats, +                }) -        media_url = data.get('mediaUrl') +        if not entries: +            media_url = data.get('mediaUrl') +            if media_url: +                formats = self._extract_formats(media_url, video_id) +                self._sort_formats(formats) +                duration = parse_duration(data.get('duration')) +                entries = [{ +                    'id': video_id, +                    'title': title, +                    'duration': duration, +                    'formats': formats, +                }] -        if not media_url: -            if data['usageRights']['isGeoBlocked']: +        if not entries: +            if data.get('usageRights', {}).get('isGeoBlocked'):                  raise ExtractorError(                      'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',                      expected=True) -        if determine_ext(media_url) == 'f4m': -            formats = self._extract_f4m_formats( -                media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') -            self._sort_formats(formats) -        else: -            formats = [{ -                'url': media_url, -                'ext': 'flv', -            }] - -        duration = parse_duration(data.get('duration')) +        conviva = data.get('convivaStatistics') or {} +        series = conviva.get('seriesName') or data.get('seriesTitle') +        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') +        thumbnails = None          images = data.get('images') -        if images: -            thumbnails = images['webImages'] -            thumbnails.sort(key=lambda image: image['pixelWidth']) -            thumbnail = thumbnails[-1]['imageUrl'] -        else: -            thumbnail = None - -        return { -            'id': video_id, -            'title': data['title'], -            'description': data['description'], -            'duration': duration, -            'thumbnail': thumbnail, -            'formats': formats, +        if images and isinstance(images, dict): +            web_images = images.get('webImages') +            if isinstance(web_images, list): +                thumbnails = [{ +                    'url': image['imageUrl'], +                    'width': int_or_none(image.get('width')), +                    'height': int_or_none(image.get('height')), +                } for image in web_images if image.get('imageUrl')] + +        description = data.get('description') + +        common_info = { +            'description': description, +            'series': series, +            'episode': episode, +            'age_limit': parse_age_limit(data.get('legalAge')), +            'thumbnails': thumbnails,          } +        vcodec = 'none' if data.get('mediaType') == 'Audio' else None + +        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged + +        for entry in entries: +            entry.update(common_info) +            for f in entry['formats']: +                f['vcodec'] = vcodec + +        return self.playlist_result(entries, video_id, title, description) + + +class NRKIE(NRKBaseIE): +    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' +    _API_HOST = 'v8.psapi.nrk.no' +    _TESTS = [{ +        # video +        'url': 'http://www.nrk.no/video/PS*150533', +        'md5': '2f7f6eeb2aacdd99885f355428715cfa', +        'info_dict': { +            'id': '150533', +            'ext': 'mp4', +            'title': 'Dompap og andre fugler i Piip-Show', +            'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', +            'duration': 263, +        } +    }, { +        # audio +        'url': 'http://www.nrk.no/video/PS*154915', +        # MD5 is unstable +        'info_dict': { +            'id': '154915', +            'ext': 'flv', +            'title': 'Slik høres internett ut når du er blind', +            'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', +            'duration': 20, +        } +    }] + + +class NRKTVIE(NRKBaseIE): +    IE_DESC = 'NRK TV and NRK Radio' +    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' +    _API_HOST = 'psapi-we.nrk.no' + +    _TESTS = [{ +        'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', +        'md5': '4e9ca6629f09e588ed240fb11619922a', +        'info_dict': { +            'id': 'MUHH48000314AA', +            'ext': 'mp4', +            'title': '20 spørsmål 23.05.2014', +            'description': 'md5:bdea103bc35494c143c6a9acdd84887a', +            'duration': 1741.52, +        }, +    }, { +        'url': 'https://tv.nrk.no/program/mdfp15000514', +        'md5': '43d0be26663d380603a9cf0c24366531', +        'info_dict': { +            'id': 'MDFP15000514CA', +            'ext': 'mp4', +            'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', +            'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', +            'duration': 4605.08, +        }, +    }, { +        # single playlist video +        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', +        'md5': 'adbd1dbd813edaf532b0a253780719c2', +        'info_dict': { +            'id': 'MSPO40010515-part2', +            'ext': 'flv', +            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', +            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +        }, +        'skip': 'Only works from Norway', +    }, { +        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', +        'playlist': [{ +            'md5': '9480285eff92d64f06e02a5367970a7a', +            'info_dict': { +                'id': 'MSPO40010515-part1', +                'ext': 'flv', +                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', +                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +            }, +        }, { +            'md5': 'adbd1dbd813edaf532b0a253780719c2', +            'info_dict': { +                'id': 'MSPO40010515-part2', +                'ext': 'flv', +                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', +                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +            }, +        }], +        'info_dict': { +            'id': 'MSPO40010515', +            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', +            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +            'duration': 6947.52, +        }, +        'skip': 'Only works from Norway', +    }, { +        'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', +        'only_matching': True, +    }] +  class NRKPlaylistIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' @@ -159,179 +285,3 @@ class NRKSkoleIE(InfoExtractor):          nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')          return self.url_result('nrk:%s' % nrk_id) - - -class NRKTVIE(InfoExtractor): -    IE_DESC = 'NRK TV and NRK Radio' -    _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' - -    _TESTS = [ -        { -            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', -            'info_dict': { -                'id': 'MUHH48000314', -                'ext': 'mp4', -                'title': '20 spørsmål', -                'description': 'md5:bdea103bc35494c143c6a9acdd84887a', -                'upload_date': '20140523', -                'duration': 1741.52, -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            'url': 'https://tv.nrk.no/program/mdfp15000514', -            'info_dict': { -                'id': 'mdfp15000514', -                'ext': 'mp4', -                'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', -                'description': 'md5:654c12511f035aed1e42bdf5db3b206a', -                'upload_date': '20140524', -                'duration': 4605.08, -            }, -            'params': { -                # m3u8 download -                'skip_download': True, -            }, -        }, -        { -            # single playlist video -            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', -            'md5': 'adbd1dbd813edaf532b0a253780719c2', -            'info_dict': { -                'id': 'MSPO40010515-part2', -                'ext': 'flv', -                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', -                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                'upload_date': '20150106', -            }, -            'skip': 'Only works from Norway', -        }, -        { -            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', -            'playlist': [ -                { -                    'md5': '9480285eff92d64f06e02a5367970a7a', -                    'info_dict': { -                        'id': 'MSPO40010515-part1', -                        'ext': 'flv', -                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', -                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                        'upload_date': '20150106', -                    }, -                }, -                { -                    'md5': 'adbd1dbd813edaf532b0a253780719c2', -                    'info_dict': { -                        'id': 'MSPO40010515-part2', -                        'ext': 'flv', -                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', -                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                        'upload_date': '20150106', -                    }, -                }, -            ], -            'info_dict': { -                'id': 'MSPO40010515', -                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', -                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', -                'upload_date': '20150106', -                'duration': 6947.5199999999995, -            }, -            'skip': 'Only works from Norway', -        }, -        { -            'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', -            'only_matching': True, -        } -    ] - -    def _extract_f4m(self, manifest_url, video_id): -        return self._extract_f4m_formats( -            manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        part_id = mobj.group('part_id') -        base_url = mobj.group('baseurl') - -        webpage = self._download_webpage(url, video_id) - -        title = self._html_search_meta( -            'title', webpage, 'title') -        description = self._html_search_meta( -            'description', webpage, 'description') - -        thumbnail = self._html_search_regex( -            r'data-posterimage="([^"]+)"', -            webpage, 'thumbnail', fatal=False) -        upload_date = unified_strdate(self._html_search_meta( -            'rightsfrom', webpage, 'upload date', fatal=False)) -        duration = float_or_none(self._html_search_regex( -            r'data-duration="([^"]+)"', -            webpage, 'duration', fatal=False)) - -        # playlist -        parts = re.findall( -            r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) -        if parts: -            entries = [] -            for current_part_id, stream_url, part_title in parts: -                if part_id and current_part_id != part_id: -                    continue -                video_part_id = '%s-part%s' % (video_id, current_part_id) -                formats = self._extract_f4m(stream_url, video_part_id) -                entries.append({ -                    'id': video_part_id, -                    'title': part_title, -                    'description': description, -                    'thumbnail': thumbnail, -                    'upload_date': upload_date, -                    'formats': formats, -                }) -            if part_id: -                if entries: -                    return entries[0] -            else: -                playlist = self.playlist_result(entries, video_id, title, description) -                playlist.update({ -                    'thumbnail': thumbnail, -                    'upload_date': upload_date, -                    'duration': duration, -                }) -                return playlist - -        formats = [] - -        f4m_url = re.search(r'data-media="([^"]+)"', webpage) -        if f4m_url: -            formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - -        m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) -        if m3u8_url: -            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls')) -        self._sort_formats(formats) - -        subtitles_url = self._html_search_regex( -            r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1', -            webpage, 'subtitle URL', default=None, group='url') -        subtitles = {} -        if subtitles_url: -            subtitles['no'] = [{ -                'ext': 'ttml', -                'url': compat_urlparse.urljoin(base_url, subtitles_url), -            }] - -        return { -            'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, -            'upload_date': upload_date, -            'duration': duration, -            'formats': formats, -            'subtitles': subtitles, -        } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 456561bcc..5049b870e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -100,7 +100,7 @@ class OpenloadIE(InfoExtractor):              raise ExtractorError('File not found', expected=True)          code = self._search_regex( -            r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', +            r'</video>\s*</div>\s*<script[^>]+>([^<]+)</script>',              webpage, 'JS code')          decoded = self.openload_decode(code) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index d03f1b1d4..8fc66732a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -4,28 +4,35 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..utils import sanitized_Request +from ..utils import ( +    HEADRequest, +    ExtractorError, +    int_or_none, +    update_url_query, +    qualities, +    get_element_by_attribute, +    clean_html, +)  class SinaIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/ -                        ( -                            (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-)))) -                            | +    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ +                        (?: +                            (?:view/|.*\#)(?P<video_id>\d+)| +                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|                              # This is used by external sites like Weibo -                            (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf) +                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf                          )                    '''      _TESTS = [          { -            'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', -            'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', +            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', +            'md5': 'd38433e2fc886007729735650ae4b3e9',              'info_dict': { -                'id': '110028898', -                'ext': 'flv', -                'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', +                'id': '250576622', +                'ext': 'mp4', +                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',              }          },          { @@ -35,37 +42,74 @@ class SinaIE(InfoExtractor):                  'ext': 'flv',                  'title': '军方提高对朝情报监视级别',              }, +            'skip': 'the page does not exist or has been deleted', +        }, +        { +            'url': 'http://video.sina.com.cn/view/250587748.html', +            'md5': '3d1807a25c775092aab3bc157fff49b4', +            'info_dict': { +                'id': '250587748', +                'ext': 'mp4', +                'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', +            },          },      ] -    def _extract_video(self, video_id): -        data = compat_urllib_parse_urlencode({'vid': video_id}) -        url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, -                                     video_id, 'Downloading video url') -        image_page = self._download_webpage( -            'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, -            video_id, 'Downloading thumbnail info') - -        return {'id': video_id, -                'url': url_doc.find('./durl/url').text, -                'ext': 'flv', -                'title': url_doc.find('./vname').text, -                'thumbnail': image_page.split('=')[1], -                } -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        if mobj.group('token') is not None: -            # The video id is in the redirected url -            self.to_screen('Getting video id') -            request = sanitized_Request(url) -            request.get_method = lambda: 'HEAD' -            (_, urlh) = self._download_webpage_handle(request, 'NA', False) -            return self._real_extract(urlh.geturl()) -        elif video_id is None: -            pseudo_id = mobj.group('pseudo_id') -            webpage = self._download_webpage(url, pseudo_id) -            video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id') -        return self._extract_video(video_id) +        video_id = mobj.group('video_id') +        if not video_id: +            if mobj.group('token') is not None: +                # The video id is in the redirected url +                self.to_screen('Getting video id') +                request = HEADRequest(url) +                (_, urlh) = self._download_webpage_handle(request, 'NA', False) +                return self._real_extract(urlh.geturl()) +            else: +                pseudo_id = mobj.group('pseudo_id') +                webpage = self._download_webpage(url, pseudo_id) +                error = get_element_by_attribute('class', 'errtitle', webpage) +                if error: +                    raise ExtractorError('%s said: %s' % ( +                        self.IE_NAME, clean_html(error)), expected=True) +                video_id = self._search_regex( +                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + +        video_data = self._download_json( +            'http://s.video.sina.com.cn/video/h5play', +            video_id, query={'video_id': video_id}) +        if video_data['code'] != 1: +            raise ExtractorError('%s said: %s' % ( +                self.IE_NAME, video_data['message']), expected=True) +        else: +            video_data = video_data['data'] +            title = video_data['title'] +            description = video_data.get('description') +            if description: +                description = description.strip() + +            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) +            formats = [] +            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): +                file_api = quality.get('file_api') +                file_id = quality.get('file_id') +                if not file_api or not file_id: +                    continue +                formats.append({ +                    'format_id': quality_id, +                    'url': update_url_query(file_api, {'vid': file_id}), +                    'preference': preference(quality_id), +                    'ext': 'mp4', +                }) +            self._sort_formats(formats) + +            return { +                'id': video_id, +                'title': title, +                'description': description, +                'thumbnail': video_data.get('image'), +                'duration': int_or_none(video_data.get('length')), +                'timestamp': int_or_none(video_data.get('create_time')), +                'formats': formats, +            } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 36ee1adff..f7b98e190 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -171,6 +171,7 @@ class TwitchVideoIE(TwitchItemBaseIE):              'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',          },          'playlist_mincount': 12, +        'skip': 'HTTP Error 404: Not Found',      } @@ -187,6 +188,7 @@ class TwitchChapterIE(TwitchItemBaseIE):              'title': 'ACRL Off Season - Sports Cars @ Nordschleife',          },          'playlist_mincount': 3, +        'skip': 'HTTP Error 404: Not Found',      }, {          'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',          'only_matching': True, @@ -355,31 +357,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):      } -class TwitchBookmarksIE(TwitchPlaylistBaseIE): -    IE_NAME = 'twitch:bookmarks' -    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE -    _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE -    _PLAYLIST_TYPE = 'bookmarks' - -    _TEST = { -        'url': 'http://www.twitch.tv/ognos/profile/bookmarks', -        'info_dict': { -            'id': 'ognos', -            'title': 'Ognos', -        }, -        'playlist_mincount': 3, -    } - -    def _extract_playlist_page(self, response): -        entries = [] -        for bookmark in response.get('bookmarks', []): -            video = bookmark.get('video') -            if not video: -                continue -            entries.append(video['url']) -        return entries - -  class TwitchStreamIE(TwitchBaseIE):      IE_NAME = 'twitch:stream'      _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py index cafc082b6..3484a2046 100644 --- a/youtube_dl/extractor/ustudio.py +++ b/youtube_dl/extractor/ustudio.py @@ -6,10 +6,12 @@ from .common import InfoExtractor  from ..utils import (      int_or_none,      unified_strdate, +    unescapeHTML,  )  class UstudioIE(InfoExtractor): +    IE_NAME = 'ustudio'      _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'      _TEST = {          'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', @@ -27,9 +29,7 @@ class UstudioIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        display_id = mobj.group('display_id') +        video_id, display_id = re.match(self._VALID_URL, url).groups()          config = self._download_xml(              'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, @@ -37,7 +37,7 @@ class UstudioIE(InfoExtractor):          def extract(kind):              return [{ -                'url': item.attrib['url'], +                'url': unescapeHTML(item.attrib['url']),                  'width': int_or_none(item.get('width')),                  'height': int_or_none(item.get('height')),              } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] @@ -65,3 +65,61 @@ class UstudioIE(InfoExtractor):              'uploader': uploader,              'formats': formats,          } + + +class UstudioEmbedIE(InfoExtractor): +    IE_NAME = 'ustudio:embed' +    _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' +    _TEST = { +        'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', +        'md5': '47c0be52a09b23a7f40de9469cec58f4', +        'info_dict': { +            'id': 'Uw7G1kMCe65T', +            'ext': 'mp4', +            'title': '5 Things IT Should Know About Video', +            'description': 'md5:93d32650884b500115e158c5677d25ad', +            'uploader_id': 'DeN7VdYRDKhP', +        } +    } + +    def _real_extract(self, url): +        uploader_id, video_id = re.match(self._VALID_URL, url).groups() +        video_data = self._download_json( +            'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), +            video_id)['videos'][0] +        title = video_data['name'] + +        formats = [] +        for ext, qualities in video_data.get('transcodes', {}).items(): +            for quality in qualities: +                quality_url = quality.get('url') +                if not quality_url: +                    continue +                height = int_or_none(quality.get('height')) +                formats.append({ +                    'format_id': '%s-%dp' % (ext, height) if height else ext, +                    'url': quality_url, +                    'width': int_or_none(quality.get('width')), +                    'height': height, +                }) +        self._sort_formats(formats) + +        thumbnails = [] +        for image in video_data.get('images', []): +            image_url = image.get('url') +            if not image_url: +                continue +            thumbnails.append({ +                'url': image_url, +            }) + +        return { +            'id': video_id, +            'title': title, +            'description': video_data.get('description'), +            'duration': int_or_none(video_data.get('duration')), +            'uploader_id': uploader_id, +            'tags': video_data.get('keywords'), +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6592c8ec2..d6f94f8cd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -50,6 +50,7 @@ from .compat import (      compat_urllib_parse,      compat_urllib_parse_urlencode,      compat_urllib_parse_urlparse, +    compat_urllib_parse_unquote_plus,      compat_urllib_request,      compat_urlparse,      compat_xpath, @@ -886,7 +887,8 @@ def make_socks_conn_class(base_class, socks_proxy):          socks_type,          url_components.hostname, url_components.port or 1080,          True,  # Remote DNS -        url_components.username, url_components.password +        compat_urllib_parse_unquote_plus(url_components.username), +        compat_urllib_parse_unquote_plus(url_components.password),      )      class SocksConnection(base_class): | 
